apply workaround for when partition values are NULL

samansmink · Jun 20, 2024 · c1f44a3 · c1f44a3
1 parent 5cb6f1d
commit c1f44a3
Show file tree

Hide file tree

Showing 3 changed files with 82 additions and 12 deletions.
diff --git a/src/functions/delta_scan.cpp b/src/functions/delta_scan.cpp
@@ -467,13 +467,79 @@ unique_ptr<MultiFileReaderGlobalState> DeltaMultiFileReader::InitializeGlobalSta
     return std::move(res);
 }
 
+// This code is duplicated from MultiFileReader::CreateNameMapping the difference is that for columns that are not found
+// in the parquet files, we just add null constant columns
+static void CustomMulfiFileNameMapping(const string &file_name, const vector<LogicalType> &local_types,
+                                        const vector<string> &local_names, const vector<LogicalType> &global_types,
+                                        const vector<string> &global_names, const vector<column_t> &global_column_ids,
+                                        MultiFileReaderData &reader_data, const string &initial_file,
+                                        optional_ptr<MultiFileReaderGlobalState> global_state) {
+    D_ASSERT(global_types.size() == global_names.size());
+	D_ASSERT(local_types.size() == local_names.size());
+	// we have expected types: create a map of name -> column index
+	case_insensitive_map_t<idx_t> name_map;
+	for (idx_t col_idx = 0; col_idx < local_names.size(); col_idx++) {
+		name_map[local_names[col_idx]] = col_idx;
+	}
+	for (idx_t i = 0; i < global_column_ids.size(); i++) {
+		// check if this is a constant column
+		bool constant = false;
+		for (auto &entry : reader_data.constant_map) {
+			if (entry.column_id == i) {
+				constant = true;
+				break;
+			}
+		}
+		if (constant) {
+			// this column is constant for this file
+			continue;
+		}
+		// not constant - look up the column in the name map
+		auto global_id = global_column_ids[i];
+		if (global_id >= global_types.size()) {
+			throw InternalException(
+			    "MultiFileReader::CreatePositionalMapping - global_id is out of range in global_types for this file");
+		}
+		auto &global_name = global_names[global_id];
+		auto entry = name_map.find(global_name);
+		if (entry == name_map.end()) {
+			string candidate_names;
+			for (auto &local_name : local_names) {
+				if (!candidate_names.empty()) {
+					candidate_names += ", ";
+				}
+				candidate_names += local_name;
+			}
+			// FIXME: this override is pretty hacky: for missing columns we just insert NULL constants
+		    auto &global_type = global_types[global_id];
+		    Value val (global_type);
+		    reader_data.constant_map.push_back({i, val});
+		    continue;
+		}
+		// we found the column in the local file - check if the types are the same
+		auto local_id = entry->second;
+		D_ASSERT(global_id < global_types.size());
+		D_ASSERT(local_id < local_types.size());
+		auto &global_type = global_types[global_id];
+		auto &local_type = local_types[local_id];
+		if (global_type != local_type) {
+			reader_data.cast_map[local_id] = global_type;
+		}
+		// the types are the same - create the mapping
+		reader_data.column_mapping.push_back(i);
+		reader_data.column_ids.push_back(local_id);
+	}
+
+	reader_data.empty_columns = reader_data.column_ids.empty();
+}
+
 void DeltaMultiFileReader::CreateNameMapping(const string &file_name, const vector<LogicalType> &local_types,
                                         const vector<string> &local_names, const vector<LogicalType> &global_types,
                                         const vector<string> &global_names, const vector<column_t> &global_column_ids,
                                         MultiFileReaderData &reader_data, const string &initial_file,
                                         optional_ptr<MultiFileReaderGlobalState> global_state) {
     // First call the base implementation to do most mapping
-    MultiFileReader::CreateNameMapping(file_name, local_types, local_names, global_types, global_names, global_column_ids, reader_data, initial_file, global_state);
+    CustomMulfiFileNameMapping(file_name, local_types, local_names, global_types, global_names, global_column_ids, reader_data, initial_file, global_state);
 
     // Then we handle delta specific mapping
     D_ASSERT(global_state);

diff --git a/test/sql/dat/all.test b/test/sql/dat/all.test
@@ -54,12 +54,6 @@ SELECT *
 FROM parquet_scan('${DAT_PATH}/out/reader_tests/generated/with_schema_change/expected/latest/**/*.parquet')
 ----
 
-
-### FAILING DAT TESTS
-
-# TODO fix all of these
-mode skip
-
 # basic_partitioned
 query I rowsort basic_partitioned
 SELECT *
@@ -71,6 +65,14 @@ SELECT *
 FROM parquet_scan('${DAT_PATH}/out/reader_tests/generated/basic_partitioned/expected/latest/**/*.parquet')
 ----
 
+### FAILING DAT TESTS
+
+# TODO fix all of these
+mode skip
+
+# Fetches path containing`letter=%252F%252520%2525f` from kernel
+# Should be letter= %2F%2520%25f, which means its doubly url encoded
+
 # multi_partitioned
 query I rowsort multi_partitioned
 SELECT *

diff --git a/test/sql/delta_kernel_rs/basic_partitioned.test b/test/sql/delta_kernel_rs/basic_partitioned.test
@@ -8,10 +8,12 @@ require delta
 
 require-env DELTA_KERNEL_TESTS_PATH
 
-# FIXME: this fails due some weird error
-mode skip
-
-statement error
+query III
 SELECT * FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/basic_partitioned')
 ----
-Failed to read file "/Users/sam/Development/delta-kernel-testing/delta-kernel-rs/kernel/tests/data/basic_partitioned/letter=__HIVE_DEFAULT_PARTITION__
+NULL	6	6.6
+a	4	4.4
+e	5	5.5
+a	1	1.1
+b	2	2.2
+c	3	3.3