From c1f44a31c092ef5907f336dec0d9ef6ca3a983b9 Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Thu, 20 Jun 2024 18:16:45 +0200 Subject: [PATCH] apply workaround for when partition values are NULL --- src/functions/delta_scan.cpp | 68 ++++++++++++++++++- test/sql/dat/all.test | 14 ++-- .../delta_kernel_rs/basic_partitioned.test | 12 ++-- 3 files changed, 82 insertions(+), 12 deletions(-) diff --git a/src/functions/delta_scan.cpp b/src/functions/delta_scan.cpp index ed968a2..1065a7e 100644 --- a/src/functions/delta_scan.cpp +++ b/src/functions/delta_scan.cpp @@ -467,13 +467,79 @@ unique_ptr DeltaMultiFileReader::InitializeGlobalSta return std::move(res); } +// This code is duplicated from MultiFileReader::CreateNameMapping the difference is that for columns that are not found +// in the parquet files, we just add null constant columns +static void CustomMulfiFileNameMapping(const string &file_name, const vector &local_types, + const vector &local_names, const vector &global_types, + const vector &global_names, const vector &global_column_ids, + MultiFileReaderData &reader_data, const string &initial_file, + optional_ptr global_state) { + D_ASSERT(global_types.size() == global_names.size()); + D_ASSERT(local_types.size() == local_names.size()); + // we have expected types: create a map of name -> column index + case_insensitive_map_t name_map; + for (idx_t col_idx = 0; col_idx < local_names.size(); col_idx++) { + name_map[local_names[col_idx]] = col_idx; + } + for (idx_t i = 0; i < global_column_ids.size(); i++) { + // check if this is a constant column + bool constant = false; + for (auto &entry : reader_data.constant_map) { + if (entry.column_id == i) { + constant = true; + break; + } + } + if (constant) { + // this column is constant for this file + continue; + } + // not constant - look up the column in the name map + auto global_id = global_column_ids[i]; + if (global_id >= global_types.size()) { + throw InternalException( + "MultiFileReader::CreatePositionalMapping - global_id is out of range in global_types for this file"); + } + auto &global_name = global_names[global_id]; + auto entry = name_map.find(global_name); + if (entry == name_map.end()) { + string candidate_names; + for (auto &local_name : local_names) { + if (!candidate_names.empty()) { + candidate_names += ", "; + } + candidate_names += local_name; + } + // FIXME: this override is pretty hacky: for missing columns we just insert NULL constants + auto &global_type = global_types[global_id]; + Value val (global_type); + reader_data.constant_map.push_back({i, val}); + continue; + } + // we found the column in the local file - check if the types are the same + auto local_id = entry->second; + D_ASSERT(global_id < global_types.size()); + D_ASSERT(local_id < local_types.size()); + auto &global_type = global_types[global_id]; + auto &local_type = local_types[local_id]; + if (global_type != local_type) { + reader_data.cast_map[local_id] = global_type; + } + // the types are the same - create the mapping + reader_data.column_mapping.push_back(i); + reader_data.column_ids.push_back(local_id); + } + + reader_data.empty_columns = reader_data.column_ids.empty(); +} + void DeltaMultiFileReader::CreateNameMapping(const string &file_name, const vector &local_types, const vector &local_names, const vector &global_types, const vector &global_names, const vector &global_column_ids, MultiFileReaderData &reader_data, const string &initial_file, optional_ptr global_state) { // First call the base implementation to do most mapping - MultiFileReader::CreateNameMapping(file_name, local_types, local_names, global_types, global_names, global_column_ids, reader_data, initial_file, global_state); + CustomMulfiFileNameMapping(file_name, local_types, local_names, global_types, global_names, global_column_ids, reader_data, initial_file, global_state); // Then we handle delta specific mapping D_ASSERT(global_state); diff --git a/test/sql/dat/all.test b/test/sql/dat/all.test index 6afeb84..b3ba2d8 100644 --- a/test/sql/dat/all.test +++ b/test/sql/dat/all.test @@ -54,12 +54,6 @@ SELECT * FROM parquet_scan('${DAT_PATH}/out/reader_tests/generated/with_schema_change/expected/latest/**/*.parquet') ---- - -### FAILING DAT TESTS - -# TODO fix all of these -mode skip - # basic_partitioned query I rowsort basic_partitioned SELECT * @@ -71,6 +65,14 @@ SELECT * FROM parquet_scan('${DAT_PATH}/out/reader_tests/generated/basic_partitioned/expected/latest/**/*.parquet') ---- +### FAILING DAT TESTS + +# TODO fix all of these +mode skip + +# Fetches path containing`letter=%252F%252520%2525f` from kernel +# Should be letter= %2F%2520%25f, which means its doubly url encoded + # multi_partitioned query I rowsort multi_partitioned SELECT * diff --git a/test/sql/delta_kernel_rs/basic_partitioned.test b/test/sql/delta_kernel_rs/basic_partitioned.test index 79804d1..d66d012 100644 --- a/test/sql/delta_kernel_rs/basic_partitioned.test +++ b/test/sql/delta_kernel_rs/basic_partitioned.test @@ -8,10 +8,12 @@ require delta require-env DELTA_KERNEL_TESTS_PATH -# FIXME: this fails due some weird error -mode skip - -statement error +query III SELECT * FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/basic_partitioned') ---- -Failed to read file "/Users/sam/Development/delta-kernel-testing/delta-kernel-rs/kernel/tests/data/basic_partitioned/letter=__HIVE_DEFAULT_PARTITION__ +NULL 6 6.6 +a 4 4.4 +e 5 5.5 +a 1 1.1 +b 2 2.2 +c 3 3.3