Skip to content

Commit

Permalink
apply workaround for when partition values are NULL
Browse files Browse the repository at this point in the history
  • Loading branch information
samansmink committed Jun 20, 2024
1 parent 5cb6f1d commit c1f44a3
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 12 deletions.
68 changes: 67 additions & 1 deletion src/functions/delta_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -467,13 +467,79 @@ unique_ptr<MultiFileReaderGlobalState> DeltaMultiFileReader::InitializeGlobalSta
return std::move(res);
}

// This code is duplicated from MultiFileReader::CreateNameMapping the difference is that for columns that are not found
// in the parquet files, we just add null constant columns
static void CustomMulfiFileNameMapping(const string &file_name, const vector<LogicalType> &local_types,
const vector<string> &local_names, const vector<LogicalType> &global_types,
const vector<string> &global_names, const vector<column_t> &global_column_ids,
MultiFileReaderData &reader_data, const string &initial_file,
optional_ptr<MultiFileReaderGlobalState> global_state) {
D_ASSERT(global_types.size() == global_names.size());
D_ASSERT(local_types.size() == local_names.size());
// we have expected types: create a map of name -> column index
case_insensitive_map_t<idx_t> name_map;
for (idx_t col_idx = 0; col_idx < local_names.size(); col_idx++) {
name_map[local_names[col_idx]] = col_idx;
}
for (idx_t i = 0; i < global_column_ids.size(); i++) {
// check if this is a constant column
bool constant = false;
for (auto &entry : reader_data.constant_map) {
if (entry.column_id == i) {
constant = true;
break;
}
}
if (constant) {
// this column is constant for this file
continue;
}
// not constant - look up the column in the name map
auto global_id = global_column_ids[i];
if (global_id >= global_types.size()) {
throw InternalException(
"MultiFileReader::CreatePositionalMapping - global_id is out of range in global_types for this file");
}
auto &global_name = global_names[global_id];
auto entry = name_map.find(global_name);
if (entry == name_map.end()) {
string candidate_names;
for (auto &local_name : local_names) {
if (!candidate_names.empty()) {
candidate_names += ", ";
}
candidate_names += local_name;
}
// FIXME: this override is pretty hacky: for missing columns we just insert NULL constants
auto &global_type = global_types[global_id];
Value val (global_type);
reader_data.constant_map.push_back({i, val});
continue;
}
// we found the column in the local file - check if the types are the same
auto local_id = entry->second;
D_ASSERT(global_id < global_types.size());
D_ASSERT(local_id < local_types.size());
auto &global_type = global_types[global_id];
auto &local_type = local_types[local_id];
if (global_type != local_type) {
reader_data.cast_map[local_id] = global_type;
}
// the types are the same - create the mapping
reader_data.column_mapping.push_back(i);
reader_data.column_ids.push_back(local_id);
}

reader_data.empty_columns = reader_data.column_ids.empty();
}

void DeltaMultiFileReader::CreateNameMapping(const string &file_name, const vector<LogicalType> &local_types,
const vector<string> &local_names, const vector<LogicalType> &global_types,
const vector<string> &global_names, const vector<column_t> &global_column_ids,
MultiFileReaderData &reader_data, const string &initial_file,
optional_ptr<MultiFileReaderGlobalState> global_state) {
// First call the base implementation to do most mapping
MultiFileReader::CreateNameMapping(file_name, local_types, local_names, global_types, global_names, global_column_ids, reader_data, initial_file, global_state);
CustomMulfiFileNameMapping(file_name, local_types, local_names, global_types, global_names, global_column_ids, reader_data, initial_file, global_state);

// Then we handle delta specific mapping
D_ASSERT(global_state);
Expand Down
14 changes: 8 additions & 6 deletions test/sql/dat/all.test
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,6 @@ SELECT *
FROM parquet_scan('${DAT_PATH}/out/reader_tests/generated/with_schema_change/expected/latest/**/*.parquet')
----


### FAILING DAT TESTS

# TODO fix all of these
mode skip

# basic_partitioned
query I rowsort basic_partitioned
SELECT *
Expand All @@ -71,6 +65,14 @@ SELECT *
FROM parquet_scan('${DAT_PATH}/out/reader_tests/generated/basic_partitioned/expected/latest/**/*.parquet')
----

### FAILING DAT TESTS

# TODO fix all of these
mode skip

# Fetches path containing`letter=%252F%252520%2525f` from kernel
# Should be letter= %2F%2520%25f, which means its doubly url encoded

# multi_partitioned
query I rowsort multi_partitioned
SELECT *
Expand Down
12 changes: 7 additions & 5 deletions test/sql/delta_kernel_rs/basic_partitioned.test
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@ require delta

require-env DELTA_KERNEL_TESTS_PATH

# FIXME: this fails due some weird error
mode skip

statement error
query III
SELECT * FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/basic_partitioned')
----
Failed to read file "/Users/sam/Development/delta-kernel-testing/delta-kernel-rs/kernel/tests/data/basic_partitioned/letter=__HIVE_DEFAULT_PARTITION__
NULL 6 6.6
a 4 4.4
e 5 5.5
a 1 1.1
b 2 2.2
c 3 3.3

0 comments on commit c1f44a3

Please sign in to comment.