Skip to content

Commit

Permalink
fix string encoding-related dat failures
Browse files Browse the repository at this point in the history
  • Loading branch information
samansmink committed Jun 27, 2024
1 parent c1f44a3 commit 53e6d95
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 23 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ ExternalProject_Add(
GIT_REPOSITORY "https://github.com/delta-incubator/delta-kernel-rs"
# WARNING: the FFI headers are currently pinned due to the C linkage issue of the c++ headers. Currently, when bumping
# the kernel version, the produced header in ./src/include/delta_kernel_ffi.hpp should be also bumped, applying the fix
GIT_TAG 6f95fd3bfaaa57698d72f539f8c6a0475a52c4e7
GIT_TAG ed2b80b127984481adba8e59879f39b9e5f871d1
# Prints the env variables passed to the cargo build to the terminal, useful in debugging because passing them
# through CMake is an error-prone mess
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${RUST_UNSET_ENV_VARS} ${RUST_ENV_VARS} env
Expand Down
23 changes: 22 additions & 1 deletion src/functions/delta_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,33 @@ static void* allocate_string(const struct ffi::KernelStringSlice slice) {
return new string(slice.ptr, slice.len);
}

static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::KernelStringSlice path, int64_t size, const ffi::DvInfo *dv_info, const struct ffi::CStringMap *partition_values) {
string url_decode(string input) {
string result;
result.reserve(input.size());
char ch;
replace(input.begin(), input.end(), '+', ' ');
for (idx_t i = 0; i < input.length(); i++) {
if (int(input[i]) == 37) {
unsigned int ii;
sscanf(input.substr(i + 1, 2).c_str(), "%x", &ii);
ch = static_cast<char>(ii);
result += ch;
i += 2;
} else {
result += input[i];
}
}
return result;
}

static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::KernelStringSlice path, int64_t size, const ffi::Stats *, const ffi::DvInfo *dv_info, const struct ffi::CStringMap *partition_values) {
auto context = (DeltaSnapshot *) engine_context;
auto path_string = context->GetPath();
StringUtil::RTrim(path_string, "/");
path_string += "/" + KernelUtils::FromDeltaString(path);

path_string = url_decode(path_string);

// First we append the file to our resolved files
context->resolved_files.push_back(DeltaSnapshot::ToDuckDBPath(path_string));
context->metadata.emplace_back(make_uniq<DeltaFileMetaData>());
Expand Down
11 changes: 10 additions & 1 deletion src/include/delta_kernel_ffi.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,9 +304,19 @@ using NullableCvoid = void*;
/// function is that `kernel_str` is _only_ valid until the return from this function
using AllocateStringFn = NullableCvoid(*)(KernelStringSlice kernel_str);

/// Give engines an easy way to consume stats
struct Stats {
/// For any file where the deletion vector is not present (see [`DvInfo::has_vector`]), the
/// `num_records` statistic must be present and accurate, and must equal the number of records
/// in the data file. In the presence of Deletion Vectors the statistics may be somewhat
/// outdated, i.e. not reflecting deleted rows yet.
uint64_t num_records;
};

using CScanCallback = void(*)(NullableCvoid engine_context,
KernelStringSlice path,
int64_t size,
const Stats *stats,
const DvInfo *dv_info,
const CStringMap *partition_map);

Expand All @@ -324,7 +334,6 @@ struct im_an_unused_struct_that_tricks_msvc_into_compilation {
ExternResult<Handle<SharedScan>> field10;
};


extern "C" {

/// # Safety
Expand Down
26 changes: 6 additions & 20 deletions test/sql/dat/all.test
Original file line number Diff line number Diff line change
Expand Up @@ -65,33 +65,14 @@ SELECT *
FROM parquet_scan('${DAT_PATH}/out/reader_tests/generated/basic_partitioned/expected/latest/**/*.parquet')
----

### FAILING DAT TESTS

# TODO fix all of these
mode skip

# Fetches path containing`letter=%252F%252520%2525f` from kernel
# Should be letter= %2F%2520%25f, which means its doubly url encoded

# multi_partitioned
query I rowsort multi_partitioned
SELECT *
FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/multi_partitioned/delta')
----

query I rowsort multi_partitioned
SELECT *
FROM parquet_scan('${DAT_PATH}/out/reader_tests/generated/multi_partitioned/expected/latest/**/*.parquet')
----

# multi_partitioned
query I rowsort multi_partitioned
SELECT *
FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/multi_partitioned/delta')
----

query I rowsort multi_partitioned
SELECT *
SELECT letter, date, decode(data) as data, number
FROM parquet_scan('${DAT_PATH}/out/reader_tests/generated/multi_partitioned/expected/latest/**/*.parquet')
----

Expand All @@ -106,6 +87,11 @@ SELECT *
FROM parquet_scan('${DAT_PATH}/out/reader_tests/generated/multi_partitioned_2/expected/latest/**/*.parquet')
----

### FAILING DAT TESTS

# TODO fix all of these
mode skip

# no_replay
query I rowsort no_replay
SELECT *
Expand Down

0 comments on commit 53e6d95

Please sign in to comment.