From 251d3bb20af7f666f3e7ffd9ba412f417149894c Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Sat, 15 Jun 2024 10:54:26 +0200 Subject: [PATCH] bump delta to 181232a45562, enable cardinalty estimation, fix varchar pushdown --- CMakeLists.txt | 4 +-- scripts/generate_test_data.py | 3 +- src/delta_utils.cpp | 60 ++++++++++++++++++++--------------- src/functions/delta_scan.cpp | 3 -- 4 files changed, 38 insertions(+), 32 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 28ea1d2..58e3d39 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,8 +59,8 @@ endif() # Add rust_example as a CMake target ExternalProject_Add( ${KERNEL_NAME} - GIT_REPOSITORY "https://github.com/delta-incubator/delta-kernel-rs" - GIT_TAG c901665b98b2fed5ff1c713a9666eba9d16ea281 + GIT_REPOSITORY "https://github.com/nicklan/delta-kernel-rs" + GIT_TAG 181232a45562ca78be763c2f5fb46b88a2463b5c CONFIGURE_COMMAND "" UPDATE_COMMAND "" BUILD_IN_SOURCE 1 diff --git a/scripts/generate_test_data.py b/scripts/generate_test_data.py index cb1d2f7..e7bf588 100644 --- a/scripts/generate_test_data.py +++ b/scripts/generate_test_data.py @@ -78,7 +78,8 @@ def generate_test_data_pyspark(name, current_path, input_path, delete_predicate ## CREATE ## CONFIGURE USAGE OF DELETION VECTORS - spark.sql(f"ALTER TABLE test_table_{name} SET TBLPROPERTIES ('delta.enableDeletionVectors' = true);") + if (delete_predicate): + spark.sql(f"ALTER TABLE test_table_{name} SET TBLPROPERTIES ('delta.enableDeletionVectors' = true);") ## ADDING DELETES deltaTable = DeltaTable.forPath(spark, delta_table_path) diff --git a/src/delta_utils.cpp b/src/delta_utils.cpp index a805d15..c299e8e 100644 --- a/src/delta_utils.cpp +++ b/src/delta_utils.cpp @@ -200,6 +200,10 @@ static bool CanHandleFilter(TableFilter *filter) { switch (filter->filter_type) { case TableFilterType::CONSTANT_COMPARISON: return true; + case TableFilterType::IS_NULL: + return true; + case TableFilterType::IS_NOT_NULL: + return true; case TableFilterType::CONJUNCTION_AND: { auto &conjunction = static_cast(*filter); bool can_handle = true; @@ -258,28 +262,28 @@ uintptr_t PredicateVisitor::VisitConstantFilter(const string &col_name, const Co case LogicalType::BIGINT: right = visit_expression_literal_long(state, BigIntValue::Get(value)); break; - // case LogicalType::INTEGER: - // right = visit_expression_literal_int(state, IntegerValue::Get(value)); - // break; - // case LogicalType::SMALLINT: - // right = visit_expression_literal_short(state, SmallIntValue::Get(value)); - // break; - // case LogicalType::TINYINT: - // right = visit_expression_literal_byte(state, TinyIntValue::Get(value)); - // break; - // case LogicalType::FLOAT: - // right = visit_expression_literal_float(state, FloatValue::Get(value)); - // break; - // case LogicalType::DOUBLE: - // right = visit_expression_literal_double(state, DoubleValue::Get(value)); - // break; - // case LogicalType::BOOLEAN: - // right = visit_expression_literal_bool(state, BooleanValue::Get(value)); - // break; + case LogicalType::INTEGER: + right = visit_expression_literal_int(state, IntegerValue::Get(value)); + break; + case LogicalType::SMALLINT: + right = visit_expression_literal_short(state, SmallIntValue::Get(value)); + break; + case LogicalType::TINYINT: + right = visit_expression_literal_byte(state, TinyIntValue::Get(value)); + break; + case LogicalType::FLOAT: + right = visit_expression_literal_float(state, FloatValue::Get(value)); + break; + case LogicalType::DOUBLE: + right = visit_expression_literal_double(state, DoubleValue::Get(value)); + break; + case LogicalType::BOOLEAN: + right = visit_expression_literal_bool(state, BooleanValue::Get(value)); + break; case LogicalType::VARCHAR: { // WARNING: C++ lifetime extension rules don't protect calls of the form foo(std::string(...).c_str()) auto str = StringValue::Get(value); - auto maybe_right = ffi::visit_expression_literal_string(state, KernelUtils::ToDeltaString(col_name), DuckDBEngineError::AllocateError); + auto maybe_right = ffi::visit_expression_literal_string(state, KernelUtils::ToDeltaString(str), DuckDBEngineError::AllocateError); right = KernelUtils::UnpackResult(maybe_right, "VisitConstantFilter failed to visit_expression_literal_string"); break; } @@ -315,6 +319,10 @@ uintptr_t PredicateVisitor::VisitAndFilter(const string &col_name, const Conjunc return 0; } auto &child_filter = *it++; + + if (child_filter->filter_type == TableFilterType::IS_NULL || child_filter->filter_type == TableFilterType::IS_NOT_NULL) { + return 0; + } return VisitFilter(col_name, *child_filter, state); }; auto eit = EngineIteratorFromCallable(get_next); @@ -322,9 +330,9 @@ uintptr_t PredicateVisitor::VisitAndFilter(const string &col_name, const Conjunc } uintptr_t PredicateVisitor::VisitIsNull(const string &col_name, ffi::KernelExpressionVisitorState *state) { - auto maybe_left = ffi::visit_expression_column(state, KernelUtils::ToDeltaString(col_name), DuckDBEngineError::AllocateError); - uintptr_t left = KernelUtils::UnpackResult(maybe_left, "VisitIsNull failed to visit_expression_column"); - return ffi::visit_expression_is_null(state, left); + auto maybe_inner = ffi::visit_expression_column(state, KernelUtils::ToDeltaString(col_name), DuckDBEngineError::AllocateError); + uintptr_t inner = KernelUtils::UnpackResult(maybe_inner, "VisitIsNull failed to visit_expression_column"); + return ffi::visit_expression_is_null(state, inner); } uintptr_t PredicateVisitor::VisitIsNotNull(const string &col_name, ffi::KernelExpressionVisitorState *state) { @@ -337,10 +345,10 @@ uintptr_t PredicateVisitor::VisitFilter(const string &col_name, const TableFilte return VisitConstantFilter(col_name, static_cast(filter), state); case TableFilterType::CONJUNCTION_AND: return VisitAndFilter(col_name, static_cast(filter), state); - // case TableFilterType::IS_NULL: - // return VisitIsNull(col_name, state); - // case TableFilterType::IS_NOT_NULL: - // return VisitIsNotNull(col_name, state); + case TableFilterType::IS_NULL: + return VisitIsNull(col_name, state); + case TableFilterType::IS_NOT_NULL: + return VisitIsNotNull(col_name, state); default: return ~0; } diff --git a/src/functions/delta_scan.cpp b/src/functions/delta_scan.cpp index d4320e5..ed968a2 100644 --- a/src/functions/delta_scan.cpp +++ b/src/functions/delta_scan.cpp @@ -31,8 +31,6 @@ static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::Kernel StringUtil::RTrim(path_string, "/"); path_string += "/" + KernelUtils::FromDeltaString(path); - printf("Got File %s\n", path_string.c_str()); - // First we append the file to our resolved files context->resolved_files.push_back(DeltaSnapshot::ToDuckDBPath(path_string)); context->metadata.emplace_back(make_uniq()); @@ -589,7 +587,6 @@ TableFunctionSet DeltaFunctions::GetDeltaScanFunction(DatabaseInstance &instance function.deserialize = nullptr; function.statistics = nullptr; function.table_scan_progress = nullptr; - function.cardinality = nullptr; function.get_bind_info = nullptr; // Schema param is just confusing here