diff --git a/scripts/plot.py b/scripts/plot.py index c5ea201..9090f3f 100644 --- a/scripts/plot.py +++ b/scripts/plot.py @@ -1,13 +1,24 @@ import duckdb +import argparse + +### Parse script parameters +parser = argparse.ArgumentParser(description='Plot the results in ./benchmark_results') +parser.add_argument('-p','--pattern', help='Pattern to match result csv files to', required=False, default='*.csv') +parser.add_argument('-w','--width', help='Width of graph, adjust to fit data', required=False, default=20) +args = vars(parser.parse_args()) ### Parse Query Results -parse_benchmark_result_query = """ +parse_benchmark_result_query = f""" SELECT parse_filename(name, true) as benchmark, parse_filename(filename, true) as config, avg(timing) as timing FROM - read_csv('benchmark_results/*.csv', filename=1) + read_csv('benchmark_results/{args['pattern']}', filename=1, columns = {{ + 'name': 'VARCHAR', + 'run': 'BIGINT', + 'timing': 'double' + }}) GROUP BY config, benchmark @@ -22,6 +33,6 @@ import matplotlib.pyplot as plt import numpy as np -plt.rcParams["figure.figsize"] = [10, 5] +plt.rcParams["figure.figsize"] = [int(args['width']), 5] fig = benchmark_results.pivot(index='benchmark', columns='config', values='timing').plot(kind='bar', title='', ylabel='runtime [s]').get_figure() fig.savefig('benchmark_results/result.png') \ No newline at end of file diff --git a/src/functions/delta_scan.cpp b/src/functions/delta_scan.cpp index 9e1b222..d4e537f 100644 --- a/src/functions/delta_scan.cpp +++ b/src/functions/delta_scan.cpp @@ -45,7 +45,7 @@ string url_decode(string input) { return result; } -static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::KernelStringSlice path, int64_t size, const ffi::Stats *, const ffi::DvInfo *dv_info, const struct ffi::CStringMap *partition_values) { +static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::KernelStringSlice path, int64_t size, const ffi::Stats * stats, const ffi::DvInfo *dv_info, const struct ffi::CStringMap *partition_values) { auto context = (DeltaSnapshot *) engine_context; auto path_string = context->GetPath(); StringUtil::RTrim(path_string, "/"); @@ -62,6 +62,7 @@ static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::Kernel // Initialize the file metadata context->metadata.back()->delta_snapshot_version = context->version; context->metadata.back()->file_number = context->resolved_files.size() - 1; + context->metadata.back()->cardinality = stats->num_records; // Fetch the deletion vector auto selection_vector_res = ffi::selection_vector_from_dv(dv_info, context->extern_engine.get(), context->global_state.get()); @@ -485,6 +486,22 @@ idx_t DeltaSnapshot::GetTotalFileCount() { return resolved_files.size(); } +unique_ptr DeltaSnapshot::GetCardinality(ClientContext &context) { + // This also ensures all files are expanded + auto total_file_count = DeltaSnapshot::GetTotalFileCount(); + + if (total_file_count == 0) { + return make_uniq(0,0); + } + + idx_t total_tuple_count = 0; + for (auto &metadatum : metadata) { + total_tuple_count += metadatum->cardinality; + } + + return make_uniq(total_tuple_count,total_tuple_count); +} + unique_ptr DeltaMultiFileReader::CreateInstance() { return std::move(make_uniq()); } diff --git a/src/include/functions/delta_scan.hpp b/src/include/functions/delta_scan.hpp index 25e312f..aac35cc 100644 --- a/src/include/functions/delta_scan.hpp +++ b/src/include/functions/delta_scan.hpp @@ -28,6 +28,7 @@ struct DeltaFileMetaData { idx_t delta_snapshot_version = DConstants::INVALID_INDEX; idx_t file_number = DConstants::INVALID_INDEX; + idx_t cardinality = DConstants::INVALID_INDEX; ffi::KernelBoolSlice selection_vector = {nullptr, 0}; case_insensitive_map_t partition_map; }; @@ -49,6 +50,8 @@ struct DeltaSnapshot : public MultiFileList { FileExpandResult GetExpandResult() override; idx_t GetTotalFileCount() override; + unique_ptr GetCardinality(ClientContext &context) override; + protected: //! Get the i-th expanded file string GetFile(idx_t i) override; diff --git a/test/sql/dat/basic_append.test b/test/sql/dat/basic_append.test index 87930b8..2286eed 100644 --- a/test/sql/dat/basic_append.test +++ b/test/sql/dat/basic_append.test @@ -20,6 +20,12 @@ FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') ---- 5 +# Cardinality estimation should correctly show this +query II +EXPLAIN FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') +---- +physical_plan :.*EC: 5 .* + query I SELECT count(number) FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') @@ -78,3 +84,16 @@ SELECT a_float, number, letter FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') WHERE number > 6 ---- + +# Filters are reflected in cardinality estimation: filtering out all files shows 0 EC +query II +EXPLAIN FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') +WHERE number > 6 +---- +physical_plan :.*EC: 0 .* + +query II +EXPLAIN FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') +WHERE number > 4 +---- +physical_plan :.*EC: 1 .* diff --git a/test/sql/delta_kernel_rs/basic_partitioned.test b/test/sql/delta_kernel_rs/basic_partitioned.test index d66d012..4446477 100644 --- a/test/sql/delta_kernel_rs/basic_partitioned.test +++ b/test/sql/delta_kernel_rs/basic_partitioned.test @@ -17,3 +17,8 @@ e 5 5.5 a 1 1.1 b 2 2.2 c 3 3.3 + +query II +EXPLAIN FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/basic_partitioned') +---- +physical_plan :.*EC: 6 .*