diff --git a/CMakeLists.txt b/CMakeLists.txt index b49ee4e3..a6cacc0f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,7 +25,7 @@ add_definitions(-DDUCKDB_EXTENSION_AUTOLOAD_DEFAULT=1 -DDUCKDB_EXTENSION_AUTOINS file(GLOB_RECURSE JAVA_SRC_FILES src/main/java/org/duckdb/*.java) file(GLOB_RECURSE JAVA_TEST_FILES src/test/java/org/duckdb/*.java) -set(DUCKDB_SRC_FILES src/duckdb/ub_src_catalog.cpp src/duckdb/ub_src_catalog_catalog_entry.cpp src/duckdb/ub_src_catalog_catalog_entry_dependency.cpp src/duckdb/ub_src_catalog_default.cpp src/duckdb/ub_src_common_adbc.cpp src/duckdb/ub_src_common_adbc_nanoarrow.cpp src/duckdb/ub_src_common.cpp src/duckdb/ub_src_common_arrow_appender.cpp src/duckdb/ub_src_common_arrow.cpp src/duckdb/ub_src_common_crypto.cpp src/duckdb/ub_src_common_enums.cpp src/duckdb/ub_src_common_exception.cpp src/duckdb/ub_src_common_operator.cpp src/duckdb/ub_src_common_progress_bar.cpp src/duckdb/ub_src_common_row_operations.cpp src/duckdb/ub_src_common_serializer.cpp src/duckdb/ub_src_common_sort.cpp src/duckdb/ub_src_common_tree_renderer.cpp src/duckdb/ub_src_common_types.cpp src/duckdb/ub_src_common_types_column.cpp src/duckdb/ub_src_common_types_row.cpp src/duckdb/ub_src_common_value_operations.cpp src/duckdb/src/common/vector_operations/boolean_operators.cpp src/duckdb/src/common/vector_operations/comparison_operators.cpp src/duckdb/src/common/vector_operations/generators.cpp src/duckdb/src/common/vector_operations/is_distinct_from.cpp src/duckdb/src/common/vector_operations/null_operations.cpp src/duckdb/src/common/vector_operations/numeric_inplace_operators.cpp src/duckdb/src/common/vector_operations/vector_cast.cpp src/duckdb/src/common/vector_operations/vector_copy.cpp src/duckdb/src/common/vector_operations/vector_hash.cpp src/duckdb/src/common/vector_operations/vector_storage.cpp src/duckdb/ub_src_execution.cpp src/duckdb/ub_src_execution_expression_executor.cpp src/duckdb/ub_src_execution_index_art.cpp src/duckdb/ub_src_execution_index.cpp src/duckdb/ub_src_execution_nested_loop_join.cpp src/duckdb/ub_src_execution_operator_aggregate.cpp src/duckdb/ub_src_execution_operator_csv_scanner_buffer_manager.cpp src/duckdb/ub_src_execution_operator_csv_scanner_encode.cpp src/duckdb/ub_src_execution_operator_csv_scanner_scanner.cpp src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp src/duckdb/ub_src_execution_operator_csv_scanner_state_machine.cpp src/duckdb/ub_src_execution_operator_csv_scanner_table_function.cpp src/duckdb/ub_src_execution_operator_csv_scanner_util.cpp src/duckdb/ub_src_execution_operator_filter.cpp src/duckdb/ub_src_execution_operator_helper.cpp src/duckdb/ub_src_execution_operator_join.cpp src/duckdb/ub_src_execution_operator_order.cpp src/duckdb/ub_src_execution_operator_persistent.cpp src/duckdb/ub_src_execution_operator_projection.cpp src/duckdb/ub_src_execution_operator_scan.cpp src/duckdb/ub_src_execution_operator_schema.cpp src/duckdb/ub_src_execution_operator_set.cpp src/duckdb/ub_src_execution_physical_plan.cpp src/duckdb/ub_src_function_aggregate_distributive.cpp src/duckdb/ub_src_function_aggregate.cpp src/duckdb/ub_src_function.cpp src/duckdb/ub_src_function_cast.cpp src/duckdb/ub_src_function_cast_union.cpp src/duckdb/ub_src_function_pragma.cpp src/duckdb/ub_src_function_scalar_compressed_materialization.cpp src/duckdb/ub_src_function_scalar.cpp src/duckdb/ub_src_function_scalar_date.cpp src/duckdb/ub_src_function_scalar_generic.cpp src/duckdb/ub_src_function_scalar_list.cpp src/duckdb/ub_src_function_scalar_map.cpp src/duckdb/ub_src_function_scalar_operator.cpp src/duckdb/ub_src_function_scalar_sequence.cpp src/duckdb/ub_src_function_scalar_string.cpp src/duckdb/ub_src_function_scalar_string_regexp.cpp src/duckdb/ub_src_function_scalar_struct.cpp src/duckdb/ub_src_function_scalar_system.cpp src/duckdb/ub_src_function_table_arrow.cpp src/duckdb/ub_src_function_table.cpp src/duckdb/ub_src_function_table_system.cpp src/duckdb/ub_src_function_table_version.cpp src/duckdb/ub_src_function_window.cpp src/duckdb/ub_src_main.cpp src/duckdb/ub_src_main_buffered_data.cpp src/duckdb/ub_src_main_capi.cpp src/duckdb/ub_src_main_capi_cast.cpp src/duckdb/ub_src_main_chunk_scan_state.cpp src/duckdb/ub_src_main_extension.cpp src/duckdb/ub_src_main_relation.cpp src/duckdb/ub_src_main_secret.cpp src/duckdb/ub_src_main_settings.cpp src/duckdb/ub_src_optimizer.cpp src/duckdb/ub_src_optimizer_compressed_materialization.cpp src/duckdb/ub_src_optimizer_join_order.cpp src/duckdb/ub_src_optimizer_matcher.cpp src/duckdb/ub_src_optimizer_pullup.cpp src/duckdb/ub_src_optimizer_pushdown.cpp src/duckdb/ub_src_optimizer_rule.cpp src/duckdb/ub_src_optimizer_statistics_expression.cpp src/duckdb/ub_src_optimizer_statistics_operator.cpp src/duckdb/ub_src_parallel.cpp src/duckdb/ub_src_parser.cpp src/duckdb/ub_src_parser_constraints.cpp src/duckdb/ub_src_parser_expression.cpp src/duckdb/ub_src_parser_parsed_data.cpp src/duckdb/ub_src_parser_query_node.cpp src/duckdb/ub_src_parser_statement.cpp src/duckdb/ub_src_parser_tableref.cpp src/duckdb/ub_src_parser_transform_constraint.cpp src/duckdb/ub_src_parser_transform_expression.cpp src/duckdb/ub_src_parser_transform_helpers.cpp src/duckdb/ub_src_parser_transform_statement.cpp src/duckdb/ub_src_parser_transform_tableref.cpp src/duckdb/ub_src_planner.cpp src/duckdb/ub_src_planner_binder_expression.cpp src/duckdb/ub_src_planner_binder_query_node.cpp src/duckdb/ub_src_planner_binder_statement.cpp src/duckdb/ub_src_planner_binder_tableref.cpp src/duckdb/ub_src_planner_expression.cpp src/duckdb/ub_src_planner_expression_binder.cpp src/duckdb/ub_src_planner_filter.cpp src/duckdb/ub_src_planner_operator.cpp src/duckdb/ub_src_planner_subquery.cpp src/duckdb/ub_src_storage.cpp src/duckdb/ub_src_storage_buffer.cpp src/duckdb/ub_src_storage_checkpoint.cpp src/duckdb/ub_src_storage_compression_alp.cpp src/duckdb/ub_src_storage_compression.cpp src/duckdb/ub_src_storage_compression_chimp.cpp src/duckdb/ub_src_storage_compression_roaring.cpp src/duckdb/ub_src_storage_metadata.cpp src/duckdb/ub_src_storage_serialization.cpp src/duckdb/ub_src_storage_statistics.cpp src/duckdb/ub_src_storage_table.cpp src/duckdb/ub_src_transaction.cpp src/duckdb/src/verification/copied_statement_verifier.cpp src/duckdb/src/verification/deserialized_statement_verifier.cpp src/duckdb/src/verification/external_statement_verifier.cpp src/duckdb/src/verification/fetch_row_verifier.cpp src/duckdb/src/verification/no_operator_caching_verifier.cpp src/duckdb/src/verification/parsed_statement_verifier.cpp src/duckdb/src/verification/prepared_statement_verifier.cpp src/duckdb/src/verification/statement_verifier.cpp src/duckdb/src/verification/unoptimized_statement_verifier.cpp src/duckdb/third_party/fmt/format.cc src/duckdb/third_party/fsst/libfsst.cpp src/duckdb/third_party/miniz/miniz.cpp src/duckdb/third_party/re2/re2/bitmap256.cc src/duckdb/third_party/re2/re2/bitstate.cc src/duckdb/third_party/re2/re2/compile.cc src/duckdb/third_party/re2/re2/dfa.cc src/duckdb/third_party/re2/re2/filtered_re2.cc src/duckdb/third_party/re2/re2/mimics_pcre.cc src/duckdb/third_party/re2/re2/nfa.cc src/duckdb/third_party/re2/re2/onepass.cc src/duckdb/third_party/re2/re2/parse.cc src/duckdb/third_party/re2/re2/perl_groups.cc src/duckdb/third_party/re2/re2/prefilter.cc src/duckdb/third_party/re2/re2/prefilter_tree.cc src/duckdb/third_party/re2/re2/prog.cc src/duckdb/third_party/re2/re2/re2.cc src/duckdb/third_party/re2/re2/regexp.cc src/duckdb/third_party/re2/re2/set.cc src/duckdb/third_party/re2/re2/simplify.cc src/duckdb/third_party/re2/re2/stringpiece.cc src/duckdb/third_party/re2/re2/tostring.cc src/duckdb/third_party/re2/re2/unicode_casefold.cc src/duckdb/third_party/re2/re2/unicode_groups.cc src/duckdb/third_party/re2/util/rune.cc src/duckdb/third_party/re2/util/strutil.cc src/duckdb/third_party/hyperloglog/hyperloglog.cpp src/duckdb/third_party/hyperloglog/sds.cpp src/duckdb/third_party/skiplist/SkipList.cpp src/duckdb/third_party/fastpforlib/bitpacking.cpp src/duckdb/third_party/utf8proc/utf8proc.cpp src/duckdb/third_party/utf8proc/utf8proc_wrapper.cpp src/duckdb/third_party/libpg_query/pg_functions.cpp src/duckdb/third_party/libpg_query/postgres_parser.cpp src/duckdb/third_party/libpg_query/src_backend_nodes_list.cpp src/duckdb/third_party/libpg_query/src_backend_nodes_makefuncs.cpp src/duckdb/third_party/libpg_query/src_backend_nodes_value.cpp src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp src/duckdb/third_party/libpg_query/src_backend_parser_parser.cpp src/duckdb/third_party/libpg_query/src_backend_parser_scan.cpp src/duckdb/third_party/libpg_query/src_backend_parser_scansup.cpp src/duckdb/third_party/libpg_query/src_common_keywords.cpp src/duckdb/third_party/mbedtls/library/aes.cpp src/duckdb/third_party/mbedtls/library/aria.cpp src/duckdb/third_party/mbedtls/library/asn1parse.cpp src/duckdb/third_party/mbedtls/library/base64.cpp src/duckdb/third_party/mbedtls/library/bignum.cpp src/duckdb/third_party/mbedtls/library/camellia.cpp src/duckdb/third_party/mbedtls/library/cipher.cpp src/duckdb/third_party/mbedtls/library/cipher_wrap.cpp src/duckdb/third_party/mbedtls/library/constant_time.cpp src/duckdb/third_party/mbedtls/library/entropy.cpp src/duckdb/third_party/mbedtls/library/entropy_poll.cpp src/duckdb/third_party/mbedtls/library/gcm.cpp src/duckdb/third_party/mbedtls/library/md.cpp src/duckdb/third_party/mbedtls/library/oid.cpp src/duckdb/third_party/mbedtls/library/pem.cpp src/duckdb/third_party/mbedtls/library/pk.cpp src/duckdb/third_party/mbedtls/library/pk_wrap.cpp src/duckdb/third_party/mbedtls/library/pkparse.cpp src/duckdb/third_party/mbedtls/library/platform_util.cpp src/duckdb/third_party/mbedtls/library/rsa.cpp src/duckdb/third_party/mbedtls/library/rsa_alt_helpers.cpp src/duckdb/third_party/mbedtls/library/sha1.cpp src/duckdb/third_party/mbedtls/library/sha256.cpp src/duckdb/third_party/mbedtls/library/sha512.cpp src/duckdb/third_party/mbedtls/mbedtls_wrapper.cpp src/duckdb/third_party/yyjson/yyjson.cpp src/duckdb/third_party/zstd/common/debug.cpp src/duckdb/third_party/zstd/common/entropy_common.cpp src/duckdb/third_party/zstd/common/error_private.cpp src/duckdb/third_party/zstd/common/fse_decompress.cpp src/duckdb/third_party/zstd/common/pool.cpp src/duckdb/third_party/zstd/common/threading.cpp src/duckdb/third_party/zstd/common/xxhash.cpp src/duckdb/third_party/zstd/common/zstd_common.cpp src/duckdb/third_party/zstd/compress/fse_compress.cpp src/duckdb/third_party/zstd/compress/hist.cpp src/duckdb/third_party/zstd/compress/huf_compress.cpp src/duckdb/third_party/zstd/compress/zstd_compress.cpp src/duckdb/third_party/zstd/compress/zstd_compress_literals.cpp src/duckdb/third_party/zstd/compress/zstd_compress_sequences.cpp src/duckdb/third_party/zstd/compress/zstd_compress_superblock.cpp src/duckdb/third_party/zstd/compress/zstd_double_fast.cpp src/duckdb/third_party/zstd/compress/zstd_fast.cpp src/duckdb/third_party/zstd/compress/zstd_lazy.cpp src/duckdb/third_party/zstd/compress/zstd_ldm.cpp src/duckdb/third_party/zstd/compress/zstd_opt.cpp src/duckdb/third_party/zstd/compress/zstdmt_compress.cpp src/duckdb/third_party/zstd/decompress/huf_decompress.cpp src/duckdb/third_party/zstd/decompress/zstd_ddict.cpp src/duckdb/third_party/zstd/decompress/zstd_decompress.cpp src/duckdb/third_party/zstd/decompress/zstd_decompress_block.cpp src/duckdb/third_party/zstd/deprecated/zbuff_common.cpp src/duckdb/third_party/zstd/deprecated/zbuff_compress.cpp src/duckdb/third_party/zstd/deprecated/zbuff_decompress.cpp src/duckdb/third_party/zstd/dict/cover.cpp src/duckdb/third_party/zstd/dict/divsufsort.cpp src/duckdb/third_party/zstd/dict/fastcover.cpp src/duckdb/third_party/zstd/dict/zdict.cpp src/duckdb/extension/core_functions/core_functions_extension.cpp src/duckdb/extension/core_functions/lambda_functions.cpp src/duckdb/extension/core_functions/function_list.cpp src/duckdb/ub_extension_core_functions_aggregate_regression.cpp src/duckdb/ub_extension_core_functions_aggregate_algebraic.cpp src/duckdb/ub_extension_core_functions_aggregate_nested.cpp src/duckdb/ub_extension_core_functions_aggregate_holistic.cpp src/duckdb/ub_extension_core_functions_aggregate_distributive.cpp src/duckdb/ub_extension_core_functions_scalar_random.cpp src/duckdb/ub_extension_core_functions_scalar_string.cpp src/duckdb/ub_extension_core_functions_scalar_math.cpp src/duckdb/ub_extension_core_functions_scalar_generic.cpp src/duckdb/ub_extension_core_functions_scalar_enum.cpp src/duckdb/ub_extension_core_functions_scalar_map.cpp src/duckdb/ub_extension_core_functions_scalar_operators.cpp src/duckdb/ub_extension_core_functions_scalar_date.cpp src/duckdb/ub_extension_core_functions_scalar_list.cpp src/duckdb/ub_extension_core_functions_scalar_blob.cpp src/duckdb/ub_extension_core_functions_scalar_debug.cpp src/duckdb/ub_extension_core_functions_scalar_array.cpp src/duckdb/ub_extension_core_functions_scalar_union.cpp src/duckdb/ub_extension_core_functions_scalar_struct.cpp src/duckdb/ub_extension_core_functions_scalar_bit.cpp src/duckdb/extension/parquet/column_reader.cpp src/duckdb/extension/parquet/column_writer.cpp src/duckdb/extension/parquet/parquet_crypto.cpp src/duckdb/extension/parquet/parquet_extension.cpp src/duckdb/extension/parquet/parquet_metadata.cpp src/duckdb/extension/parquet/parquet_reader.cpp src/duckdb/extension/parquet/parquet_statistics.cpp src/duckdb/extension/parquet/parquet_timestamp.cpp src/duckdb/extension/parquet/parquet_writer.cpp src/duckdb/extension/parquet/serialize_parquet.cpp src/duckdb/extension/parquet/zstd_file_system.cpp src/duckdb/extension/parquet/geo_parquet.cpp src/duckdb/third_party/parquet/parquet_types.cpp src/duckdb/third_party/thrift/thrift/protocol/TProtocol.cpp src/duckdb/third_party/thrift/thrift/transport/TTransportException.cpp src/duckdb/third_party/thrift/thrift/transport/TBufferTransports.cpp src/duckdb/third_party/snappy/snappy.cc src/duckdb/third_party/snappy/snappy-sinksource.cc src/duckdb/third_party/lz4/lz4.cpp src/duckdb/third_party/brotli/common/constants.cpp src/duckdb/third_party/brotli/common/context.cpp src/duckdb/third_party/brotli/common/dictionary.cpp src/duckdb/third_party/brotli/common/platform.cpp src/duckdb/third_party/brotli/common/shared_dictionary.cpp src/duckdb/third_party/brotli/common/transform.cpp src/duckdb/third_party/brotli/dec/bit_reader.cpp src/duckdb/third_party/brotli/dec/decode.cpp src/duckdb/third_party/brotli/dec/huffman.cpp src/duckdb/third_party/brotli/dec/state.cpp src/duckdb/third_party/brotli/enc/backward_references.cpp src/duckdb/third_party/brotli/enc/backward_references_hq.cpp src/duckdb/third_party/brotli/enc/bit_cost.cpp src/duckdb/third_party/brotli/enc/block_splitter.cpp src/duckdb/third_party/brotli/enc/brotli_bit_stream.cpp src/duckdb/third_party/brotli/enc/cluster.cpp src/duckdb/third_party/brotli/enc/command.cpp src/duckdb/third_party/brotli/enc/compound_dictionary.cpp src/duckdb/third_party/brotli/enc/compress_fragment.cpp src/duckdb/third_party/brotli/enc/compress_fragment_two_pass.cpp src/duckdb/third_party/brotli/enc/dictionary_hash.cpp src/duckdb/third_party/brotli/enc/encode.cpp src/duckdb/third_party/brotli/enc/encoder_dict.cpp src/duckdb/third_party/brotli/enc/entropy_encode.cpp src/duckdb/third_party/brotli/enc/fast_log.cpp src/duckdb/third_party/brotli/enc/histogram.cpp src/duckdb/third_party/brotli/enc/literal_cost.cpp src/duckdb/third_party/brotli/enc/memory.cpp src/duckdb/third_party/brotli/enc/metablock.cpp src/duckdb/third_party/brotli/enc/static_dict.cpp src/duckdb/third_party/brotli/enc/utf8_util.cpp src/duckdb/extension/icu/./icu-table-range.cpp src/duckdb/extension/icu/./icu-datefunc.cpp src/duckdb/extension/icu/./icu-datepart.cpp src/duckdb/extension/icu/./icu-datetrunc.cpp src/duckdb/extension/icu/./icu_extension.cpp src/duckdb/extension/icu/./icu-timezone.cpp src/duckdb/extension/icu/./icu-timebucket.cpp src/duckdb/extension/icu/./icu-makedate.cpp src/duckdb/extension/icu/./icu-datesub.cpp src/duckdb/extension/icu/./icu-dateadd.cpp src/duckdb/extension/icu/./icu-list-range.cpp src/duckdb/extension/icu/./icu-strptime.cpp src/duckdb/ub_extension_icu_third_party_icu_common.cpp src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp src/duckdb/extension/icu/third_party/icu/stubdata/stubdata.cpp src/duckdb/extension/json/buffered_json_reader.cpp src/duckdb/extension/json/json_enums.cpp src/duckdb/extension/json/json_extension.cpp src/duckdb/extension/json/json_common.cpp src/duckdb/extension/json/json_functions.cpp src/duckdb/extension/json/json_scan.cpp src/duckdb/extension/json/json_serializer.cpp src/duckdb/extension/json/json_deserializer.cpp src/duckdb/extension/json/serialize_json.cpp src/duckdb/ub_extension_json_json_functions.cpp) +set(DUCKDB_SRC_FILES src/duckdb/ub_src_catalog.cpp src/duckdb/ub_src_catalog_catalog_entry.cpp src/duckdb/ub_src_catalog_catalog_entry_dependency.cpp src/duckdb/ub_src_catalog_default.cpp src/duckdb/ub_src_common_adbc.cpp src/duckdb/ub_src_common_adbc_nanoarrow.cpp src/duckdb/ub_src_common.cpp src/duckdb/ub_src_common_arrow_appender.cpp src/duckdb/ub_src_common_arrow.cpp src/duckdb/ub_src_common_crypto.cpp src/duckdb/ub_src_common_enums.cpp src/duckdb/ub_src_common_exception.cpp src/duckdb/ub_src_common_operator.cpp src/duckdb/ub_src_common_progress_bar.cpp src/duckdb/ub_src_common_row_operations.cpp src/duckdb/ub_src_common_serializer.cpp src/duckdb/ub_src_common_sort.cpp src/duckdb/ub_src_common_tree_renderer.cpp src/duckdb/ub_src_common_types.cpp src/duckdb/ub_src_common_types_column.cpp src/duckdb/ub_src_common_types_row.cpp src/duckdb/ub_src_common_value_operations.cpp src/duckdb/src/common/vector_operations/boolean_operators.cpp src/duckdb/src/common/vector_operations/comparison_operators.cpp src/duckdb/src/common/vector_operations/generators.cpp src/duckdb/src/common/vector_operations/is_distinct_from.cpp src/duckdb/src/common/vector_operations/null_operations.cpp src/duckdb/src/common/vector_operations/numeric_inplace_operators.cpp src/duckdb/src/common/vector_operations/vector_cast.cpp src/duckdb/src/common/vector_operations/vector_copy.cpp src/duckdb/src/common/vector_operations/vector_hash.cpp src/duckdb/src/common/vector_operations/vector_storage.cpp src/duckdb/ub_src_execution.cpp src/duckdb/ub_src_execution_expression_executor.cpp src/duckdb/ub_src_execution_index_art.cpp src/duckdb/ub_src_execution_index.cpp src/duckdb/ub_src_execution_nested_loop_join.cpp src/duckdb/ub_src_execution_operator_aggregate.cpp src/duckdb/ub_src_execution_operator_csv_scanner_buffer_manager.cpp src/duckdb/ub_src_execution_operator_csv_scanner_encode.cpp src/duckdb/ub_src_execution_operator_csv_scanner_scanner.cpp src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp src/duckdb/ub_src_execution_operator_csv_scanner_state_machine.cpp src/duckdb/ub_src_execution_operator_csv_scanner_table_function.cpp src/duckdb/ub_src_execution_operator_csv_scanner_util.cpp src/duckdb/ub_src_execution_operator_filter.cpp src/duckdb/ub_src_execution_operator_helper.cpp src/duckdb/ub_src_execution_operator_join.cpp src/duckdb/ub_src_execution_operator_order.cpp src/duckdb/ub_src_execution_operator_persistent.cpp src/duckdb/ub_src_execution_operator_projection.cpp src/duckdb/ub_src_execution_operator_scan.cpp src/duckdb/ub_src_execution_operator_schema.cpp src/duckdb/ub_src_execution_operator_set.cpp src/duckdb/ub_src_execution_physical_plan.cpp src/duckdb/ub_src_execution_sample.cpp src/duckdb/ub_src_function_aggregate_distributive.cpp src/duckdb/ub_src_function_aggregate.cpp src/duckdb/ub_src_function.cpp src/duckdb/ub_src_function_cast.cpp src/duckdb/ub_src_function_cast_union.cpp src/duckdb/ub_src_function_pragma.cpp src/duckdb/ub_src_function_scalar_compressed_materialization.cpp src/duckdb/ub_src_function_scalar.cpp src/duckdb/ub_src_function_scalar_date.cpp src/duckdb/ub_src_function_scalar_generic.cpp src/duckdb/ub_src_function_scalar_list.cpp src/duckdb/ub_src_function_scalar_map.cpp src/duckdb/ub_src_function_scalar_operator.cpp src/duckdb/ub_src_function_scalar_sequence.cpp src/duckdb/ub_src_function_scalar_string.cpp src/duckdb/ub_src_function_scalar_string_regexp.cpp src/duckdb/ub_src_function_scalar_struct.cpp src/duckdb/ub_src_function_scalar_system.cpp src/duckdb/ub_src_function_table_arrow.cpp src/duckdb/ub_src_function_table.cpp src/duckdb/ub_src_function_table_system.cpp src/duckdb/ub_src_function_table_version.cpp src/duckdb/ub_src_function_window.cpp src/duckdb/ub_src_main.cpp src/duckdb/ub_src_main_buffered_data.cpp src/duckdb/ub_src_main_capi.cpp src/duckdb/ub_src_main_capi_cast.cpp src/duckdb/ub_src_main_chunk_scan_state.cpp src/duckdb/ub_src_main_extension.cpp src/duckdb/ub_src_main_relation.cpp src/duckdb/ub_src_main_secret.cpp src/duckdb/ub_src_main_settings.cpp src/duckdb/ub_src_optimizer.cpp src/duckdb/ub_src_optimizer_compressed_materialization.cpp src/duckdb/ub_src_optimizer_join_order.cpp src/duckdb/ub_src_optimizer_matcher.cpp src/duckdb/ub_src_optimizer_pullup.cpp src/duckdb/ub_src_optimizer_pushdown.cpp src/duckdb/ub_src_optimizer_rule.cpp src/duckdb/ub_src_optimizer_statistics_expression.cpp src/duckdb/ub_src_optimizer_statistics_operator.cpp src/duckdb/ub_src_parallel.cpp src/duckdb/ub_src_parser.cpp src/duckdb/ub_src_parser_constraints.cpp src/duckdb/ub_src_parser_expression.cpp src/duckdb/ub_src_parser_parsed_data.cpp src/duckdb/ub_src_parser_query_node.cpp src/duckdb/ub_src_parser_statement.cpp src/duckdb/ub_src_parser_tableref.cpp src/duckdb/ub_src_parser_transform_constraint.cpp src/duckdb/ub_src_parser_transform_expression.cpp src/duckdb/ub_src_parser_transform_helpers.cpp src/duckdb/ub_src_parser_transform_statement.cpp src/duckdb/ub_src_parser_transform_tableref.cpp src/duckdb/ub_src_planner.cpp src/duckdb/ub_src_planner_binder_expression.cpp src/duckdb/ub_src_planner_binder_query_node.cpp src/duckdb/ub_src_planner_binder_statement.cpp src/duckdb/ub_src_planner_binder_tableref.cpp src/duckdb/ub_src_planner_expression.cpp src/duckdb/ub_src_planner_expression_binder.cpp src/duckdb/ub_src_planner_filter.cpp src/duckdb/ub_src_planner_operator.cpp src/duckdb/ub_src_planner_subquery.cpp src/duckdb/ub_src_storage.cpp src/duckdb/ub_src_storage_buffer.cpp src/duckdb/ub_src_storage_checkpoint.cpp src/duckdb/ub_src_storage_compression_alp.cpp src/duckdb/ub_src_storage_compression.cpp src/duckdb/ub_src_storage_compression_chimp.cpp src/duckdb/ub_src_storage_compression_dictionary.cpp src/duckdb/ub_src_storage_compression_roaring.cpp src/duckdb/ub_src_storage_metadata.cpp src/duckdb/ub_src_storage_serialization.cpp src/duckdb/ub_src_storage_statistics.cpp src/duckdb/ub_src_storage_table.cpp src/duckdb/ub_src_transaction.cpp src/duckdb/src/verification/copied_statement_verifier.cpp src/duckdb/src/verification/deserialized_statement_verifier.cpp src/duckdb/src/verification/external_statement_verifier.cpp src/duckdb/src/verification/fetch_row_verifier.cpp src/duckdb/src/verification/no_operator_caching_verifier.cpp src/duckdb/src/verification/parsed_statement_verifier.cpp src/duckdb/src/verification/prepared_statement_verifier.cpp src/duckdb/src/verification/statement_verifier.cpp src/duckdb/src/verification/unoptimized_statement_verifier.cpp src/duckdb/third_party/fmt/format.cc src/duckdb/third_party/fsst/libfsst.cpp src/duckdb/third_party/miniz/miniz.cpp src/duckdb/third_party/re2/re2/bitmap256.cc src/duckdb/third_party/re2/re2/bitstate.cc src/duckdb/third_party/re2/re2/compile.cc src/duckdb/third_party/re2/re2/dfa.cc src/duckdb/third_party/re2/re2/filtered_re2.cc src/duckdb/third_party/re2/re2/mimics_pcre.cc src/duckdb/third_party/re2/re2/nfa.cc src/duckdb/third_party/re2/re2/onepass.cc src/duckdb/third_party/re2/re2/parse.cc src/duckdb/third_party/re2/re2/perl_groups.cc src/duckdb/third_party/re2/re2/prefilter.cc src/duckdb/third_party/re2/re2/prefilter_tree.cc src/duckdb/third_party/re2/re2/prog.cc src/duckdb/third_party/re2/re2/re2.cc src/duckdb/third_party/re2/re2/regexp.cc src/duckdb/third_party/re2/re2/set.cc src/duckdb/third_party/re2/re2/simplify.cc src/duckdb/third_party/re2/re2/stringpiece.cc src/duckdb/third_party/re2/re2/tostring.cc src/duckdb/third_party/re2/re2/unicode_casefold.cc src/duckdb/third_party/re2/re2/unicode_groups.cc src/duckdb/third_party/re2/util/rune.cc src/duckdb/third_party/re2/util/strutil.cc src/duckdb/third_party/hyperloglog/hyperloglog.cpp src/duckdb/third_party/hyperloglog/sds.cpp src/duckdb/third_party/skiplist/SkipList.cpp src/duckdb/third_party/fastpforlib/bitpacking.cpp src/duckdb/third_party/utf8proc/utf8proc.cpp src/duckdb/third_party/utf8proc/utf8proc_wrapper.cpp src/duckdb/third_party/libpg_query/pg_functions.cpp src/duckdb/third_party/libpg_query/postgres_parser.cpp src/duckdb/third_party/libpg_query/src_backend_nodes_list.cpp src/duckdb/third_party/libpg_query/src_backend_nodes_makefuncs.cpp src/duckdb/third_party/libpg_query/src_backend_nodes_value.cpp src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp src/duckdb/third_party/libpg_query/src_backend_parser_parser.cpp src/duckdb/third_party/libpg_query/src_backend_parser_scan.cpp src/duckdb/third_party/libpg_query/src_backend_parser_scansup.cpp src/duckdb/third_party/libpg_query/src_common_keywords.cpp src/duckdb/third_party/mbedtls/library/aes.cpp src/duckdb/third_party/mbedtls/library/aria.cpp src/duckdb/third_party/mbedtls/library/asn1parse.cpp src/duckdb/third_party/mbedtls/library/base64.cpp src/duckdb/third_party/mbedtls/library/bignum.cpp src/duckdb/third_party/mbedtls/library/camellia.cpp src/duckdb/third_party/mbedtls/library/cipher.cpp src/duckdb/third_party/mbedtls/library/cipher_wrap.cpp src/duckdb/third_party/mbedtls/library/constant_time.cpp src/duckdb/third_party/mbedtls/library/entropy.cpp src/duckdb/third_party/mbedtls/library/entropy_poll.cpp src/duckdb/third_party/mbedtls/library/gcm.cpp src/duckdb/third_party/mbedtls/library/md.cpp src/duckdb/third_party/mbedtls/library/oid.cpp src/duckdb/third_party/mbedtls/library/pem.cpp src/duckdb/third_party/mbedtls/library/pk.cpp src/duckdb/third_party/mbedtls/library/pk_wrap.cpp src/duckdb/third_party/mbedtls/library/pkparse.cpp src/duckdb/third_party/mbedtls/library/platform_util.cpp src/duckdb/third_party/mbedtls/library/rsa.cpp src/duckdb/third_party/mbedtls/library/rsa_alt_helpers.cpp src/duckdb/third_party/mbedtls/library/sha1.cpp src/duckdb/third_party/mbedtls/library/sha256.cpp src/duckdb/third_party/mbedtls/library/sha512.cpp src/duckdb/third_party/mbedtls/mbedtls_wrapper.cpp src/duckdb/third_party/yyjson/yyjson.cpp src/duckdb/third_party/zstd/common/debug.cpp src/duckdb/third_party/zstd/common/entropy_common.cpp src/duckdb/third_party/zstd/common/error_private.cpp src/duckdb/third_party/zstd/common/fse_decompress.cpp src/duckdb/third_party/zstd/common/pool.cpp src/duckdb/third_party/zstd/common/threading.cpp src/duckdb/third_party/zstd/common/xxhash.cpp src/duckdb/third_party/zstd/common/zstd_common.cpp src/duckdb/third_party/zstd/compress/fse_compress.cpp src/duckdb/third_party/zstd/compress/hist.cpp src/duckdb/third_party/zstd/compress/huf_compress.cpp src/duckdb/third_party/zstd/compress/zstd_compress.cpp src/duckdb/third_party/zstd/compress/zstd_compress_literals.cpp src/duckdb/third_party/zstd/compress/zstd_compress_sequences.cpp src/duckdb/third_party/zstd/compress/zstd_compress_superblock.cpp src/duckdb/third_party/zstd/compress/zstd_double_fast.cpp src/duckdb/third_party/zstd/compress/zstd_fast.cpp src/duckdb/third_party/zstd/compress/zstd_lazy.cpp src/duckdb/third_party/zstd/compress/zstd_ldm.cpp src/duckdb/third_party/zstd/compress/zstd_opt.cpp src/duckdb/third_party/zstd/compress/zstdmt_compress.cpp src/duckdb/third_party/zstd/decompress/huf_decompress.cpp src/duckdb/third_party/zstd/decompress/zstd_ddict.cpp src/duckdb/third_party/zstd/decompress/zstd_decompress.cpp src/duckdb/third_party/zstd/decompress/zstd_decompress_block.cpp src/duckdb/third_party/zstd/deprecated/zbuff_common.cpp src/duckdb/third_party/zstd/deprecated/zbuff_compress.cpp src/duckdb/third_party/zstd/deprecated/zbuff_decompress.cpp src/duckdb/third_party/zstd/dict/cover.cpp src/duckdb/third_party/zstd/dict/divsufsort.cpp src/duckdb/third_party/zstd/dict/fastcover.cpp src/duckdb/third_party/zstd/dict/zdict.cpp src/duckdb/extension/core_functions/core_functions_extension.cpp src/duckdb/extension/core_functions/lambda_functions.cpp src/duckdb/extension/core_functions/function_list.cpp src/duckdb/ub_extension_core_functions_aggregate_regression.cpp src/duckdb/ub_extension_core_functions_aggregate_algebraic.cpp src/duckdb/ub_extension_core_functions_aggregate_nested.cpp src/duckdb/ub_extension_core_functions_aggregate_holistic.cpp src/duckdb/ub_extension_core_functions_aggregate_distributive.cpp src/duckdb/ub_extension_core_functions_scalar_random.cpp src/duckdb/ub_extension_core_functions_scalar_string.cpp src/duckdb/ub_extension_core_functions_scalar_math.cpp src/duckdb/ub_extension_core_functions_scalar_generic.cpp src/duckdb/ub_extension_core_functions_scalar_enum.cpp src/duckdb/ub_extension_core_functions_scalar_map.cpp src/duckdb/ub_extension_core_functions_scalar_operators.cpp src/duckdb/ub_extension_core_functions_scalar_date.cpp src/duckdb/ub_extension_core_functions_scalar_list.cpp src/duckdb/ub_extension_core_functions_scalar_blob.cpp src/duckdb/ub_extension_core_functions_scalar_debug.cpp src/duckdb/ub_extension_core_functions_scalar_array.cpp src/duckdb/ub_extension_core_functions_scalar_union.cpp src/duckdb/ub_extension_core_functions_scalar_struct.cpp src/duckdb/ub_extension_core_functions_scalar_bit.cpp src/duckdb/extension/parquet/column_reader.cpp src/duckdb/extension/parquet/column_writer.cpp src/duckdb/extension/parquet/parquet_crypto.cpp src/duckdb/extension/parquet/parquet_extension.cpp src/duckdb/extension/parquet/parquet_metadata.cpp src/duckdb/extension/parquet/parquet_reader.cpp src/duckdb/extension/parquet/parquet_statistics.cpp src/duckdb/extension/parquet/parquet_timestamp.cpp src/duckdb/extension/parquet/parquet_writer.cpp src/duckdb/extension/parquet/serialize_parquet.cpp src/duckdb/extension/parquet/zstd_file_system.cpp src/duckdb/extension/parquet/geo_parquet.cpp src/duckdb/third_party/parquet/parquet_types.cpp src/duckdb/third_party/thrift/thrift/protocol/TProtocol.cpp src/duckdb/third_party/thrift/thrift/transport/TTransportException.cpp src/duckdb/third_party/thrift/thrift/transport/TBufferTransports.cpp src/duckdb/third_party/snappy/snappy.cc src/duckdb/third_party/snappy/snappy-sinksource.cc src/duckdb/third_party/lz4/lz4.cpp src/duckdb/third_party/brotli/common/constants.cpp src/duckdb/third_party/brotli/common/context.cpp src/duckdb/third_party/brotli/common/dictionary.cpp src/duckdb/third_party/brotli/common/platform.cpp src/duckdb/third_party/brotli/common/shared_dictionary.cpp src/duckdb/third_party/brotli/common/transform.cpp src/duckdb/third_party/brotli/dec/bit_reader.cpp src/duckdb/third_party/brotli/dec/decode.cpp src/duckdb/third_party/brotli/dec/huffman.cpp src/duckdb/third_party/brotli/dec/state.cpp src/duckdb/third_party/brotli/enc/backward_references.cpp src/duckdb/third_party/brotli/enc/backward_references_hq.cpp src/duckdb/third_party/brotli/enc/bit_cost.cpp src/duckdb/third_party/brotli/enc/block_splitter.cpp src/duckdb/third_party/brotli/enc/brotli_bit_stream.cpp src/duckdb/third_party/brotli/enc/cluster.cpp src/duckdb/third_party/brotli/enc/command.cpp src/duckdb/third_party/brotli/enc/compound_dictionary.cpp src/duckdb/third_party/brotli/enc/compress_fragment.cpp src/duckdb/third_party/brotli/enc/compress_fragment_two_pass.cpp src/duckdb/third_party/brotli/enc/dictionary_hash.cpp src/duckdb/third_party/brotli/enc/encode.cpp src/duckdb/third_party/brotli/enc/encoder_dict.cpp src/duckdb/third_party/brotli/enc/entropy_encode.cpp src/duckdb/third_party/brotli/enc/fast_log.cpp src/duckdb/third_party/brotli/enc/histogram.cpp src/duckdb/third_party/brotli/enc/literal_cost.cpp src/duckdb/third_party/brotli/enc/memory.cpp src/duckdb/third_party/brotli/enc/metablock.cpp src/duckdb/third_party/brotli/enc/static_dict.cpp src/duckdb/third_party/brotli/enc/utf8_util.cpp src/duckdb/extension/icu/./icu-table-range.cpp src/duckdb/extension/icu/./icu-datefunc.cpp src/duckdb/extension/icu/./icu-datepart.cpp src/duckdb/extension/icu/./icu-datetrunc.cpp src/duckdb/extension/icu/./icu_extension.cpp src/duckdb/extension/icu/./icu-timezone.cpp src/duckdb/extension/icu/./icu-timebucket.cpp src/duckdb/extension/icu/./icu-makedate.cpp src/duckdb/extension/icu/./icu-datesub.cpp src/duckdb/extension/icu/./icu-dateadd.cpp src/duckdb/extension/icu/./icu-list-range.cpp src/duckdb/extension/icu/./icu-strptime.cpp src/duckdb/ub_extension_icu_third_party_icu_common.cpp src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp src/duckdb/extension/icu/third_party/icu/stubdata/stubdata.cpp src/duckdb/extension/json/buffered_json_reader.cpp src/duckdb/extension/json/json_enums.cpp src/duckdb/extension/json/json_extension.cpp src/duckdb/extension/json/json_common.cpp src/duckdb/extension/json/json_functions.cpp src/duckdb/extension/json/json_scan.cpp src/duckdb/extension/json/json_serializer.cpp src/duckdb/extension/json/json_deserializer.cpp src/duckdb/extension/json/serialize_json.cpp src/duckdb/ub_extension_json_json_functions.cpp) set(CMAKE_JAVA_COMPILE_FLAGS -source 1.8 -target 1.8 -encoding utf-8) diff --git a/src/duckdb/extension/core_functions/aggregate/holistic/reservoir_quantile.cpp b/src/duckdb/extension/core_functions/aggregate/holistic/reservoir_quantile.cpp index e9991276..8c332500 100644 --- a/src/duckdb/extension/core_functions/aggregate/holistic/reservoir_quantile.cpp +++ b/src/duckdb/extension/core_functions/aggregate/holistic/reservoir_quantile.cpp @@ -39,7 +39,7 @@ struct ReservoirQuantileState { void FillReservoir(idx_t sample_size, T element) { if (pos < sample_size) { v[pos++] = element; - r_samp->InitializeReservoir(pos, len); + r_samp->InitializeReservoirWeights(pos, len); } else { D_ASSERT(r_samp->next_index_to_sample >= r_samp->num_entries_to_skip_b4_next_sample); if (r_samp->next_index_to_sample == r_samp->num_entries_to_skip_b4_next_sample) { diff --git a/src/duckdb/extension/json/buffered_json_reader.cpp b/src/duckdb/extension/json/buffered_json_reader.cpp index c5dc68f5..f99fe032 100644 --- a/src/duckdb/extension/json/buffered_json_reader.cpp +++ b/src/duckdb/extension/json/buffered_json_reader.cpp @@ -337,6 +337,11 @@ void BufferedJSONReader::ThrowTransformError(idx_t buf_index, idx_t line_or_obje error_message); } +bool BufferedJSONReader::HasThrown() { + lock_guard guard(lock); + return thrown; +} + double BufferedJSONReader::GetProgress() const { lock_guard guard(lock); if (HasFileHandle()) { diff --git a/src/duckdb/extension/json/include/buffered_json_reader.hpp b/src/duckdb/extension/json/include/buffered_json_reader.hpp index 6fddc78a..3069546b 100644 --- a/src/duckdb/extension/json/include/buffered_json_reader.hpp +++ b/src/duckdb/extension/json/include/buffered_json_reader.hpp @@ -136,6 +136,8 @@ class BufferedJSONReader { void ThrowParseError(idx_t buf_index, idx_t line_or_object_in_buf, yyjson_read_err &err, const string &extra = ""); //! Throws a transform error that mentions the file name and line number void ThrowTransformError(idx_t buf_index, idx_t line_or_object_in_buf, const string &error_message); + //! Whether this reader has thrown if an error has occurred + bool HasThrown(); //! Scan progress double GetProgress() const; diff --git a/src/duckdb/extension/json/json_scan.cpp b/src/duckdb/extension/json/json_scan.cpp index a1dd4e0d..69915155 100644 --- a/src/duckdb/extension/json/json_scan.cpp +++ b/src/duckdb/extension/json/json_scan.cpp @@ -676,8 +676,8 @@ void JSONScanLocalState::ReadAndAutoDetect(JSONScanGlobalState &gstate, Allocate if (!bind_data.ignore_errors && bind_data.options.record_type == JSONRecordType::RECORDS && current_reader->GetRecordType() != JSONRecordType::RECORDS) { - throw InvalidInputException("Expected file \"%s\" to contain records, detected non-record JSON instead.", - current_reader->GetFileName()); + current_reader->ThrowTransformError(buffer_index.GetIndex(), 0, + "Expected records, detected non-record JSON instead."); } } @@ -829,6 +829,9 @@ bool JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) { // Spinlock until the previous batch index has also read its buffer optional_ptr previous_buffer_handle; while (!previous_buffer_handle) { + if (current_reader->HasThrown()) { + return false; + } previous_buffer_handle = current_reader->GetBuffer(current_buffer_handle->buffer_index - 1); } diff --git a/src/duckdb/extension/parquet/column_reader.cpp b/src/duckdb/extension/parquet/column_reader.cpp index 6ea2cae7..cf1f5ba5 100644 --- a/src/duckdb/extension/parquet/column_reader.cpp +++ b/src/duckdb/extension/parquet/column_reader.cpp @@ -533,6 +533,7 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr idx_t result_offset = 0; auto to_read = num_values; + D_ASSERT(to_read <= STANDARD_VECTOR_SIZE); while (to_read > 0) { while (page_rows_available == 0) { @@ -542,7 +543,7 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr D_ASSERT(block); auto read_now = MinValue(to_read, page_rows_available); - D_ASSERT(read_now <= STANDARD_VECTOR_SIZE); + D_ASSERT(read_now + result_offset <= STANDARD_VECTOR_SIZE); if (HasRepeats()) { D_ASSERT(repeated_decoder); @@ -565,7 +566,7 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr if (result_offset != 0 && result.GetVectorType() != VectorType::FLAT_VECTOR) { result.Flatten(result_offset); - result.Resize(result_offset, result_offset + read_now); + result.Resize(result_offset, STANDARD_VECTOR_SIZE); } if (dict_decoder) { diff --git a/src/duckdb/src/catalog/catalog_entry/duck_table_entry.cpp b/src/duckdb/src/catalog/catalog_entry/duck_table_entry.cpp index e1d0c66a..6b7f72c5 100644 --- a/src/duckdb/src/catalog/catalog_entry/duck_table_entry.cpp +++ b/src/duckdb/src/catalog/catalog_entry/duck_table_entry.cpp @@ -136,6 +136,10 @@ unique_ptr DuckTableEntry::GetStatistics(ClientContext &context, return storage->GetStatistics(context, column.StorageOid()); } +unique_ptr DuckTableEntry::GetSample() { + return storage->GetSample(); +} + unique_ptr DuckTableEntry::AlterEntry(CatalogTransaction transaction, AlterInfo &info) { if (transaction.HasContext()) { return AlterEntry(transaction.GetContext(), info); diff --git a/src/duckdb/src/catalog/catalog_entry/table_catalog_entry.cpp b/src/duckdb/src/catalog/catalog_entry/table_catalog_entry.cpp index ce0078e5..4be35415 100644 --- a/src/duckdb/src/catalog/catalog_entry/table_catalog_entry.cpp +++ b/src/duckdb/src/catalog/catalog_entry/table_catalog_entry.cpp @@ -43,6 +43,10 @@ LogicalIndex TableCatalogEntry::GetColumnIndex(string &column_name, bool if_exis return entry; } +unique_ptr TableCatalogEntry::GetSample() { + return nullptr; +} + bool TableCatalogEntry::ColumnExists(const string &name) const { return columns.ColumnExists(name); } @@ -251,28 +255,6 @@ static void BindExtraColumns(TableCatalogEntry &table, LogicalGet &get, LogicalP } } -static bool TypeSupportsRegularUpdate(const LogicalType &type) { - switch (type.id()) { - case LogicalTypeId::LIST: - case LogicalTypeId::ARRAY: - case LogicalTypeId::MAP: - case LogicalTypeId::UNION: - // lists and maps and unions don't support updates directly - return false; - case LogicalTypeId::STRUCT: { - auto &child_types = StructType::GetChildTypes(type); - for (auto &entry : child_types) { - if (!TypeSupportsRegularUpdate(entry.second)) { - return false; - } - } - return true; - } - default: - return true; - } -} - vector TableCatalogEntry::GetColumnSegmentInfo() { return {}; } @@ -317,7 +299,7 @@ void TableCatalogEntry::BindUpdateConstraints(Binder &binder, LogicalGet &get, L // we also convert any updates on LIST columns into delete + insert for (auto &col_index : update.columns) { auto &column = GetColumns().GetColumn(col_index); - if (!TypeSupportsRegularUpdate(column.Type())) { + if (!column.Type().SupportsRegularUpdate()) { update.update_is_del_and_insert = true; break; } diff --git a/src/duckdb/src/common/enum_util.cpp b/src/duckdb/src/common/enum_util.cpp index 41c5adf1..1e6af405 100644 --- a/src/duckdb/src/common/enum_util.cpp +++ b/src/duckdb/src/common/enum_util.cpp @@ -146,6 +146,44 @@ namespace duckdb { +const StringUtil::EnumStringLiteral *GetARTAppendModeValues() { + static constexpr StringUtil::EnumStringLiteral values[] { + { static_cast(ARTAppendMode::DEFAULT), "DEFAULT" }, + { static_cast(ARTAppendMode::IGNORE_DUPLICATES), "IGNORE_DUPLICATES" }, + { static_cast(ARTAppendMode::INSERT_DUPLICATES), "INSERT_DUPLICATES" } + }; + return values; +} + +template<> +const char* EnumUtil::ToChars(ARTAppendMode value) { + return StringUtil::EnumToString(GetARTAppendModeValues(), 3, "ARTAppendMode", static_cast(value)); +} + +template<> +ARTAppendMode EnumUtil::FromString(const char *value) { + return static_cast(StringUtil::StringToEnum(GetARTAppendModeValues(), 3, "ARTAppendMode", value)); +} + +const StringUtil::EnumStringLiteral *GetARTConflictTypeValues() { + static constexpr StringUtil::EnumStringLiteral values[] { + { static_cast(ARTConflictType::NO_CONFLICT), "NO_CONFLICT" }, + { static_cast(ARTConflictType::CONSTRAINT), "CONSTRAINT" }, + { static_cast(ARTConflictType::TRANSACTION), "TRANSACTION" } + }; + return values; +} + +template<> +const char* EnumUtil::ToChars(ARTConflictType value) { + return StringUtil::EnumToString(GetARTConflictTypeValues(), 3, "ARTConflictType", static_cast(value)); +} + +template<> +ARTConflictType EnumUtil::FromString(const char *value) { + return static_cast(StringUtil::StringToEnum(GetARTConflictTypeValues(), 3, "ARTConflictType", value)); +} + const StringUtil::EnumStringLiteral *GetAccessModeValues() { static constexpr StringUtil::EnumStringLiteral values[] { { static_cast(AccessMode::UNDEFINED), "UNDEFINED" }, @@ -1426,19 +1464,20 @@ const StringUtil::EnumStringLiteral *GetExtensionABITypeValues() { static constexpr StringUtil::EnumStringLiteral values[] { { static_cast(ExtensionABIType::UNKNOWN), "UNKNOWN" }, { static_cast(ExtensionABIType::CPP), "CPP" }, - { static_cast(ExtensionABIType::C_STRUCT), "C_STRUCT" } + { static_cast(ExtensionABIType::C_STRUCT), "C_STRUCT" }, + { static_cast(ExtensionABIType::C_STRUCT_UNSTABLE), "C_STRUCT_UNSTABLE" } }; return values; } template<> const char* EnumUtil::ToChars(ExtensionABIType value) { - return StringUtil::EnumToString(GetExtensionABITypeValues(), 3, "ExtensionABIType", static_cast(value)); + return StringUtil::EnumToString(GetExtensionABITypeValues(), 4, "ExtensionABIType", static_cast(value)); } template<> ExtensionABIType EnumUtil::FromString(const char *value) { - return static_cast(StringUtil::StringToEnum(GetExtensionABITypeValues(), 3, "ExtensionABIType", value)); + return static_cast(StringUtil::StringToEnum(GetExtensionABITypeValues(), 4, "ExtensionABIType", value)); } const StringUtil::EnumStringLiteral *GetExtensionInstallModeValues() { @@ -3086,6 +3125,24 @@ SampleType EnumUtil::FromString(const char *value) { return static_cast(StringUtil::StringToEnum(GetSampleTypeValues(), 3, "SampleType", value)); } +const StringUtil::EnumStringLiteral *GetSamplingStateValues() { + static constexpr StringUtil::EnumStringLiteral values[] { + { static_cast(SamplingState::RANDOM), "RANDOM" }, + { static_cast(SamplingState::RESERVOIR), "RESERVOIR" } + }; + return values; +} + +template<> +const char* EnumUtil::ToChars(SamplingState value) { + return StringUtil::EnumToString(GetSamplingStateValues(), 2, "SamplingState", static_cast(value)); +} + +template<> +SamplingState EnumUtil::FromString(const char *value) { + return static_cast(StringUtil::StringToEnum(GetSamplingStateValues(), 2, "SamplingState", value)); +} + const StringUtil::EnumStringLiteral *GetScanTypeValues() { static constexpr StringUtil::EnumStringLiteral values[] { { static_cast(ScanType::TABLE), "TABLE" }, diff --git a/src/duckdb/src/common/random_engine.cpp b/src/duckdb/src/common/random_engine.cpp index 704992f0..ebc0abd4 100644 --- a/src/duckdb/src/common/random_engine.cpp +++ b/src/duckdb/src/common/random_engine.cpp @@ -55,6 +55,10 @@ uint32_t RandomEngine::NextRandomInteger(uint32_t min, uint32_t max) { return min + static_cast(NextRandom() * double(max - min)); } +uint32_t RandomEngine::NextRandomInteger32(uint32_t min, uint32_t max) { + return min + static_cast(NextRandom32() * double(max - min)); +} + void RandomEngine::SetSeed(uint32_t seed) { random_state->pcg.seed(seed); } diff --git a/src/duckdb/src/common/tree_renderer/text_tree_renderer.cpp b/src/duckdb/src/common/tree_renderer/text_tree_renderer.cpp index 8e0fa425..fa24ee10 100644 --- a/src/duckdb/src/common/tree_renderer/text_tree_renderer.cpp +++ b/src/duckdb/src/common/tree_renderer/text_tree_renderer.cpp @@ -168,13 +168,12 @@ void TextTreeRenderer::RenderBoxContent(RenderTree &root, std::ostream &ss, idx_ for (idx_t x = 0; x < root.width; x++) { auto node = root.GetNode(x, y); if (node) { - SplitUpExtraInfo(node->extra_text, extra_info[x]); + SplitUpExtraInfo(node->extra_text, extra_info[x], config.max_extra_lines); if (extra_info[x].size() > extra_height) { extra_height = extra_info[x].size(); } } } - extra_height = MinValue(extra_height, config.max_extra_lines); idx_t halfway_point = (extra_height + 1) / 2; // now we render the actual node for (idx_t render_y = 0; render_y <= extra_height; render_y++) { @@ -405,7 +404,8 @@ void TextTreeRenderer::SplitStringBuffer(const string &source, vector &r } } -void TextTreeRenderer::SplitUpExtraInfo(const InsertionOrderPreservingMap &extra_info, vector &result) { +void TextTreeRenderer::SplitUpExtraInfo(const InsertionOrderPreservingMap &extra_info, vector &result, + idx_t max_lines) { if (extra_info.empty()) { return; } @@ -467,6 +467,18 @@ void TextTreeRenderer::SplitUpExtraInfo(const InsertionOrderPreservingMap max_lines) { + // truncate this entry + vector truncated_splits; + for (idx_t i = 0; i < max_lines / 2; i++) { + truncated_splits.push_back(std::move(splits[i])); + } + truncated_splits.push_back("..."); + for (idx_t i = splits.size() - max_lines / 2; i < splits.size(); i++) { + truncated_splits.push_back(std::move(splits[i])); + } + splits = std::move(truncated_splits); + } for (auto &split : splits) { SplitStringBuffer(split, result); } diff --git a/src/duckdb/src/common/types.cpp b/src/duckdb/src/common/types.cpp index e59ba561..6c3a7913 100644 --- a/src/duckdb/src/common/types.cpp +++ b/src/duckdb/src/common/types.cpp @@ -734,6 +734,27 @@ bool LogicalType::IsComplete() const { }); } +bool LogicalType::SupportsRegularUpdate() const { + switch (id()) { + case LogicalTypeId::LIST: + case LogicalTypeId::ARRAY: + case LogicalTypeId::MAP: + case LogicalTypeId::UNION: + return false; + case LogicalTypeId::STRUCT: { + auto &child_types = StructType::GetChildTypes(*this); + for (auto &entry : child_types) { + if (!entry.second.SupportsRegularUpdate()) { + return false; + } + } + return true; + } + default: + return true; + } +} + bool LogicalType::GetDecimalProperties(uint8_t &width, uint8_t &scale) const { switch (id_) { case LogicalTypeId::SQLNULL: diff --git a/src/duckdb/src/common/types/conflict_manager.cpp b/src/duckdb/src/common/types/conflict_manager.cpp index d06e7244..409d0278 100644 --- a/src/duckdb/src/common/types/conflict_manager.cpp +++ b/src/duckdb/src/common/types/conflict_manager.cpp @@ -212,18 +212,24 @@ idx_t ConflictManager::ConflictCount() const { return conflicts.Count(); } -void ConflictManager::AddIndex(BoundIndex &index) { - matched_indexes.insert(&index); +void ConflictManager::AddIndex(BoundIndex &index, optional_ptr delete_index) { + matched_indexes.push_back(index); + matched_delete_indexes.push_back(delete_index); + matched_index_names.insert(index.name); } bool ConflictManager::MatchedIndex(BoundIndex &index) { - return matched_indexes.count(&index); + return matched_index_names.find(index.name) != matched_index_names.end(); } -const unordered_set &ConflictManager::MatchedIndexes() const { +const vector> &ConflictManager::MatchedIndexes() const { return matched_indexes; } +const vector> &ConflictManager::MatchedDeleteIndexes() const { + return matched_delete_indexes; +} + void ConflictManager::Finalize() { D_ASSERT(!finalized); if (SingleIndexTarget()) { @@ -246,8 +252,8 @@ void ConflictManager::Finalize() { } } // Now create the row_ids Vector, aligned with the selection vector - auto &row_ids = InternalRowIds(); - auto row_id_data = FlatVector::GetData(row_ids); + auto &internal_row_ids = InternalRowIds(); + auto row_id_data = FlatVector::GetData(internal_row_ids); for (idx_t i = 0; i < selection.Count(); i++) { D_ASSERT(!row_id_map.empty()); @@ -260,7 +266,7 @@ void ConflictManager::Finalize() { } VerifyExistenceType ConflictManager::LookupType() const { - return this->lookup_type; + return lookup_type; } } // namespace duckdb diff --git a/src/duckdb/src/execution/aggregate_hashtable.cpp b/src/duckdb/src/execution/aggregate_hashtable.cpp index 65619c3f..589fe4fb 100644 --- a/src/duckdb/src/execution/aggregate_hashtable.cpp +++ b/src/duckdb/src/execution/aggregate_hashtable.cpp @@ -41,7 +41,8 @@ GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, All vector aggregate_objects_p, idx_t initial_capacity, idx_t radix_bits) : BaseAggregateHashTable(context, allocator, aggregate_objects_p, std::move(payload_types_p)), - radix_bits(radix_bits), count(0), capacity(0), aggregate_allocator(make_shared_ptr(allocator)) { + radix_bits(radix_bits), count(0), capacity(0), skip_lookups(false), + aggregate_allocator(make_shared_ptr(allocator)) { // Append hash column to the end and initialise the row layout group_types_p.emplace_back(LogicalType::HASH); @@ -204,6 +205,9 @@ idx_t GroupedAggregateHashTable::ApplyBitMask(hash_t hash) const { void GroupedAggregateHashTable::Verify() { #ifdef DEBUG + if (skip_lookups) { + return; + } idx_t total_count = 0; for (idx_t i = 0; i < capacity; i++) { const auto &entry = entries[i]; @@ -230,6 +234,14 @@ idx_t GroupedAggregateHashTable::GetRadixBits() const { return radix_bits; } +idx_t GroupedAggregateHashTable::GetSinkCount() const { + return sink_count; +} + +void GroupedAggregateHashTable::SkipLookups() { + skip_lookups = true; +} + void GroupedAggregateHashTable::Resize(idx_t size) { D_ASSERT(size >= STANDARD_VECTOR_SIZE); D_ASSERT(IsPowerOfTwo(size)); @@ -458,6 +470,8 @@ optional_idx GroupedAggregateHashTable::TryAddCompressedGroups(DataChunk &groups } idx_t GroupedAggregateHashTable::AddChunk(DataChunk &groups, DataChunk &payload, const unsafe_vector &filter) { + sink_count += groups.size(); + // check if we can use an optimized path that utilizes compressed vectors auto result = TryAddCompressedGroups(groups, payload, filter); if (result.IsValid()) { @@ -563,23 +577,6 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, V } D_ASSERT(capacity - Count() >= chunk_size); // we need to be able to fit at least one vector of data - group_hashes_v.Flatten(chunk_size); - const auto hashes = FlatVector::GetData(group_hashes_v); - - addresses_v.Flatten(chunk_size); - const auto addresses = FlatVector::GetData(addresses_v); - - // Compute the entry in the table based on the hash using a modulo, - // and precompute the hash salts for faster comparison below - const auto ht_offsets = FlatVector::GetData(state.ht_offsets); - const auto hash_salts = FlatVector::GetData(state.hash_salts); - for (idx_t r = 0; r < chunk_size; r++) { - const auto &hash = hashes[r]; - ht_offsets[r] = ApplyBitMask(hash); - D_ASSERT(ht_offsets[r] == hash % capacity); - hash_salts[r] = ht_entry_t::ExtractSalt(hash); - } - // we start out with all entries [0, 1, 2, ..., chunk_size] const SelectionVector *sel_vector = FlatVector::IncrementalSelectionVector(); @@ -601,6 +598,42 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, V } TupleDataCollection::GetVectorData(state.partitioned_append_state.chunk_state, state.group_data.get()); + group_hashes_v.Flatten(chunk_size); + const auto hashes = FlatVector::GetData(group_hashes_v); + + addresses_v.Flatten(chunk_size); + const auto addresses = FlatVector::GetData(addresses_v); + + if (skip_lookups) { + // Just appending now + partitioned_data->AppendUnified(state.partitioned_append_state, state.group_chunk, + *FlatVector::IncrementalSelectionVector(), chunk_size); + RowOperations::InitializeStates(layout, state.partitioned_append_state.chunk_state.row_locations, + *FlatVector::IncrementalSelectionVector(), chunk_size); + + const auto row_locations = + FlatVector::GetData(state.partitioned_append_state.chunk_state.row_locations); + const auto &row_sel = state.partitioned_append_state.reverse_partition_sel; + for (idx_t i = 0; i < chunk_size; i++) { + const auto &row_idx = row_sel[i]; + const auto &row_location = row_locations[row_idx]; + addresses[i] = row_location; + } + count += chunk_size; + return chunk_size; + } + + // Compute the entry in the table based on the hash using a modulo, + // and precompute the hash salts for faster comparison below + const auto ht_offsets = FlatVector::GetData(state.ht_offsets); + const auto hash_salts = FlatVector::GetData(state.hash_salts); + for (idx_t r = 0; r < chunk_size; r++) { + const auto &hash = hashes[r]; + ht_offsets[r] = ApplyBitMask(hash); + D_ASSERT(ht_offsets[r] == hash % capacity); + hash_salts[r] = ht_entry_t::ExtractSalt(hash); + } + idx_t new_group_count = 0; idx_t remaining_entries = chunk_size; idx_t iteration_count; diff --git a/src/duckdb/src/execution/index/art/art.cpp b/src/duckdb/src/execution/index/art/art.cpp index be4beef1..4848c48f 100644 --- a/src/duckdb/src/execution/index/art/art.cpp +++ b/src/duckdb/src/execution/index/art/art.cpp @@ -45,7 +45,7 @@ ART::ART(const string &name, const IndexConstraintType index_constraint_type, co const shared_ptr, ALLOCATOR_COUNT>> &allocators_ptr, const IndexStorageInfo &info) : BoundIndex(name, ART::TYPE_NAME, index_constraint_type, column_ids, table_io_manager, unbound_expressions, db), - allocators(allocators_ptr), owns_data(false) { + allocators(allocators_ptr), owns_data(false), append_mode(ARTAppendMode::DEFAULT) { // FIXME: Use the new byte representation function to support nested types. for (idx_t i = 0; i < types.size(); i++) { @@ -477,32 +477,44 @@ bool ART::Construct(unsafe_vector &keys, unsafe_vector &row_ids, // Insert and Constraint Checking //===--------------------------------------------------------------------===// -ErrorData ART::Insert(IndexLock &lock, DataChunk &input, Vector &row_ids) { +ErrorData ART::Insert(IndexLock &l, DataChunk &chunk, Vector &row_ids) { + return Insert(l, chunk, row_ids, nullptr); +} + +ErrorData ART::Insert(IndexLock &l, DataChunk &chunk, Vector &row_ids, optional_ptr delete_index) { D_ASSERT(row_ids.GetType().InternalType() == ROW_TYPE); - auto row_count = input.size(); + auto row_count = chunk.size(); ArenaAllocator allocator(BufferAllocator::Get(db)); unsafe_vector keys(row_count); unsafe_vector row_id_keys(row_count); - GenerateKeyVectors(allocator, input, row_ids, keys, row_id_keys); + GenerateKeyVectors(allocator, chunk, row_ids, keys, row_id_keys); - // Insert the entries into the index. - idx_t failed_index = DConstants::INVALID_INDEX; + optional_ptr delete_art; + if (delete_index) { + delete_art = delete_index->Cast(); + } + + auto conflict_type = ARTConflictType::NO_CONFLICT; + optional_idx conflict_idx; auto was_empty = !tree.HasMetadata(); + + // Insert the entries into the index. for (idx_t i = 0; i < row_count; i++) { if (keys[i].Empty()) { continue; } - if (!Insert(tree, keys[i], 0, row_id_keys[i], tree.GetGateStatus())) { - // Insertion failure due to a constraint violation. - failed_index = i; + conflict_type = Insert(tree, keys[i], 0, row_id_keys[i], tree.GetGateStatus(), delete_art); + if (conflict_type != ARTConflictType::NO_CONFLICT) { + conflict_idx = i; break; } } // Remove any previously inserted entries. - if (failed_index != DConstants::INVALID_INDEX) { - for (idx_t i = 0; i < failed_index; i++) { + if (conflict_type != ARTConflictType::NO_CONFLICT) { + D_ASSERT(conflict_idx.IsValid()); + for (idx_t i = 0; i < conflict_idx.GetIndex(); i++) { if (keys[i].Empty()) { continue; } @@ -515,9 +527,14 @@ ErrorData ART::Insert(IndexLock &lock, DataChunk &input, Vector &row_ids) { VerifyAllocationsInternal(); } - if (failed_index != DConstants::INVALID_INDEX) { - auto msg = AppendRowError(input, failed_index); - return ErrorData(ConstraintException("PRIMARY KEY or UNIQUE constraint violated: duplicate key \"%s\"", msg)); + if (conflict_type == ARTConflictType::TRANSACTION) { + auto msg = AppendRowError(chunk, conflict_idx.GetIndex()); + return ErrorData(TransactionException("write-write conflict on key: \"%s\"", msg)); + } + + if (conflict_type == ARTConflictType::CONSTRAINT) { + auto msg = AppendRowError(chunk, conflict_idx.GetIndex()); + return ErrorData(ConstraintException("PRIMARY KEY or UNIQUE constraint violation: duplicate key \"%s\"", msg)); } #ifdef DEBUG @@ -531,22 +548,34 @@ ErrorData ART::Insert(IndexLock &lock, DataChunk &input, Vector &row_ids) { return ErrorData(); } -ErrorData ART::Append(IndexLock &lock, DataChunk &input, Vector &row_ids) { +ErrorData ART::Append(IndexLock &l, DataChunk &chunk, Vector &row_ids) { // Execute all column expressions before inserting the data chunk. DataChunk expr_chunk; expr_chunk.Initialize(Allocator::DefaultAllocator(), logical_types); - ExecuteExpressions(input, expr_chunk); - return Insert(lock, expr_chunk, row_ids); + ExecuteExpressions(chunk, expr_chunk); + + // Now insert the data chunk. + return Insert(l, expr_chunk, row_ids, nullptr); } -void ART::VerifyAppend(DataChunk &chunk) { - ConflictManager conflict_manager(VerifyExistenceType::APPEND, chunk.size()); - CheckConstraintsForChunk(chunk, conflict_manager); +ErrorData ART::AppendWithDeleteIndex(IndexLock &l, DataChunk &chunk, Vector &row_ids, + optional_ptr delete_index) { + // Execute all column expressions before inserting the data chunk. + DataChunk expr_chunk; + expr_chunk.Initialize(Allocator::DefaultAllocator(), logical_types); + ExecuteExpressions(chunk, expr_chunk); + + // Now insert the data chunk. + return Insert(l, expr_chunk, row_ids, delete_index); } -void ART::VerifyAppend(DataChunk &chunk, ConflictManager &conflict_manager) { - D_ASSERT(conflict_manager.LookupType() == VerifyExistenceType::APPEND); - CheckConstraintsForChunk(chunk, conflict_manager); +void ART::VerifyAppend(DataChunk &chunk, optional_ptr delete_index, optional_ptr manager) { + if (manager) { + D_ASSERT(manager->LookupType() == VerifyExistenceType::APPEND); + return VerifyConstraint(chunk, delete_index, *manager); + } + ConflictManager local_manager(VerifyExistenceType::APPEND, chunk.size()); + VerifyConstraint(chunk, delete_index, local_manager); } void ART::InsertIntoEmpty(Node &node, const ARTKey &key, const idx_t depth, const ARTKey &row_id, @@ -566,26 +595,61 @@ void ART::InsertIntoEmpty(Node &node, const ARTKey &key, const idx_t depth, cons Leaf::New(ref, row_id.GetRowId()); } -bool ART::InsertIntoNode(Node &node, const ARTKey &key, const idx_t depth, const ARTKey &row_id, - const GateStatus status) { +ARTConflictType ART::InsertIntoInlined(Node &node, const ARTKey &key, const idx_t depth, const ARTKey &row_id, + const GateStatus status, optional_ptr delete_art) { + + if (!IsUnique() || append_mode == ARTAppendMode::INSERT_DUPLICATES) { + Leaf::InsertIntoInlined(*this, node, row_id, depth, status); + return ARTConflictType::NO_CONFLICT; + } + + if (!delete_art) { + if (append_mode == ARTAppendMode::IGNORE_DUPLICATES) { + return ARTConflictType::NO_CONFLICT; + } + return ARTConflictType::CONSTRAINT; + } + + // Lookup in the delete_art. + auto delete_leaf = delete_art->Lookup(delete_art->tree, key, 0); + if (!delete_leaf) { + return ARTConflictType::CONSTRAINT; + } + + // The row ID has changed. + // Thus, the local index has a newer (local) row ID, and this is a constraint violation. + D_ASSERT(delete_leaf->GetType() == NType::LEAF_INLINED); + auto deleted_row_id = delete_leaf->GetRowId(); + auto this_row_id = node.GetRowId(); + if (deleted_row_id != this_row_id) { + return ARTConflictType::CONSTRAINT; + } + + // The deleted key and its row ID match the current key and its row ID. + Leaf::InsertIntoInlined(*this, node, row_id, depth, status); + return ARTConflictType::NO_CONFLICT; +} + +ARTConflictType ART::InsertIntoNode(Node &node, const ARTKey &key, const idx_t depth, const ARTKey &row_id, + const GateStatus status, optional_ptr delete_art) { D_ASSERT(depth < key.len); auto child = node.GetChildMutable(*this, key[depth]); // Recurse, if a child exists at key[depth]. if (child) { D_ASSERT(child->HasMetadata()); - bool success = Insert(*child, key, depth + 1, row_id, status); + auto conflict_type = Insert(*child, key, depth + 1, row_id, status, delete_art); node.ReplaceChild(*this, key[depth], *child); - return success; + return conflict_type; } // Create an inlined prefix at key[depth]. if (status == GateStatus::GATE_SET) { Node remainder; auto byte = key[depth]; - auto success = Insert(remainder, key, depth + 1, row_id, status); + auto conflict_type = Insert(remainder, key, depth + 1, row_id, status, delete_art); Node::InsertChild(*this, node, byte, remainder); - return success; + return conflict_type; } // Insert an inlined leaf at key[depth]. @@ -601,49 +665,56 @@ bool ART::InsertIntoNode(Node &node, const ARTKey &key, const idx_t depth, const // Create the inlined leaf. Leaf::New(ref, row_id.GetRowId()); Node::InsertChild(*this, node, key[depth], leaf); - return true; + return ARTConflictType::NO_CONFLICT; } -bool ART::Insert(Node &node, const ARTKey &key, idx_t depth, const ARTKey &row_id, const GateStatus status) { +ARTConflictType ART::Insert(Node &node, const ARTKey &key, idx_t depth, const ARTKey &row_id, const GateStatus status, + optional_ptr delete_art) { if (!node.HasMetadata()) { InsertIntoEmpty(node, key, depth, row_id, status); - return true; + return ARTConflictType::NO_CONFLICT; } // Enter a nested leaf. if (status == GateStatus::GATE_NOT_SET && node.GetGateStatus() == GateStatus::GATE_SET) { - return Insert(node, row_id, 0, row_id, GateStatus::GATE_SET); + if (IsUnique()) { + // Unique indexes can have duplicates, if another transaction DELETE + INSERT + // the same key. In that case, the previous value must be kept alive until all + // other transactions do not depend on it anymore. + + // We restrict this transactionality to two-value leaves, so any subsequent + // incoming transaction must fail here. + return ARTConflictType::TRANSACTION; + } + return Insert(node, row_id, 0, row_id, GateStatus::GATE_SET, delete_art); } auto type = node.GetType(); switch (type) { case NType::LEAF_INLINED: { - if (IsUnique()) { - return false; - } - Leaf::InsertIntoInlined(*this, node, row_id, depth, status); - return true; + return InsertIntoInlined(node, key, depth, row_id, status, delete_art); } case NType::LEAF: { Leaf::TransformToNested(*this, node); - return Insert(node, key, depth, row_id, status); + return Insert(node, key, depth, row_id, status, delete_art); } case NType::NODE_7_LEAF: case NType::NODE_15_LEAF: case NType::NODE_256_LEAF: { + // Row IDs are unique, so there are never any duplicate byte conflicts here. auto byte = key[Prefix::ROW_ID_COUNT]; Node::InsertChild(*this, node, byte); - return true; + return ARTConflictType::NO_CONFLICT; } case NType::NODE_4: case NType::NODE_16: case NType::NODE_48: case NType::NODE_256: - return InsertIntoNode(node, key, depth, row_id, status); + return InsertIntoNode(node, key, depth, row_id, status, delete_art); case NType::PREFIX: - return Prefix::Insert(*this, node, key, depth, row_id, status); + return Prefix::Insert(*this, node, key, depth, row_id, status, delete_art); default: - throw InternalException("Invalid node type for Insert."); + throw InternalException("Invalid node type for ART::Insert."); } } @@ -953,11 +1024,7 @@ string ART::GenerateConstraintErrorMessage(VerifyExistenceType verify_type, cons case VerifyExistenceType::APPEND: { // APPEND to PK/UNIQUE table, but node/key already exists in PK/UNIQUE table. string type = IsPrimary() ? "primary key" : "unique"; - return StringUtil::Format("Duplicate key \"%s\" violates %s constraint. " - "If this is an unexpected constraint violation please double " - "check with the known index limitations section in our documentation " - "(https://duckdb.org/docs/sql/indexes).", - key_name, type); + return StringUtil::Format("Duplicate key \"%s\" violates %s constraint.", key_name, type); } case VerifyExistenceType::APPEND_FK: { // APPEND_FK to FK table, node/key does not exist in PK/UNIQUE table. @@ -975,50 +1042,117 @@ string ART::GenerateConstraintErrorMessage(VerifyExistenceType verify_type, cons } } -void ART::CheckConstraintsForChunk(DataChunk &input, ConflictManager &conflict_manager) { +void ART::VerifyLeaf(const Node &leaf, const ARTKey &key, optional_ptr delete_art, ConflictManager &manager, + optional_idx &conflict_idx, idx_t i) { + // Fast path, the leaf is inlined, and the delete ART does not exist. + if (leaf.GetType() == NType::LEAF_INLINED && !delete_art) { + if (manager.AddHit(i, leaf.GetRowId())) { + conflict_idx = i; + } + return; + } + + // Get the delete_leaf. + // All leaves in the delete ART are inlined. + auto deleted_leaf = delete_art->Lookup(delete_art->tree, key, 0); + + // The leaf is inlined, and the same key does not exist in the delete ART. + if (leaf.GetType() == NType::LEAF_INLINED && !deleted_leaf) { + if (manager.AddHit(i, leaf.GetRowId())) { + conflict_idx = i; + } + return; + } + + // The leaf is inlined, and the same key exists in the delete ART. + if (leaf.GetType() == NType::LEAF_INLINED && deleted_leaf) { + auto deleted_row_id = deleted_leaf->GetRowId(); + auto this_row_id = leaf.GetRowId(); + + if (deleted_row_id == this_row_id) { + if (manager.AddMiss(i)) { + conflict_idx = i; + } + return; + } + + if (manager.AddHit(i, this_row_id)) { + conflict_idx = i; + } + return; + } + + // Scan the two row IDs in the leaf. + Iterator it(*this); + it.FindMinimum(leaf); + ARTKey empty_key = ARTKey(); + unsafe_vector row_ids; + it.Scan(empty_key, 2, row_ids, false); + + if (!deleted_leaf) { + if (manager.AddHit(i, row_ids[0]) || manager.AddHit(i, row_ids[0])) { + conflict_idx = i; + } + return; + } + + auto deleted_row_id = deleted_leaf->GetRowId(); + + if (deleted_row_id == row_ids[0] || deleted_row_id == row_ids[1]) { + if (manager.AddMiss(i)) { + conflict_idx = i; + } + return; + } + + if (manager.AddHit(i, row_ids[0]) || manager.AddHit(i, row_ids[1])) { + conflict_idx = i; + } +} + +void ART::VerifyConstraint(DataChunk &chunk, optional_ptr delete_index, ConflictManager &manager) { // Lock the index during constraint checking. lock_guard l(lock); DataChunk expr_chunk; expr_chunk.Initialize(Allocator::DefaultAllocator(), logical_types); - ExecuteExpressions(input, expr_chunk); + ExecuteExpressions(chunk, expr_chunk); ArenaAllocator arena_allocator(BufferAllocator::Get(db)); unsafe_vector keys(expr_chunk.size()); GenerateKeys<>(arena_allocator, expr_chunk, keys); - auto found_conflict = DConstants::INVALID_INDEX; - for (idx_t i = 0; found_conflict == DConstants::INVALID_INDEX && i < input.size(); i++) { + optional_ptr delete_art; + if (delete_index) { + delete_art = delete_index->Cast(); + } + + optional_idx conflict_idx; + for (idx_t i = 0; !conflict_idx.IsValid() && i < chunk.size(); i++) { if (keys[i].Empty()) { - if (conflict_manager.AddNull(i)) { - found_conflict = i; + if (manager.AddNull(i)) { + conflict_idx = i; } continue; } auto leaf = Lookup(tree, keys[i], 0); if (!leaf) { - if (conflict_manager.AddMiss(i)) { - found_conflict = i; + if (manager.AddMiss(i)) { + conflict_idx = i; } continue; } - - // If we find a node, we need to update the 'matches' and 'row_ids'. - // We only perform constraint checking on unique indexes, i.e., all leaves are inlined. - D_ASSERT(leaf->GetType() == NType::LEAF_INLINED); - if (conflict_manager.AddHit(i, leaf->GetRowId())) { - found_conflict = i; - } + VerifyLeaf(*leaf, keys[i], delete_art, manager, conflict_idx, i); } - conflict_manager.FinishLookup(); - if (found_conflict == DConstants::INVALID_INDEX) { + manager.FinishLookup(); + if (!conflict_idx.IsValid()) { return; } - auto key_name = GenerateErrorKeyName(input, found_conflict); - auto exception_msg = GenerateConstraintErrorMessage(conflict_manager.LookupType(), key_name); + auto key_name = GenerateErrorKeyName(chunk, conflict_idx.GetIndex()); + auto exception_msg = GenerateConstraintErrorMessage(manager.LookupType(), key_name); throw ConstraintException(exception_msg); } diff --git a/src/duckdb/src/execution/index/art/leaf.cpp b/src/duckdb/src/execution/index/art/leaf.cpp index 4a5b346c..5cde0d5d 100644 --- a/src/duckdb/src/execution/index/art/leaf.cpp +++ b/src/duckdb/src/execution/index/art/leaf.cpp @@ -13,13 +13,9 @@ namespace duckdb { void Leaf::New(Node &node, const row_t row_id) { D_ASSERT(row_id < MAX_ROW_ID_LOCAL); - - auto status = node.GetGateStatus(); node.Clear(); - node.SetMetadata(static_cast(INLINED)); node.SetRowId(row_id); - node.SetGateStatus(status); } void Leaf::New(ART &art, reference &node, const unsafe_vector &row_ids, const idx_t start, @@ -30,7 +26,7 @@ void Leaf::New(ART &art, reference &node, const unsafe_vector &row // We cannot recurse into the leaf during Construct(...) because row IDs are not sorted. for (idx_t i = 0; i < count; i++) { idx_t offset = start + i; - art.Insert(node, row_ids[offset], 0, row_ids[offset], GateStatus::GATE_SET); + art.Insert(node, row_ids[offset], 0, row_ids[offset], GateStatus::GATE_SET, nullptr); } node.get().SetGateStatus(GateStatus::GATE_SET); } @@ -40,7 +36,7 @@ void Leaf::MergeInlined(ART &art, Node &l_node, Node &r_node) { ArenaAllocator arena_allocator(Allocator::Get(art.db)); auto key = ARTKey::CreateARTKey(arena_allocator, r_node.GetRowId()); - art.Insert(l_node, key, 0, key, l_node.GetGateStatus()); + art.Insert(l_node, key, 0, key, l_node.GetGateStatus(), nullptr); r_node.Clear(); } @@ -100,17 +96,26 @@ void Leaf::TransformToNested(ART &art, Node &node) { ArenaAllocator allocator(Allocator::Get(art.db)); Node root = Node(); + // Temporarily disable constraint checking. + if (art.IsUnique() && art.append_mode == ARTAppendMode::DEFAULT) { + art.append_mode = ARTAppendMode::INSERT_DUPLICATES; + } + // Move all row IDs into the nested leaf. reference leaf_ref(node); while (leaf_ref.get().HasMetadata()) { auto &leaf = Node::Ref(art, leaf_ref, LEAF); for (uint8_t i = 0; i < leaf.count; i++) { auto row_id = ARTKey::CreateARTKey(allocator, leaf.row_ids[i]); - art.Insert(root, row_id, 0, row_id, GateStatus::GATE_SET); + auto conflict_type = art.Insert(root, row_id, 0, row_id, GateStatus::GATE_SET, nullptr); + if (conflict_type != ARTConflictType::NO_CONFLICT) { + throw InternalException("invalid conflict type in Leaf::TransformToNested"); + } } leaf_ref = leaf.ptr; } + art.append_mode = ARTAppendMode::DEFAULT; root.SetGateStatus(GateStatus::GATE_SET); Node::Free(art, node); node = root; diff --git a/src/duckdb/src/execution/index/art/node.cpp b/src/duckdb/src/execution/index/art/node.cpp index 8a39d832..25c1dd5f 100644 --- a/src/duckdb/src/execution/index/art/node.cpp +++ b/src/duckdb/src/execution/index/art/node.cpp @@ -572,7 +572,7 @@ bool Node::MergeInternal(ART &art, Node &other, const GateStatus status) { ArenaAllocator allocator(Allocator::Get(art.db)); for (idx_t i = 0; i < row_ids.size(); i++) { auto row_id = ARTKey::CreateARTKey(allocator, row_ids[i]); - art.Insert(*this, row_id, 0, row_id, GateStatus::GATE_SET); + art.Insert(*this, row_id, 0, row_id, GateStatus::GATE_SET, nullptr); } return true; } @@ -649,6 +649,7 @@ void Node::TransformToDeprecated(ART &art, Node &node, unsafe_unique_ptr &node, Node &child, const uin return GateStatus::GATE_NOT_SET; } -bool Prefix::Insert(ART &art, Node &node, const ARTKey &key, idx_t depth, const ARTKey &row_id, - const GateStatus status) { +ARTConflictType Prefix::Insert(ART &art, Node &node, const ARTKey &key, idx_t depth, const ARTKey &row_id, + const GateStatus status, optional_ptr delete_art) { reference next(node); auto pos = TraverseMutable(art, next, key, depth); @@ -307,7 +301,7 @@ bool Prefix::Insert(ART &art, Node &node, const ARTKey &key, idx_t depth, const // (2) we reach a gate. if (pos == DConstants::INVALID_INDEX) { if (next.get().GetType() != NType::PREFIX || next.get().GetGateStatus() == GateStatus::GATE_SET) { - return art.Insert(next, key, depth, row_id, status); + return art.Insert(next, key, depth, row_id, status, delete_art); } } @@ -325,7 +319,7 @@ bool Prefix::Insert(ART &art, Node &node, const ARTKey &key, idx_t depth, const Node new_row_id; Leaf::New(new_row_id, key.GetRowId()); Node::InsertChild(art, next, key[depth], new_row_id); - return true; + return ARTConflictType::NO_CONFLICT; } Node leaf; @@ -338,7 +332,7 @@ bool Prefix::Insert(ART &art, Node &node, const ARTKey &key, idx_t depth, const // Create the inlined leaf. Leaf::New(ref, row_id.GetRowId()); Node4::InsertChild(art, next, key[depth], leaf); - return true; + return ARTConflictType::NO_CONFLICT; } string Prefix::VerifyAndToString(ART &art, const Node &node, const bool only_verify) { diff --git a/src/duckdb/src/execution/index/bound_index.cpp b/src/duckdb/src/execution/index/bound_index.cpp index 017f7f5b..2437e3d4 100644 --- a/src/duckdb/src/execution/index/bound_index.cpp +++ b/src/duckdb/src/execution/index/bound_index.cpp @@ -32,10 +32,31 @@ void BoundIndex::InitializeLock(IndexLock &state) { state.index_lock = unique_lock(lock); } -ErrorData BoundIndex::Append(DataChunk &entries, Vector &row_identifiers) { - IndexLock state; - InitializeLock(state); - return Append(state, entries, row_identifiers); +ErrorData BoundIndex::Append(DataChunk &chunk, Vector &row_ids) { + IndexLock l; + InitializeLock(l); + return Append(l, chunk, row_ids); +} + +ErrorData BoundIndex::AppendWithDeleteIndex(IndexLock &l, DataChunk &chunk, Vector &row_ids, + optional_ptr delete_index) { + // Fallback to the old Append. + return Append(l, chunk, row_ids); +} + +ErrorData BoundIndex::AppendWithDeleteIndex(DataChunk &chunk, Vector &row_ids, optional_ptr delete_index) { + IndexLock l; + InitializeLock(l); + return AppendWithDeleteIndex(l, chunk, row_ids, delete_index); +} + +void BoundIndex::VerifyAppend(DataChunk &chunk, optional_ptr delete_index, + optional_ptr manager) { + throw NotImplementedException("this implementation of VerifyAppend does not exist."); +} + +void BoundIndex::VerifyConstraint(DataChunk &chunk, optional_ptr delete_index, ConflictManager &manager) { + throw NotImplementedException("this implementation of VerifyConstraint does not exist."); } void BoundIndex::CommitDrop() { @@ -50,6 +71,10 @@ void BoundIndex::Delete(DataChunk &entries, Vector &row_identifiers) { Delete(state, entries, row_identifiers); } +ErrorData BoundIndex::Insert(IndexLock &l, DataChunk &chunk, Vector &row_ids, optional_ptr delete_index) { + throw NotImplementedException("this implementation of Insert does not exist."); +} + bool BoundIndex::MergeIndexes(BoundIndex &other_index) { IndexLock state; InitializeLock(state); diff --git a/src/duckdb/src/execution/operator/persistent/physical_batch_insert.cpp b/src/duckdb/src/execution/operator/persistent/physical_batch_insert.cpp index 58c7a060..2e546c47 100644 --- a/src/duckdb/src/execution/operator/persistent/physical_batch_insert.cpp +++ b/src/duckdb/src/execution/operator/persistent/physical_batch_insert.cpp @@ -517,7 +517,8 @@ SinkResultType PhysicalBatchInsert::Sink(ExecutionContext &context, DataChunk &c if (!lstate.constraint_state) { lstate.constraint_state = table.GetStorage().InitializeConstraintState(table, bound_constraints); } - table.GetStorage().VerifyAppendConstraints(*lstate.constraint_state, context.client, lstate.insert_chunk); + auto &storage = table.GetStorage(); + storage.VerifyAppendConstraints(*lstate.constraint_state, context.client, lstate.insert_chunk, nullptr, nullptr); auto new_row_group = lstate.current_collection->Append(lstate.insert_chunk, lstate.current_append_state); if (new_row_group) { @@ -628,7 +629,7 @@ SinkFinalizeType PhysicalBatchInsert::Finalize(Pipeline &pipeline, Event &event, memory_manager.ReduceUnflushedMemory(entry.unflushed_memory); entry.collection->Scan(transaction, [&](DataChunk &insert_chunk) { - storage.LocalAppend(append_state, table, context, insert_chunk); + storage.LocalAppend(append_state, context, insert_chunk, false); return true; }); } diff --git a/src/duckdb/src/execution/operator/persistent/physical_delete.cpp b/src/duckdb/src/execution/operator/persistent/physical_delete.cpp index acbb8b9d..e92e2ec6 100644 --- a/src/duckdb/src/execution/operator/persistent/physical_delete.cpp +++ b/src/duckdb/src/execution/operator/persistent/physical_delete.cpp @@ -1,13 +1,14 @@ #include "duckdb/execution/operator/persistent/physical_delete.hpp" +#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp" #include "duckdb/common/atomic.hpp" #include "duckdb/common/types/column/column_data_collection.hpp" #include "duckdb/execution/expression_executor.hpp" +#include "duckdb/execution/index/bound_index.hpp" #include "duckdb/storage/data_table.hpp" +#include "duckdb/storage/table/delete_state.hpp" #include "duckdb/storage/table/scan_state.hpp" #include "duckdb/transaction/duck_transaction.hpp" -#include "duckdb/storage/table/delete_state.hpp" -#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp" namespace duckdb { @@ -23,13 +24,23 @@ PhysicalDelete::PhysicalDelete(vector types, TableCatalogEntry &tab //===--------------------------------------------------------------------===// class DeleteGlobalState : public GlobalSinkState { public: - explicit DeleteGlobalState(ClientContext &context, const vector &return_types) - : deleted_count(0), return_collection(context, return_types) { + explicit DeleteGlobalState(ClientContext &context, const vector &return_types, + TableCatalogEntry &table, const vector> &bound_constraints) + : deleted_count(0), return_collection(context, return_types), has_unique_indexes(false) { + + // We need to append deletes to the local delete-ART. + auto &storage = table.GetStorage(); + if (storage.HasUniqueIndexes()) { + storage.InitializeLocalStorage(delete_index_append_state, table, context, bound_constraints); + has_unique_indexes = true; + } } mutex delete_lock; idx_t deleted_count; ColumnDataCollection return_collection; + LocalAppendState delete_index_append_state; + bool has_unique_indexes; }; class DeleteLocalState : public LocalSinkState { @@ -37,39 +48,68 @@ class DeleteLocalState : public LocalSinkState { DeleteLocalState(ClientContext &context, TableCatalogEntry &table, const vector> &bound_constraints) { delete_chunk.Initialize(Allocator::Get(context), table.GetTypes()); - delete_state = table.GetStorage().InitializeDelete(table, context, bound_constraints); + auto &storage = table.GetStorage(); + delete_state = storage.InitializeDelete(table, context, bound_constraints); } + +public: DataChunk delete_chunk; unique_ptr delete_state; }; SinkResultType PhysicalDelete::Sink(ExecutionContext &context, DataChunk &chunk, OperatorSinkInput &input) const { - auto &gstate = input.global_state.Cast(); - auto &ustate = input.local_state.Cast(); + auto &g_state = input.global_state.Cast(); + auto &l_state = input.local_state.Cast(); - // get rows and auto &transaction = DuckTransaction::Get(context.client, table.db); - auto &row_identifiers = chunk.data[row_id_index]; + auto &row_ids = chunk.data[row_id_index]; vector column_ids; for (idx_t i = 0; i < table.ColumnCount(); i++) { column_ids.emplace_back(i); }; - auto cfs = ColumnFetchState(); + auto fetch_state = ColumnFetchState(); - lock_guard delete_guard(gstate.delete_lock); + lock_guard delete_guard(g_state.delete_lock); + if (!return_chunk && !g_state.has_unique_indexes) { + g_state.deleted_count += table.Delete(*l_state.delete_state, context.client, row_ids, chunk.size()); + return SinkResultType::NEED_MORE_INPUT; + } + + // Fetch the to-be-deleted chunk. + l_state.delete_chunk.Reset(); + row_ids.Flatten(chunk.size()); + table.Fetch(transaction, l_state.delete_chunk, column_ids, row_ids, chunk.size(), fetch_state); + + // Append the deleted row IDs to the delete indexes. + // If we only delete local row IDs, then the delete_chunk is empty. + if (g_state.has_unique_indexes && l_state.delete_chunk.size() != 0) { + auto &local_storage = LocalStorage::Get(context.client, table.db); + auto storage = local_storage.GetStorage(table); + storage->delete_indexes.Scan([&](Index &index) { + if (!index.IsBound() || !index.IsUnique()) { + return false; + } + auto &bound_index = index.Cast(); + auto error = bound_index.Append(l_state.delete_chunk, row_ids); + if (error.HasError()) { + throw InternalException("failed to update delete ART in physical delete: ", error.Message()); + } + return false; + }); + } + + // Append the return_chunk to the return collection. if (return_chunk) { - ustate.delete_chunk.Reset(); - row_identifiers.Flatten(chunk.size()); - table.Fetch(transaction, ustate.delete_chunk, column_ids, row_identifiers, chunk.size(), cfs); - gstate.return_collection.Append(ustate.delete_chunk); + g_state.return_collection.Append(l_state.delete_chunk); } - gstate.deleted_count += table.Delete(*ustate.delete_state, context.client, row_identifiers, chunk.size()); + + g_state.deleted_count += table.Delete(*l_state.delete_state, context.client, row_ids, chunk.size()); return SinkResultType::NEED_MORE_INPUT; } unique_ptr PhysicalDelete::GetGlobalSinkState(ClientContext &context) const { - return make_uniq(context, GetTypes()); + return make_uniq(context, GetTypes(), tableref, bound_constraints); } unique_ptr PhysicalDelete::GetLocalSinkState(ExecutionContext &context) const { @@ -107,7 +147,6 @@ SourceResultType PhysicalDelete::GetData(ExecutionContext &context, DataChunk &c } g.return_collection.Scan(state.scan_state, chunk); - return chunk.size() == 0 ? SourceResultType::FINISHED : SourceResultType::HAVE_MORE_OUTPUT; } diff --git a/src/duckdb/src/execution/operator/persistent/physical_insert.cpp b/src/duckdb/src/execution/operator/persistent/physical_insert.cpp index 35d2f425..736dffc9 100644 --- a/src/duckdb/src/execution/operator/persistent/physical_insert.cpp +++ b/src/duckdb/src/execution/operator/persistent/physical_insert.cpp @@ -28,14 +28,14 @@ PhysicalInsert::PhysicalInsert( vector> set_expressions, vector set_columns, vector set_types, idx_t estimated_cardinality, bool return_chunk, bool parallel, OnConflictAction action_type, unique_ptr on_conflict_condition_p, unique_ptr do_update_condition_p, - unordered_set conflict_target_p, vector columns_to_fetch_p) + unordered_set conflict_target_p, vector columns_to_fetch_p, bool update_is_del_and_insert) : PhysicalOperator(PhysicalOperatorType::INSERT, std::move(types_p), estimated_cardinality), column_index_map(std::move(column_index_map)), insert_table(&table), insert_types(table.GetTypes()), bound_defaults(std::move(bound_defaults)), bound_constraints(std::move(bound_constraints_p)), return_chunk(return_chunk), parallel(parallel), action_type(action_type), set_expressions(std::move(set_expressions)), set_columns(std::move(set_columns)), set_types(std::move(set_types)), on_conflict_condition(std::move(on_conflict_condition_p)), do_update_condition(std::move(do_update_condition_p)), - conflict_target(std::move(conflict_target_p)) { + conflict_target(std::move(conflict_target_p)), update_is_del_and_insert(update_is_del_and_insert) { if (action_type == OnConflictAction::THROW) { return; @@ -58,7 +58,7 @@ PhysicalInsert::PhysicalInsert(LogicalOperator &op, SchemaCatalogEntry &schema, idx_t estimated_cardinality, bool parallel) : PhysicalOperator(PhysicalOperatorType::CREATE_TABLE_AS, op.types, estimated_cardinality), insert_table(nullptr), return_chunk(false), schema(&schema), info(std::move(info_p)), parallel(parallel), - action_type(OnConflictAction::THROW) { + action_type(OnConflictAction::THROW), update_is_del_and_insert(false) { GetInsertInfo(*info, insert_types, bound_defaults); } @@ -84,17 +84,28 @@ InsertLocalState::InsertLocalState(ClientContext &context, const vector> &bound_defaults, const vector> &bound_constraints) : default_executor(context, bound_defaults), bound_constraints(bound_constraints) { - insert_chunk.Initialize(Allocator::Get(context), types); - update_chunk.Initialize(Allocator::Get(context), types); + + auto &allocator = Allocator::Get(context); + insert_chunk.Initialize(allocator, types); + update_chunk.Initialize(allocator, types); + append_chunk.Initialize(allocator, types); } -ConstraintState &InsertLocalState::GetConstraintState(DataTable &table, TableCatalogEntry &tableref) { +ConstraintState &InsertLocalState::GetConstraintState(DataTable &table, TableCatalogEntry &table_ref) { if (!constraint_state) { - constraint_state = table.InitializeConstraintState(tableref, bound_constraints); + constraint_state = table.InitializeConstraintState(table_ref, bound_constraints); } return *constraint_state; } +TableDeleteState &InsertLocalState::GetDeleteState(DataTable &table, TableCatalogEntry &table_ref, + ClientContext &context) { + if (!delete_state) { + delete_state = table.InitializeDelete(table_ref, context, bound_constraints); + } + return *delete_state; +} + unique_ptr PhysicalInsert::GetGlobalSinkState(ClientContext &context) const { optional_ptr table; if (info) { @@ -252,27 +263,49 @@ static void CreateUpdateChunk(ExecutionContext &context, DataChunk &chunk, Table } template -static idx_t PerformOnConflictAction(ExecutionContext &context, DataChunk &chunk, TableCatalogEntry &table, - Vector &row_ids, const PhysicalInsert &op) { - +static idx_t PerformOnConflictAction(InsertLocalState &lstate, ExecutionContext &context, DataChunk &chunk, + TableCatalogEntry &table, Vector &row_ids, const PhysicalInsert &op) { + // Early-out, if we do nothing on conflicting rows. if (op.action_type == OnConflictAction::NOTHING) { return 0; } - auto &set_columns = op.set_columns; + auto &set_columns = op.set_columns; DataChunk update_chunk; CreateUpdateChunk(context, chunk, table, row_ids, update_chunk, op); - auto &data_table = table.GetStorage(); - // Perform the update, using the results of the SET expressions + + // Perform the UPDATE on the (global) storage. + if (!op.update_is_del_and_insert) { + if (GLOBAL) { + auto update_state = data_table.InitializeUpdate(table, context.client, op.bound_constraints); + data_table.Update(*update_state, context.client, row_ids, set_columns, update_chunk); + return update_chunk.size(); + } + auto &local_storage = LocalStorage::Get(context.client, data_table.db); + local_storage.Update(data_table, row_ids, set_columns, update_chunk); + return update_chunk.size(); + } + + // Arrange the columns in the standard table order. + DataChunk &append_chunk = lstate.append_chunk; + append_chunk.SetCardinality(update_chunk); + for (idx_t i = 0; i < append_chunk.ColumnCount(); i++) { + append_chunk.data[i].Reference(chunk.data[i]); + } + for (idx_t i = 0; i < set_columns.size(); i++) { + append_chunk.data[set_columns[i].index].Reference(update_chunk.data[i]); + } + if (GLOBAL) { - auto update_state = data_table.InitializeUpdate(table, context.client, op.bound_constraints); - data_table.Update(*update_state, context.client, row_ids, set_columns, update_chunk); + auto &delete_state = lstate.GetDeleteState(data_table, table, context.client); + data_table.Delete(delete_state, context.client, row_ids, update_chunk.size()); } else { auto &local_storage = LocalStorage::Get(context.client, data_table.db); - // Perform the update, using the results of the SET expressions - local_storage.Update(data_table, row_ids, set_columns, update_chunk); + local_storage.Delete(data_table, row_ids, update_chunk.size()); } + + data_table.LocalAppend(table, context.client, append_chunk, op.bound_constraints, row_ids, append_chunk); return update_chunk.size(); } @@ -378,24 +411,31 @@ static void VerifyOnConflictCondition(ExecutionContext &context, DataChunk &comb DataChunk conflict_condition_result; CheckOnConflictCondition(context, combined_chunk, on_conflict_condition, conflict_condition_result); bool conditions_met = AllConflictsMeetCondition(conflict_condition_result); - if (!conditions_met) { - // Filter out the tuples that did pass the filter, then run the verify again - ManagedSelection sel(combined_chunk.size()); - auto data = FlatVector::GetData(conflict_condition_result.data[0]); - for (idx_t i = 0; i < combined_chunk.size(); i++) { - if (!data[i]) { - // Only populate the selection vector with the tuples that did not meet the condition - sel.Append(i); - } - } - combined_chunk.Slice(sel.Selection(), sel.Count()); - if (GLOBAL) { - data_table.VerifyAppendConstraints(constraint_state, context.client, combined_chunk, nullptr); - } else { - DataTable::VerifyUniqueIndexes(local_storage.GetIndexes(data_table), context.client, tuples, nullptr); + if (conditions_met) { + return; + } + + // We need to throw. Filter all tuples that passed, and verify again with those that violate the constraint. + ManagedSelection sel(combined_chunk.size()); + auto data = FlatVector::GetData(conflict_condition_result.data[0]); + for (idx_t i = 0; i < combined_chunk.size(); i++) { + if (!data[i]) { + // This tuple did not meet the condition. + sel.Append(i); } - throw InternalException("The previous operation was expected to throw but didn't"); } + combined_chunk.Slice(sel.Selection(), sel.Count()); + + // Verify and throw. + if (GLOBAL) { + data_table.VerifyAppendConstraints(constraint_state, context.client, combined_chunk, nullptr, nullptr); + throw InternalException("VerifyAppendConstraints was expected to throw but didn't"); + } + + auto &indexes = local_storage.GetIndexes(data_table); + auto storage = local_storage.GetStorage(data_table); + DataTable::VerifyUniqueIndexes(indexes, storage, tuples, nullptr); + throw InternalException("VerifyUniqueIndexes was expected to throw but didn't"); } template @@ -413,9 +453,12 @@ static idx_t HandleInsertConflicts(TableCatalogEntry &table, ExecutionContext &c ConflictManager conflict_manager(VerifyExistenceType::APPEND, tuples.size(), &conflict_info); if (GLOBAL) { auto &constraint_state = lstate.GetConstraintState(data_table, table); - data_table.VerifyAppendConstraints(constraint_state, context.client, tuples, &conflict_manager); + auto storage = local_storage.GetStorage(data_table); + data_table.VerifyAppendConstraints(constraint_state, context.client, tuples, storage, &conflict_manager); } else { - DataTable::VerifyUniqueIndexes(local_storage.GetIndexes(data_table), context.client, tuples, &conflict_manager); + auto &indexes = local_storage.GetIndexes(data_table); + auto storage = local_storage.GetStorage(data_table); + DataTable::VerifyUniqueIndexes(indexes, storage, tuples, &conflict_manager); } conflict_manager.Finalize(); @@ -467,7 +510,7 @@ static idx_t HandleInsertConflicts(TableCatalogEntry &table, ExecutionContext &c RegisterUpdatedRows(lstate, row_ids, combined_chunk.size()); } - affected_tuples += PerformOnConflictAction(context, combined_chunk, table, row_ids, op); + affected_tuples += PerformOnConflictAction(lstate, context, combined_chunk, table, row_ids, op); // Remove the conflicting tuples from the insert chunk SelectionVector sel_vec(tuples.size()); @@ -480,13 +523,15 @@ static idx_t HandleInsertConflicts(TableCatalogEntry &table, ExecutionContext &c idx_t PhysicalInsert::OnConflictHandling(TableCatalogEntry &table, ExecutionContext &context, InsertGlobalState &gstate, InsertLocalState &lstate) const { auto &data_table = table.GetStorage(); + auto &local_storage = LocalStorage::Get(context.client, data_table.db); + if (action_type == OnConflictAction::THROW) { auto &constraint_state = lstate.GetConstraintState(data_table, table); - data_table.VerifyAppendConstraints(constraint_state, context.client, lstate.insert_chunk, nullptr); + auto storage = local_storage.GetStorage(data_table); + data_table.VerifyAppendConstraints(constraint_state, context.client, lstate.insert_chunk, storage, nullptr); return 0; } - auto &local_storage = LocalStorage::Get(context.client, data_table.db); ConflictInfo conflict_info(conflict_target); auto &global_indexes = data_table.GetDataTableInfo()->GetIndexes(); @@ -597,7 +642,7 @@ SinkResultType PhysicalInsert::Sink(ExecutionContext &context, DataChunk &chunk, } gstate.insert_count += lstate.insert_chunk.size(); gstate.insert_count += updated_tuples; - storage.LocalAppend(gstate.append_state, table, context.client, lstate.insert_chunk, true); + storage.LocalAppend(gstate.append_state, context.client, lstate.insert_chunk, true); if (action_type == OnConflictAction::UPDATE && lstate.update_chunk.size() != 0) { // Flush the append so we can target the data we just appended with the update storage.FinalizeLocalAppend(gstate.append_state); @@ -660,14 +705,13 @@ SinkCombineResultType PhysicalInsert::Combine(ExecutionContext &context, Operato storage.InitializeLocalAppend(gstate.append_state, table, context.client, bound_constraints); auto &transaction = DuckTransaction::Get(context.client, table.catalog); lstate.local_collection->Scan(transaction, [&](DataChunk &insert_chunk) { - storage.LocalAppend(gstate.append_state, table, context.client, insert_chunk); + storage.LocalAppend(gstate.append_state, context.client, insert_chunk, false); return true; }); storage.FinalizeLocalAppend(gstate.append_state); } else { // we have written rows to disk optimistically - merge directly into the transaction-local storage lstate.writer->WriteLastRowGroup(*lstate.local_collection); - gstate.table.GetStorage().LocalMerge(context.client, *lstate.local_collection); gstate.table.GetStorage().FinalizeOptimisticWriter(context.client, *lstate.writer); } diff --git a/src/duckdb/src/execution/operator/persistent/physical_update.cpp b/src/duckdb/src/execution/operator/persistent/physical_update.cpp index f314eb12..fadb1514 100644 --- a/src/duckdb/src/execution/operator/persistent/physical_update.cpp +++ b/src/duckdb/src/execution/operator/persistent/physical_update.cpp @@ -9,7 +9,9 @@ #include "duckdb/planner/expression/bound_reference_expression.hpp" #include "duckdb/storage/data_table.hpp" #include "duckdb/storage/table/delete_state.hpp" +#include "duckdb/storage/table/scan_state.hpp" #include "duckdb/storage/table/update_state.hpp" +#include "duckdb/transaction/duck_transaction.hpp" namespace duckdb { @@ -21,7 +23,26 @@ PhysicalUpdate::PhysicalUpdate(vector types, TableCatalogEntry &tab : PhysicalOperator(PhysicalOperatorType::UPDATE, std::move(types), estimated_cardinality), tableref(tableref), table(table), columns(std::move(columns)), expressions(std::move(expressions)), bound_defaults(std::move(bound_defaults)), bound_constraints(std::move(bound_constraints)), - return_chunk(return_chunk) { + return_chunk(return_chunk), index_update(false) { + + auto &indexes = table.GetDataTableInfo().get()->GetIndexes(); + auto index_columns = indexes.GetRequiredColumns(); + + unordered_set update_columns; + for (const auto col : this->columns) { + update_columns.insert(col.index); + } + + for (const auto &col : table.Columns()) { + if (index_columns.find(col.Logical().index) == index_columns.end()) { + continue; + } + if (update_columns.find(col.Physical().index) == update_columns.end()) { + continue; + } + index_update = true; + break; + } } //===--------------------------------------------------------------------===// @@ -35,7 +56,7 @@ class UpdateGlobalState : public GlobalSinkState { mutex lock; idx_t updated_count; - unordered_set updated_columns; + unordered_set updated_rows; ColumnDataCollection return_collection; }; @@ -45,7 +66,8 @@ class UpdateLocalState : public LocalSinkState { const vector &table_types, const vector> &bound_defaults, const vector> &bound_constraints) : default_executor(context, bound_defaults), bound_constraints(bound_constraints) { - // initialize the update chunk + + // Initialize the update chunk. auto &allocator = Allocator::Get(context); vector update_types; update_types.reserve(expressions.size()); @@ -53,12 +75,15 @@ class UpdateLocalState : public LocalSinkState { update_types.push_back(expr->return_type); } update_chunk.Initialize(allocator, update_types); - // initialize the mock chunk + + // Initialize the mock and delete chunk. mock_chunk.Initialize(allocator, table_types); + delete_chunk.Initialize(allocator, table_types); } DataChunk update_chunk; DataChunk mock_chunk; + DataChunk delete_chunk; ExpressionExecutor default_executor; unique_ptr delete_state; unique_ptr update_state; @@ -80,79 +105,103 @@ class UpdateLocalState : public LocalSinkState { }; SinkResultType PhysicalUpdate::Sink(ExecutionContext &context, DataChunk &chunk, OperatorSinkInput &input) const { - auto &gstate = input.global_state.Cast(); - auto &lstate = input.local_state.Cast(); - - DataChunk &update_chunk = lstate.update_chunk; - DataChunk &mock_chunk = lstate.mock_chunk; + auto &g_state = input.global_state.Cast(); + auto &l_state = input.local_state.Cast(); chunk.Flatten(); - lstate.default_executor.SetChunk(chunk); + l_state.default_executor.SetChunk(chunk); - // update data in the base table - // the row ids are given to us as the last column of the child chunk - auto &row_ids = chunk.data[chunk.ColumnCount() - 1]; + DataChunk &update_chunk = l_state.update_chunk; update_chunk.Reset(); update_chunk.SetCardinality(chunk); for (idx_t i = 0; i < expressions.size(); i++) { + // Default expression, set to the default value of the column. if (expressions[i]->type == ExpressionType::VALUE_DEFAULT) { - // default expression, set to the default value of the column - lstate.default_executor.ExecuteExpression(columns[i].index, update_chunk.data[i]); - } else { - D_ASSERT(expressions[i]->type == ExpressionType::BOUND_REF); - // index into child chunk - auto &binding = expressions[i]->Cast(); - update_chunk.data[i].Reference(chunk.data[binding.index]); + l_state.default_executor.ExecuteExpression(columns[i].index, update_chunk.data[i]); + continue; } + + D_ASSERT(expressions[i]->type == ExpressionType::BOUND_REF); + auto &binding = expressions[i]->Cast(); + update_chunk.data[i].Reference(chunk.data[binding.index]); } - lock_guard glock(gstate.lock); - if (update_is_del_and_insert) { - // index update or update on complex type, perform a delete and an append instead - - // figure out which rows have not yet been deleted in this update - // this is required since we might see the same row_id multiple times - // in the case of an UPDATE query that e.g. has joins - auto row_id_data = FlatVector::GetData(row_ids); - SelectionVector sel(STANDARD_VECTOR_SIZE); - idx_t update_count = 0; - for (idx_t i = 0; i < update_chunk.size(); i++) { - auto row_id = row_id_data[i]; - if (gstate.updated_columns.find(row_id) == gstate.updated_columns.end()) { - gstate.updated_columns.insert(row_id); - sel.set_index(update_count++, i); - } - } - if (update_count != update_chunk.size()) { - // we need to slice here - update_chunk.Slice(sel, update_count); - } - auto &delete_state = lstate.GetDeleteState(table, tableref, context.client); - table.Delete(delete_state, context.client, row_ids, update_chunk.size()); - // for the append we need to arrange the columns in a specific manner (namely the "standard table order") - mock_chunk.SetCardinality(update_chunk); - for (idx_t i = 0; i < columns.size(); i++) { - mock_chunk.data[columns[i].index].Reference(update_chunk.data[i]); - } - table.LocalAppend(tableref, context.client, mock_chunk, bound_constraints); - } else { + lock_guard glock(g_state.lock); + auto &row_ids = chunk.data[chunk.ColumnCount() - 1]; + DataChunk &mock_chunk = l_state.mock_chunk; + + // Regular in-place update. + if (!update_is_del_and_insert) { if (return_chunk) { mock_chunk.SetCardinality(update_chunk); for (idx_t i = 0; i < columns.size(); i++) { mock_chunk.data[columns[i].index].Reference(update_chunk.data[i]); } } - auto &update_state = lstate.GetUpdateState(table, tableref, context.client); + auto &update_state = l_state.GetUpdateState(table, tableref, context.client); table.Update(update_state, context.client, row_ids, columns, update_chunk); + + if (return_chunk) { + g_state.return_collection.Append(mock_chunk); + } + g_state.updated_count += chunk.size(); + return SinkResultType::NEED_MORE_INPUT; } - if (return_chunk) { - gstate.return_collection.Append(mock_chunk); + // We update an index or a complex type, so we need to split the UPDATE into DELETE + INSERT. + + // Keep track of the rows that have not yet been deleted in this UPDATE. + // This is required since we might see the same row_id multiple times, e.g., + // during an UPDATE containing joins. + SelectionVector sel(update_chunk.size()); + idx_t update_count = 0; + auto row_id_data = FlatVector::GetData(row_ids); + + for (idx_t i = 0; i < update_chunk.size(); i++) { + auto row_id = row_id_data[i]; + if (g_state.updated_rows.find(row_id) == g_state.updated_rows.end()) { + g_state.updated_rows.insert(row_id); + sel.set_index(update_count++, i); + } + } + + // The update chunk now contains exactly those rows that we are deleting. + Vector del_row_ids(row_ids); + if (update_count != update_chunk.size()) { + update_chunk.Slice(sel, update_count); + del_row_ids.Slice(row_ids, sel, update_count); } - gstate.updated_count += chunk.size(); + auto &delete_chunk = index_update ? l_state.delete_chunk : l_state.mock_chunk; + delete_chunk.SetCardinality(update_count); + + if (index_update) { + auto &transaction = DuckTransaction::Get(context.client, table.db); + vector column_ids; + for (idx_t i = 0; i < table.ColumnCount(); i++) { + column_ids.emplace_back(i); + }; + // We need to fetch the previous index keys to add them to the delete index. + auto fetch_state = ColumnFetchState(); + table.Fetch(transaction, delete_chunk, column_ids, row_ids, update_count, fetch_state); + } + + auto &delete_state = l_state.GetDeleteState(table, tableref, context.client); + table.Delete(delete_state, context.client, del_row_ids, update_count); + + // Arrange the columns in the standard table order. + mock_chunk.SetCardinality(update_count); + for (idx_t i = 0; i < columns.size(); i++) { + mock_chunk.data[columns[i].index].Reference(update_chunk.data[i]); + } + + table.LocalAppend(tableref, context.client, mock_chunk, bound_constraints, del_row_ids, delete_chunk); + if (return_chunk) { + g_state.return_collection.Append(mock_chunk); + } + g_state.updated_count += chunk.size(); return SinkResultType::NEED_MORE_INPUT; } diff --git a/src/duckdb/src/execution/operator/schema/physical_create_art_index.cpp b/src/duckdb/src/execution/operator/schema/physical_create_art_index.cpp index 78b96cff..e34e49de 100644 --- a/src/duckdb/src/execution/operator/schema/physical_create_art_index.cpp +++ b/src/duckdb/src/execution/operator/schema/physical_create_art_index.cpp @@ -83,11 +83,14 @@ SinkResultType PhysicalCreateARTIndex::SinkUnsorted(OperatorSinkInput &input) co auto &l_state = input.local_state.Cast(); auto row_count = l_state.key_chunk.size(); + auto &art = l_state.local_index->Cast(); // Insert each key and its corresponding row ID. - auto &art = l_state.local_index->Cast(); for (idx_t i = 0; i < row_count; i++) { - if (!art.Insert(art.tree, l_state.keys[i], 0, l_state.row_ids[i], art.tree.GetGateStatus())) { + auto status = art.tree.GetGateStatus(); + auto conflict_type = art.Insert(art.tree, l_state.keys[i], 0, l_state.row_ids[i], status, nullptr); + D_ASSERT(conflict_type != ARTConflictType::TRANSACTION); + if (conflict_type == ARTConflictType::CONSTRAINT) { throw ConstraintException("Data contains duplicates on indexed column(s)"); } } diff --git a/src/duckdb/src/execution/physical_plan/plan_insert.cpp b/src/duckdb/src/execution/physical_plan/plan_insert.cpp index 0ce031d0..2342a1ef 100644 --- a/src/duckdb/src/execution/physical_plan/plan_insert.cpp +++ b/src/duckdb/src/execution/physical_plan/plan_insert.cpp @@ -101,7 +101,7 @@ unique_ptr DuckCatalog::PlanInsert(ClientContext &context, Log std::move(op.expressions), std::move(op.set_columns), std::move(op.set_types), op.estimated_cardinality, op.return_chunk, parallel_streaming_insert && num_threads > 1, op.action_type, std::move(op.on_conflict_condition), std::move(op.do_update_condition), std::move(op.on_conflict_filter), - std::move(op.columns_to_fetch)); + std::move(op.columns_to_fetch), op.update_is_del_and_insert); } D_ASSERT(plan); insert->children.push_back(std::move(plan)); diff --git a/src/duckdb/src/execution/radix_partitioned_hashtable.cpp b/src/duckdb/src/execution/radix_partitioned_hashtable.cpp index a9df807d..cfd92655 100644 --- a/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +++ b/src/duckdb/src/execution/radix_partitioned_hashtable.cpp @@ -464,8 +464,21 @@ void RadixPartitionedHashTable::Sink(ExecutionContext &context, DataChunk &chunk if (gstate.number_of_threads > RadixHTConfig::GROW_STRATEGY_THREAD_THRESHOLD || gstate.external) { // 'Reset' the HT without taking its data, we can just keep appending to the same collection // This only works because we never resize the HT - ht.Abandon(); // We don't do this when running with 1 or 2 threads, it only makes sense when there's many threads + ht.Abandon(); + + // Once we've inserted more than SKIP_LOOKUP_THRESHOLD tuples, + // and more than UNIQUE_PERCENTAGE_THRESHOLD were unique, + // we set the HT to skip doing lookups, which makes it blindly append data to the HT. + // This speeds up adding data, at the cost of no longer de-duplicating. + // The data will be de-duplicated later anyway + static constexpr idx_t SKIP_LOOKUP_THRESHOLD = 262144; + static constexpr double UNIQUE_PERCENTAGE_THRESHOLD = 0.95; + const auto unique_percentage = + static_cast(ht.GetPartitionedData().Count()) / static_cast(ht.GetSinkCount()); + if (ht.GetSinkCount() > SKIP_LOOKUP_THRESHOLD && unique_percentage > UNIQUE_PERCENTAGE_THRESHOLD) { + ht.SkipLookups(); + } } // Check if we need to repartition diff --git a/src/duckdb/src/execution/reservoir_sample.cpp b/src/duckdb/src/execution/reservoir_sample.cpp deleted file mode 100644 index 284e03fa..00000000 --- a/src/duckdb/src/execution/reservoir_sample.cpp +++ /dev/null @@ -1,324 +0,0 @@ -#include "duckdb/execution/reservoir_sample.hpp" -#include "duckdb/common/types/data_chunk.hpp" -#include "duckdb/common/pair.hpp" - -namespace duckdb { - -void ReservoirChunk::Serialize(Serializer &serializer) const { - chunk.Serialize(serializer); -} - -unique_ptr ReservoirChunk::Deserialize(Deserializer &deserializer) { - auto result = make_uniq(); - result->chunk.Deserialize(deserializer); - return result; -} - -ReservoirSample::ReservoirSample(Allocator &allocator, idx_t sample_count, int64_t seed) - : BlockingSample(seed), allocator(allocator), sample_count(sample_count), reservoir_initialized(false) { -} - -ReservoirSample::ReservoirSample(idx_t sample_count, int64_t seed) - : ReservoirSample(Allocator::DefaultAllocator(), sample_count, seed) { -} - -void ReservoirSample::AddToReservoir(DataChunk &input) { - if (sample_count == 0) { - // sample count is 0, means no samples were requested - return; - } - old_base_reservoir_sample.num_entries_seen_total += input.size(); - // Input: A population V of n weighted items - // Output: A reservoir R with a size m - // 1: The first m items of V are inserted into R - // first we need to check if the reservoir already has "m" elements - if (!reservoir_data_chunk || reservoir_data_chunk->size() < sample_count) { - if (FillReservoir(input) == 0) { - // entire chunk was consumed by reservoir - return; - } - } - D_ASSERT(reservoir_data_chunk); - D_ASSERT(reservoir_data_chunk->size() == sample_count); - // Initialize the weights if they have not been already - if (old_base_reservoir_sample.reservoir_weights.empty()) { - old_base_reservoir_sample.InitializeReservoir(reservoir_data_chunk->size(), sample_count); - } - // find the position of next_index_to_sample relative to number of seen entries (num_entries_to_skip_b4_next_sample) - idx_t remaining = input.size(); - idx_t base_offset = 0; - while (true) { - idx_t offset = old_base_reservoir_sample.next_index_to_sample - - old_base_reservoir_sample.num_entries_to_skip_b4_next_sample; - if (offset >= remaining) { - // not in this chunk! increment current count and go to the next chunk - old_base_reservoir_sample.num_entries_to_skip_b4_next_sample += remaining; - return; - } - // in this chunk! replace the element - ReplaceElement(input, base_offset + offset); - // shift the chunk forward - remaining -= offset; - base_offset += offset; - } -} - -unique_ptr ReservoirSample::GetChunk() { - if (!reservoir_data_chunk || reservoir_data_chunk->size() == 0) { - return nullptr; - } - auto collected_sample_count = reservoir_data_chunk->size(); - if (collected_sample_count > STANDARD_VECTOR_SIZE) { - // get from the back to avoid creating two selection vectors - // one to return the first STANDARD_VECTOR_SIZE - // another to replace the reservoir_data_chunk with the first STANDARD VECTOR SIZE missing - auto ret = make_uniq(); - auto samples_remaining = collected_sample_count - STANDARD_VECTOR_SIZE; - auto reservoir_types = reservoir_data_chunk->GetTypes(); - SelectionVector sel(STANDARD_VECTOR_SIZE); - for (idx_t i = samples_remaining; i < collected_sample_count; i++) { - sel.set_index(i - samples_remaining, i); - } - ret->Initialize(allocator, reservoir_types); - ret->Slice(*reservoir_data_chunk, sel, STANDARD_VECTOR_SIZE); - ret->SetCardinality(STANDARD_VECTOR_SIZE); - // reduce capacity and cardinality of the sample data chunk - reservoir_data_chunk->SetCardinality(samples_remaining); - return ret; - } - return std::move(reservoir_data_chunk); -} - -void ReservoirSample::ReplaceElement(DataChunk &input, idx_t index_in_chunk, double with_weight) { - // replace the entry in the reservoir - // 8. The item in R with the minimum key is replaced by item vi - D_ASSERT(input.ColumnCount() == reservoir_data_chunk->ColumnCount()); - for (idx_t col_idx = 0; col_idx < input.ColumnCount(); col_idx++) { - reservoir_data_chunk->SetValue(col_idx, old_base_reservoir_sample.min_weighted_entry_index, - input.GetValue(col_idx, index_in_chunk)); - } - old_base_reservoir_sample.ReplaceElement(with_weight); -} - -void ReservoirSample::InitializeReservoir(DataChunk &input) { - reservoir_data_chunk = make_uniq(); - reservoir_data_chunk->Initialize(allocator, input.GetTypes(), sample_count); - for (idx_t col_idx = 0; col_idx < reservoir_data_chunk->ColumnCount(); col_idx++) { - FlatVector::Validity(reservoir_data_chunk->data[col_idx]).Initialize(sample_count); - } - reservoir_initialized = true; -} - -idx_t ReservoirSample::FillReservoir(DataChunk &input) { - idx_t chunk_count = input.size(); - input.Flatten(); - auto num_added_samples = reservoir_data_chunk ? reservoir_data_chunk->size() : 0; - D_ASSERT(num_added_samples <= sample_count); - - // required count is what we still need to add to the reservoir - idx_t required_count; - if (num_added_samples + chunk_count >= sample_count) { - // have to limit the count of the chunk - required_count = sample_count - num_added_samples; - } else { - // we copy the entire chunk - required_count = chunk_count; - } - input.SetCardinality(required_count); - - // initialize the reservoir - if (!reservoir_initialized) { - InitializeReservoir(input); - } - reservoir_data_chunk->Append(input, false, nullptr, required_count); - old_base_reservoir_sample.InitializeReservoir(required_count, sample_count); - - // check if there are still elements remaining in the Input data chunk that should be - // randomly sampled and potentially added. This happens if we are on a boundary - // for example, input.size() is 1024, but our sample size is 10 - if (required_count == chunk_count) { - // we are done here - return 0; - } - // we still need to process a part of the chunk - // create a selection vector of the remaining elements - SelectionVector sel(STANDARD_VECTOR_SIZE); - for (idx_t i = required_count; i < chunk_count; i++) { - sel.set_index(i - required_count, i); - } - // slice the input vector and continue - input.Slice(sel, chunk_count - required_count); - return input.size(); -} - -void ReservoirSample::Finalize() { - return; -} - -ReservoirSamplePercentage::ReservoirSamplePercentage(Allocator &allocator, double percentage, int64_t seed) - : BlockingSample(seed), allocator(allocator), sample_percentage(percentage / 100.0), current_count(0), - is_finalized(false) { - reservoir_sample_size = idx_t(sample_percentage * RESERVOIR_THRESHOLD); - current_sample = make_uniq(allocator, reservoir_sample_size, random.NextRandomInteger()); -} - -ReservoirSamplePercentage::ReservoirSamplePercentage(double percentage, int64_t seed) - : ReservoirSamplePercentage(Allocator::DefaultAllocator(), percentage, seed) { -} - -void ReservoirSamplePercentage::AddToReservoir(DataChunk &input) { - old_base_reservoir_sample.num_entries_seen_total += input.size(); - if (current_count + input.size() > RESERVOIR_THRESHOLD) { - // we don't have enough space in our current reservoir - // first check what we still need to append to the current sample - idx_t append_to_current_sample_count = RESERVOIR_THRESHOLD - current_count; - idx_t append_to_next_sample = input.size() - append_to_current_sample_count; - if (append_to_current_sample_count > 0) { - // we have elements remaining, first add them to the current sample - if (append_to_next_sample > 0) { - // we need to also add to the next sample - DataChunk new_chunk; - new_chunk.InitializeEmpty(input.GetTypes()); - new_chunk.Slice(input, *FlatVector::IncrementalSelectionVector(), append_to_current_sample_count); - new_chunk.Flatten(); - current_sample->AddToReservoir(new_chunk); - } else { - input.Flatten(); - input.SetCardinality(append_to_current_sample_count); - current_sample->AddToReservoir(input); - } - } - if (append_to_next_sample > 0) { - // slice the input for the remainder - SelectionVector sel(append_to_next_sample); - for (idx_t i = append_to_current_sample_count; i < append_to_next_sample + append_to_current_sample_count; - i++) { - sel.set_index(i - append_to_current_sample_count, i); - } - input.Slice(sel, append_to_next_sample); - } - // now our first sample is filled: append it to the set of finished samples - finished_samples.push_back(std::move(current_sample)); - - // allocate a new sample, and potentially add the remainder of the current input to that sample - current_sample = make_uniq(allocator, reservoir_sample_size, random.NextRandomInteger()); - if (append_to_next_sample > 0) { - current_sample->AddToReservoir(input); - } - current_count = append_to_next_sample; - } else { - // we can just append to the current sample - current_count += input.size(); - current_sample->AddToReservoir(input); - } -} - -unique_ptr ReservoirSamplePercentage::GetChunk() { - if (!is_finalized) { - Finalize(); - } - while (!finished_samples.empty()) { - auto &front = finished_samples.front(); - auto chunk = front->GetChunk(); - if (chunk && chunk->size() > 0) { - return chunk; - } - // move to the next sample - finished_samples.erase(finished_samples.begin()); - } - return nullptr; -} - -void ReservoirSamplePercentage::Finalize() { - // need to finalize the current sample, if any - // we are finializing, so we are starting to return chunks. Our last chunk has - // sample_percentage * RESERVOIR_THRESHOLD entries that hold samples. - // if our current count is less than the sample_percentage * RESERVOIR_THRESHOLD - // then we have sampled too much for the current_sample and we need to redo the sample - // otherwise we can just push the current sample back - // Imagine sampling 70% of 100 rows (so 70 rows). We allocate sample_percentage * RESERVOIR_THRESHOLD - // ----------------------------------------- - auto sampled_more_than_required = - static_cast(current_count) > sample_percentage * RESERVOIR_THRESHOLD || finished_samples.empty(); - if (current_count > 0 && sampled_more_than_required) { - // create a new sample - auto new_sample_size = idx_t(round(sample_percentage * static_cast(current_count))); - auto new_sample = make_uniq(allocator, new_sample_size, random.NextRandomInteger()); - while (true) { - auto chunk = current_sample->GetChunk(); - if (!chunk || chunk->size() == 0) { - break; - } - new_sample->AddToReservoir(*chunk); - } - finished_samples.push_back(std::move(new_sample)); - } else { - finished_samples.push_back(std::move(current_sample)); - } - // when finalizing, current_sample is null. All samples are now in finished samples. - current_sample = nullptr; - is_finalized = true; -} - -BaseReservoirSampling::BaseReservoirSampling(int64_t seed) : random(seed) { - next_index_to_sample = 0; - min_weight_threshold = 0; - min_weighted_entry_index = 0; - num_entries_to_skip_b4_next_sample = 0; - num_entries_seen_total = 0; -} - -BaseReservoirSampling::BaseReservoirSampling() : BaseReservoirSampling(-1) { -} - -void BaseReservoirSampling::InitializeReservoir(idx_t cur_size, idx_t sample_size) { - //! 1: The first m items of V are inserted into R - //! first we need to check if the reservoir already has "m" elements - if (cur_size == sample_size) { - //! 2. For each item vi ∈ R: Calculate a key ki = random(0, 1) - //! we then define the threshold to enter the reservoir T_w as the minimum key of R - //! we use a priority queue to extract the minimum key in O(1) time - for (idx_t i = 0; i < sample_size; i++) { - double k_i = random.NextRandom(); - reservoir_weights.emplace(-k_i, i); - } - SetNextEntry(); - } -} - -void BaseReservoirSampling::SetNextEntry() { - //! 4. Let r = random(0, 1) and Xw = log(r) / log(T_w) - auto &min_key = reservoir_weights.top(); - double t_w = -min_key.first; - double r = random.NextRandom(); - double x_w = log(r) / log(t_w); - //! 5. From the current item vc skip items until item vi , such that: - //! 6. wc +wc+1 +···+wi−1 < Xw <= wc +wc+1 +···+wi−1 +wi - //! since all our weights are 1 (uniform sampling), we can just determine the amount of elements to skip - min_weight_threshold = t_w; - min_weighted_entry_index = min_key.second; - next_index_to_sample = MaxValue(1, idx_t(round(x_w))); - num_entries_to_skip_b4_next_sample = 0; -} - -void BaseReservoirSampling::ReplaceElement(double with_weight) { - //! replace the entry in the reservoir - //! pop the minimum entry - reservoir_weights.pop(); - //! now update the reservoir - //! 8. Let tw = Tw i , r2 = random(tw,1) and vi’s key: ki = (r2)1/wi - //! 9. The new threshold Tw is the new minimum key of R - //! we generate a random number between (min_weight_threshold, 1) - double r2 = random.NextRandom(min_weight_threshold, 1); - - //! if we are merging two reservoir samples use the weight passed - if (with_weight >= 0) { - r2 = with_weight; - } - //! now we insert the new weight into the reservoir - reservoir_weights.emplace(-r2, min_weighted_entry_index); - //! we update the min entry with the new min entry in the reservoir - SetNextEntry(); -} - -} // namespace duckdb diff --git a/src/duckdb/src/execution/sample/base_reservoir_sample.cpp b/src/duckdb/src/execution/sample/base_reservoir_sample.cpp new file mode 100644 index 00000000..0f0fcdf7 --- /dev/null +++ b/src/duckdb/src/execution/sample/base_reservoir_sample.cpp @@ -0,0 +1,136 @@ +#include "duckdb/execution/reservoir_sample.hpp" +#include + +namespace duckdb { + +double BaseReservoirSampling::GetMinWeightFromTuplesSeen(idx_t rows_seen_total) { + // this function was obtained using https://mycurvefit.com. Inputting multiple x, y values into + // The + switch (rows_seen_total) { + case 0: + return 0; + case 1: + return 0.000161; + case 2: + return 0.530136; + case 3: + return 0.693454; + default: { + return (0.99 - 0.355 * std::exp(-0.07 * static_cast(rows_seen_total))); + } + } +} + +BaseReservoirSampling::BaseReservoirSampling(int64_t seed) : random(seed) { + next_index_to_sample = 0; + min_weight_threshold = 0; + min_weighted_entry_index = 0; + num_entries_to_skip_b4_next_sample = 0; + num_entries_seen_total = 0; +} + +BaseReservoirSampling::BaseReservoirSampling() : BaseReservoirSampling(1) { +} + +unique_ptr BaseReservoirSampling::Copy() { + auto ret = make_uniq(1); + ret->reservoir_weights = reservoir_weights; + ret->next_index_to_sample = next_index_to_sample; + ret->min_weight_threshold = min_weight_threshold; + ret->min_weighted_entry_index = min_weighted_entry_index; + ret->num_entries_to_skip_b4_next_sample = num_entries_to_skip_b4_next_sample; + ret->num_entries_seen_total = num_entries_seen_total; + return ret; +} + +void BaseReservoirSampling::InitializeReservoirWeights(idx_t cur_size, idx_t sample_size) { + //! 1: The first m items of V are inserted into R + //! first we need to check if the reservoir already has "m" elements + //! 2. For each item vi ∈ R: Calculate a key ki = random(0, 1) + //! we then define the threshold to enter the reservoir T_w as the minimum key of R + //! we use a priority queue to extract the minimum key in O(1) time + if (cur_size == sample_size) { + //! 2. For each item vi ∈ R: Calculate a key ki = random(0, 1) + //! we then define the threshold to enter the reservoir T_w as the minimum key of R + //! we use a priority queue to extract the minimum key in O(1) time + for (idx_t i = 0; i < sample_size; i++) { + double k_i = random.NextRandom(); + reservoir_weights.emplace(-k_i, i); + } + SetNextEntry(); + } +} + +void BaseReservoirSampling::SetNextEntry() { + D_ASSERT(!reservoir_weights.empty()); + //! 4. Let r = random(0, 1) and Xw = log(r) / log(T_w) + auto &min_key = reservoir_weights.top(); + double t_w = -min_key.first; + double r = random.NextRandom32(); + double x_w = log(r) / log(t_w); + //! 5. From the current item vc skip items until item vi , such that: + //! 6. wc +wc+1 +···+wi−1 < Xw <= wc +wc+1 +···+wi−1 +wi + //! since all our weights are 1 (uniform sampling), we can just determine the amount of elements to skip + min_weight_threshold = t_w; + min_weighted_entry_index = min_key.second; + next_index_to_sample = MaxValue(1, idx_t(round(x_w))); + num_entries_to_skip_b4_next_sample = 0; +} + +void BaseReservoirSampling::ReplaceElementWithIndex(idx_t entry_index, double with_weight, bool pop) { + + if (pop) { + reservoir_weights.pop(); + } + double r2 = with_weight; + //! now we insert the new weight into the reservoir + reservoir_weights.emplace(-r2, entry_index); + //! we update the min entry with the new min entry in the reservoir + SetNextEntry(); +} + +void BaseReservoirSampling::ReplaceElement(double with_weight) { + //! replace the entry in the reservoir + //! pop the minimum entry + reservoir_weights.pop(); + //! now update the reservoir + //! 8. Let tw = Tw i , r2 = random(tw,1) and vi’s key: ki = (r2)1/wi + //! 9. The new threshold Tw is the new minimum key of R + //! we generate a random number between (min_weight_threshold, 1) + double r2 = random.NextRandom(min_weight_threshold, 1); + + //! if we are merging two reservoir samples use the weight passed + if (with_weight >= 0) { + r2 = with_weight; + } + //! now we insert the new weight into the reservoir + reservoir_weights.emplace(-r2, min_weighted_entry_index); + //! we update the min entry with the new min entry in the reservoir + SetNextEntry(); +} + +void BaseReservoirSampling::UpdateMinWeightThreshold() { + if (!reservoir_weights.empty()) { + min_weight_threshold = -reservoir_weights.top().first; + min_weighted_entry_index = reservoir_weights.top().second; + return; + } + min_weight_threshold = 1; +} + +void BaseReservoirSampling::FillWeights(SelectionVector &sel, idx_t &sel_size) { + if (!reservoir_weights.empty()) { + return; + } + D_ASSERT(reservoir_weights.empty()); + auto num_entries_seen_normalized = num_entries_seen_total / FIXED_SAMPLE_SIZE; + auto min_weight = GetMinWeightFromTuplesSeen(num_entries_seen_normalized); + for (idx_t i = 0; i < sel_size; i++) { + auto weight = random.NextRandom(min_weight, 1); + reservoir_weights.emplace(-weight, i); + } + D_ASSERT(reservoir_weights.size() <= sel_size); + SetNextEntry(); +} + +} // namespace duckdb diff --git a/src/duckdb/src/execution/sample/reservoir_sample.cpp b/src/duckdb/src/execution/sample/reservoir_sample.cpp new file mode 100644 index 00000000..eef10fca --- /dev/null +++ b/src/duckdb/src/execution/sample/reservoir_sample.cpp @@ -0,0 +1,930 @@ +#include "duckdb/execution/reservoir_sample.hpp" +#include "duckdb/common/types/data_chunk.hpp" +#include "duckdb/common/vector_operations/vector_operations.hpp" +#include + +namespace duckdb { + +std::pair BlockingSample::PopFromWeightQueue() { + D_ASSERT(base_reservoir_sample && !base_reservoir_sample->reservoir_weights.empty()); + auto ret = base_reservoir_sample->reservoir_weights.top(); + base_reservoir_sample->reservoir_weights.pop(); + + base_reservoir_sample->UpdateMinWeightThreshold(); + D_ASSERT(base_reservoir_sample->min_weight_threshold > 0); + return ret; +} + +double BlockingSample::GetMinWeightThreshold() { + return base_reservoir_sample->min_weight_threshold; +} + +idx_t BlockingSample::GetPriorityQueueSize() { + return base_reservoir_sample->reservoir_weights.size(); +} + +void BlockingSample::Destroy() { + destroyed = true; +} + +void ReservoirChunk::Serialize(Serializer &serializer) const { + chunk.Serialize(serializer); +} + +unique_ptr ReservoirChunk::Deserialize(Deserializer &deserializer) { + auto result = make_uniq(); + result->chunk.Deserialize(deserializer); + return result; +} + +unique_ptr ReservoirChunk::Copy() const { + auto copy = make_uniq(); + copy->chunk.Initialize(Allocator::DefaultAllocator(), chunk.GetTypes()); + + chunk.Copy(copy->chunk); + return copy; +} + +ReservoirSample::ReservoirSample(idx_t sample_count, unique_ptr reservoir_chunk) + : ReservoirSample(Allocator::DefaultAllocator(), sample_count, 1) { + if (reservoir_chunk) { + this->reservoir_chunk = std::move(reservoir_chunk); + sel_size = this->reservoir_chunk->chunk.size(); + sel = SelectionVector(0, sel_size); + ExpandSerializedSample(); + } + stats_sample = true; +} + +ReservoirSample::ReservoirSample(Allocator &allocator, idx_t sample_count, int64_t seed) + : BlockingSample(seed), sample_count(sample_count), allocator(allocator) { + base_reservoir_sample = make_uniq(seed); + type = SampleType::RESERVOIR_SAMPLE; + reservoir_chunk = nullptr; + stats_sample = false; + sel = SelectionVector(sample_count); + sel_size = 0; +} + +idx_t ReservoirSample::GetSampleCount() { + return sample_count; +} + +idx_t ReservoirSample::NumSamplesCollected() const { + if (!reservoir_chunk) { + return 0; + } + return reservoir_chunk->chunk.size(); +} + +SamplingState ReservoirSample::GetSamplingState() const { + if (base_reservoir_sample->reservoir_weights.empty()) { + return SamplingState::RANDOM; + } + return SamplingState::RESERVOIR; +} + +idx_t ReservoirSample::GetActiveSampleCount() const { + switch (GetSamplingState()) { + case SamplingState::RANDOM: + return sel_size; + case SamplingState::RESERVOIR: + return base_reservoir_sample->reservoir_weights.size(); + default: + throw InternalException("Sampling State is INVALID"); + } +} + +idx_t ReservoirSample::GetTuplesSeen() const { + return base_reservoir_sample->num_entries_seen_total; +} + +DataChunk &ReservoirSample::Chunk() { + D_ASSERT(reservoir_chunk); + return reservoir_chunk->chunk; +} + +unique_ptr ReservoirSample::GetChunk() { + if (destroyed || !reservoir_chunk || Chunk().size() == 0) { + return nullptr; + } + // cannot destory internal samples. + auto ret = make_uniq(); + + SelectionVector ret_sel(STANDARD_VECTOR_SIZE); + idx_t collected_samples = GetActiveSampleCount(); + + if (collected_samples == 0) { + return nullptr; + } + + idx_t samples_remaining; + idx_t return_chunk_size; + if (collected_samples > STANDARD_VECTOR_SIZE) { + samples_remaining = collected_samples - STANDARD_VECTOR_SIZE; + return_chunk_size = STANDARD_VECTOR_SIZE; + } else { + samples_remaining = 0; + return_chunk_size = collected_samples; + } + + for (idx_t i = samples_remaining; i < collected_samples; i++) { + // pop samples and reduce size of selection vector. + if (GetSamplingState() == SamplingState::RESERVOIR) { + auto top = PopFromWeightQueue(); + ret_sel.set_index(i - samples_remaining, sel.get_index(top.second)); + } else { + ret_sel.set_index(i - samples_remaining, sel.get_index(i)); + } + sel_size -= 1; + } + + auto reservoir_types = Chunk().GetTypes(); + + ret->Initialize(allocator, reservoir_types, STANDARD_VECTOR_SIZE); + ret->Slice(Chunk(), ret_sel, return_chunk_size); + ret->SetCardinality(return_chunk_size); + return ret; +} + +unique_ptr ReservoirSample::CreateNewSampleChunk(vector &types, idx_t size) const { + auto new_sample_chunk = make_uniq(); + new_sample_chunk->chunk.Initialize(Allocator::DefaultAllocator(), types, size); + + // set the NULL columns correctly + for (idx_t col_idx = 0; col_idx < types.size(); col_idx++) { + if (!ValidSampleType(types[col_idx]) && stats_sample) { + new_sample_chunk->chunk.data[col_idx].SetVectorType(VectorType::CONSTANT_VECTOR); + ConstantVector::SetNull(new_sample_chunk->chunk.data[col_idx], true); + } + } + return new_sample_chunk; +} + +void ReservoirSample::Vacuum() { + Verify(); + if (NumSamplesCollected() <= FIXED_SAMPLE_SIZE || !reservoir_chunk || destroyed) { + // sample is destroyed or too small to shrink + return; + } + + auto ret = Copy(); + auto ret_reservoir = duckdb::unique_ptr_cast(std::move(ret)); + reservoir_chunk = std::move(ret_reservoir->reservoir_chunk); + sel = std::move(ret_reservoir->sel); + sel_size = ret_reservoir->sel_size; + + Verify(); + // We should only have one sample chunk now. + D_ASSERT(Chunk().size() > 0 && Chunk().size() <= sample_count); +} + +unique_ptr ReservoirSample::Copy() const { + + auto ret = make_uniq(sample_count); + ret->stats_sample = stats_sample; + + ret->base_reservoir_sample = base_reservoir_sample->Copy(); + ret->destroyed = destroyed; + + if (!reservoir_chunk || destroyed) { + return unique_ptr_cast(std::move(ret)); + } + + D_ASSERT(reservoir_chunk); + + // create a new sample chunk to store new samples + auto types = reservoir_chunk->chunk.GetTypes(); + // how many values should be copied + idx_t values_to_copy = MinValue(GetActiveSampleCount(), sample_count); + + auto new_sample_chunk = CreateNewSampleChunk(types, GetReservoirChunkCapacity()); + + SelectionVector sel_copy(sel); + + ret->reservoir_chunk = std::move(new_sample_chunk); + ret->UpdateSampleAppend(ret->reservoir_chunk->chunk, reservoir_chunk->chunk, sel_copy, values_to_copy); + ret->sel = SelectionVector(values_to_copy); + for (idx_t i = 0; i < values_to_copy; i++) { + ret->sel.set_index(i, i); + } + ret->sel_size = sel_size; + D_ASSERT(ret->reservoir_chunk->chunk.size() <= sample_count); + ret->Verify(); + return unique_ptr_cast(std::move(ret)); +} + +void ReservoirSample::ConvertToReservoirSample() { + D_ASSERT(sel_size <= sample_count); + base_reservoir_sample->FillWeights(sel, sel_size); +} + +vector ReservoirSample::GetRandomizedVector(uint32_t range, uint32_t size) const { + vector ret; + ret.reserve(range); + for (uint32_t i = 0; i < range; i++) { + ret.push_back(i); + } + if (size == FIXED_SAMPLE_SIZE) { + std::shuffle(ret.begin(), ret.end(), base_reservoir_sample->random); + return ret; + } + for (uint32_t i = 0; i < size; i++) { + uint32_t random_shuffle = base_reservoir_sample->random.NextRandomInteger32(i, range); + if (random_shuffle == i) { + // leave the value where it is + continue; + } + uint32_t tmp = ret[random_shuffle]; + // basically replacing the tuple that was at index actual_sample_indexes[random_shuffle] + ret[random_shuffle] = ret[i]; + ret[i] = tmp; + } + return ret; +} + +void ReservoirSample::SimpleMerge(ReservoirSample &other) { + D_ASSERT(GetPriorityQueueSize() == 0); + D_ASSERT(other.GetPriorityQueueSize() == 0); + D_ASSERT(GetSamplingState() == SamplingState::RANDOM); + D_ASSERT(other.GetSamplingState() == SamplingState::RANDOM); + + if (other.GetActiveSampleCount() == 0 && other.GetTuplesSeen() == 0) { + return; + } + + if (GetActiveSampleCount() == 0 && GetTuplesSeen() == 0) { + sel = SelectionVector(other.sel); + sel_size = other.sel_size; + base_reservoir_sample->num_entries_seen_total = other.GetTuplesSeen(); + return; + } + + idx_t total_seen = GetTuplesSeen() + other.GetTuplesSeen(); + + auto weight_tuples_this = static_cast(GetTuplesSeen()) / static_cast(total_seen); + auto weight_tuples_other = static_cast(other.GetTuplesSeen()) / static_cast(total_seen); + + // If weights don't add up to 1, most likely a simple merge occured and no new samples were added. + // if that is the case, add the missing weight to the lower weighted sample to adjust. + // this is to avoid cases where if you have a 20k row table and add another 20k rows row by row + // then eventually the missing weights will add up, and get you a more even distribution + if (weight_tuples_this + weight_tuples_other < 1) { + weight_tuples_other += 1 - (weight_tuples_other + weight_tuples_this); + } + + idx_t keep_from_this = 0; + idx_t keep_from_other = 0; + D_ASSERT(stats_sample); + D_ASSERT(sample_count == FIXED_SAMPLE_SIZE); + D_ASSERT(sample_count == other.sample_count); + auto sample_count_double = static_cast(sample_count); + + if (weight_tuples_this > weight_tuples_other) { + keep_from_this = MinValue(static_cast(round(sample_count_double * weight_tuples_this)), + GetActiveSampleCount()); + keep_from_other = MinValue(sample_count - keep_from_this, other.GetActiveSampleCount()); + } else { + keep_from_other = MinValue(static_cast(round(sample_count_double * weight_tuples_other)), + other.GetActiveSampleCount()); + keep_from_this = MinValue(sample_count - keep_from_other, GetActiveSampleCount()); + } + + D_ASSERT(keep_from_this <= GetActiveSampleCount()); + D_ASSERT(keep_from_other <= other.GetActiveSampleCount()); + D_ASSERT(keep_from_other + keep_from_this <= FIXED_SAMPLE_SIZE); + idx_t size_after_merge = MinValue(keep_from_other + keep_from_this, FIXED_SAMPLE_SIZE); + + // Check if appending the other samples to this will go over the sample chunk size + if (reservoir_chunk->chunk.size() + keep_from_other > GetReservoirChunkCapacity()) { + Vacuum(); + } + + D_ASSERT(size_after_merge <= other.GetActiveSampleCount() + GetActiveSampleCount()); + SelectionVector chunk_sel(keep_from_other); + auto offset = reservoir_chunk->chunk.size(); + for (idx_t i = keep_from_this; i < size_after_merge; i++) { + if (i >= GetActiveSampleCount()) { + sel.set_index(GetActiveSampleCount(), offset); + sel_size += 1; + } else { + sel.set_index(i, offset); + } + chunk_sel.set_index(i - keep_from_this, other.sel.get_index(i - keep_from_this)); + offset += 1; + } + + D_ASSERT(GetActiveSampleCount() == size_after_merge); + + // Copy the rows that make it to the sample from other and put them into this. + UpdateSampleAppend(reservoir_chunk->chunk, other.reservoir_chunk->chunk, chunk_sel, keep_from_other); + base_reservoir_sample->num_entries_seen_total += other.GetTuplesSeen(); + + // if THIS has too many samples now, we conver it to a slower sample. + if (GetTuplesSeen() >= FIXED_SAMPLE_SIZE * FAST_TO_SLOW_THRESHOLD) { + ConvertToReservoirSample(); + } + Verify(); +} + +void ReservoirSample::WeightedMerge(ReservoirSample &other_sample) { + D_ASSERT(GetSamplingState() == SamplingState::RESERVOIR); + D_ASSERT(other_sample.GetSamplingState() == SamplingState::RESERVOIR); + + // Find out how many samples we want to keep. + idx_t total_samples = GetActiveSampleCount() + other_sample.GetActiveSampleCount(); + idx_t total_samples_seen = + base_reservoir_sample->num_entries_seen_total + other_sample.base_reservoir_sample->num_entries_seen_total; + idx_t num_samples_to_keep = MinValue(total_samples, MinValue(sample_count, total_samples_seen)); + + D_ASSERT(GetActiveSampleCount() <= num_samples_to_keep); + D_ASSERT(total_samples <= FIXED_SAMPLE_SIZE * 2); + + // pop from base base_reservoir weights until there are num_samples_to_keep left. + vector this_indexes_to_replace; + for (idx_t i = num_samples_to_keep; i < total_samples; i++) { + auto min_weight_this = base_reservoir_sample->min_weight_threshold; + auto min_weight_other = other_sample.base_reservoir_sample->min_weight_threshold; + // min weight threshol is always positive + if (min_weight_this > min_weight_other) { + // pop from other + other_sample.base_reservoir_sample->reservoir_weights.pop(); + other_sample.base_reservoir_sample->UpdateMinWeightThreshold(); + } else { + auto top_this = PopFromWeightQueue(); + this_indexes_to_replace.push_back(top_this.second); + base_reservoir_sample->UpdateMinWeightThreshold(); + } + } + + D_ASSERT(other_sample.GetPriorityQueueSize() + GetPriorityQueueSize() <= FIXED_SAMPLE_SIZE); + D_ASSERT(other_sample.GetPriorityQueueSize() + GetPriorityQueueSize() == num_samples_to_keep); + D_ASSERT(other_sample.reservoir_chunk->chunk.GetTypes() == reservoir_chunk->chunk.GetTypes()); + + // Prepare a selection vector to copy data from the other sample chunk to this sample chunk + SelectionVector sel_other(other_sample.GetPriorityQueueSize()); + D_ASSERT(GetPriorityQueueSize() <= num_samples_to_keep); + D_ASSERT(other_sample.GetPriorityQueueSize() >= this_indexes_to_replace.size()); + idx_t chunk_offset = 0; + + // Now push weights from other.base_reservoir_sample to this + // Depending on how many sample values "this" has, we either need to add to the selection vector + // Or replace values in "this'" selection vector + idx_t i = 0; + while (other_sample.GetPriorityQueueSize() > 0) { + auto other_top = other_sample.PopFromWeightQueue(); + idx_t index_for_new_pair = chunk_offset + reservoir_chunk->chunk.size(); + + // update the sel used to copy values from other to this + sel_other.set_index(chunk_offset, other_top.second); + if (i < this_indexes_to_replace.size()) { + auto replacement_index = this_indexes_to_replace[i]; + sel.set_index(replacement_index, index_for_new_pair); + other_top.second = replacement_index; + } else { + sel.set_index(sel_size, index_for_new_pair); + other_top.second = sel_size; + sel_size += 1; + } + + // make sure that the sample indexes are (this.sample_chunk.size() + chunk_offfset) + base_reservoir_sample->reservoir_weights.push(other_top); + chunk_offset += 1; + i += 1; + } + + D_ASSERT(GetPriorityQueueSize() == num_samples_to_keep); + + base_reservoir_sample->UpdateMinWeightThreshold(); + D_ASSERT(base_reservoir_sample->min_weight_threshold > 0); + base_reservoir_sample->num_entries_seen_total = GetTuplesSeen() + other_sample.GetTuplesSeen(); + + UpdateSampleAppend(reservoir_chunk->chunk, other_sample.reservoir_chunk->chunk, sel_other, chunk_offset); + if (reservoir_chunk->chunk.size() > FIXED_SAMPLE_SIZE * (FIXED_SAMPLE_SIZE_MULTIPLIER - 3)) { + Vacuum(); + } + + Verify(); +} + +void ReservoirSample::Merge(unique_ptr other) { + if (destroyed || other->destroyed) { + Destroy(); + return; + } + + D_ASSERT(other->type == SampleType::RESERVOIR_SAMPLE); + auto &other_sample = other->Cast(); + + // if the other sample has not collected anything yet return + if (!other_sample.reservoir_chunk || other_sample.reservoir_chunk->chunk.size() == 0) { + return; + } + + // this has not collected samples, take over the other + if (!reservoir_chunk || reservoir_chunk->chunk.size() == 0) { + base_reservoir_sample = std::move(other->base_reservoir_sample); + reservoir_chunk = std::move(other_sample.reservoir_chunk); + sel = SelectionVector(other_sample.sel); + sel_size = other_sample.sel_size; + Verify(); + return; + } + //! Both samples are still in "fast sampling" method + if (GetSamplingState() == SamplingState::RANDOM && other_sample.GetSamplingState() == SamplingState::RANDOM) { + SimpleMerge(other_sample); + return; + } + + // One or none of the samples are in "Fast Sampling" method. + // When this is the case, switch both to slow sampling + ConvertToReservoirSample(); + other_sample.ConvertToReservoirSample(); + WeightedMerge(other_sample); +} + +void ReservoirSample::ShuffleSel(SelectionVector &sel, idx_t range, idx_t size) const { + auto randomized = GetRandomizedVector(static_cast(range), static_cast(size)); + SelectionVector original_sel(range); + for (idx_t i = 0; i < range; i++) { + original_sel.set_index(i, sel.get_index(i)); + } + for (idx_t i = 0; i < size; i++) { + sel.set_index(i, original_sel.get_index(randomized[i])); + } +} + +void ReservoirSample::NormalizeWeights() { + vector> tmp_weights; + while (!base_reservoir_sample->reservoir_weights.empty()) { + auto top = base_reservoir_sample->reservoir_weights.top(); + tmp_weights.push_back(std::move(top)); + base_reservoir_sample->reservoir_weights.pop(); + } + std::sort(tmp_weights.begin(), tmp_weights.end(), + [&](std::pair a, std::pair b) { return a.second < b.second; }); + for (idx_t i = 0; i < tmp_weights.size(); i++) { + base_reservoir_sample->reservoir_weights.emplace(tmp_weights.at(i).first, i); + } + base_reservoir_sample->SetNextEntry(); +} + +void ReservoirSample::EvictOverBudgetSamples() { + Verify(); + if (!reservoir_chunk || destroyed) { + return; + } + + // since this is for serialization, we really need to make sure keep a + // minimum of 1% of the rows or 2048 rows + idx_t num_samples_to_keep = + MinValue(FIXED_SAMPLE_SIZE, static_cast(SAVE_PERCENTAGE * static_cast(GetTuplesSeen()))); + + if (num_samples_to_keep <= 0) { + reservoir_chunk->chunk.SetCardinality(0); + return; + } + + if (num_samples_to_keep == sample_count) { + return; + } + + // if we over sampled, make sure we only keep the highest percentage samples + std::unordered_set selections_to_delete; + + while (num_samples_to_keep < GetPriorityQueueSize()) { + auto top = PopFromWeightQueue(); + D_ASSERT(top.second < sel_size); + selections_to_delete.emplace(top.second); + } + + // set up reservoir chunk for the reservoir sample + D_ASSERT(reservoir_chunk->chunk.size() <= sample_count); + // create a new sample chunk to store new samples + auto types = reservoir_chunk->chunk.GetTypes(); + D_ASSERT(num_samples_to_keep <= sample_count); + D_ASSERT(stats_sample); + D_ASSERT(sample_count == STANDARD_VECTOR_SIZE); + auto new_reservoir_chunk = CreateNewSampleChunk(types, STANDARD_VECTOR_SIZE); + + // The current selection vector can potentially have 2048 valid mappings. + // If we need to save a sample with less rows than that, we need to do the following + // 1. Create a new selection vector that doesn't point to the rows we are evicting + SelectionVector new_sel(num_samples_to_keep); + idx_t offset = 0; + for (idx_t i = 0; i < num_samples_to_keep + selections_to_delete.size(); i++) { + if (selections_to_delete.find(i) == selections_to_delete.end()) { + D_ASSERT(i - offset < num_samples_to_keep); + new_sel.set_index(i - offset, sel.get_index(i)); + } else { + offset += 1; + } + } + // 2. Update row_ids in our weights so that they don't store rows ids to + // indexes in the selection vector that have been evicted. + if (!selections_to_delete.empty()) { + NormalizeWeights(); + } + + D_ASSERT(reservoir_chunk->chunk.GetTypes() == new_reservoir_chunk->chunk.GetTypes()); + + UpdateSampleAppend(new_reservoir_chunk->chunk, reservoir_chunk->chunk, new_sel, num_samples_to_keep); + // set the cardinality + new_reservoir_chunk->chunk.SetCardinality(num_samples_to_keep); + reservoir_chunk = std::move(new_reservoir_chunk); + sel_size = num_samples_to_keep; + base_reservoir_sample->UpdateMinWeightThreshold(); +} + +void ReservoirSample::ExpandSerializedSample() { + if (!reservoir_chunk) { + return; + } + + auto types = reservoir_chunk->chunk.GetTypes(); + auto new_res_chunk = CreateNewSampleChunk(types, GetReservoirChunkCapacity()); + auto copy_count = reservoir_chunk->chunk.size(); + SelectionVector tmp_sel = SelectionVector(0, copy_count); + UpdateSampleAppend(new_res_chunk->chunk, reservoir_chunk->chunk, tmp_sel, copy_count); + new_res_chunk->chunk.SetCardinality(copy_count); + std::swap(reservoir_chunk, new_res_chunk); +} + +idx_t ReservoirSample::GetReservoirChunkCapacity() const { + return sample_count + (FIXED_SAMPLE_SIZE_MULTIPLIER * FIXED_SAMPLE_SIZE); +} + +idx_t ReservoirSample::FillReservoir(DataChunk &chunk) { + + idx_t ingested_count = 0; + if (!reservoir_chunk) { + if (chunk.size() > FIXED_SAMPLE_SIZE) { + throw InternalException("Creating sample with DataChunk that is larger than the fixed sample size"); + } + auto types = chunk.GetTypes(); + // create a new sample chunk to store new samples + reservoir_chunk = CreateNewSampleChunk(types, GetReservoirChunkCapacity()); + } + + idx_t actual_sample_index_start = GetActiveSampleCount(); + D_ASSERT(reservoir_chunk->chunk.ColumnCount() == chunk.ColumnCount()); + + if (reservoir_chunk->chunk.size() < sample_count) { + ingested_count = MinValue(sample_count - reservoir_chunk->chunk.size(), chunk.size()); + auto random_other_sel = + GetRandomizedVector(static_cast(ingested_count), static_cast(ingested_count)); + SelectionVector sel_for_input_chunk(ingested_count); + for (idx_t i = 0; i < ingested_count; i++) { + sel.set_index(actual_sample_index_start + i, actual_sample_index_start + i); + sel_for_input_chunk.set_index(i, random_other_sel[i]); + } + UpdateSampleAppend(reservoir_chunk->chunk, chunk, sel_for_input_chunk, ingested_count); + sel_size += ingested_count; + } + D_ASSERT(GetActiveSampleCount() <= sample_count); + D_ASSERT(GetActiveSampleCount() >= ingested_count); + // always return how many tuples were ingested + return ingested_count; +} + +void ReservoirSample::Destroy() { + destroyed = true; +} + +SelectionVectorHelper ReservoirSample::GetReplacementIndexes(idx_t sample_chunk_offset, + idx_t theoretical_chunk_length) { + if (GetSamplingState() == SamplingState::RANDOM) { + return GetReplacementIndexesFast(sample_chunk_offset, theoretical_chunk_length); + } + return GetReplacementIndexesSlow(sample_chunk_offset, theoretical_chunk_length); +} + +SelectionVectorHelper ReservoirSample::GetReplacementIndexesFast(idx_t sample_chunk_offset, idx_t chunk_length) { + + // how much weight to the other tuples have compared to the ones in this chunk? + auto weight_tuples_other = static_cast(chunk_length) / static_cast(GetTuplesSeen() + chunk_length); + auto num_to_pop = static_cast(round(weight_tuples_other * static_cast(sample_count))); + D_ASSERT(num_to_pop <= sample_count); + D_ASSERT(num_to_pop <= sel_size); + SelectionVectorHelper ret; + + if (num_to_pop == 0) { + ret.sel = SelectionVector(num_to_pop); + ret.size = 0; + return ret; + } + std::unordered_map replacement_indexes; + SelectionVector chunk_sel(num_to_pop); + + auto random_indexes_chunk = GetRandomizedVector(static_cast(chunk_length), num_to_pop); + auto random_sel_indexes = GetRandomizedVector(static_cast(sel_size), num_to_pop); + for (idx_t i = 0; i < num_to_pop; i++) { + // update the selection vector for the reservoir sample + chunk_sel.set_index(i, random_indexes_chunk[i]); + // sel is not guaratneed to be random, so we update the indexes according to our + // random sel indexes. + sel.set_index(random_sel_indexes[i], sample_chunk_offset + i); + } + + D_ASSERT(sel_size == sample_count); + + ret.sel = SelectionVector(chunk_sel); + ret.size = num_to_pop; + return ret; +} + +SelectionVectorHelper ReservoirSample::GetReplacementIndexesSlow(const idx_t sample_chunk_offset, + const idx_t chunk_length) { + idx_t remaining = chunk_length; + std::unordered_map ret_map; + idx_t sample_chunk_index = 0; + + idx_t base_offset = 0; + + while (true) { + idx_t offset = + base_reservoir_sample->next_index_to_sample - base_reservoir_sample->num_entries_to_skip_b4_next_sample; + if (offset >= remaining) { + // not in this chunk! increment current count and go to the next chunk + base_reservoir_sample->num_entries_to_skip_b4_next_sample += remaining; + break; + } + // in this chunk! replace the element + // ret[index_in_new_chunk] = index_in_sample_chunk (the sample chunk offset will be applied later) + // D_ASSERT(sample_chunk_index == ret.size()); + ret_map[base_offset + offset] = sample_chunk_index; + double r2 = base_reservoir_sample->random.NextRandom32(base_reservoir_sample->min_weight_threshold, 1); + // replace element in our max_heap + // first get the top most pair + const auto top = PopFromWeightQueue(); + const auto index = top.second; + const auto index_in_sample_chunk = sample_chunk_offset + sample_chunk_index; + sel.set_index(index, index_in_sample_chunk); + base_reservoir_sample->ReplaceElementWithIndex(index, r2, false); + + sample_chunk_index += 1; + // shift the chunk forward + remaining -= offset; + base_offset += offset; + } + + // create selection vector to return + SelectionVector ret_sel(ret_map.size()); + D_ASSERT(sel_size == sample_count); + for (auto &kv : ret_map) { + ret_sel.set_index(kv.second, kv.first); + } + SelectionVectorHelper ret; + ret.sel = SelectionVector(ret_sel); + ret.size = static_cast(ret_map.size()); + return ret; +} + +void ReservoirSample::Finalize() { +} + +bool ReservoirSample::ValidSampleType(const LogicalType &type) { + return type.IsNumeric(); +} + +void ReservoirSample::UpdateSampleAppend(DataChunk &this_, DataChunk &other, SelectionVector &other_sel, + idx_t append_count) const { + idx_t new_size = this_.size() + append_count; + if (other.size() == 0) { + return; + } + D_ASSERT(this_.GetTypes() == other.GetTypes()); + + // UpdateSampleAppend(this_, other, other_sel, append_count); + D_ASSERT(this_.GetTypes() == other.GetTypes()); + auto types = reservoir_chunk->chunk.GetTypes(); + + for (idx_t i = 0; i < reservoir_chunk->chunk.ColumnCount(); i++) { + auto col_type = types[i]; + if (ValidSampleType(col_type) || !stats_sample) { + D_ASSERT(this_.data[i].GetVectorType() == VectorType::FLAT_VECTOR); + VectorOperations::Copy(other.data[i], this_.data[i], other_sel, append_count, 0, this_.size()); + } + } + this_.SetCardinality(new_size); +} + +void ReservoirSample::AddToReservoir(DataChunk &chunk) { + if (destroyed || chunk.size() == 0) { + return; + } + + idx_t tuples_consumed = FillReservoir(chunk); + base_reservoir_sample->num_entries_seen_total += tuples_consumed; + D_ASSERT(reservoir_chunk->chunk.size() >= 1); + + if (tuples_consumed == chunk.size()) { + return; + } + + // the chunk filled the first FIXED_SAMPLE_SIZE chunk but still has tuples remaining + // slice the chunk and call AddToReservoir again. + if (tuples_consumed != chunk.size() && tuples_consumed != 0) { + // Fill reservoir consumed some of the chunk to reach FIXED_SAMPLE_SIZE + // now we need to + // So we slice it and call AddToReservoir + auto slice = make_uniq(); + auto samples_remaining = chunk.size() - tuples_consumed; + auto types = chunk.GetTypes(); + SelectionVector input_sel(samples_remaining); + for (idx_t i = 0; i < samples_remaining; i++) { + input_sel.set_index(i, tuples_consumed + i); + } + slice->Initialize(Allocator::DefaultAllocator(), types, samples_remaining); + slice->Slice(chunk, input_sel, samples_remaining); + slice->SetCardinality(samples_remaining); + AddToReservoir(*slice); + return; + } + + // at this point we should have collected at least sample count samples + D_ASSERT(GetActiveSampleCount() >= sample_count); + + auto chunk_sel = GetReplacementIndexes(reservoir_chunk->chunk.size(), chunk.size()); + + if (chunk_sel.size == 0) { + // not adding any samples + return; + } + idx_t size = chunk_sel.size; + D_ASSERT(size <= chunk.size()); + + UpdateSampleAppend(reservoir_chunk->chunk, chunk, chunk_sel.sel, size); + + base_reservoir_sample->num_entries_seen_total += chunk.size(); + D_ASSERT(base_reservoir_sample->reservoir_weights.size() == 0 || + base_reservoir_sample->reservoir_weights.size() == sample_count); + + Verify(); + + // if we are over the threshold, we ned to swith to slow sampling. + if (GetSamplingState() == SamplingState::RANDOM && GetTuplesSeen() >= FIXED_SAMPLE_SIZE * FAST_TO_SLOW_THRESHOLD) { + ConvertToReservoirSample(); + } + if (reservoir_chunk->chunk.size() >= (GetReservoirChunkCapacity() - (static_cast(FIXED_SAMPLE_SIZE) * 3))) { + Vacuum(); + } +} + +void ReservoirSample::Verify() { +#ifdef DEBUG + if (destroyed) { + return; + } + if (GetPriorityQueueSize() == 0) { + D_ASSERT(GetActiveSampleCount() <= sample_count); + D_ASSERT(GetTuplesSeen() >= GetActiveSampleCount()); + return; + } + if (NumSamplesCollected() > sample_count) { + D_ASSERT(GetPriorityQueueSize() == sample_count); + } else if (NumSamplesCollected() <= sample_count && GetPriorityQueueSize() > 0) { + // it's possible to collect more samples than your priority queue size. + // see sample_converts_to_reservoir_sample.test + D_ASSERT(NumSamplesCollected() >= GetPriorityQueueSize()); + } + auto base_reservoir_copy = base_reservoir_sample->Copy(); + std::unordered_map index_count; + while (!base_reservoir_copy->reservoir_weights.empty()) { + auto &pair = base_reservoir_copy->reservoir_weights.top(); + if (index_count.find(pair.second) == index_count.end()) { + index_count[pair.second] = 1; + base_reservoir_copy->reservoir_weights.pop(); + } else { + index_count[pair.second] += 1; + base_reservoir_copy->reservoir_weights.pop(); + throw InternalException("Duplicate selection index in reservoir weights"); + } + } + // TODO: Verify the Sel as well. No duplicate indices. + + if (reservoir_chunk) { + reservoir_chunk->chunk.Verify(); + } +#endif +} + +ReservoirSamplePercentage::ReservoirSamplePercentage(double percentage, int64_t seed, idx_t reservoir_sample_size) + : BlockingSample(seed), allocator(Allocator::DefaultAllocator()), sample_percentage(percentage / 100.0), + reservoir_sample_size(reservoir_sample_size), current_count(0), is_finalized(false) { + current_sample = make_uniq(allocator, reservoir_sample_size, base_reservoir_sample->random()); + type = SampleType::RESERVOIR_PERCENTAGE_SAMPLE; +} + +ReservoirSamplePercentage::ReservoirSamplePercentage(Allocator &allocator, double percentage, int64_t seed) + : BlockingSample(seed), allocator(allocator), sample_percentage(percentage / 100.0), current_count(0), + is_finalized(false) { + reservoir_sample_size = (idx_t)(sample_percentage * RESERVOIR_THRESHOLD); + current_sample = make_uniq(allocator, reservoir_sample_size, base_reservoir_sample->random()); + type = SampleType::RESERVOIR_PERCENTAGE_SAMPLE; +} + +ReservoirSamplePercentage::ReservoirSamplePercentage(double percentage, int64_t seed) + : ReservoirSamplePercentage(Allocator::DefaultAllocator(), percentage, seed) { +} + +void ReservoirSamplePercentage::AddToReservoir(DataChunk &input) { + base_reservoir_sample->num_entries_seen_total += input.size(); + if (current_count + input.size() > RESERVOIR_THRESHOLD) { + // we don't have enough space in our current reservoir + // first check what we still need to append to the current sample + idx_t append_to_current_sample_count = RESERVOIR_THRESHOLD - current_count; + idx_t append_to_next_sample = input.size() - append_to_current_sample_count; + if (append_to_current_sample_count > 0) { + // we have elements remaining, first add them to the current sample + if (append_to_next_sample > 0) { + // we need to also add to the next sample + DataChunk new_chunk; + new_chunk.InitializeEmpty(input.GetTypes()); + new_chunk.Slice(input, *FlatVector::IncrementalSelectionVector(), append_to_current_sample_count); + new_chunk.Flatten(); + current_sample->AddToReservoir(new_chunk); + } else { + input.Flatten(); + input.SetCardinality(append_to_current_sample_count); + current_sample->AddToReservoir(input); + } + } + if (append_to_next_sample > 0) { + // slice the input for the remainder + SelectionVector sel(append_to_next_sample); + for (idx_t i = append_to_current_sample_count; i < append_to_next_sample + append_to_current_sample_count; + i++) { + sel.set_index(i - append_to_current_sample_count, i); + } + input.Slice(sel, append_to_next_sample); + } + // now our first sample is filled: append it to the set of finished samples + finished_samples.push_back(std::move(current_sample)); + + // allocate a new sample, and potentially add the remainder of the current input to that sample + current_sample = make_uniq(allocator, reservoir_sample_size, base_reservoir_sample->random()); + if (append_to_next_sample > 0) { + current_sample->AddToReservoir(input); + } + current_count = append_to_next_sample; + } else { + // we can just append to the current sample + current_count += input.size(); + current_sample->AddToReservoir(input); + } +} + +unique_ptr ReservoirSamplePercentage::GetChunk() { + // reservoir sample percentage should never stay + if (!is_finalized) { + Finalize(); + } + while (!finished_samples.empty()) { + auto &front = finished_samples.front(); + auto chunk = front->GetChunk(); + if (chunk && chunk->size() > 0) { + return chunk; + } + // move to the next sample + finished_samples.erase(finished_samples.begin()); + } + return nullptr; +} + +unique_ptr ReservoirSamplePercentage::Copy() const { + throw InternalException("Cannot call Copy on ReservoirSample Percentage"); +} + +void ReservoirSamplePercentage::Finalize() { + // need to finalize the current sample, if any + // we are finializing, so we are starting to return chunks. Our last chunk has + // sample_percentage * RESERVOIR_THRESHOLD entries that hold samples. + // if our current count is less than the sample_percentage * RESERVOIR_THRESHOLD + // then we have sampled too much for the current_sample and we need to redo the sample + // otherwise we can just push the current sample back + // Imagine sampling 70% of 100 rows (so 70 rows). We allocate sample_percentage * RESERVOIR_THRESHOLD + // ----------------------------------------- + auto sampled_more_than_required = + static_cast(current_count) > sample_percentage * RESERVOIR_THRESHOLD || finished_samples.empty(); + if (current_count > 0 && sampled_more_than_required) { + // create a new sample + auto new_sample_size = static_cast(round(sample_percentage * static_cast(current_count))); + auto new_sample = make_uniq(allocator, new_sample_size, base_reservoir_sample->random()); + while (true) { + auto chunk = current_sample->GetChunk(); + if (!chunk || chunk->size() == 0) { + break; + } + new_sample->AddToReservoir(*chunk); + } + finished_samples.push_back(std::move(new_sample)); + } else { + finished_samples.push_back(std::move(current_sample)); + } + // when finalizing, current_sample is null. All samples are now in finished samples. + current_sample = nullptr; + is_finalized = true; +} + +} // namespace duckdb diff --git a/src/duckdb/src/function/table/system/pragma_table_sample.cpp b/src/duckdb/src/function/table/system/pragma_table_sample.cpp new file mode 100644 index 00000000..7f4122b9 --- /dev/null +++ b/src/duckdb/src/function/table/system/pragma_table_sample.cpp @@ -0,0 +1,95 @@ +#include "duckdb/function/table/system_functions.hpp" + +#include "duckdb/catalog/catalog.hpp" +#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp" +#include "duckdb/catalog/catalog_entry/view_catalog_entry.hpp" +#include "duckdb/parser/qualified_name.hpp" +#include "duckdb/parser/constraints/not_null_constraint.hpp" +#include "duckdb/parser/constraints/unique_constraint.hpp" +#include "duckdb/planner/expression/bound_parameter_expression.hpp" +#include "duckdb/planner/binder.hpp" + +#include "duckdb/common/exception.hpp" +#include "duckdb/common/limits.hpp" + +#include + +namespace duckdb { + +struct DuckDBTableSampleFunctionData : public TableFunctionData { + explicit DuckDBTableSampleFunctionData(CatalogEntry &entry_p) : entry(entry_p) { + } + CatalogEntry &entry; +}; + +struct DuckDBTableSampleOperatorData : public GlobalTableFunctionState { + DuckDBTableSampleOperatorData() : sample_offset(0) { + sample = nullptr; + } + idx_t sample_offset; + unique_ptr sample; +}; + +static unique_ptr DuckDBTableSampleBind(ClientContext &context, TableFunctionBindInput &input, + vector &return_types, vector &names) { + + // look up the table name in the catalog + auto qname = QualifiedName::Parse(input.inputs[0].GetValue()); + Binder::BindSchemaOrCatalog(context, qname.catalog, qname.schema); + + auto &entry = Catalog::GetEntry(context, CatalogType::TABLE_ENTRY, qname.catalog, qname.schema, qname.name); + if (entry.type != CatalogType::TABLE_ENTRY) { + throw NotImplementedException("Invalid Catalog type passed to table_sample()"); + } + auto &table_entry = entry.Cast(); + auto types = table_entry.GetTypes(); + for (auto &type : types) { + return_types.push_back(type); + } + for (idx_t i = 0; i < types.size(); i++) { + auto logical_index = LogicalIndex(i); + auto &col = table_entry.GetColumn(logical_index); + names.push_back(col.GetName()); + } + + return make_uniq(entry); +} + +unique_ptr DuckDBTableSampleInit(ClientContext &context, TableFunctionInitInput &input) { + return make_uniq(); +} + +static void DuckDBTableSampleTable(ClientContext &context, DuckDBTableSampleOperatorData &data, + TableCatalogEntry &table, DataChunk &output) { + // if table has statistics. + // copy the sample of statistics into the output chunk + if (!data.sample) { + data.sample = table.GetSample(); + } + if (data.sample) { + auto sample_chunk = data.sample->GetChunk(); + if (sample_chunk) { + sample_chunk->Copy(output, 0); + data.sample_offset += sample_chunk->size(); + } + } +} + +static void DuckDBTableSampleFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) { + auto &bind_data = data_p.bind_data->Cast(); + auto &state = data_p.global_state->Cast(); + switch (bind_data.entry.type) { + case CatalogType::TABLE_ENTRY: + DuckDBTableSampleTable(context, state, bind_data.entry.Cast(), output); + break; + default: + throw NotImplementedException("Unimplemented catalog type for pragma_table_sample"); + } +} + +void DuckDBTableSample::RegisterFunction(BuiltinFunctions &set) { + set.AddFunction(TableFunction("duckdb_table_sample", {LogicalType::VARCHAR}, DuckDBTableSampleFunction, + DuckDBTableSampleBind, DuckDBTableSampleInit)); +} + +} // namespace duckdb diff --git a/src/duckdb/src/function/table/system_functions.cpp b/src/duckdb/src/function/table/system_functions.cpp index 12e8bcc3..7560221c 100644 --- a/src/duckdb/src/function/table/system_functions.cpp +++ b/src/duckdb/src/function/table/system_functions.cpp @@ -34,6 +34,7 @@ void BuiltinFunctions::RegisterSQLiteFunctions() { DuckDBSequencesFun::RegisterFunction(*this); DuckDBSettingsFun::RegisterFunction(*this); DuckDBTablesFun::RegisterFunction(*this); + DuckDBTableSample::RegisterFunction(*this); DuckDBTemporaryFilesFun::RegisterFunction(*this); DuckDBTypesFun::RegisterFunction(*this); DuckDBVariablesFun::RegisterFunction(*this); diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 4d7dde1f..bdcf6722 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev3393" +#define DUCKDB_PATCH_VERSION "4-dev3588" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 1 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev3393" +#define DUCKDB_VERSION "v1.1.4-dev3588" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "bcd65821a9" +#define DUCKDB_SOURCE_ID "a91feadf8c" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/include/duckdb/catalog/catalog_entry/duck_table_entry.hpp b/src/duckdb/src/include/duckdb/catalog/catalog_entry/duck_table_entry.hpp index ce09c4fe..fb9d5ae6 100644 --- a/src/duckdb/src/include/duckdb/catalog/catalog_entry/duck_table_entry.hpp +++ b/src/duckdb/src/include/duckdb/catalog/catalog_entry/duck_table_entry.hpp @@ -35,6 +35,8 @@ class DuckTableEntry : public TableCatalogEntry { //! Get statistics of a column (physical or virtual) within the table unique_ptr GetStatistics(ClientContext &context, column_t column_id) override; + unique_ptr GetSample() override; + unique_ptr Copy(ClientContext &context) const override; void SetAsRoot() override; diff --git a/src/duckdb/src/include/duckdb/catalog/catalog_entry/table_catalog_entry.hpp b/src/duckdb/src/include/duckdb/catalog/catalog_entry/table_catalog_entry.hpp index d2e47e6d..5ce45af7 100644 --- a/src/duckdb/src/include/duckdb/catalog/catalog_entry/table_catalog_entry.hpp +++ b/src/duckdb/src/include/duckdb/catalog/catalog_entry/table_catalog_entry.hpp @@ -13,6 +13,7 @@ #include "duckdb/parser/column_list.hpp" #include "duckdb/parser/constraint.hpp" #include "duckdb/planner/bound_constraint.hpp" +#include "duckdb/storage/table/table_statistics.hpp" #include "duckdb/planner/expression.hpp" #include "duckdb/common/case_insensitive_map.hpp" #include "duckdb/catalog/catalog_entry/table_column_type.hpp" @@ -82,6 +83,8 @@ class TableCatalogEntry : public StandardEntry { //! Get statistics of a column (physical or virtual) within the table virtual unique_ptr GetStatistics(ClientContext &context, column_t column_id) = 0; + virtual unique_ptr GetSample(); + //! Returns the column index of the specified column name. //! If the column does not exist: //! If if_column_exists is true, returns DConstants::INVALID_INDEX diff --git a/src/duckdb/src/include/duckdb/common/enum_util.hpp b/src/duckdb/src/include/duckdb/common/enum_util.hpp index ce92dd77..6057a89b 100644 --- a/src/duckdb/src/include/duckdb/common/enum_util.hpp +++ b/src/duckdb/src/include/duckdb/common/enum_util.hpp @@ -32,6 +32,10 @@ struct EnumUtil { static string ToString(T value) { return string(ToChars(value)); } }; +enum class ARTAppendMode : uint8_t; + +enum class ARTConflictType : uint8_t; + enum class AccessMode : uint8_t; enum class AggregateCombineType : uint8_t; @@ -278,6 +282,8 @@ enum class SampleMethod : uint8_t; enum class SampleType : uint8_t; +enum class SamplingState : uint8_t; + enum class ScanType : uint8_t; enum class SecretDisplayType : uint8_t; @@ -373,6 +379,12 @@ enum class WindowBoundary : uint8_t; enum class WindowExcludeMode : uint8_t; +template<> +const char* EnumUtil::ToChars(ARTAppendMode value); + +template<> +const char* EnumUtil::ToChars(ARTConflictType value); + template<> const char* EnumUtil::ToChars(AccessMode value); @@ -742,6 +754,9 @@ const char* EnumUtil::ToChars(SampleMethod value); template<> const char* EnumUtil::ToChars(SampleType value); +template<> +const char* EnumUtil::ToChars(SamplingState value); + template<> const char* EnumUtil::ToChars(ScanType value); @@ -884,6 +899,12 @@ template<> const char* EnumUtil::ToChars(WindowExcludeMode value); +template<> +ARTAppendMode EnumUtil::FromString(const char *value); + +template<> +ARTConflictType EnumUtil::FromString(const char *value); + template<> AccessMode EnumUtil::FromString(const char *value); @@ -1253,6 +1274,9 @@ SampleMethod EnumUtil::FromString(const char *value); template<> SampleType EnumUtil::FromString(const char *value); +template<> +SamplingState EnumUtil::FromString(const char *value); + template<> ScanType EnumUtil::FromString(const char *value); diff --git a/src/duckdb/src/include/duckdb/common/random_engine.hpp b/src/duckdb/src/include/duckdb/common/random_engine.hpp index 970db6ce..59531e1d 100644 --- a/src/duckdb/src/include/duckdb/common/random_engine.hpp +++ b/src/duckdb/src/include/duckdb/common/random_engine.hpp @@ -18,11 +18,11 @@ namespace duckdb { class ClientContext; struct RandomState; -struct RandomEngine { +class RandomEngine { +public: explicit RandomEngine(int64_t seed = -1); ~RandomEngine(); -public: //! Generate a random number between min and max double NextRandom(double min, double max); @@ -31,6 +31,7 @@ struct RandomEngine { //! Generate a random number between 0 and 1, using 32-bits as a base double NextRandom32(); double NextRandom32(double min, double max); + uint32_t NextRandomInteger32(uint32_t min, uint32_t max); uint32_t NextRandomInteger(); uint32_t NextRandomInteger(uint32_t min, uint32_t max); uint64_t NextRandomInteger64(); diff --git a/src/duckdb/src/include/duckdb/common/serializer/serializer.hpp b/src/duckdb/src/include/duckdb/common/serializer/serializer.hpp index 60531fd1..54b994e4 100644 --- a/src/duckdb/src/include/duckdb/common/serializer/serializer.hpp +++ b/src/duckdb/src/include/duckdb/common/serializer/serializer.hpp @@ -16,6 +16,7 @@ #include "duckdb/common/types/uhugeint.hpp" #include "duckdb/common/unordered_map.hpp" #include "duckdb/common/unordered_set.hpp" +#include "duckdb/common/queue.hpp" #include "duckdb/common/optional_idx.hpp" #include "duckdb/common/optionally_owned_ptr.hpp" #include "duckdb/common/value_operations/value_operations.hpp" diff --git a/src/duckdb/src/include/duckdb/common/tree_renderer/text_tree_renderer.hpp b/src/duckdb/src/include/duckdb/common/tree_renderer/text_tree_renderer.hpp index b34144a3..1bf43191 100644 --- a/src/duckdb/src/include/duckdb/common/tree_renderer/text_tree_renderer.hpp +++ b/src/duckdb/src/include/duckdb/common/tree_renderer/text_tree_renderer.hpp @@ -112,7 +112,8 @@ class TextTreeRenderer : public TreeRenderer { bool CanSplitOnThisChar(char l); bool IsPadding(char l); string RemovePadding(string l); - void SplitUpExtraInfo(const InsertionOrderPreservingMap &extra_info, vector &result); + void SplitUpExtraInfo(const InsertionOrderPreservingMap &extra_info, vector &result, + idx_t max_lines); void SplitStringBuffer(const string &source, vector &result); }; diff --git a/src/duckdb/src/include/duckdb/common/types.hpp b/src/duckdb/src/include/duckdb/common/types.hpp index 5c2492f8..216e1a36 100644 --- a/src/duckdb/src/include/duckdb/common/types.hpp +++ b/src/duckdb/src/include/duckdb/common/types.hpp @@ -346,6 +346,9 @@ struct LogicalType { DUCKDB_API bool IsValid() const; DUCKDB_API bool IsComplete() const; + //! True, if this type supports in-place updates. + bool SupportsRegularUpdate() const; + private: LogicalTypeId id_; // NOLINT: allow this naming for legacy reasons diff --git a/src/duckdb/src/include/duckdb/common/types/conflict_manager.hpp b/src/duckdb/src/include/duckdb/common/types/conflict_manager.hpp index 5d63365d..f120b91d 100644 --- a/src/duckdb/src/include/duckdb/common/types/conflict_manager.hpp +++ b/src/duckdb/src/include/duckdb/common/types/conflict_manager.hpp @@ -38,15 +38,24 @@ class ConflictManager { VerifyExistenceType LookupType() const; // This should be called before using the conflicts selection vector void Finalize(); - idx_t ConflictCount() const; - const ManagedSelection &Conflicts() const; + Vector &RowIds(); const ConflictInfo &GetConflictInfo() const; void FinishLookup(); void SetMode(ConflictManagerMode mode); - void AddIndex(BoundIndex &index); + + //! Returns a reference to all conflicts in this conflict manager. + const ManagedSelection &Conflicts() const; + //! Returns the number of conflicts in this conflict manager. + idx_t ConflictCount() const; + //! Adds an index and its respective delete_index to the conflict manager's matches. + void AddIndex(BoundIndex &index, optional_ptr delete_index); + //! Returns true, if the index is in this conflict manager. bool MatchedIndex(BoundIndex &index); - const unordered_set &MatchedIndexes() const; + //! Returns a reference to the matched indexes. + const vector> &MatchedIndexes() const; + //! Returns a reference to the matched delete indexes. + const vector> &MatchedDeleteIndexes() const; private: bool IsConflict(LookupResultType type); @@ -66,7 +75,6 @@ class ConflictManager { optional_ptr conflict_info; bool finalized = false; ManagedSelection conflicts; - unordered_set matched_indexes; unique_ptr row_ids; // Used to check if a given conflict is part of the conflict target or not unique_ptr> conflict_set; @@ -77,6 +85,13 @@ class ConflictManager { // Whether we have already found the one conflict target we're interested in bool single_index_finished = false; ConflictManagerMode mode; + + //! Indexes matching the conflict target. + vector> matched_indexes; + //! Delete indexes matching the conflict target. + vector> matched_delete_indexes; + //! All matched indexes by their name, which is their unique identifier. + case_insensitive_set_t matched_index_names; }; } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/common/types/uuid.hpp b/src/duckdb/src/include/duckdb/common/types/uuid.hpp index 5573aac6..bf5ade17 100644 --- a/src/duckdb/src/include/duckdb/common/types/uuid.hpp +++ b/src/duckdb/src/include/duckdb/common/types/uuid.hpp @@ -13,7 +13,7 @@ namespace duckdb { class ClientContext; -struct RandomEngine; +class RandomEngine; //! The UUID class contains static operations for the UUID type class UUID { diff --git a/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp b/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp index 4ceb56a7..e44c9036 100644 --- a/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +++ b/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp @@ -94,6 +94,10 @@ class GroupedAggregateHashTable : public BaseAggregateHashTable { void SetRadixBits(idx_t radix_bits); //! Get the radix bits for this HT idx_t GetRadixBits() const; + //! Get the total amount of data sunk into this HT + idx_t GetSinkCount() const; + //! Skips lookups from here on out + void SkipLookups(); //! Executes the filter(if any) and update the aggregates void Combine(GroupedAggregateHashTable &other); @@ -159,6 +163,11 @@ class GroupedAggregateHashTable : public BaseAggregateHashTable { //! Bitmask for getting relevant bits from the hashes to determine the position hash_t bitmask; + //! How many tuples went into this HT (before de-duplication) + idx_t sink_count; + //! If true, we just append, skipping HT lookups + bool skip_lookups; + //! The active arena allocator used by the aggregates for their internal state shared_ptr aggregate_allocator; //! Owning arena allocators that this HT has data from diff --git a/src/duckdb/src/include/duckdb/execution/index/art/art.hpp b/src/duckdb/src/include/duckdb/execution/index/art/art.hpp index 00299952..31a560ad 100644 --- a/src/duckdb/src/include/duckdb/execution/index/art/art.hpp +++ b/src/duckdb/src/include/duckdb/execution/index/art/art.hpp @@ -14,14 +14,10 @@ namespace duckdb { -enum class VerifyExistenceType : uint8_t { - // Appends to a table. - APPEND = 0, - // Appends to a table that has a foreign key. - APPEND_FK = 1, - // Delete from a table that has a foreign key. - DELETE_FK = 2 -}; +enum class VerifyExistenceType : uint8_t { APPEND = 0, APPEND_FK = 1, DELETE_FK = 2 }; +enum class ARTConflictType : uint8_t { NO_CONFLICT = 0, CONSTRAINT = 1, TRANSACTION = 2 }; +enum class ARTAppendMode : uint8_t { DEFAULT = 0, IGNORE_DUPLICATES = 1, INSERT_DUPLICATES = 2 }; + class ConflictManager; class ARTKey; class ARTKeySection; @@ -66,6 +62,8 @@ class ART : public BoundIndex { bool owns_data; //! The number of bytes fitting in the prefix. uint8_t prefix_count; + //! The append mode. + ARTAppendMode append_mode; public: //! Try to initialize a scan on the ART with the given expression and filter. @@ -74,15 +72,23 @@ class ART : public BoundIndex { //! If all row IDs were fetched, it return true, else false. bool Scan(IndexScanState &state, idx_t max_count, unsafe_vector &row_ids); - //! Append a chunk by first executing the ART's expressions. - ErrorData Append(IndexLock &lock, DataChunk &input, Vector &row_ids) override; + //! Appends data to the locked index. + ErrorData Append(IndexLock &l, DataChunk &chunk, Vector &row_ids) override; + //! Appends data to the locked index and verifies constraint violations against a delete index. + ErrorData AppendWithDeleteIndex(IndexLock &l, DataChunk &chunk, Vector &row_ids, + optional_ptr delete_index) override; + + //! Internally inserts a chunk. + ARTConflictType Insert(Node &node, const ARTKey &key, idx_t depth, const ARTKey &row_id, const GateStatus status, + optional_ptr delete_art); //! Insert a chunk. - bool Insert(Node &node, const ARTKey &key, idx_t depth, const ARTKey &row_id, const GateStatus status); - ErrorData Insert(IndexLock &lock, DataChunk &data, Vector &row_ids) override; + ErrorData Insert(IndexLock &l, DataChunk &chunk, Vector &row_ids) override; + //! Insert a chunk and verifies constraint violations against a delete index. + ErrorData Insert(IndexLock &l, DataChunk &data, Vector &row_ids, optional_ptr delete_index) override; - //! Constraint verification for a chunk. - void VerifyAppend(DataChunk &chunk) override; - void VerifyAppend(DataChunk &chunk, ConflictManager &conflict_manager) override; + //! Verify that data can be appended to the index without a constraint violation. + void VerifyAppend(DataChunk &chunk, optional_ptr delete_index, + optional_ptr manager) override; //! Delete a chunk from the ART. void Delete(IndexLock &lock, DataChunk &entries, Vector &row_ids) override; @@ -124,12 +130,16 @@ class ART : public BoundIndex { void InsertIntoEmpty(Node &node, const ARTKey &key, const idx_t depth, const ARTKey &row_id, const GateStatus status); - bool InsertIntoNode(Node &node, const ARTKey &key, const idx_t depth, const ARTKey &row_id, - const GateStatus status); + ARTConflictType InsertIntoInlined(Node &node, const ARTKey &key, const idx_t depth, const ARTKey &row_id, + const GateStatus status, optional_ptr delete_art); + ARTConflictType InsertIntoNode(Node &node, const ARTKey &key, const idx_t depth, const ARTKey &row_id, + const GateStatus status, optional_ptr delete_art); string GenerateErrorKeyName(DataChunk &input, idx_t row); string GenerateConstraintErrorMessage(VerifyExistenceType verify_type, const string &key_name); - void CheckConstraintsForChunk(DataChunk &input, ConflictManager &conflict_manager) override; + void VerifyLeaf(const Node &leaf, const ARTKey &key, optional_ptr delete_art, ConflictManager &manager, + optional_idx &conflict_idx, idx_t i); + void VerifyConstraint(DataChunk &chunk, optional_ptr delete_index, ConflictManager &manager) override; string GetConstraintViolationMessage(VerifyExistenceType verify_type, idx_t failed_index, DataChunk &input) override; diff --git a/src/duckdb/src/include/duckdb/execution/index/art/node.hpp b/src/duckdb/src/include/duckdb/execution/index/art/node.hpp index 725cc2d3..316ecc5f 100644 --- a/src/duckdb/src/include/duckdb/execution/index/art/node.hpp +++ b/src/duckdb/src/include/duckdb/execution/index/art/node.hpp @@ -143,6 +143,7 @@ class Node : public IndexPointer { inline void SetGateStatus(const GateStatus status) { switch (status) { case GateStatus::GATE_SET: + D_ASSERT(GetType() != NType::LEAF_INLINED); SetMetadata(GetMetadata() | AND_GATE); break; case GateStatus::GATE_NOT_SET: diff --git a/src/duckdb/src/include/duckdb/execution/index/art/prefix.hpp b/src/duckdb/src/include/duckdb/execution/index/art/prefix.hpp index 670ab32e..5d656c74 100644 --- a/src/duckdb/src/include/duckdb/execution/index/art/prefix.hpp +++ b/src/duckdb/src/include/duckdb/execution/index/art/prefix.hpp @@ -84,8 +84,8 @@ class Prefix { static GateStatus Split(ART &art, reference &node, Node &child, const uint8_t pos); //! Insert a key into a prefix. - static bool Insert(ART &art, Node &node, const ARTKey &key, idx_t depth, const ARTKey &row_id, - const GateStatus status); + static ARTConflictType Insert(ART &art, Node &node, const ARTKey &key, idx_t depth, const ARTKey &row_id, + const GateStatus status, optional_ptr delete_art); //! Returns the string representation of the node, or only traverses and verifies the node and its subtree static string VerifyAndToString(ART &art, const Node &node, const bool only_verify); diff --git a/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp b/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp index a73ede0f..522c7c78 100644 --- a/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp +++ b/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp @@ -1,7 +1,7 @@ //===----------------------------------------------------------------------===// // DuckDB // -// duckdb/storage/index.hpp +// duckdb/execution/index/bound_index.hpp // // //===----------------------------------------------------------------------===// @@ -15,8 +15,8 @@ #include "duckdb/execution/expression_executor.hpp" #include "duckdb/parser/parsed_expression.hpp" #include "duckdb/planner/expression.hpp" -#include "duckdb/storage/table_storage_info.hpp" #include "duckdb/storage/index.hpp" +#include "duckdb/storage/table_storage_info.hpp" namespace duckdb { @@ -64,19 +64,24 @@ class BoundIndex : public Index { return index_constraint_type; } -public: // Index interface - //! Obtain a lock on the index +public: + //! Obtains a lock on the index. void InitializeLock(IndexLock &state); - //! Called when data is appended to the index. The lock obtained from InitializeLock must be held - virtual ErrorData Append(IndexLock &state, DataChunk &entries, Vector &row_identifiers) = 0; - //! Obtains a lock and calls Append while holding that lock - ErrorData Append(DataChunk &entries, Vector &row_identifiers); - //! Verify that data can be appended to the index without a constraint violation - virtual void VerifyAppend(DataChunk &chunk) = 0; - //! Verify that data can be appended to the index without a constraint violation using the conflict manager - virtual void VerifyAppend(DataChunk &chunk, ConflictManager &conflict_manager) = 0; - //! Performs constraint checking for a chunk of input data - virtual void CheckConstraintsForChunk(DataChunk &input, ConflictManager &conflict_manager) = 0; + //! Appends data to the locked index. + virtual ErrorData Append(IndexLock &l, DataChunk &chunk, Vector &row_ids) = 0; + //! Obtains a lock and calls Append while holding that lock. + ErrorData Append(DataChunk &chunk, Vector &row_ids); + //! Appends data to the locked index and verifies constraint violations against a delete index. + virtual ErrorData AppendWithDeleteIndex(IndexLock &l, DataChunk &chunk, Vector &row_ids, + optional_ptr delete_index); + //! Obtains a lock and calls Append with an delete_index while holding that lock. + ErrorData AppendWithDeleteIndex(DataChunk &chunk, Vector &row_ids, optional_ptr delete_index); + + //! Verify that data can be appended to the index without a constraint violation. + virtual void VerifyAppend(DataChunk &chunk, optional_ptr delete_index, + optional_ptr manager); + //! Verifies the constraint for a chunk of data. + virtual void VerifyConstraint(DataChunk &chunk, optional_ptr delete_index, ConflictManager &manager); //! Deletes all data from the index. The lock obtained from InitializeLock must be held virtual void CommitDrop(IndexLock &index_lock) = 0; @@ -87,8 +92,10 @@ class BoundIndex : public Index { //! Obtains a lock and calls Delete while holding that lock void Delete(DataChunk &entries, Vector &row_identifiers); - //! Insert a chunk of entries into the index - virtual ErrorData Insert(IndexLock &lock, DataChunk &input, Vector &row_identifiers) = 0; + //! Insert a chunk. + virtual ErrorData Insert(IndexLock &l, DataChunk &chunk, Vector &row_ids) = 0; + //! Insert a chunk and verifies constraint violations against a delete index. + virtual ErrorData Insert(IndexLock &l, DataChunk &chunk, Vector &row_ids, optional_ptr delete_index); //! Merge another index into this index. The lock obtained from InitializeLock must be held, and the other //! index must also be locked during the merge diff --git a/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_insert.hpp b/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_insert.hpp index 29666460..ccb113c4 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_insert.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_insert.hpp @@ -15,6 +15,7 @@ #include "duckdb/parser/statement/insert_statement.hpp" #include "duckdb/storage/table/append_state.hpp" #include "duckdb/catalog/catalog_entry/duck_table_entry.hpp" +#include "duckdb/storage/table/delete_state.hpp" namespace duckdb { @@ -42,7 +43,8 @@ class InsertLocalState : public LocalSinkState { const vector> &bound_constraints); public: - ConstraintState &GetConstraintState(DataTable &table, TableCatalogEntry &tableref); + ConstraintState &GetConstraintState(DataTable &table, TableCatalogEntry &table_ref); + TableDeleteState &GetDeleteState(DataTable &table, TableCatalogEntry &table_ref, ClientContext &context); public: //! The chunk that ends up getting inserted @@ -58,6 +60,10 @@ class InsertLocalState : public LocalSinkState { idx_t update_count = 0; unique_ptr constraint_state; const vector> &bound_constraints; + //! The delete state for ON CONFLICT handling that is rewritten into DELETE + INSERT. + unique_ptr delete_state; + //! The append chunk for ON CONFLICT handling that is rewritting into DELETE + INSERT. + DataChunk append_chunk; }; //! Physically insert a set of data into a table @@ -73,7 +79,7 @@ class PhysicalInsert : public PhysicalOperator { vector set_types, idx_t estimated_cardinality, bool return_chunk, bool parallel, OnConflictAction action_type, unique_ptr on_conflict_condition, unique_ptr do_update_condition, unordered_set on_conflict_filter, - vector columns_to_fetch); + vector columns_to_fetch, bool update_is_del_and_insert); //! CREATE TABLE AS PhysicalInsert(LogicalOperator &op, SchemaCatalogEntry &schema, unique_ptr info, idx_t estimated_cardinality, bool parallel); @@ -113,6 +119,8 @@ class PhysicalInsert : public PhysicalOperator { unique_ptr do_update_condition; // The column ids to apply the ON CONFLICT on unordered_set conflict_target; + //! True, if the INSERT OR REPLACE requires delete + insert. + bool update_is_del_and_insert; // Column ids from the original table to fetch vector columns_to_fetch; diff --git a/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_update.hpp b/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_update.hpp index 9556b48c..d86b805e 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_update.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_update.hpp @@ -35,6 +35,8 @@ class PhysicalUpdate : public PhysicalOperator { bool update_is_del_and_insert; //! If the returning statement is present, return the whole chunk bool return_chunk; + //! Set to true, if we are updating an index column. + bool index_update; public: // Source interface diff --git a/src/duckdb/src/include/duckdb/execution/physical_operator.hpp b/src/duckdb/src/include/duckdb/execution/physical_operator.hpp index 822b5537..50529d59 100644 --- a/src/duckdb/src/include/duckdb/execution/physical_operator.hpp +++ b/src/duckdb/src/include/duckdb/execution/physical_operator.hpp @@ -164,7 +164,7 @@ class PhysicalOperator { virtual void PrepareFinalize(ClientContext &context, GlobalSinkState &sink_state) const; //! The finalize is called when ALL threads are finished execution. It is called only once per pipeline, and is //! entirely single threaded. - //! If Finalize returns SinkResultType::FINISHED, the sink is marked as finished + //! If Finalize returns SinkResultType::Finished, the sink is marked as finished virtual SinkFinalizeType Finalize(Pipeline &pipeline, Event &event, ClientContext &context, OperatorSinkFinalizeInput &input) const; //! For sinks with RequiresBatchIndex set to true, when a new batch starts being processed this method is called diff --git a/src/duckdb/src/include/duckdb/execution/reservoir_sample.hpp b/src/duckdb/src/include/duckdb/execution/reservoir_sample.hpp index 0edc7e07..b794328b 100644 --- a/src/duckdb/src/include/duckdb/execution/reservoir_sample.hpp +++ b/src/duckdb/src/include/duckdb/execution/reservoir_sample.hpp @@ -12,25 +12,64 @@ #include "duckdb/common/common.hpp" #include "duckdb/common/random_engine.hpp" #include "duckdb/common/types/data_chunk.hpp" +#include "duckdb/common/windows_undefs.hpp" #include "duckdb/common/queue.hpp" +// Originally intended to be the vector size, but in order to run on +// vector size = 2, we had to change it. +#define FIXED_SAMPLE_SIZE 2048 + namespace duckdb { enum class SampleType : uint8_t { BLOCKING_SAMPLE = 0, RESERVOIR_SAMPLE = 1, RESERVOIR_PERCENTAGE_SAMPLE = 2 }; +enum class SamplingState : uint8_t { RANDOM = 0, RESERVOIR = 1 }; + +class ReservoirRNG : public RandomEngine { +public: + // return type must be called result type to be a valid URNG + typedef uint32_t result_type; + + explicit ReservoirRNG(int64_t seed) : RandomEngine(seed) {}; + + result_type operator()() { + return NextRandomInteger(); + }; + + static constexpr result_type min() { + return NumericLimits::Minimum(); + }; + static constexpr result_type max() { + return NumericLimits::Maximum(); + }; +}; + +//! Resevoir sampling is based on the 2005 paper "Weighted Random Sampling" by Efraimidis and Spirakis class BaseReservoirSampling { public: explicit BaseReservoirSampling(int64_t seed); BaseReservoirSampling(); - void InitializeReservoir(idx_t cur_size, idx_t sample_size); + void InitializeReservoirWeights(idx_t cur_size, idx_t sample_size); void SetNextEntry(); + void ReplaceElementWithIndex(idx_t entry_index, double with_weight, bool pop = true); void ReplaceElement(double with_weight = -1); + + void UpdateMinWeightThreshold(); + + //! Go from the naive sampling to the reservoir sampling + //! Naive samping will not collect weights, but when we serialize + //! we need to serialize weights again. + void FillWeights(SelectionVector &sel, idx_t &sel_size); + + unique_ptr Copy(); + //! The random generator - RandomEngine random; + ReservoirRNG random; + //! The next element to sample idx_t next_index_to_sample; //! The reservoir threshold of the current min entry @@ -48,6 +87,13 @@ class BaseReservoirSampling { void Serialize(Serializer &serializer) const; static unique_ptr Deserialize(Deserializer &deserializer); + + static double GetMinWeightFromTuplesSeen(idx_t rows_seen_total); + // static unordered_map tuples_to_min_weight_map; + // Blocking sample is a virtual class. It should be allowed to see the weights and + // of tuples in the sample. The blocking sample can then easily maintain statisitcal properties + // from the sample point of view. + friend class BlockingSample; }; class BlockingSample { @@ -61,24 +107,31 @@ class BlockingSample { bool destroyed; public: - explicit BlockingSample(int64_t seed) : old_base_reservoir_sample(seed), random(old_base_reservoir_sample.random) { - base_reservoir_sample = nullptr; + explicit BlockingSample(int64_t seed = -1) + : base_reservoir_sample(make_uniq(seed)), type(SampleType::BLOCKING_SAMPLE), + destroyed(false) { } virtual ~BlockingSample() { } //! Add a chunk of data to the sample virtual void AddToReservoir(DataChunk &input) = 0; - + virtual unique_ptr Copy() const = 0; virtual void Finalize() = 0; - //! Fetches a chunk from the sample. Note that this method is destructive and should only be used after the - //! sample is completely built. + virtual void Destroy(); + + //! Fetches a chunk from the sample. destroy = true should only be used when + //! querying from a sample defined in a query and not a duckdb_table_sample. virtual unique_ptr GetChunk() = 0; - BaseReservoirSampling old_base_reservoir_sample; virtual void Serialize(Serializer &serializer) const; static unique_ptr Deserialize(Deserializer &deserializer); + //! Helper functions needed to merge two reservoirs while respecting weights of sampled rows + std::pair PopFromWeightQueue(); + double GetMinWeightThreshold(); + idx_t GetPriorityQueueSize(); + public: template TARGET &Cast() { @@ -95,8 +148,6 @@ class BlockingSample { } return reinterpret_cast(*this); } - //! The reservoir sampling - RandomEngine &random; }; class ReservoirChunk { @@ -107,45 +158,120 @@ class ReservoirChunk { DataChunk chunk; void Serialize(Serializer &serializer) const; static unique_ptr Deserialize(Deserializer &deserializer); + + unique_ptr Copy() const; +}; + +struct SelectionVectorHelper { + SelectionVector sel; + uint32_t size; }; -//! The reservoir sample class maintains a streaming sample of fixed size "sample_count" class ReservoirSample : public BlockingSample { public: static constexpr const SampleType TYPE = SampleType::RESERVOIR_SAMPLE; -public: + constexpr static idx_t FIXED_SAMPLE_SIZE_MULTIPLIER = 10; + constexpr static idx_t FAST_TO_SLOW_THRESHOLD = 60; + + // If the table has less than 204800 rows, this is the percentage + // of values we save when serializing/returning a sample. + constexpr static double SAVE_PERCENTAGE = 0.01; + ReservoirSample(Allocator &allocator, idx_t sample_count, int64_t seed = 1); - explicit ReservoirSample(idx_t sample_count, int64_t seed = 1); + explicit ReservoirSample(idx_t sample_count, unique_ptr = nullptr); + + //! methods used to help with serializing and deserializing + void EvictOverBudgetSamples(); + void ExpandSerializedSample(); + + SamplingState GetSamplingState() const; + + //! Vacuum the Reservoir Sample so it throws away tuples that are not in the + //! reservoir weights or in the selection vector + void Vacuum(); + + //! Transform To sample based on reservoir sampling paper + void ConvertToReservoirSample(); + + //! Get the capactiy of the data chunk reserved for storing samples + idx_t GetReservoirChunkCapacity() const; + //! If for_serialization=true then the sample_chunk is not padded with extra spaces for + //! future sampling values + unique_ptr Copy() const override; + + //! create the first chunk called by AddToReservoir() + idx_t FillReservoir(DataChunk &chunk); //! Add a chunk of data to the sample void AddToReservoir(DataChunk &input) override; + //! Merge two Reservoir Samples. Other must be a reservoir sample + void Merge(unique_ptr other); + + void ShuffleSel(SelectionVector &sel, idx_t range, idx_t size) const; + + //! Update the sample by pushing new sample rows to the end of the sample_chunk. + //! The new sample rows are the tuples rows resulting from applying sel to other + void UpdateSampleAppend(DataChunk &this_, DataChunk &other, SelectionVector &other_sel, idx_t append_count) const; + + idx_t GetTuplesSeen() const; + idx_t NumSamplesCollected() const; + idx_t GetActiveSampleCount() const; + static bool ValidSampleType(const LogicalType &type); + + // get the chunk from Reservoir chunk + DataChunk &Chunk(); //! Fetches a chunk from the sample. Note that this method is destructive and should only be used after the //! sample is completely built. + // unique_ptr GetChunkAndDestroy() override; unique_ptr GetChunk() override; + void Destroy() override; void Finalize() override; + void Verify(); + + idx_t GetSampleCount(); + + // map is [index in input chunk] -> [index in sample chunk]. Both are zero-based + // [index in sample chunk] is incremented by 1 + // index in input chunk have random values, however, they are increasing. + // The base_reservoir_sampling gets updated however, so the indexes point to (sample_chunk_offset + + // index_in_sample_chunk) this data is used to make a selection vector to copy samples from the input chunk to the + // sample chunk + //! Get indexes from current sample that can be replaced. + SelectionVectorHelper GetReplacementIndexes(idx_t sample_chunk_offset, idx_t theoretical_chunk_length); + void Serialize(Serializer &serializer) const override; static unique_ptr Deserialize(Deserializer &deserializer); private: - //! Replace a single element of the input - void ReplaceElement(DataChunk &input, idx_t index_in_chunk, double with_weight = -1); - void InitializeReservoir(DataChunk &input); - //! Fills the reservoir up until sample_count entries, returns how many entries are still required - idx_t FillReservoir(DataChunk &input); + // when we serialize, we may have collected too many samples since we fill a standard vector size, then + // truncate if the table is still <=204800 values. The problem is, in our weights, we store indexes into + // the selection vector. If throw away values at selection vector index i = 5 , we need to update all indexes + // i > 5. Otherwise we will have indexes in the weights that are greater than the length of our sample. + void NormalizeWeights(); + + SelectionVectorHelper GetReplacementIndexesSlow(const idx_t sample_chunk_offset, const idx_t chunk_length); + SelectionVectorHelper GetReplacementIndexesFast(const idx_t sample_chunk_offset, const idx_t chunk_length); + void SimpleMerge(ReservoirSample &other); + void WeightedMerge(ReservoirSample &other_sample); + + // Helper methods for Shrink(). + // Shrink has different logic depending on if the Reservoir sample is still in + // "Random" mode or in "reservoir" mode. This function creates a new sample chunk + // to copy the old sample chunk into + unique_ptr CreateNewSampleChunk(vector &types, idx_t size) const; + + // Get a vector where each index is a random int in the range 0, size. + // This is used to shuffle selection vector indexes + vector GetRandomizedVector(uint32_t range, uint32_t size) const; -public: - Allocator &allocator; - //! The size of the reservoir sample. - //! when calculating percentages, it is set to reservoir_threshold * percentage - //! when explicit number used, sample_count = number idx_t sample_count; - bool reservoir_initialized; - - //! The current reservoir - unique_ptr reservoir_data_chunk; + Allocator &allocator; unique_ptr reservoir_chunk; + bool stats_sample; + SelectionVector sel; + idx_t sel_size; }; //! The reservoir sample sample_size class maintains a streaming sample of variable size @@ -155,15 +281,16 @@ class ReservoirSamplePercentage : public BlockingSample { public: static constexpr const SampleType TYPE = SampleType::RESERVOIR_PERCENTAGE_SAMPLE; -public: ReservoirSamplePercentage(Allocator &allocator, double percentage, int64_t seed = -1); + ReservoirSamplePercentage(double percentage, int64_t seed, idx_t reservoir_sample_size); explicit ReservoirSamplePercentage(double percentage, int64_t seed = -1); //! Add a chunk of data to the sample void AddToReservoir(DataChunk &input) override; - //! Fetches a chunk from the sample. Note that this method is destructive and should only be used after the - //! sample is completely built. + unique_ptr Copy() const override; + + //! Fetches a chunk from the sample. If destory = true this method is descructive unique_ptr GetChunk() override; void Finalize() override; @@ -182,9 +309,11 @@ class ReservoirSamplePercentage : public BlockingSample { //! The set of finished samples of the reservoir sample vector> finished_samples; + //! The amount of tuples that have been processed so far (not put in the reservoir, just processed) idx_t current_count = 0; - //! Whether or not the stream is finalized. The stream is automatically finalized on the first call to GetChunk(); + //! Whether or not the stream is finalized. The stream is automatically finalized on the first call to + //! GetChunkAndShrink(); bool is_finalized; }; diff --git a/src/duckdb/src/include/duckdb/function/table/system_functions.hpp b/src/duckdb/src/include/duckdb/function/table/system_functions.hpp index f74dc466..689b5520 100644 --- a/src/duckdb/src/include/duckdb/function/table/system_functions.hpp +++ b/src/duckdb/src/include/duckdb/function/table/system_functions.hpp @@ -107,6 +107,10 @@ struct DuckDBTablesFun { static void RegisterFunction(BuiltinFunctions &set); }; +struct DuckDBTableSample { + static void RegisterFunction(BuiltinFunctions &set); +}; + struct DuckDBTemporaryFilesFun { static void RegisterFunction(BuiltinFunctions &set); }; diff --git a/src/duckdb/src/include/duckdb/main/capi/extension_api.hpp b/src/duckdb/src/include/duckdb/main/capi/extension_api.hpp index b7d7b803..c8541ef1 100644 --- a/src/duckdb/src/include/duckdb/main/capi/extension_api.hpp +++ b/src/duckdb/src/include/duckdb/main/capi/extension_api.hpp @@ -6,7 +6,7 @@ // Function pointer struct //===--------------------------------------------------------------------===// typedef struct { - // v0.0.1 + // v1.2.0 duckdb_state (*duckdb_open)(const char *path, duckdb_database *out_database); duckdb_state (*duckdb_open_ext)(const char *path, duckdb_database *out_database, duckdb_config config, char **out_error); @@ -30,10 +30,14 @@ typedef struct { idx_t (*duckdb_column_count)(duckdb_result *result); idx_t (*duckdb_rows_changed)(duckdb_result *result); const char *(*duckdb_result_error)(duckdb_result *result); + duckdb_error_type (*duckdb_result_error_type)(duckdb_result *result); + duckdb_result_type (*duckdb_result_return_type)(duckdb_result result); void *(*duckdb_malloc)(size_t size); void (*duckdb_free)(void *ptr); idx_t (*duckdb_vector_size)(); bool (*duckdb_string_is_inlined)(duckdb_string_t string); + uint32_t (*duckdb_string_t_length)(duckdb_string_t string); + const char *(*duckdb_string_t_data)(duckdb_string_t *string); duckdb_date_struct (*duckdb_from_date)(duckdb_date date); duckdb_date (*duckdb_to_date)(duckdb_date_struct date); bool (*duckdb_is_finite_date)(duckdb_date date); @@ -57,6 +61,7 @@ typedef struct { idx_t (*duckdb_nparams)(duckdb_prepared_statement prepared_statement); const char *(*duckdb_parameter_name)(duckdb_prepared_statement prepared_statement, idx_t index); duckdb_type (*duckdb_param_type)(duckdb_prepared_statement prepared_statement, idx_t param_idx); + duckdb_logical_type (*duckdb_param_logical_type)(duckdb_prepared_statement prepared_statement, idx_t param_idx); duckdb_state (*duckdb_clear_bindings)(duckdb_prepared_statement prepared_statement); duckdb_statement_type (*duckdb_prepared_statement_type)(duckdb_prepared_statement statement); duckdb_state (*duckdb_bind_value)(duckdb_prepared_statement prepared_statement, idx_t param_idx, duckdb_value val); @@ -112,14 +117,70 @@ typedef struct { void (*duckdb_destroy_value)(duckdb_value *value); duckdb_value (*duckdb_create_varchar)(const char *text); duckdb_value (*duckdb_create_varchar_length)(const char *text, idx_t length); + duckdb_value (*duckdb_create_bool)(bool input); + duckdb_value (*duckdb_create_int8)(int8_t input); + duckdb_value (*duckdb_create_uint8)(uint8_t input); + duckdb_value (*duckdb_create_int16)(int16_t input); + duckdb_value (*duckdb_create_uint16)(uint16_t input); + duckdb_value (*duckdb_create_int32)(int32_t input); + duckdb_value (*duckdb_create_uint32)(uint32_t input); + duckdb_value (*duckdb_create_uint64)(uint64_t input); duckdb_value (*duckdb_create_int64)(int64_t val); + duckdb_value (*duckdb_create_hugeint)(duckdb_hugeint input); + duckdb_value (*duckdb_create_uhugeint)(duckdb_uhugeint input); + duckdb_value (*duckdb_create_float)(float input); + duckdb_value (*duckdb_create_double)(double input); + duckdb_value (*duckdb_create_date)(duckdb_date input); + duckdb_value (*duckdb_create_time)(duckdb_time input); + duckdb_value (*duckdb_create_time_tz_value)(duckdb_time_tz value); + duckdb_value (*duckdb_create_timestamp)(duckdb_timestamp input); + duckdb_value (*duckdb_create_interval)(duckdb_interval input); + duckdb_value (*duckdb_create_blob)(const uint8_t *data, idx_t length); + duckdb_value (*duckdb_create_varint)(duckdb_varint input); + duckdb_value (*duckdb_create_decimal)(duckdb_decimal input); + duckdb_value (*duckdb_create_bit)(duckdb_bit input); + duckdb_value (*duckdb_create_uuid)(duckdb_uhugeint input); + bool (*duckdb_get_bool)(duckdb_value val); + int8_t (*duckdb_get_int8)(duckdb_value val); + uint8_t (*duckdb_get_uint8)(duckdb_value val); + int16_t (*duckdb_get_int16)(duckdb_value val); + uint16_t (*duckdb_get_uint16)(duckdb_value val); + int32_t (*duckdb_get_int32)(duckdb_value val); + uint32_t (*duckdb_get_uint32)(duckdb_value val); + int64_t (*duckdb_get_int64)(duckdb_value val); + uint64_t (*duckdb_get_uint64)(duckdb_value val); + duckdb_hugeint (*duckdb_get_hugeint)(duckdb_value val); + duckdb_uhugeint (*duckdb_get_uhugeint)(duckdb_value val); + float (*duckdb_get_float)(duckdb_value val); + double (*duckdb_get_double)(duckdb_value val); + duckdb_date (*duckdb_get_date)(duckdb_value val); + duckdb_time (*duckdb_get_time)(duckdb_value val); + duckdb_time_tz (*duckdb_get_time_tz)(duckdb_value val); + duckdb_timestamp (*duckdb_get_timestamp)(duckdb_value val); + duckdb_interval (*duckdb_get_interval)(duckdb_value val); + duckdb_logical_type (*duckdb_get_value_type)(duckdb_value val); + duckdb_blob (*duckdb_get_blob)(duckdb_value val); + duckdb_varint (*duckdb_get_varint)(duckdb_value val); + duckdb_decimal (*duckdb_get_decimal)(duckdb_value val); + duckdb_bit (*duckdb_get_bit)(duckdb_value val); + duckdb_uhugeint (*duckdb_get_uuid)(duckdb_value val); + char *(*duckdb_get_varchar)(duckdb_value value); duckdb_value (*duckdb_create_struct_value)(duckdb_logical_type type, duckdb_value *values); duckdb_value (*duckdb_create_list_value)(duckdb_logical_type type, duckdb_value *values, idx_t value_count); duckdb_value (*duckdb_create_array_value)(duckdb_logical_type type, duckdb_value *values, idx_t value_count); - char *(*duckdb_get_varchar)(duckdb_value value); - int64_t (*duckdb_get_int64)(duckdb_value val); + idx_t (*duckdb_get_map_size)(duckdb_value value); + duckdb_value (*duckdb_get_map_key)(duckdb_value value, idx_t index); + duckdb_value (*duckdb_get_map_value)(duckdb_value value, idx_t index); + bool (*duckdb_is_null_value)(duckdb_value value); + duckdb_value (*duckdb_create_null_value)(); + idx_t (*duckdb_get_list_size)(duckdb_value value); + duckdb_value (*duckdb_get_list_child)(duckdb_value value, idx_t index); + duckdb_value (*duckdb_create_enum_value)(duckdb_logical_type type, uint64_t value); + uint64_t (*duckdb_get_enum_value)(duckdb_value value); + duckdb_value (*duckdb_get_struct_child)(duckdb_value value, idx_t index); duckdb_logical_type (*duckdb_create_logical_type)(duckdb_type type); char *(*duckdb_logical_type_get_alias)(duckdb_logical_type type); + void (*duckdb_logical_type_set_alias)(duckdb_logical_type type, const char *alias); duckdb_logical_type (*duckdb_create_list_type)(duckdb_logical_type type); duckdb_logical_type (*duckdb_create_array_type)(duckdb_logical_type type, idx_t array_size); duckdb_logical_type (*duckdb_create_map_type)(duckdb_logical_type key_type, duckdb_logical_type value_type); @@ -148,7 +209,8 @@ typedef struct { char *(*duckdb_union_type_member_name)(duckdb_logical_type type, idx_t index); duckdb_logical_type (*duckdb_union_type_member_type)(duckdb_logical_type type, idx_t index); void (*duckdb_destroy_logical_type)(duckdb_logical_type *type); - duckdb_data_chunk (*duckdb_fetch_chunk)(duckdb_result result); + duckdb_state (*duckdb_register_logical_type)(duckdb_connection con, duckdb_logical_type type, + duckdb_create_type_info info); duckdb_data_chunk (*duckdb_create_data_chunk)(duckdb_logical_type *types, idx_t column_count); void (*duckdb_destroy_data_chunk)(duckdb_data_chunk *chunk); void (*duckdb_data_chunk_reset)(duckdb_data_chunk chunk); @@ -175,6 +237,9 @@ typedef struct { duckdb_scalar_function (*duckdb_create_scalar_function)(); void (*duckdb_destroy_scalar_function)(duckdb_scalar_function *scalar_function); void (*duckdb_scalar_function_set_name)(duckdb_scalar_function scalar_function, const char *name); + void (*duckdb_scalar_function_set_varargs)(duckdb_scalar_function scalar_function, duckdb_logical_type type); + void (*duckdb_scalar_function_set_special_handling)(duckdb_scalar_function scalar_function); + void (*duckdb_scalar_function_set_volatile)(duckdb_scalar_function scalar_function); void (*duckdb_scalar_function_add_parameter)(duckdb_scalar_function scalar_function, duckdb_logical_type type); void (*duckdb_scalar_function_set_return_type)(duckdb_scalar_function scalar_function, duckdb_logical_type type); void (*duckdb_scalar_function_set_extra_info)(duckdb_scalar_function scalar_function, void *extra_info, @@ -182,6 +247,39 @@ typedef struct { void (*duckdb_scalar_function_set_function)(duckdb_scalar_function scalar_function, duckdb_scalar_function_t function); duckdb_state (*duckdb_register_scalar_function)(duckdb_connection con, duckdb_scalar_function scalar_function); + void *(*duckdb_scalar_function_get_extra_info)(duckdb_function_info info); + void (*duckdb_scalar_function_set_error)(duckdb_function_info info, const char *error); + duckdb_scalar_function_set (*duckdb_create_scalar_function_set)(const char *name); + void (*duckdb_destroy_scalar_function_set)(duckdb_scalar_function_set *scalar_function_set); + duckdb_state (*duckdb_add_scalar_function_to_set)(duckdb_scalar_function_set set, duckdb_scalar_function function); + duckdb_state (*duckdb_register_scalar_function_set)(duckdb_connection con, duckdb_scalar_function_set set); + duckdb_aggregate_function (*duckdb_create_aggregate_function)(); + void (*duckdb_destroy_aggregate_function)(duckdb_aggregate_function *aggregate_function); + void (*duckdb_aggregate_function_set_name)(duckdb_aggregate_function aggregate_function, const char *name); + void (*duckdb_aggregate_function_add_parameter)(duckdb_aggregate_function aggregate_function, + duckdb_logical_type type); + void (*duckdb_aggregate_function_set_return_type)(duckdb_aggregate_function aggregate_function, + duckdb_logical_type type); + void (*duckdb_aggregate_function_set_functions)(duckdb_aggregate_function aggregate_function, + duckdb_aggregate_state_size state_size, + duckdb_aggregate_init_t state_init, + duckdb_aggregate_update_t update, + duckdb_aggregate_combine_t combine, + duckdb_aggregate_finalize_t finalize); + void (*duckdb_aggregate_function_set_destructor)(duckdb_aggregate_function aggregate_function, + duckdb_aggregate_destroy_t destroy); + duckdb_state (*duckdb_register_aggregate_function)(duckdb_connection con, + duckdb_aggregate_function aggregate_function); + void (*duckdb_aggregate_function_set_special_handling)(duckdb_aggregate_function aggregate_function); + void (*duckdb_aggregate_function_set_extra_info)(duckdb_aggregate_function aggregate_function, void *extra_info, + duckdb_delete_callback_t destroy); + void *(*duckdb_aggregate_function_get_extra_info)(duckdb_function_info info); + void (*duckdb_aggregate_function_set_error)(duckdb_function_info info, const char *error); + duckdb_aggregate_function_set (*duckdb_create_aggregate_function_set)(const char *name); + void (*duckdb_destroy_aggregate_function_set)(duckdb_aggregate_function_set *aggregate_function_set); + duckdb_state (*duckdb_add_aggregate_function_to_set)(duckdb_aggregate_function_set set, + duckdb_aggregate_function function); + duckdb_state (*duckdb_register_aggregate_function_set)(duckdb_connection con, duckdb_aggregate_function_set set); duckdb_table_function (*duckdb_create_table_function)(); void (*duckdb_destroy_table_function)(duckdb_table_function *table_function); void (*duckdb_table_function_set_name)(duckdb_table_function table_function, const char *name); @@ -222,14 +320,68 @@ typedef struct { void (*duckdb_replacement_scan_set_function_name)(duckdb_replacement_scan_info info, const char *function_name); void (*duckdb_replacement_scan_add_parameter)(duckdb_replacement_scan_info info, duckdb_value parameter); void (*duckdb_replacement_scan_set_error)(duckdb_replacement_scan_info info, const char *error); + duckdb_value (*duckdb_profiling_info_get_metrics)(duckdb_profiling_info info); + idx_t (*duckdb_profiling_info_get_child_count)(duckdb_profiling_info info); + duckdb_profiling_info (*duckdb_profiling_info_get_child)(duckdb_profiling_info info, idx_t index); duckdb_state (*duckdb_appender_create)(duckdb_connection connection, const char *schema, const char *table, duckdb_appender *out_appender); + duckdb_state (*duckdb_appender_create_ext)(duckdb_connection connection, const char *catalog, const char *schema, + const char *table, duckdb_appender *out_appender); idx_t (*duckdb_appender_column_count)(duckdb_appender appender); duckdb_logical_type (*duckdb_appender_column_type)(duckdb_appender appender, idx_t col_idx); const char *(*duckdb_appender_error)(duckdb_appender appender); duckdb_state (*duckdb_appender_flush)(duckdb_appender appender); duckdb_state (*duckdb_appender_close)(duckdb_appender appender); duckdb_state (*duckdb_appender_destroy)(duckdb_appender *appender); + duckdb_state (*duckdb_appender_add_column)(duckdb_appender appender, const char *name); + duckdb_state (*duckdb_appender_clear_columns)(duckdb_appender appender); + duckdb_state (*duckdb_append_data_chunk)(duckdb_appender appender, duckdb_data_chunk chunk); + duckdb_state (*duckdb_table_description_create)(duckdb_connection connection, const char *schema, const char *table, + duckdb_table_description *out); + duckdb_state (*duckdb_table_description_create_ext)(duckdb_connection connection, const char *catalog, + const char *schema, const char *table, + duckdb_table_description *out); + void (*duckdb_table_description_destroy)(duckdb_table_description *table_description); + const char *(*duckdb_table_description_error)(duckdb_table_description table_description); + duckdb_state (*duckdb_column_has_default)(duckdb_table_description table_description, idx_t index, bool *out); + char *(*duckdb_table_description_get_column_name)(duckdb_table_description table_description, idx_t index); + void (*duckdb_execute_tasks)(duckdb_database database, idx_t max_tasks); + duckdb_task_state (*duckdb_create_task_state)(duckdb_database database); + void (*duckdb_execute_tasks_state)(duckdb_task_state state); + idx_t (*duckdb_execute_n_tasks_state)(duckdb_task_state state, idx_t max_tasks); + void (*duckdb_finish_execution)(duckdb_task_state state); + bool (*duckdb_task_state_is_finished)(duckdb_task_state state); + void (*duckdb_destroy_task_state)(duckdb_task_state state); + bool (*duckdb_execution_is_finished)(duckdb_connection con); + duckdb_data_chunk (*duckdb_fetch_chunk)(duckdb_result result); + duckdb_cast_function (*duckdb_create_cast_function)(); + void (*duckdb_cast_function_set_source_type)(duckdb_cast_function cast_function, duckdb_logical_type source_type); + void (*duckdb_cast_function_set_target_type)(duckdb_cast_function cast_function, duckdb_logical_type target_type); + void (*duckdb_cast_function_set_implicit_cast_cost)(duckdb_cast_function cast_function, int64_t cost); + void (*duckdb_cast_function_set_function)(duckdb_cast_function cast_function, duckdb_cast_function_t function); + void (*duckdb_cast_function_set_extra_info)(duckdb_cast_function cast_function, void *extra_info, + duckdb_delete_callback_t destroy); + void *(*duckdb_cast_function_get_extra_info)(duckdb_function_info info); + duckdb_cast_mode (*duckdb_cast_function_get_cast_mode)(duckdb_function_info info); + void (*duckdb_cast_function_set_error)(duckdb_function_info info, const char *error); + void (*duckdb_cast_function_set_row_error)(duckdb_function_info info, const char *error, idx_t row, + duckdb_vector output); + duckdb_state (*duckdb_register_cast_function)(duckdb_connection con, duckdb_cast_function cast_function); + void (*duckdb_destroy_cast_function)(duckdb_cast_function *cast_function); + bool (*duckdb_is_finite_timestamp_s)(duckdb_timestamp_s ts); + bool (*duckdb_is_finite_timestamp_ms)(duckdb_timestamp_ms ts); + bool (*duckdb_is_finite_timestamp_ns)(duckdb_timestamp_ns ts); + duckdb_value (*duckdb_create_timestamp_tz)(duckdb_timestamp input); + duckdb_value (*duckdb_create_timestamp_s)(duckdb_timestamp_s input); + duckdb_value (*duckdb_create_timestamp_ms)(duckdb_timestamp_ms input); + duckdb_value (*duckdb_create_timestamp_ns)(duckdb_timestamp_ns input); + duckdb_timestamp (*duckdb_get_timestamp_tz)(duckdb_value val); + duckdb_timestamp_s (*duckdb_get_timestamp_s)(duckdb_value val); + duckdb_timestamp_ms (*duckdb_get_timestamp_ms)(duckdb_value val); + duckdb_timestamp_ns (*duckdb_get_timestamp_ns)(duckdb_value val); + duckdb_state (*duckdb_append_value)(duckdb_appender appender, duckdb_value value); + duckdb_profiling_info (*duckdb_get_profiling_info)(duckdb_connection connection); + duckdb_value (*duckdb_profiling_info_get_value)(duckdb_profiling_info info, const char *key); duckdb_state (*duckdb_appender_begin_row)(duckdb_appender appender); duckdb_state (*duckdb_appender_end_row)(duckdb_appender appender); duckdb_state (*duckdb_append_default)(duckdb_appender appender); @@ -254,127 +406,14 @@ typedef struct { duckdb_state (*duckdb_append_varchar_length)(duckdb_appender appender, const char *val, idx_t length); duckdb_state (*duckdb_append_blob)(duckdb_appender appender, const void *data, idx_t length); duckdb_state (*duckdb_append_null)(duckdb_appender appender); - duckdb_state (*duckdb_append_data_chunk)(duckdb_appender appender, duckdb_data_chunk chunk); - void (*duckdb_execute_tasks)(duckdb_database database, idx_t max_tasks); - duckdb_task_state (*duckdb_create_task_state)(duckdb_database database); - void (*duckdb_execute_tasks_state)(duckdb_task_state state); - idx_t (*duckdb_execute_n_tasks_state)(duckdb_task_state state, idx_t max_tasks); - void (*duckdb_finish_execution)(duckdb_task_state state); - bool (*duckdb_task_state_is_finished)(duckdb_task_state state); - void (*duckdb_destroy_task_state)(duckdb_task_state state); - bool (*duckdb_execution_is_finished)(duckdb_connection con); - duckdb_profiling_info (*duckdb_get_profiling_info)(duckdb_connection connection); - duckdb_value (*duckdb_profiling_info_get_value)(duckdb_profiling_info info, const char *key); - idx_t (*duckdb_profiling_info_get_child_count)(duckdb_profiling_info info); - duckdb_profiling_info (*duckdb_profiling_info_get_child)(duckdb_profiling_info info, idx_t index); - duckdb_value (*duckdb_profiling_info_get_metrics)(duckdb_profiling_info info); - void (*duckdb_scalar_function_set_varargs)(duckdb_scalar_function scalar_function, duckdb_logical_type type); - void (*duckdb_scalar_function_set_special_handling)(duckdb_scalar_function scalar_function); - void (*duckdb_scalar_function_set_volatile)(duckdb_scalar_function scalar_function); - void *(*duckdb_scalar_function_get_extra_info)(duckdb_function_info info); - void (*duckdb_scalar_function_set_error)(duckdb_function_info info, const char *error); - duckdb_state (*duckdb_table_description_create)(duckdb_connection connection, const char *schema, const char *table, - duckdb_table_description *out); - void (*duckdb_table_description_destroy)(duckdb_table_description *table_description); - const char *(*duckdb_table_description_error)(duckdb_table_description table_description); - duckdb_error_type (*duckdb_result_error_type)(duckdb_result *result); - uint32_t (*duckdb_string_t_length)(duckdb_string_t string); - const char *(*duckdb_string_t_data)(duckdb_string_t *string); - duckdb_value (*duckdb_create_bool)(bool input); - duckdb_value (*duckdb_create_int8)(int8_t input); - duckdb_value (*duckdb_create_uint8)(uint8_t input); - duckdb_value (*duckdb_create_int16)(int16_t input); - duckdb_value (*duckdb_create_uint16)(uint16_t input); - duckdb_value (*duckdb_create_int32)(int32_t input); - duckdb_value (*duckdb_create_uint32)(uint32_t input); - duckdb_value (*duckdb_create_uint64)(uint64_t input); - duckdb_value (*duckdb_create_hugeint)(duckdb_hugeint input); - duckdb_value (*duckdb_create_uhugeint)(duckdb_uhugeint input); - duckdb_value (*duckdb_create_float)(float input); - duckdb_value (*duckdb_create_double)(double input); - duckdb_value (*duckdb_create_date)(duckdb_date input); - duckdb_value (*duckdb_create_time)(duckdb_time input); - duckdb_value (*duckdb_create_time_tz_value)(duckdb_time_tz value); - duckdb_value (*duckdb_create_timestamp)(duckdb_timestamp input); - duckdb_value (*duckdb_create_interval)(duckdb_interval input); - duckdb_value (*duckdb_create_blob)(const uint8_t *data, idx_t length); - bool (*duckdb_get_bool)(duckdb_value val); - int8_t (*duckdb_get_int8)(duckdb_value val); - uint8_t (*duckdb_get_uint8)(duckdb_value val); - int16_t (*duckdb_get_int16)(duckdb_value val); - uint16_t (*duckdb_get_uint16)(duckdb_value val); - int32_t (*duckdb_get_int32)(duckdb_value val); - uint32_t (*duckdb_get_uint32)(duckdb_value val); - uint64_t (*duckdb_get_uint64)(duckdb_value val); - duckdb_hugeint (*duckdb_get_hugeint)(duckdb_value val); - duckdb_uhugeint (*duckdb_get_uhugeint)(duckdb_value val); - float (*duckdb_get_float)(duckdb_value val); - double (*duckdb_get_double)(duckdb_value val); - duckdb_date (*duckdb_get_date)(duckdb_value val); - duckdb_time (*duckdb_get_time)(duckdb_value val); - duckdb_time_tz (*duckdb_get_time_tz)(duckdb_value val); - duckdb_timestamp (*duckdb_get_timestamp)(duckdb_value val); - duckdb_interval (*duckdb_get_interval)(duckdb_value val); - duckdb_logical_type (*duckdb_get_value_type)(duckdb_value val); - duckdb_blob (*duckdb_get_blob)(duckdb_value val); - duckdb_scalar_function_set (*duckdb_create_scalar_function_set)(const char *name); - void (*duckdb_destroy_scalar_function_set)(duckdb_scalar_function_set *scalar_function_set); - duckdb_state (*duckdb_add_scalar_function_to_set)(duckdb_scalar_function_set set, duckdb_scalar_function function); - duckdb_state (*duckdb_register_scalar_function_set)(duckdb_connection con, duckdb_scalar_function_set set); - duckdb_aggregate_function_set (*duckdb_create_aggregate_function_set)(const char *name); - void (*duckdb_destroy_aggregate_function_set)(duckdb_aggregate_function_set *aggregate_function_set); - duckdb_state (*duckdb_add_aggregate_function_to_set)(duckdb_aggregate_function_set set, - duckdb_aggregate_function function); - duckdb_state (*duckdb_register_aggregate_function_set)(duckdb_connection con, duckdb_aggregate_function_set set); - idx_t (*duckdb_get_map_size)(duckdb_value value); - duckdb_value (*duckdb_get_map_key)(duckdb_value value, idx_t index); - duckdb_value (*duckdb_get_map_value)(duckdb_value value, idx_t index); - duckdb_aggregate_function (*duckdb_create_aggregate_function)(); - void (*duckdb_destroy_aggregate_function)(duckdb_aggregate_function *aggregate_function); - void (*duckdb_aggregate_function_set_name)(duckdb_aggregate_function aggregate_function, const char *name); - void (*duckdb_aggregate_function_add_parameter)(duckdb_aggregate_function aggregate_function, - duckdb_logical_type type); - void (*duckdb_aggregate_function_set_return_type)(duckdb_aggregate_function aggregate_function, - duckdb_logical_type type); - void (*duckdb_aggregate_function_set_functions)(duckdb_aggregate_function aggregate_function, - duckdb_aggregate_state_size state_size, - duckdb_aggregate_init_t state_init, - duckdb_aggregate_update_t update, - duckdb_aggregate_combine_t combine, - duckdb_aggregate_finalize_t finalize); - void (*duckdb_aggregate_function_set_destructor)(duckdb_aggregate_function aggregate_function, - duckdb_aggregate_destroy_t destroy); - duckdb_state (*duckdb_register_aggregate_function)(duckdb_connection con, - duckdb_aggregate_function aggregate_function); - void (*duckdb_aggregate_function_set_special_handling)(duckdb_aggregate_function aggregate_function); - void (*duckdb_aggregate_function_set_extra_info)(duckdb_aggregate_function aggregate_function, void *extra_info, - duckdb_delete_callback_t destroy); - void *(*duckdb_aggregate_function_get_extra_info)(duckdb_function_info info); - void (*duckdb_aggregate_function_set_error)(duckdb_function_info info, const char *error); - void (*duckdb_logical_type_set_alias)(duckdb_logical_type type, const char *alias); - duckdb_state (*duckdb_register_logical_type)(duckdb_connection con, duckdb_logical_type type, - duckdb_create_type_info info); - duckdb_cast_function (*duckdb_create_cast_function)(); - void (*duckdb_cast_function_set_source_type)(duckdb_cast_function cast_function, duckdb_logical_type source_type); - void (*duckdb_cast_function_set_target_type)(duckdb_cast_function cast_function, duckdb_logical_type target_type); - void (*duckdb_cast_function_set_implicit_cast_cost)(duckdb_cast_function cast_function, int64_t cost); - void (*duckdb_cast_function_set_function)(duckdb_cast_function cast_function, duckdb_cast_function_t function); - void (*duckdb_cast_function_set_extra_info)(duckdb_cast_function cast_function, void *extra_info, - duckdb_delete_callback_t destroy); - void *(*duckdb_cast_function_get_extra_info)(duckdb_function_info info); - duckdb_cast_mode (*duckdb_cast_function_get_cast_mode)(duckdb_function_info info); - void (*duckdb_cast_function_set_error)(duckdb_function_info info, const char *error); - void (*duckdb_cast_function_set_row_error)(duckdb_function_info info, const char *error, idx_t row, - duckdb_vector output); - duckdb_state (*duckdb_register_cast_function)(duckdb_connection con, duckdb_cast_function cast_function); - void (*duckdb_destroy_cast_function)(duckdb_cast_function *cast_function); + // These functions have been deprecated and may be removed in future versions of DuckDB + idx_t (*duckdb_row_count)(duckdb_result *result); void *(*duckdb_column_data)(duckdb_result *result, idx_t col); bool *(*duckdb_nullmask_data)(duckdb_result *result, idx_t col); duckdb_data_chunk (*duckdb_result_get_chunk)(duckdb_result result, idx_t chunk_index); bool (*duckdb_result_is_streaming)(duckdb_result result); idx_t (*duckdb_result_chunk_count)(duckdb_result result); - duckdb_result_type (*duckdb_result_return_type)(duckdb_result result); bool (*duckdb_value_boolean)(duckdb_result *result, idx_t col, idx_t row); int8_t (*duckdb_value_int8)(duckdb_result *result, idx_t col, idx_t row); int16_t (*duckdb_value_int16)(duckdb_result *result, idx_t col, idx_t row); @@ -403,7 +442,6 @@ typedef struct { duckdb_result *out_result); duckdb_state (*duckdb_pending_prepared_streaming)(duckdb_prepared_statement prepared_statement, duckdb_pending_result *out_result); - duckdb_state (*duckdb_column_has_default)(duckdb_table_description table_description, idx_t index, bool *out); duckdb_state (*duckdb_query_arrow)(duckdb_connection connection, const char *query, duckdb_arrow *out_result); duckdb_state (*duckdb_query_arrow_schema)(duckdb_arrow result, duckdb_arrow_schema *out_schema); duckdb_state (*duckdb_prepared_arrow_schema)(duckdb_prepared_statement prepared, duckdb_arrow_schema *out_schema); @@ -422,52 +460,13 @@ typedef struct { duckdb_arrow_schema arrow_schema, duckdb_arrow_array arrow_array, duckdb_arrow_stream *out_stream); duckdb_data_chunk (*duckdb_stream_fetch_chunk)(duckdb_result result); - // dev - // WARNING! the functions below are not (yet) stable - - duckdb_state (*duckdb_appender_create_ext)(duckdb_connection connection, const char *catalog, const char *schema, - const char *table, duckdb_appender *out_appender); - duckdb_state (*duckdb_table_description_create_ext)(duckdb_connection connection, const char *catalog, - const char *schema, const char *table, - duckdb_table_description *out); - char *(*duckdb_table_description_get_column_name)(duckdb_table_description table_description, idx_t index); - duckdb_logical_type (*duckdb_param_logical_type)(duckdb_prepared_statement prepared_statement, idx_t param_idx); - bool (*duckdb_is_null_value)(duckdb_value value); - duckdb_value (*duckdb_create_null_value)(); - idx_t (*duckdb_get_list_size)(duckdb_value value); - duckdb_value (*duckdb_get_list_child)(duckdb_value value, idx_t index); - duckdb_value (*duckdb_create_enum_value)(duckdb_logical_type type, uint64_t value); - uint64_t (*duckdb_get_enum_value)(duckdb_value value); - duckdb_value (*duckdb_get_struct_child)(duckdb_value value, idx_t index); - duckdb_state (*duckdb_appender_add_column)(duckdb_appender appender, const char *name); - duckdb_state (*duckdb_appender_clear_columns)(duckdb_appender appender); - bool (*duckdb_is_finite_timestamp_s)(duckdb_timestamp_s ts); - bool (*duckdb_is_finite_timestamp_ms)(duckdb_timestamp_ms ts); - bool (*duckdb_is_finite_timestamp_ns)(duckdb_timestamp_ns ts); - duckdb_value (*duckdb_create_timestamp_tz)(duckdb_timestamp input); - duckdb_value (*duckdb_create_timestamp_s)(duckdb_timestamp_s input); - duckdb_value (*duckdb_create_timestamp_ms)(duckdb_timestamp_ms input); - duckdb_value (*duckdb_create_timestamp_ns)(duckdb_timestamp_ns input); - duckdb_timestamp (*duckdb_get_timestamp_tz)(duckdb_value val); - duckdb_timestamp_s (*duckdb_get_timestamp_s)(duckdb_value val); - duckdb_timestamp_ms (*duckdb_get_timestamp_ms)(duckdb_value val); - duckdb_timestamp_ns (*duckdb_get_timestamp_ns)(duckdb_value val); - duckdb_state (*duckdb_append_value)(duckdb_appender appender, duckdb_value value); - duckdb_value (*duckdb_create_varint)(duckdb_varint input); - duckdb_value (*duckdb_create_decimal)(duckdb_decimal input); - duckdb_value (*duckdb_create_bit)(duckdb_bit input); - duckdb_value (*duckdb_create_uuid)(duckdb_uhugeint input); - duckdb_varint (*duckdb_get_varint)(duckdb_value val); - duckdb_decimal (*duckdb_get_decimal)(duckdb_value val); - duckdb_bit (*duckdb_get_bit)(duckdb_value val); - duckdb_uhugeint (*duckdb_get_uuid)(duckdb_value val); -} duckdb_ext_api_v0; +} duckdb_ext_api_v1; //===--------------------------------------------------------------------===// // Struct Create Method //===--------------------------------------------------------------------===// -inline duckdb_ext_api_v0 CreateAPIv0() { - duckdb_ext_api_v0 result; +inline duckdb_ext_api_v1 CreateAPIv1() { + duckdb_ext_api_v1 result; result.duckdb_open = duckdb_open; result.duckdb_open_ext = duckdb_open_ext; result.duckdb_close = duckdb_close; @@ -490,10 +489,14 @@ inline duckdb_ext_api_v0 CreateAPIv0() { result.duckdb_column_count = duckdb_column_count; result.duckdb_rows_changed = duckdb_rows_changed; result.duckdb_result_error = duckdb_result_error; + result.duckdb_result_error_type = duckdb_result_error_type; + result.duckdb_result_return_type = duckdb_result_return_type; result.duckdb_malloc = duckdb_malloc; result.duckdb_free = duckdb_free; result.duckdb_vector_size = duckdb_vector_size; result.duckdb_string_is_inlined = duckdb_string_is_inlined; + result.duckdb_string_t_length = duckdb_string_t_length; + result.duckdb_string_t_data = duckdb_string_t_data; result.duckdb_from_date = duckdb_from_date; result.duckdb_to_date = duckdb_to_date; result.duckdb_is_finite_date = duckdb_is_finite_date; @@ -516,6 +519,7 @@ inline duckdb_ext_api_v0 CreateAPIv0() { result.duckdb_nparams = duckdb_nparams; result.duckdb_parameter_name = duckdb_parameter_name; result.duckdb_param_type = duckdb_param_type; + result.duckdb_param_logical_type = duckdb_param_logical_type; result.duckdb_clear_bindings = duckdb_clear_bindings; result.duckdb_prepared_statement_type = duckdb_prepared_statement_type; result.duckdb_bind_value = duckdb_bind_value; @@ -558,14 +562,70 @@ inline duckdb_ext_api_v0 CreateAPIv0() { result.duckdb_destroy_value = duckdb_destroy_value; result.duckdb_create_varchar = duckdb_create_varchar; result.duckdb_create_varchar_length = duckdb_create_varchar_length; + result.duckdb_create_bool = duckdb_create_bool; + result.duckdb_create_int8 = duckdb_create_int8; + result.duckdb_create_uint8 = duckdb_create_uint8; + result.duckdb_create_int16 = duckdb_create_int16; + result.duckdb_create_uint16 = duckdb_create_uint16; + result.duckdb_create_int32 = duckdb_create_int32; + result.duckdb_create_uint32 = duckdb_create_uint32; + result.duckdb_create_uint64 = duckdb_create_uint64; result.duckdb_create_int64 = duckdb_create_int64; + result.duckdb_create_hugeint = duckdb_create_hugeint; + result.duckdb_create_uhugeint = duckdb_create_uhugeint; + result.duckdb_create_float = duckdb_create_float; + result.duckdb_create_double = duckdb_create_double; + result.duckdb_create_date = duckdb_create_date; + result.duckdb_create_time = duckdb_create_time; + result.duckdb_create_time_tz_value = duckdb_create_time_tz_value; + result.duckdb_create_timestamp = duckdb_create_timestamp; + result.duckdb_create_interval = duckdb_create_interval; + result.duckdb_create_blob = duckdb_create_blob; + result.duckdb_create_varint = duckdb_create_varint; + result.duckdb_create_decimal = duckdb_create_decimal; + result.duckdb_create_bit = duckdb_create_bit; + result.duckdb_create_uuid = duckdb_create_uuid; + result.duckdb_get_bool = duckdb_get_bool; + result.duckdb_get_int8 = duckdb_get_int8; + result.duckdb_get_uint8 = duckdb_get_uint8; + result.duckdb_get_int16 = duckdb_get_int16; + result.duckdb_get_uint16 = duckdb_get_uint16; + result.duckdb_get_int32 = duckdb_get_int32; + result.duckdb_get_uint32 = duckdb_get_uint32; + result.duckdb_get_int64 = duckdb_get_int64; + result.duckdb_get_uint64 = duckdb_get_uint64; + result.duckdb_get_hugeint = duckdb_get_hugeint; + result.duckdb_get_uhugeint = duckdb_get_uhugeint; + result.duckdb_get_float = duckdb_get_float; + result.duckdb_get_double = duckdb_get_double; + result.duckdb_get_date = duckdb_get_date; + result.duckdb_get_time = duckdb_get_time; + result.duckdb_get_time_tz = duckdb_get_time_tz; + result.duckdb_get_timestamp = duckdb_get_timestamp; + result.duckdb_get_interval = duckdb_get_interval; + result.duckdb_get_value_type = duckdb_get_value_type; + result.duckdb_get_blob = duckdb_get_blob; + result.duckdb_get_varint = duckdb_get_varint; + result.duckdb_get_decimal = duckdb_get_decimal; + result.duckdb_get_bit = duckdb_get_bit; + result.duckdb_get_uuid = duckdb_get_uuid; + result.duckdb_get_varchar = duckdb_get_varchar; result.duckdb_create_struct_value = duckdb_create_struct_value; result.duckdb_create_list_value = duckdb_create_list_value; result.duckdb_create_array_value = duckdb_create_array_value; - result.duckdb_get_varchar = duckdb_get_varchar; - result.duckdb_get_int64 = duckdb_get_int64; + result.duckdb_get_map_size = duckdb_get_map_size; + result.duckdb_get_map_key = duckdb_get_map_key; + result.duckdb_get_map_value = duckdb_get_map_value; + result.duckdb_is_null_value = duckdb_is_null_value; + result.duckdb_create_null_value = duckdb_create_null_value; + result.duckdb_get_list_size = duckdb_get_list_size; + result.duckdb_get_list_child = duckdb_get_list_child; + result.duckdb_create_enum_value = duckdb_create_enum_value; + result.duckdb_get_enum_value = duckdb_get_enum_value; + result.duckdb_get_struct_child = duckdb_get_struct_child; result.duckdb_create_logical_type = duckdb_create_logical_type; result.duckdb_logical_type_get_alias = duckdb_logical_type_get_alias; + result.duckdb_logical_type_set_alias = duckdb_logical_type_set_alias; result.duckdb_create_list_type = duckdb_create_list_type; result.duckdb_create_array_type = duckdb_create_array_type; result.duckdb_create_map_type = duckdb_create_map_type; @@ -592,7 +652,7 @@ inline duckdb_ext_api_v0 CreateAPIv0() { result.duckdb_union_type_member_name = duckdb_union_type_member_name; result.duckdb_union_type_member_type = duckdb_union_type_member_type; result.duckdb_destroy_logical_type = duckdb_destroy_logical_type; - result.duckdb_fetch_chunk = duckdb_fetch_chunk; + result.duckdb_register_logical_type = duckdb_register_logical_type; result.duckdb_create_data_chunk = duckdb_create_data_chunk; result.duckdb_destroy_data_chunk = duckdb_destroy_data_chunk; result.duckdb_data_chunk_reset = duckdb_data_chunk_reset; @@ -619,11 +679,36 @@ inline duckdb_ext_api_v0 CreateAPIv0() { result.duckdb_create_scalar_function = duckdb_create_scalar_function; result.duckdb_destroy_scalar_function = duckdb_destroy_scalar_function; result.duckdb_scalar_function_set_name = duckdb_scalar_function_set_name; + result.duckdb_scalar_function_set_varargs = duckdb_scalar_function_set_varargs; + result.duckdb_scalar_function_set_special_handling = duckdb_scalar_function_set_special_handling; + result.duckdb_scalar_function_set_volatile = duckdb_scalar_function_set_volatile; result.duckdb_scalar_function_add_parameter = duckdb_scalar_function_add_parameter; result.duckdb_scalar_function_set_return_type = duckdb_scalar_function_set_return_type; result.duckdb_scalar_function_set_extra_info = duckdb_scalar_function_set_extra_info; result.duckdb_scalar_function_set_function = duckdb_scalar_function_set_function; result.duckdb_register_scalar_function = duckdb_register_scalar_function; + result.duckdb_scalar_function_get_extra_info = duckdb_scalar_function_get_extra_info; + result.duckdb_scalar_function_set_error = duckdb_scalar_function_set_error; + result.duckdb_create_scalar_function_set = duckdb_create_scalar_function_set; + result.duckdb_destroy_scalar_function_set = duckdb_destroy_scalar_function_set; + result.duckdb_add_scalar_function_to_set = duckdb_add_scalar_function_to_set; + result.duckdb_register_scalar_function_set = duckdb_register_scalar_function_set; + result.duckdb_create_aggregate_function = duckdb_create_aggregate_function; + result.duckdb_destroy_aggregate_function = duckdb_destroy_aggregate_function; + result.duckdb_aggregate_function_set_name = duckdb_aggregate_function_set_name; + result.duckdb_aggregate_function_add_parameter = duckdb_aggregate_function_add_parameter; + result.duckdb_aggregate_function_set_return_type = duckdb_aggregate_function_set_return_type; + result.duckdb_aggregate_function_set_functions = duckdb_aggregate_function_set_functions; + result.duckdb_aggregate_function_set_destructor = duckdb_aggregate_function_set_destructor; + result.duckdb_register_aggregate_function = duckdb_register_aggregate_function; + result.duckdb_aggregate_function_set_special_handling = duckdb_aggregate_function_set_special_handling; + result.duckdb_aggregate_function_set_extra_info = duckdb_aggregate_function_set_extra_info; + result.duckdb_aggregate_function_get_extra_info = duckdb_aggregate_function_get_extra_info; + result.duckdb_aggregate_function_set_error = duckdb_aggregate_function_set_error; + result.duckdb_create_aggregate_function_set = duckdb_create_aggregate_function_set; + result.duckdb_destroy_aggregate_function_set = duckdb_destroy_aggregate_function_set; + result.duckdb_add_aggregate_function_to_set = duckdb_add_aggregate_function_to_set; + result.duckdb_register_aggregate_function_set = duckdb_register_aggregate_function_set; result.duckdb_create_table_function = duckdb_create_table_function; result.duckdb_destroy_table_function = duckdb_destroy_table_function; result.duckdb_table_function_set_name = duckdb_table_function_set_name; @@ -660,13 +745,61 @@ inline duckdb_ext_api_v0 CreateAPIv0() { result.duckdb_replacement_scan_set_function_name = duckdb_replacement_scan_set_function_name; result.duckdb_replacement_scan_add_parameter = duckdb_replacement_scan_add_parameter; result.duckdb_replacement_scan_set_error = duckdb_replacement_scan_set_error; + result.duckdb_profiling_info_get_metrics = duckdb_profiling_info_get_metrics; + result.duckdb_profiling_info_get_child_count = duckdb_profiling_info_get_child_count; + result.duckdb_profiling_info_get_child = duckdb_profiling_info_get_child; result.duckdb_appender_create = duckdb_appender_create; + result.duckdb_appender_create_ext = duckdb_appender_create_ext; result.duckdb_appender_column_count = duckdb_appender_column_count; result.duckdb_appender_column_type = duckdb_appender_column_type; result.duckdb_appender_error = duckdb_appender_error; result.duckdb_appender_flush = duckdb_appender_flush; result.duckdb_appender_close = duckdb_appender_close; result.duckdb_appender_destroy = duckdb_appender_destroy; + result.duckdb_appender_add_column = duckdb_appender_add_column; + result.duckdb_appender_clear_columns = duckdb_appender_clear_columns; + result.duckdb_append_data_chunk = duckdb_append_data_chunk; + result.duckdb_table_description_create = duckdb_table_description_create; + result.duckdb_table_description_create_ext = duckdb_table_description_create_ext; + result.duckdb_table_description_destroy = duckdb_table_description_destroy; + result.duckdb_table_description_error = duckdb_table_description_error; + result.duckdb_column_has_default = duckdb_column_has_default; + result.duckdb_table_description_get_column_name = duckdb_table_description_get_column_name; + result.duckdb_execute_tasks = duckdb_execute_tasks; + result.duckdb_create_task_state = duckdb_create_task_state; + result.duckdb_execute_tasks_state = duckdb_execute_tasks_state; + result.duckdb_execute_n_tasks_state = duckdb_execute_n_tasks_state; + result.duckdb_finish_execution = duckdb_finish_execution; + result.duckdb_task_state_is_finished = duckdb_task_state_is_finished; + result.duckdb_destroy_task_state = duckdb_destroy_task_state; + result.duckdb_execution_is_finished = duckdb_execution_is_finished; + result.duckdb_fetch_chunk = duckdb_fetch_chunk; + result.duckdb_create_cast_function = duckdb_create_cast_function; + result.duckdb_cast_function_set_source_type = duckdb_cast_function_set_source_type; + result.duckdb_cast_function_set_target_type = duckdb_cast_function_set_target_type; + result.duckdb_cast_function_set_implicit_cast_cost = duckdb_cast_function_set_implicit_cast_cost; + result.duckdb_cast_function_set_function = duckdb_cast_function_set_function; + result.duckdb_cast_function_set_extra_info = duckdb_cast_function_set_extra_info; + result.duckdb_cast_function_get_extra_info = duckdb_cast_function_get_extra_info; + result.duckdb_cast_function_get_cast_mode = duckdb_cast_function_get_cast_mode; + result.duckdb_cast_function_set_error = duckdb_cast_function_set_error; + result.duckdb_cast_function_set_row_error = duckdb_cast_function_set_row_error; + result.duckdb_register_cast_function = duckdb_register_cast_function; + result.duckdb_destroy_cast_function = duckdb_destroy_cast_function; + result.duckdb_is_finite_timestamp_s = duckdb_is_finite_timestamp_s; + result.duckdb_is_finite_timestamp_ms = duckdb_is_finite_timestamp_ms; + result.duckdb_is_finite_timestamp_ns = duckdb_is_finite_timestamp_ns; + result.duckdb_create_timestamp_tz = duckdb_create_timestamp_tz; + result.duckdb_create_timestamp_s = duckdb_create_timestamp_s; + result.duckdb_create_timestamp_ms = duckdb_create_timestamp_ms; + result.duckdb_create_timestamp_ns = duckdb_create_timestamp_ns; + result.duckdb_get_timestamp_tz = duckdb_get_timestamp_tz; + result.duckdb_get_timestamp_s = duckdb_get_timestamp_s; + result.duckdb_get_timestamp_ms = duckdb_get_timestamp_ms; + result.duckdb_get_timestamp_ns = duckdb_get_timestamp_ns; + result.duckdb_append_value = duckdb_append_value; + result.duckdb_get_profiling_info = duckdb_get_profiling_info; + result.duckdb_profiling_info_get_value = duckdb_profiling_info_get_value; result.duckdb_appender_begin_row = duckdb_appender_begin_row; result.duckdb_appender_end_row = duckdb_appender_end_row; result.duckdb_append_default = duckdb_append_default; @@ -691,112 +824,12 @@ inline duckdb_ext_api_v0 CreateAPIv0() { result.duckdb_append_varchar_length = duckdb_append_varchar_length; result.duckdb_append_blob = duckdb_append_blob; result.duckdb_append_null = duckdb_append_null; - result.duckdb_append_data_chunk = duckdb_append_data_chunk; - result.duckdb_execute_tasks = duckdb_execute_tasks; - result.duckdb_create_task_state = duckdb_create_task_state; - result.duckdb_execute_tasks_state = duckdb_execute_tasks_state; - result.duckdb_execute_n_tasks_state = duckdb_execute_n_tasks_state; - result.duckdb_finish_execution = duckdb_finish_execution; - result.duckdb_task_state_is_finished = duckdb_task_state_is_finished; - result.duckdb_destroy_task_state = duckdb_destroy_task_state; - result.duckdb_execution_is_finished = duckdb_execution_is_finished; - result.duckdb_get_profiling_info = duckdb_get_profiling_info; - result.duckdb_profiling_info_get_value = duckdb_profiling_info_get_value; - result.duckdb_profiling_info_get_child_count = duckdb_profiling_info_get_child_count; - result.duckdb_profiling_info_get_child = duckdb_profiling_info_get_child; - result.duckdb_profiling_info_get_metrics = duckdb_profiling_info_get_metrics; - result.duckdb_scalar_function_set_varargs = duckdb_scalar_function_set_varargs; - result.duckdb_scalar_function_set_special_handling = duckdb_scalar_function_set_special_handling; - result.duckdb_scalar_function_set_volatile = duckdb_scalar_function_set_volatile; - result.duckdb_scalar_function_get_extra_info = duckdb_scalar_function_get_extra_info; - result.duckdb_scalar_function_set_error = duckdb_scalar_function_set_error; - result.duckdb_table_description_create = duckdb_table_description_create; - result.duckdb_table_description_destroy = duckdb_table_description_destroy; - result.duckdb_table_description_error = duckdb_table_description_error; - result.duckdb_result_error_type = duckdb_result_error_type; - result.duckdb_string_t_length = duckdb_string_t_length; - result.duckdb_string_t_data = duckdb_string_t_data; - result.duckdb_create_bool = duckdb_create_bool; - result.duckdb_create_int8 = duckdb_create_int8; - result.duckdb_create_uint8 = duckdb_create_uint8; - result.duckdb_create_int16 = duckdb_create_int16; - result.duckdb_create_uint16 = duckdb_create_uint16; - result.duckdb_create_int32 = duckdb_create_int32; - result.duckdb_create_uint32 = duckdb_create_uint32; - result.duckdb_create_uint64 = duckdb_create_uint64; - result.duckdb_create_hugeint = duckdb_create_hugeint; - result.duckdb_create_uhugeint = duckdb_create_uhugeint; - result.duckdb_create_float = duckdb_create_float; - result.duckdb_create_double = duckdb_create_double; - result.duckdb_create_date = duckdb_create_date; - result.duckdb_create_time = duckdb_create_time; - result.duckdb_create_time_tz_value = duckdb_create_time_tz_value; - result.duckdb_create_timestamp = duckdb_create_timestamp; - result.duckdb_create_interval = duckdb_create_interval; - result.duckdb_create_blob = duckdb_create_blob; - result.duckdb_get_bool = duckdb_get_bool; - result.duckdb_get_int8 = duckdb_get_int8; - result.duckdb_get_uint8 = duckdb_get_uint8; - result.duckdb_get_int16 = duckdb_get_int16; - result.duckdb_get_uint16 = duckdb_get_uint16; - result.duckdb_get_int32 = duckdb_get_int32; - result.duckdb_get_uint32 = duckdb_get_uint32; - result.duckdb_get_uint64 = duckdb_get_uint64; - result.duckdb_get_hugeint = duckdb_get_hugeint; - result.duckdb_get_uhugeint = duckdb_get_uhugeint; - result.duckdb_get_float = duckdb_get_float; - result.duckdb_get_double = duckdb_get_double; - result.duckdb_get_date = duckdb_get_date; - result.duckdb_get_time = duckdb_get_time; - result.duckdb_get_time_tz = duckdb_get_time_tz; - result.duckdb_get_timestamp = duckdb_get_timestamp; - result.duckdb_get_interval = duckdb_get_interval; - result.duckdb_get_value_type = duckdb_get_value_type; - result.duckdb_get_blob = duckdb_get_blob; - result.duckdb_create_scalar_function_set = duckdb_create_scalar_function_set; - result.duckdb_destroy_scalar_function_set = duckdb_destroy_scalar_function_set; - result.duckdb_add_scalar_function_to_set = duckdb_add_scalar_function_to_set; - result.duckdb_register_scalar_function_set = duckdb_register_scalar_function_set; - result.duckdb_create_aggregate_function_set = duckdb_create_aggregate_function_set; - result.duckdb_destroy_aggregate_function_set = duckdb_destroy_aggregate_function_set; - result.duckdb_add_aggregate_function_to_set = duckdb_add_aggregate_function_to_set; - result.duckdb_register_aggregate_function_set = duckdb_register_aggregate_function_set; - result.duckdb_get_map_size = duckdb_get_map_size; - result.duckdb_get_map_key = duckdb_get_map_key; - result.duckdb_get_map_value = duckdb_get_map_value; - result.duckdb_create_aggregate_function = duckdb_create_aggregate_function; - result.duckdb_destroy_aggregate_function = duckdb_destroy_aggregate_function; - result.duckdb_aggregate_function_set_name = duckdb_aggregate_function_set_name; - result.duckdb_aggregate_function_add_parameter = duckdb_aggregate_function_add_parameter; - result.duckdb_aggregate_function_set_return_type = duckdb_aggregate_function_set_return_type; - result.duckdb_aggregate_function_set_functions = duckdb_aggregate_function_set_functions; - result.duckdb_aggregate_function_set_destructor = duckdb_aggregate_function_set_destructor; - result.duckdb_register_aggregate_function = duckdb_register_aggregate_function; - result.duckdb_aggregate_function_set_special_handling = duckdb_aggregate_function_set_special_handling; - result.duckdb_aggregate_function_set_extra_info = duckdb_aggregate_function_set_extra_info; - result.duckdb_aggregate_function_get_extra_info = duckdb_aggregate_function_get_extra_info; - result.duckdb_aggregate_function_set_error = duckdb_aggregate_function_set_error; - result.duckdb_logical_type_set_alias = duckdb_logical_type_set_alias; - result.duckdb_register_logical_type = duckdb_register_logical_type; - result.duckdb_create_cast_function = duckdb_create_cast_function; - result.duckdb_cast_function_set_source_type = duckdb_cast_function_set_source_type; - result.duckdb_cast_function_set_target_type = duckdb_cast_function_set_target_type; - result.duckdb_cast_function_set_implicit_cast_cost = duckdb_cast_function_set_implicit_cast_cost; - result.duckdb_cast_function_set_function = duckdb_cast_function_set_function; - result.duckdb_cast_function_set_extra_info = duckdb_cast_function_set_extra_info; - result.duckdb_cast_function_get_extra_info = duckdb_cast_function_get_extra_info; - result.duckdb_cast_function_get_cast_mode = duckdb_cast_function_get_cast_mode; - result.duckdb_cast_function_set_error = duckdb_cast_function_set_error; - result.duckdb_cast_function_set_row_error = duckdb_cast_function_set_row_error; - result.duckdb_register_cast_function = duckdb_register_cast_function; - result.duckdb_destroy_cast_function = duckdb_destroy_cast_function; result.duckdb_row_count = duckdb_row_count; result.duckdb_column_data = duckdb_column_data; result.duckdb_nullmask_data = duckdb_nullmask_data; result.duckdb_result_get_chunk = duckdb_result_get_chunk; result.duckdb_result_is_streaming = duckdb_result_is_streaming; result.duckdb_result_chunk_count = duckdb_result_chunk_count; - result.duckdb_result_return_type = duckdb_result_return_type; result.duckdb_value_boolean = duckdb_value_boolean; result.duckdb_value_int8 = duckdb_value_int8; result.duckdb_value_int16 = duckdb_value_int16; @@ -823,7 +856,6 @@ inline duckdb_ext_api_v0 CreateAPIv0() { result.duckdb_value_is_null = duckdb_value_is_null; result.duckdb_execute_prepared_streaming = duckdb_execute_prepared_streaming; result.duckdb_pending_prepared_streaming = duckdb_pending_prepared_streaming; - result.duckdb_column_has_default = duckdb_column_has_default; result.duckdb_query_arrow = duckdb_query_arrow; result.duckdb_query_arrow_schema = duckdb_query_arrow_schema; result.duckdb_prepared_arrow_schema = duckdb_prepared_arrow_schema; @@ -839,43 +871,10 @@ inline duckdb_ext_api_v0 CreateAPIv0() { result.duckdb_arrow_scan = duckdb_arrow_scan; result.duckdb_arrow_array_scan = duckdb_arrow_array_scan; result.duckdb_stream_fetch_chunk = duckdb_stream_fetch_chunk; - result.duckdb_appender_create_ext = duckdb_appender_create_ext; - result.duckdb_table_description_create_ext = duckdb_table_description_create_ext; - result.duckdb_table_description_get_column_name = duckdb_table_description_get_column_name; - result.duckdb_param_logical_type = duckdb_param_logical_type; - result.duckdb_is_null_value = duckdb_is_null_value; - result.duckdb_create_null_value = duckdb_create_null_value; - result.duckdb_get_list_size = duckdb_get_list_size; - result.duckdb_get_list_child = duckdb_get_list_child; - result.duckdb_create_enum_value = duckdb_create_enum_value; - result.duckdb_get_enum_value = duckdb_get_enum_value; - result.duckdb_get_struct_child = duckdb_get_struct_child; - result.duckdb_appender_add_column = duckdb_appender_add_column; - result.duckdb_appender_clear_columns = duckdb_appender_clear_columns; - result.duckdb_is_finite_timestamp_s = duckdb_is_finite_timestamp_s; - result.duckdb_is_finite_timestamp_ms = duckdb_is_finite_timestamp_ms; - result.duckdb_is_finite_timestamp_ns = duckdb_is_finite_timestamp_ns; - result.duckdb_create_timestamp_tz = duckdb_create_timestamp_tz; - result.duckdb_create_timestamp_s = duckdb_create_timestamp_s; - result.duckdb_create_timestamp_ms = duckdb_create_timestamp_ms; - result.duckdb_create_timestamp_ns = duckdb_create_timestamp_ns; - result.duckdb_get_timestamp_tz = duckdb_get_timestamp_tz; - result.duckdb_get_timestamp_s = duckdb_get_timestamp_s; - result.duckdb_get_timestamp_ms = duckdb_get_timestamp_ms; - result.duckdb_get_timestamp_ns = duckdb_get_timestamp_ns; - result.duckdb_append_value = duckdb_append_value; - result.duckdb_create_varint = duckdb_create_varint; - result.duckdb_create_decimal = duckdb_create_decimal; - result.duckdb_create_bit = duckdb_create_bit; - result.duckdb_create_uuid = duckdb_create_uuid; - result.duckdb_get_varint = duckdb_get_varint; - result.duckdb_get_decimal = duckdb_get_decimal; - result.duckdb_get_bit = duckdb_get_bit; - result.duckdb_get_uuid = duckdb_get_uuid; return result; } -#define DUCKDB_EXTENSION_API_VERSION_MAJOR 0 -#define DUCKDB_EXTENSION_API_VERSION_MINOR 0 -#define DUCKDB_EXTENSION_API_VERSION_PATCH 1 -#define DUCKDB_EXTENSION_API_VERSION_STRING "v0.0.1" +#define DUCKDB_EXTENSION_API_VERSION_MAJOR 1 +#define DUCKDB_EXTENSION_API_VERSION_MINOR 2 +#define DUCKDB_EXTENSION_API_VERSION_PATCH 0 +#define DUCKDB_EXTENSION_API_VERSION_STRING "v1.2.0" diff --git a/src/duckdb/src/include/duckdb/main/client_data.hpp b/src/duckdb/src/include/duckdb/main/client_data.hpp index 77785975..b3e32626 100644 --- a/src/duckdb/src/include/duckdb/main/client_data.hpp +++ b/src/duckdb/src/include/duckdb/main/client_data.hpp @@ -27,7 +27,7 @@ class QueryProfiler; class PreparedStatementData; class SchemaCatalogEntry; class HTTPLogger; -struct RandomEngine; +class RandomEngine; struct ClientData { explicit ClientData(ClientContext &context); diff --git a/src/duckdb/src/include/duckdb/main/database.hpp b/src/duckdb/src/include/duckdb/main/database.hpp index 2a6fffa9..12bb6fbe 100644 --- a/src/duckdb/src/include/duckdb/main/database.hpp +++ b/src/duckdb/src/include/duckdb/main/database.hpp @@ -59,7 +59,7 @@ class DatabaseInstance : public enable_shared_from_this { DUCKDB_API ValidChecker &GetValidChecker(); DUCKDB_API void SetExtensionLoaded(const string &extension_name, ExtensionInstallInfo &install_info); - DUCKDB_API const duckdb_ext_api_v0 GetExtensionAPIV0(); + DUCKDB_API const duckdb_ext_api_v1 GetExtensionAPIV1(); idx_t NumberOfThreads(); @@ -95,7 +95,7 @@ class DatabaseInstance : public enable_shared_from_this { unique_ptr db_file_system; shared_ptr db_cache_entry; - duckdb_ext_api_v0 (*create_api_v0)(); + duckdb_ext_api_v1 (*create_api_v1)(); }; //! The database object. This object holds the catalog and all the diff --git a/src/duckdb/src/include/duckdb/main/extension.hpp b/src/duckdb/src/include/duckdb/main/extension.hpp index 15408048..53de1481 100644 --- a/src/duckdb/src/include/duckdb/main/extension.hpp +++ b/src/duckdb/src/include/duckdb/main/extension.hpp @@ -28,8 +28,12 @@ class Extension { enum class ExtensionABIType : uint8_t { UNKNOWN = 0, + //! Uses C++ ABI, version needs to match precisely CPP = 1, + //! Uses C ABI using the duckdb_ext_api_v1 struct, version needs to be equal or higher C_STRUCT = 2, + //! Uses C ABI using the duckdb_ext_api_v1 struct including "unstable" functions, version needs to match precisely + C_STRUCT_UNSTABLE = 3 }; //! The parsed extension metadata footer @@ -44,9 +48,11 @@ struct ParsedExtensionMetaData { ExtensionABIType abi_type; string platform; - // (only for ExtensionABIType::CPP) the DuckDB version this extension is compiled for + // (For ExtensionABIType::CPP or ExtensionABIType::C_STRUCT_UNSTABLE) the DuckDB version this extension is compiled + // for string duckdb_version; - // (only for ExtensionABIType::C_STRUCT) the CAPI version of the C_STRUCT + // (only for ExtensionABIType::C_STRUCT) the CAPI version of the C_STRUCT (Currently interpreted as the minimum + // DuckDB version) string duckdb_capi_version; string extension_version; string signature; diff --git a/src/duckdb/src/include/duckdb/main/extension_helper.hpp b/src/duckdb/src/include/duckdb/main/extension_helper.hpp index e1f3f7f3..829a225b 100644 --- a/src/duckdb/src/include/duckdb/main/extension_helper.hpp +++ b/src/duckdb/src/include/duckdb/main/extension_helper.hpp @@ -35,6 +35,7 @@ struct ExtensionAlias { struct ExtensionInitResult { string filename; string filebase; + ExtensionABIType abi_type = ExtensionABIType::UNKNOWN; // The deserialized install from the `.duckdb_extension.info` file unique_ptr install_info; diff --git a/src/duckdb/src/include/duckdb/planner/operator/logical_insert.hpp b/src/duckdb/src/include/duckdb/planner/operator/logical_insert.hpp index c8eb3997..c20c33a5 100644 --- a/src/duckdb/src/include/duckdb/planner/operator/logical_insert.hpp +++ b/src/duckdb/src/include/duckdb/planner/operator/logical_insert.hpp @@ -60,6 +60,8 @@ class LogicalInsert : public LogicalOperator { vector columns_to_fetch; // The columns to fetch from the 'source' table vector source_columns; + //! True, if the INSERT OR REPLACE requires delete + insert. + bool update_is_del_and_insert; public: void Serialize(Serializer &serializer) const override; diff --git a/src/duckdb/src/include/duckdb/storage/compression/dictionary/analyze.hpp b/src/duckdb/src/include/duckdb/storage/compression/dictionary/analyze.hpp new file mode 100644 index 00000000..99eb7215 --- /dev/null +++ b/src/duckdb/src/include/duckdb/storage/compression/dictionary/analyze.hpp @@ -0,0 +1,46 @@ +#pragma once + +#include "duckdb/storage/compression/dictionary/common.hpp" +#include "duckdb/common/string_map_set.hpp" +#include "duckdb/storage/table/column_data.hpp" + +namespace duckdb { + +//===--------------------------------------------------------------------===// +// Analyze +//===--------------------------------------------------------------------===// +struct DictionaryAnalyzeState : public DictionaryCompressionState { +public: + explicit DictionaryAnalyzeState(const CompressionInfo &info); + +public: + bool LookupString(string_t str) override; + void AddNewString(string_t str) override; + void AddLastLookup() override; + void AddNull() override; + bool CalculateSpaceRequirements(bool new_string, idx_t string_size) override; + void Flush(bool final = false) override; + void Verify() override; + +public: + idx_t segment_count; + idx_t current_tuple_count; + idx_t current_unique_count; + idx_t current_dict_size; + StringHeap heap; + string_set_t current_set; + bitpacking_width_t current_width; + bitpacking_width_t next_width; +}; + +struct DictionaryCompressionAnalyzeState : public AnalyzeState { +public: + explicit DictionaryCompressionAnalyzeState(const CompressionInfo &info) + : AnalyzeState(info), analyze_state(make_uniq(info)) { + } + +public: + unique_ptr analyze_state; +}; + +} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/storage/compression/dictionary/common.hpp b/src/duckdb/src/include/duckdb/storage/compression/dictionary/common.hpp new file mode 100644 index 00000000..79bd094b --- /dev/null +++ b/src/duckdb/src/include/duckdb/storage/compression/dictionary/common.hpp @@ -0,0 +1,60 @@ +#pragma once + +#include "duckdb/common/typedefs.hpp" +#include "duckdb/function/compression_function.hpp" +#include "duckdb/common/bitpacking.hpp" +#include "duckdb/storage/string_uncompressed.hpp" + +namespace duckdb { + +typedef struct { + uint32_t dict_size; + uint32_t dict_end; + uint32_t index_buffer_offset; + uint32_t index_buffer_count; + uint32_t bitpacking_width; +} dictionary_compression_header_t; + +struct DictionaryCompression { +public: + static constexpr float MINIMUM_COMPRESSION_RATIO = 1.2F; + //! Dictionary header size at the beginning of the string segment (offset + length) + static constexpr uint16_t DICTIONARY_HEADER_SIZE = sizeof(dictionary_compression_header_t); + +public: + static bool HasEnoughSpace(idx_t current_count, idx_t index_count, idx_t dict_size, + bitpacking_width_t packing_width, const idx_t block_size); + static idx_t RequiredSpace(idx_t current_count, idx_t index_count, idx_t dict_size, + bitpacking_width_t packing_width); + + static StringDictionaryContainer GetDictionary(ColumnSegment &segment, BufferHandle &handle); + static void SetDictionary(ColumnSegment &segment, BufferHandle &handle, StringDictionaryContainer container); +}; + +//! Abstract class managing the compression state for size analysis or compression. +class DictionaryCompressionState : public CompressionState { +public: + explicit DictionaryCompressionState(const CompressionInfo &info); + ~DictionaryCompressionState() override; + +public: + bool UpdateState(Vector &scan_vector, idx_t count); + +protected: + // Should verify the State + virtual void Verify() = 0; + // Performs a lookup of str, storing the result internally + virtual bool LookupString(string_t str) = 0; + // Add the most recently looked up str to compression state + virtual void AddLastLookup() = 0; + // Add string to the state that is known to not be seen yet + virtual void AddNewString(string_t str) = 0; + // Add a null value to the compression state + virtual void AddNull() = 0; + // Needs to be called before adding a value. Will return false if a flush is required first. + virtual bool CalculateSpaceRequirements(bool new_string, idx_t string_size) = 0; + // Flush the segment to disk if compressing or reset the counters if analyzing + virtual void Flush(bool final = false) = 0; +}; + +} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/storage/compression/dictionary/compression.hpp b/src/duckdb/src/include/duckdb/storage/compression/dictionary/compression.hpp new file mode 100644 index 00000000..1ffdb494 --- /dev/null +++ b/src/duckdb/src/include/duckdb/storage/compression/dictionary/compression.hpp @@ -0,0 +1,62 @@ +#pragma once + +#include "duckdb/common/typedefs.hpp" +#include "duckdb/storage/compression/dictionary/common.hpp" +#include "duckdb/function/compression_function.hpp" +#include "duckdb/common/string_map_set.hpp" +#include "duckdb/storage/table/column_data_checkpointer.hpp" + +namespace duckdb { + +// Dictionary compression uses a combination of bitpacking and a dictionary to compress string segments. The data is +// stored across three buffers: the index buffer, the selection buffer and the dictionary. Firstly the Index buffer +// contains the offsets into the dictionary which are also used to determine the string lengths. Each value in the +// dictionary gets a single unique index in the index buffer. Secondly, the selection buffer maps the tuples to an index +// in the index buffer. The selection buffer is compressed with bitpacking. Finally, the dictionary contains simply all +// the unique strings without lengths or null termination as we can deduce the lengths from the index buffer. The +// addition of the selection buffer is done for two reasons: firstly, to allow the scan to emit dictionary vectors by +// scanning the whole dictionary at once and then scanning the selection buffer for each emitted vector. Secondly, it +// allows for efficient bitpacking compression as the selection values should remain relatively small. + +//===--------------------------------------------------------------------===// +// Compress +//===--------------------------------------------------------------------===// +struct DictionaryCompressionCompressState : public DictionaryCompressionState { +public: + DictionaryCompressionCompressState(ColumnDataCheckpointer &checkpointer_p, const CompressionInfo &info); + +public: + void CreateEmptySegment(idx_t row_start); + void Verify() override; + bool LookupString(string_t str) override; + void AddNewString(string_t str) override; + void AddNull() override; + void AddLastLookup() override; + bool CalculateSpaceRequirements(bool new_string, idx_t string_size) override; + void Flush(bool final = false) override; + idx_t Finalize(); + +public: + ColumnDataCheckpointer &checkpointer; + CompressionFunction &function; + + // State regarding current segment + unique_ptr current_segment; + BufferHandle current_handle; + StringDictionaryContainer current_dictionary; + data_ptr_t current_end_ptr; + + // Buffers and map for current segment + StringHeap heap; + string_map_t current_string_map; + vector index_buffer; + vector selection_buffer; + + bitpacking_width_t current_width = 0; + bitpacking_width_t next_width = 0; + + // Result of latest LookupString call + uint32_t latest_lookup_result; +}; + +} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/storage/compression/dictionary/decompression.hpp b/src/duckdb/src/include/duckdb/storage/compression/dictionary/decompression.hpp new file mode 100644 index 00000000..1656ec71 --- /dev/null +++ b/src/duckdb/src/include/duckdb/storage/compression/dictionary/decompression.hpp @@ -0,0 +1,50 @@ +#pragma once + +#include "duckdb/storage/compression/dictionary/common.hpp" + +namespace duckdb { + +//===--------------------------------------------------------------------===// +// Scan +//===--------------------------------------------------------------------===// +// FIXME: why is this StringScanState when we also define: `BufferHandle handle` ??? +struct CompressedStringScanState : public StringScanState { +public: + explicit CompressedStringScanState(BufferHandle &&handle_p) + : StringScanState(), owned_handle(std::move(handle_p)), handle(owned_handle) { + } + explicit CompressedStringScanState(BufferHandle &handle_p) : StringScanState(), owned_handle(), handle(handle_p) { + } + +public: + void Initialize(ColumnSegment &segment, bool initialize_dictionary = true); + void ScanToFlatVector(Vector &result, idx_t result_offset, idx_t start, idx_t scan_count); + void ScanToDictionaryVector(ColumnSegment &segment, Vector &result, idx_t result_offset, idx_t start, + idx_t scan_count); + +private: + string_t FetchStringFromDict(int32_t dict_offset, uint16_t string_len); + uint16_t GetStringLength(sel_t index); + +public: + BufferHandle owned_handle; + optional_ptr handle; + + bitpacking_width_t current_width; + buffer_ptr sel_vec; + idx_t sel_vec_size = 0; + + //! Start of the block (pointing to the dictionary_header) + data_ptr_t baseptr; + //! Start of the data (pointing to the start of the selection buffer) + data_ptr_t base_data; + uint32_t *index_buffer_ptr; + uint32_t index_buffer_count; + + buffer_ptr dictionary; + idx_t dictionary_size; + StringDictionaryContainer dict; + idx_t block_size; +}; + +} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/storage/compression/roaring/appender.hpp b/src/duckdb/src/include/duckdb/storage/compression/roaring/appender.hpp index 2928f2fd..31972d4c 100644 --- a/src/duckdb/src/include/duckdb/storage/compression/roaring/appender.hpp +++ b/src/duckdb/src/include/duckdb/storage/compression/roaring/appender.hpp @@ -69,10 +69,45 @@ struct RoaringStateAppender { idx_t to_append = MinValue(ROARING_CONTAINER_SIZE - STATE_TYPE::Count(state), input_size - appended); + // Deal with a ragged start, when 'appended' isn't a multiple of ValidityMask::BITS_PER_VALUE + // then we need to process the remainder of the entry + idx_t entry_offset = appended / ValidityMask::BITS_PER_VALUE; + auto previous_entry_remainder = appended % ValidityMask::BITS_PER_VALUE; + if (DUCKDB_UNLIKELY(previous_entry_remainder != 0)) { + auto validity_entry = validity.GetValidityEntry(entry_offset); + + idx_t to_process = ValidityMask::BITS_PER_VALUE - previous_entry_remainder; + validity_t mask; + if (to_append < to_process) { + // Limit the amount of bits to set + auto left_bits = to_process - to_append; + to_process = to_append; + mask = ValidityUncompressed::UPPER_MASKS[to_process] >> left_bits; + } else { + mask = ValidityUncompressed::UPPER_MASKS[to_process]; + } + + auto masked_entry = (validity_entry & mask) >> previous_entry_remainder; + if (masked_entry == ValidityUncompressed::LOWER_MASKS[to_process]) { + // All bits are set + STATE_TYPE::HandleAllValid(state, to_process); + } else if (masked_entry == 0) { + // None of the bits are set + STATE_TYPE::HandleNoneValid(state, to_process); + } else { + // Mixed set/unset bits + AppendBytes(state, masked_entry, to_process); + } + + to_append -= to_process; + appended += to_process; + entry_offset++; + } + auto entry_count = to_append / ValidityMask::BITS_PER_VALUE; for (idx_t entry_index = 0; entry_index < entry_count; entry_index++) { // get the validity entry at this index - auto validity_entry = validity.GetValidityEntry(entry_index); + auto validity_entry = validity.GetValidityEntry(entry_offset + entry_index); if (ValidityMask::AllValid(validity_entry)) { // All bits are set STATE_TYPE::HandleAllValid(state, ValidityMask::BITS_PER_VALUE); @@ -88,7 +123,7 @@ struct RoaringStateAppender { // Deal with a ragged end, when the validity entry isn't entirely used idx_t remainder = to_append % ValidityMask::BITS_PER_VALUE; if (DUCKDB_UNLIKELY(remainder != 0)) { - auto validity_entry = validity.GetValidityEntry(entry_count); + auto validity_entry = validity.GetValidityEntry(entry_offset + entry_count); auto masked = validity_entry & ValidityUncompressed::LOWER_MASKS[remainder]; if (masked == ValidityUncompressed::LOWER_MASKS[remainder]) { // All bits are set diff --git a/src/duckdb/src/include/duckdb/storage/data_table.hpp b/src/duckdb/src/include/duckdb/storage/data_table.hpp index 49c1d8cb..39795ed1 100644 --- a/src/duckdb/src/include/duckdb/storage/data_table.hpp +++ b/src/duckdb/src/include/duckdb/storage/data_table.hpp @@ -92,15 +92,21 @@ class DataTable { void Fetch(DuckTransaction &transaction, DataChunk &result, const vector &column_ids, const Vector &row_ids, idx_t fetch_count, ColumnFetchState &state); - //! Initializes an append to transaction-local storage + //! Initializes appending to transaction-local storage void InitializeLocalAppend(LocalAppendState &state, TableCatalogEntry &table, ClientContext &context, const vector> &bound_constraints); + //! Initializes only the delete-indexes of the transaction-local storage + void InitializeLocalStorage(LocalAppendState &state, TableCatalogEntry &table, ClientContext &context, + const vector> &bound_constraints); //! Append a DataChunk to the transaction-local storage of the table. - void LocalAppend(LocalAppendState &state, TableCatalogEntry &table, ClientContext &context, DataChunk &chunk, - bool unsafe = false); + void LocalAppend(LocalAppendState &state, ClientContext &context, DataChunk &chunk, bool unsafe); //! Finalizes a transaction-local append void FinalizeLocalAppend(LocalAppendState &state); - //! Append a chunk to the transaction-local storage of this table + //! Append a chunk to the transaction-local storage of this table and update the delete indexes. + void LocalAppend(TableCatalogEntry &table, ClientContext &context, DataChunk &chunk, + const vector> &bound_constraints, Vector &row_ids, + DataChunk &delete_chunk); + //! Append a chunk to the transaction-local storage of this table. void LocalAppend(TableCatalogEntry &table, ClientContext &context, DataChunk &chunk, const vector> &bound_constraints); //! Append a column data collection with default values to the transaction-local storage of this table. @@ -159,10 +165,11 @@ class DataTable { //! Merge a row group collection directly into this table - appending it to the end of the table without copying void MergeStorage(RowGroupCollection &data, TableIndexList &indexes, optional_ptr commit_state); - //! Append a chunk with the row ids [row_start, ..., row_start + chunk.size()] to all indexes of the table, returns - //! whether or not the append succeeded - ErrorData AppendToIndexes(DataChunk &chunk, row_t row_start); - static ErrorData AppendToIndexes(TableIndexList &indexes, DataChunk &chunk, row_t row_start); + //! Append a chunk with the row ids [row_start, ..., row_start + chunk.size()] to all indexes of the table. + //! Returns empty ErrorData, if the append was successful. + ErrorData AppendToIndexes(optional_ptr delete_indexes, DataChunk &chunk, row_t row_start); + static ErrorData AppendToIndexes(TableIndexList &indexes, optional_ptr delete_indexes, + DataChunk &chunk, row_t row_start); //! Remove a chunk with the row ids [row_start, ..., row_start + chunk.size()] from all indexes of the table void RemoveFromIndexes(TableAppendState &state, DataChunk &chunk, row_t row_start); //! Remove the chunk with the specified set of row identifiers from all indexes of the table @@ -180,6 +187,9 @@ class DataTable { //! Get statistics of a physical column within the table unique_ptr GetStatistics(ClientContext &context, column_t column_id); + + //! Get table sample + unique_ptr GetSample(); //! Sets statistics of a physical column within the table void SetDistinct(column_t column_id, unique_ptr distinct_stats); @@ -208,13 +218,14 @@ class DataTable { unique_ptr InitializeConstraintState(TableCatalogEntry &table, const vector> &bound_constraints); //! Verify constraints with a chunk from the Append containing all columns of the table - void VerifyAppendConstraints(ConstraintState &state, ClientContext &context, DataChunk &chunk, - optional_ptr conflict_manager = nullptr); + void VerifyAppendConstraints(ConstraintState &constraint_state, ClientContext &context, DataChunk &chunk, + optional_ptr local_storage, optional_ptr manager); shared_ptr &GetDataTableInfo(); void InitializeIndexes(ClientContext &context); bool HasIndexes() const; + bool HasUniqueIndexes() const; bool HasForeignKeyIndex(const vector &keys, ForeignKeyType type); void SetIndexStorageInfo(vector index_storage_info); void VacuumIndexes(); @@ -227,8 +238,8 @@ class DataTable { idx_t GetRowGroupSize() const; - static void VerifyUniqueIndexes(TableIndexList &indexes, ClientContext &context, DataChunk &chunk, - optional_ptr conflict_manager); + static void VerifyUniqueIndexes(TableIndexList &indexes, optional_ptr storage, DataChunk &chunk, + optional_ptr manager); //! AddIndex initializes an index and adds it to the table's index list. //! It is either empty, or initialized via its index storage information. diff --git a/src/duckdb/src/include/duckdb/storage/index.hpp b/src/duckdb/src/include/duckdb/storage/index.hpp index d838f60a..2b624c2c 100644 --- a/src/duckdb/src/include/duckdb/storage/index.hpp +++ b/src/duckdb/src/include/duckdb/storage/index.hpp @@ -59,9 +59,8 @@ class Index { //! Returns unique flag bool IsUnique() const { - auto index_constraint_type = GetConstraintType(); - return (index_constraint_type == IndexConstraintType::UNIQUE || - index_constraint_type == IndexConstraintType::PRIMARY); + auto type = GetConstraintType(); + return type == IndexConstraintType::UNIQUE || type == IndexConstraintType::PRIMARY; } //! Returns primary key flag diff --git a/src/duckdb/src/include/duckdb/storage/table/row_group_collection.hpp b/src/duckdb/src/include/duckdb/storage/table/row_group_collection.hpp index 5c47dcd6..19aa6452 100644 --- a/src/duckdb/src/include/duckdb/storage/table/row_group_collection.hpp +++ b/src/duckdb/src/include/duckdb/storage/table/row_group_collection.hpp @@ -124,6 +124,7 @@ class RowGroupCollection { void CopyStats(TableStatistics &stats); unique_ptr CopyStats(column_t column_id); + unique_ptr GetSample(); void SetDistinct(column_t column_id, unique_ptr distinct_stats); AttachedDatabase &GetAttached(); diff --git a/src/duckdb/src/include/duckdb/storage/table/table_index_list.hpp b/src/duckdb/src/include/duckdb/storage/table/table_index_list.hpp index 59f5e099..6ae66836 100644 --- a/src/duckdb/src/include/duckdb/storage/table/table_index_list.hpp +++ b/src/duckdb/src/include/duckdb/storage/table/table_index_list.hpp @@ -9,18 +9,20 @@ #pragma once #include "duckdb/common/mutex.hpp" +#include "duckdb/execution/index/bound_index.hpp" #include "duckdb/parser/constraint.hpp" #include "duckdb/storage/index.hpp" namespace duckdb { class ConflictManager; + struct IndexStorageInfo; struct DataTableInfo; class TableIndexList { public: - //! Scan the indexes, invoking the callback method for every entry + //! Scan the indexes, invoking the callback method for every entry. template void Scan(T &&callback) { lock_guard lock(indexes_lock); @@ -31,7 +33,7 @@ class TableIndexList { } } - //! Scan the indexes, invoking the callback method for every bound entry of a specific type + //! Scan the indexes, invoking the callback method for every bound entry of type T. template void ScanBound(FUNC &&callback) { lock_guard lock(indexes_lock); @@ -44,45 +46,50 @@ class TableIndexList { } } - // Bind any unbound indexes of the specified type and invoke the callback method for every bound entry of the - // specified type, regardless if it was bound before or not + // Bind any unbound indexes of type T and invoke the callback method. template void BindAndScan(ClientContext &context, DataTableInfo &table_info, FUNC &&callback) { - // FIXME: optimize this by only looping through the indexes once without re-acquiring the lock + // FIXME: optimize this by only looping through the indexes once without re-acquiring the lock. InitializeIndexes(context, table_info, T::TYPE_NAME); ScanBound(callback); } - //! Returns a reference to the indexes of this table + //! Returns a reference to the indexes. const vector> &Indexes() const { return indexes; } - //! Adds an index to the list of indexes of this table + //! Adds an index to the list of indexes. void AddIndex(unique_ptr index); - //! Removes an index from the list of indexes of this table + //! Removes an index from the list of indexes. void RemoveIndex(const string &name); - //! Completely removes all remaining memory of an index after dropping the catalog entry + //! Removes all remaining memory of an index after dropping the catalog entry. void CommitDrop(const string &name); - //! Returns true, if the index name does not exist + //! Returns true, if the index name does not exist. bool NameIsUnique(const string &name); - //! Initializes unknown indexes that might now be present after an extension load, optionally throwing an exception - //! if a index cant be initialized + //! Returns an optional pointer to the index matching the name. + optional_ptr Find(const string &name); + //! Initializes unknown indexes that are possibly present after an extension load, optionally throwing an exception + //! on failure. void InitializeIndexes(ClientContext &context, DataTableInfo &table_info, const char *index_type = nullptr); + //! Returns true, if there are no indexes in this list. bool Empty(); + //! Returns the number of indexes in this list. idx_t Count(); + //! Overwrite this list with the other list. void Move(TableIndexList &other); - - Index *FindForeignKeyIndex(const vector &fk_keys, ForeignKeyType fk_type); + //! Find the foreign key matching the keys. + Index *FindForeignKeyIndex(const vector &fk_keys, const ForeignKeyType fk_type); + //! Verify a foreign key constraint. void VerifyForeignKey(const vector &fk_keys, DataChunk &chunk, ConflictManager &conflict_manager); - + //! Get the combined column ids of the indexes in this list. + unordered_set GetRequiredColumns(); //! Serialize all indexes of this table. vector GetStorageInfos(const case_insensitive_map_t &options); - vector GetRequiredColumns(); - private: - //! Indexes associated with the current table mutex indexes_lock; + // Indexes associated with the table. vector> indexes; }; + } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp b/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp index 633d4694..628023df 100644 --- a/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +++ b/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp @@ -48,6 +48,14 @@ class TableStatistics { //! Get a reference to the stats - this requires us to hold the lock. //! The reference can only be safely accessed while the lock is held ColumnStatistics &GetStats(TableStatisticsLock &lock, idx_t i); + //! Get a reference to the table sample - this requires us to hold the lock. + BlockingSample &GetTableSampleRef(TableStatisticsLock &lock); + //! Take ownership of the sample, needed for merging. Requires the lock + unique_ptr GetTableSample(TableStatisticsLock &lock); + void SetTableSample(TableStatisticsLock &lock, unique_ptr sample); + + void DestroyTableSample(TableStatisticsLock &lock) const; + void AppendToTableSample(TableStatisticsLock &lock, unique_ptr sample); bool Empty(); @@ -62,7 +70,6 @@ class TableStatistics { //! Column statistics vector> column_stats; //! The table sample - //! Sample for table unique_ptr table_sample; }; diff --git a/src/duckdb/src/include/duckdb/transaction/local_storage.hpp b/src/duckdb/src/include/duckdb/transaction/local_storage.hpp index 01371e6a..453a7ce4 100644 --- a/src/duckdb/src/include/duckdb/transaction/local_storage.hpp +++ b/src/duckdb/src/include/duckdb/transaction/local_storage.hpp @@ -44,8 +44,10 @@ class LocalTableStorage : public enable_shared_from_this { Allocator &allocator; //! The main chunk collection holding the data shared_ptr row_groups; - //! The set of unique indexes - TableIndexList indexes; + //! The set of unique append indexes. + TableIndexList append_indexes; + //! The set of delete indexes. + TableIndexList delete_indexes; //! The number of deleted rows idx_t deleted_rows; //! The main optimistic data writer @@ -65,10 +67,10 @@ class LocalTableStorage : public enable_shared_from_this { void Rollback(); idx_t EstimatedSize(); - void AppendToIndexes(DuckTransaction &transaction, TableAppendState &append_state, idx_t append_count, - bool append_to_table); + void AppendToIndexes(DuckTransaction &transaction, TableAppendState &append_state, bool append_to_table); ErrorData AppendToIndexes(DuckTransaction &transaction, RowGroupCollection &source, TableIndexList &index_list, const vector &table_types, row_t &start_row); + void AppendToDeleteIndexes(Vector &row_ids, DataChunk &delete_chunk); //! Creates an optimistic writer for this table OptimisticDataWriter &CreateOptimisticWriter(); @@ -118,6 +120,8 @@ class LocalStorage { //! Begin appending to the local storage void InitializeAppend(LocalAppendState &state, DataTable &table); + //! Initialize the storage and its indexes, but no row groups. + void InitializeStorage(LocalAppendState &state, DataTable &table); //! Append a chunk to the local storage static void Append(LocalAppendState &state, DataChunk &chunk); //! Finish appending to the local storage @@ -157,6 +161,7 @@ class LocalStorage { void FetchChunk(DataTable &table, Vector &row_ids, idx_t count, const vector &col_ids, DataChunk &chunk, ColumnFetchState &fetch_state); TableIndexList &GetIndexes(DataTable &table); + optional_ptr GetStorage(DataTable &table); void VerifyNewConstraint(DataTable &parent, const BoundConstraint &constraint); diff --git a/src/duckdb/src/include/duckdb_extension.h b/src/duckdb/src/include/duckdb_extension.h index f6a63eb3..f746830f 100644 --- a/src/duckdb/src/include/duckdb_extension.h +++ b/src/duckdb/src/include/duckdb_extension.h @@ -41,34 +41,36 @@ // Versioning //===--------------------------------------------------------------------===// //! Set version to latest if no explicit version is defined + #if !defined(DUCKDB_EXTENSION_API_VERSION_MAJOR) && !defined(DUCKDB_EXTENSION_API_VERSION_MINOR) && \ !defined(DUCKDB_EXTENSION_API_VERSION_PATCH) -#define DUCKDB_EXTENSION_API_VERSION_MAJOR 0 -#define DUCKDB_EXTENSION_API_VERSION_MINOR 0 -#define DUCKDB_EXTENSION_API_VERSION_PATCH 1 +#define DUCKDB_EXTENSION_API_VERSION_MAJOR 1 +#define DUCKDB_EXTENSION_API_VERSION_MINOR 2 +#define DUCKDB_EXTENSION_API_VERSION_PATCH 0 #elif !(defined(DUCKDB_EXTENSION_API_VERSION_MAJOR) && defined(DUCKDB_EXTENSION_API_VERSION_MINOR) && \ defined(DUCKDB_EXTENSION_API_VERSION_PATCH)) #error "either all or none of the DUCKDB_EXTENSION_API_VERSION_ defines should be defined" #endif //! Set the DUCKDB_EXTENSION_API_VERSION_STRING which is passed to DuckDB on extension load -#if DUCKDB_EXTENSION_API_VERSION_DEV -#define DUCKDB_EXTENSION_API_VERSION_STRING "dev" +#ifdef DUCKDB_EXTENSION_API_UNSTABLE_VERSION +#define DUCKDB_EXTENSION_API_VERSION_STRING DUCKDB_EXTENSION_API_UNSTABLE_VERSION #else #define DUCKDB_EXTENSION_API_VERSION_STRING \ DUCKDB_EXTENSION_SEMVER_STRING(DUCKDB_EXTENSION_API_VERSION_MAJOR, DUCKDB_EXTENSION_API_VERSION_MINOR, \ DUCKDB_EXTENSION_API_VERSION_PATCH) #endif -#if DUCKDB_EXTENSION_API_VERSION_MAJOR != 0 -#error "This version of the extension API header only supports API VERSION v0.x.x" +#if DUCKDB_EXTENSION_API_VERSION_MAJOR != 1 +#error "This version of the extension API header only supports API VERSION v1.x.x" #endif //===--------------------------------------------------------------------===// // Function pointer struct //===--------------------------------------------------------------------===// typedef struct { -#if DUCKDB_EXTENSION_API_VERSION_MINOR >= 0 && DUCKDB_EXTENSION_API_VERSION_PATCH >= 1 // v0.0.1 +#if DUCKDB_EXTENSION_API_VERSION_MINOR > 2 || \ + (DUCKDB_EXTENSION_API_VERSION_MINOR == 2 && DUCKDB_EXTENSION_API_VERSION_PATCH >= 0) // v1.2.0 duckdb_state (*duckdb_open)(const char *path, duckdb_database *out_database); duckdb_state (*duckdb_open_ext)(const char *path, duckdb_database *out_database, duckdb_config config, char **out_error); @@ -92,10 +94,14 @@ typedef struct { idx_t (*duckdb_column_count)(duckdb_result *result); idx_t (*duckdb_rows_changed)(duckdb_result *result); const char *(*duckdb_result_error)(duckdb_result *result); + duckdb_error_type (*duckdb_result_error_type)(duckdb_result *result); + duckdb_result_type (*duckdb_result_return_type)(duckdb_result result); void *(*duckdb_malloc)(size_t size); void (*duckdb_free)(void *ptr); idx_t (*duckdb_vector_size)(); bool (*duckdb_string_is_inlined)(duckdb_string_t string); + uint32_t (*duckdb_string_t_length)(duckdb_string_t string); + const char *(*duckdb_string_t_data)(duckdb_string_t *string); duckdb_date_struct (*duckdb_from_date)(duckdb_date date); duckdb_date (*duckdb_to_date)(duckdb_date_struct date); bool (*duckdb_is_finite_date)(duckdb_date date); @@ -119,6 +125,7 @@ typedef struct { idx_t (*duckdb_nparams)(duckdb_prepared_statement prepared_statement); const char *(*duckdb_parameter_name)(duckdb_prepared_statement prepared_statement, idx_t index); duckdb_type (*duckdb_param_type)(duckdb_prepared_statement prepared_statement, idx_t param_idx); + duckdb_logical_type (*duckdb_param_logical_type)(duckdb_prepared_statement prepared_statement, idx_t param_idx); duckdb_state (*duckdb_clear_bindings)(duckdb_prepared_statement prepared_statement); duckdb_statement_type (*duckdb_prepared_statement_type)(duckdb_prepared_statement statement); duckdb_state (*duckdb_bind_value)(duckdb_prepared_statement prepared_statement, idx_t param_idx, duckdb_value val); @@ -174,14 +181,70 @@ typedef struct { void (*duckdb_destroy_value)(duckdb_value *value); duckdb_value (*duckdb_create_varchar)(const char *text); duckdb_value (*duckdb_create_varchar_length)(const char *text, idx_t length); + duckdb_value (*duckdb_create_bool)(bool input); + duckdb_value (*duckdb_create_int8)(int8_t input); + duckdb_value (*duckdb_create_uint8)(uint8_t input); + duckdb_value (*duckdb_create_int16)(int16_t input); + duckdb_value (*duckdb_create_uint16)(uint16_t input); + duckdb_value (*duckdb_create_int32)(int32_t input); + duckdb_value (*duckdb_create_uint32)(uint32_t input); + duckdb_value (*duckdb_create_uint64)(uint64_t input); duckdb_value (*duckdb_create_int64)(int64_t val); + duckdb_value (*duckdb_create_hugeint)(duckdb_hugeint input); + duckdb_value (*duckdb_create_uhugeint)(duckdb_uhugeint input); + duckdb_value (*duckdb_create_float)(float input); + duckdb_value (*duckdb_create_double)(double input); + duckdb_value (*duckdb_create_date)(duckdb_date input); + duckdb_value (*duckdb_create_time)(duckdb_time input); + duckdb_value (*duckdb_create_time_tz_value)(duckdb_time_tz value); + duckdb_value (*duckdb_create_timestamp)(duckdb_timestamp input); + duckdb_value (*duckdb_create_interval)(duckdb_interval input); + duckdb_value (*duckdb_create_blob)(const uint8_t *data, idx_t length); + duckdb_value (*duckdb_create_varint)(duckdb_varint input); + duckdb_value (*duckdb_create_decimal)(duckdb_decimal input); + duckdb_value (*duckdb_create_bit)(duckdb_bit input); + duckdb_value (*duckdb_create_uuid)(duckdb_uhugeint input); + bool (*duckdb_get_bool)(duckdb_value val); + int8_t (*duckdb_get_int8)(duckdb_value val); + uint8_t (*duckdb_get_uint8)(duckdb_value val); + int16_t (*duckdb_get_int16)(duckdb_value val); + uint16_t (*duckdb_get_uint16)(duckdb_value val); + int32_t (*duckdb_get_int32)(duckdb_value val); + uint32_t (*duckdb_get_uint32)(duckdb_value val); + int64_t (*duckdb_get_int64)(duckdb_value val); + uint64_t (*duckdb_get_uint64)(duckdb_value val); + duckdb_hugeint (*duckdb_get_hugeint)(duckdb_value val); + duckdb_uhugeint (*duckdb_get_uhugeint)(duckdb_value val); + float (*duckdb_get_float)(duckdb_value val); + double (*duckdb_get_double)(duckdb_value val); + duckdb_date (*duckdb_get_date)(duckdb_value val); + duckdb_time (*duckdb_get_time)(duckdb_value val); + duckdb_time_tz (*duckdb_get_time_tz)(duckdb_value val); + duckdb_timestamp (*duckdb_get_timestamp)(duckdb_value val); + duckdb_interval (*duckdb_get_interval)(duckdb_value val); + duckdb_logical_type (*duckdb_get_value_type)(duckdb_value val); + duckdb_blob (*duckdb_get_blob)(duckdb_value val); + duckdb_varint (*duckdb_get_varint)(duckdb_value val); + duckdb_decimal (*duckdb_get_decimal)(duckdb_value val); + duckdb_bit (*duckdb_get_bit)(duckdb_value val); + duckdb_uhugeint (*duckdb_get_uuid)(duckdb_value val); + char *(*duckdb_get_varchar)(duckdb_value value); duckdb_value (*duckdb_create_struct_value)(duckdb_logical_type type, duckdb_value *values); duckdb_value (*duckdb_create_list_value)(duckdb_logical_type type, duckdb_value *values, idx_t value_count); duckdb_value (*duckdb_create_array_value)(duckdb_logical_type type, duckdb_value *values, idx_t value_count); - char *(*duckdb_get_varchar)(duckdb_value value); - int64_t (*duckdb_get_int64)(duckdb_value val); + idx_t (*duckdb_get_map_size)(duckdb_value value); + duckdb_value (*duckdb_get_map_key)(duckdb_value value, idx_t index); + duckdb_value (*duckdb_get_map_value)(duckdb_value value, idx_t index); + bool (*duckdb_is_null_value)(duckdb_value value); + duckdb_value (*duckdb_create_null_value)(); + idx_t (*duckdb_get_list_size)(duckdb_value value); + duckdb_value (*duckdb_get_list_child)(duckdb_value value, idx_t index); + duckdb_value (*duckdb_create_enum_value)(duckdb_logical_type type, uint64_t value); + uint64_t (*duckdb_get_enum_value)(duckdb_value value); + duckdb_value (*duckdb_get_struct_child)(duckdb_value value, idx_t index); duckdb_logical_type (*duckdb_create_logical_type)(duckdb_type type); char *(*duckdb_logical_type_get_alias)(duckdb_logical_type type); + void (*duckdb_logical_type_set_alias)(duckdb_logical_type type, const char *alias); duckdb_logical_type (*duckdb_create_list_type)(duckdb_logical_type type); duckdb_logical_type (*duckdb_create_array_type)(duckdb_logical_type type, idx_t array_size); duckdb_logical_type (*duckdb_create_map_type)(duckdb_logical_type key_type, duckdb_logical_type value_type); @@ -210,7 +273,8 @@ typedef struct { char *(*duckdb_union_type_member_name)(duckdb_logical_type type, idx_t index); duckdb_logical_type (*duckdb_union_type_member_type)(duckdb_logical_type type, idx_t index); void (*duckdb_destroy_logical_type)(duckdb_logical_type *type); - duckdb_data_chunk (*duckdb_fetch_chunk)(duckdb_result result); + duckdb_state (*duckdb_register_logical_type)(duckdb_connection con, duckdb_logical_type type, + duckdb_create_type_info info); duckdb_data_chunk (*duckdb_create_data_chunk)(duckdb_logical_type *types, idx_t column_count); void (*duckdb_destroy_data_chunk)(duckdb_data_chunk *chunk); void (*duckdb_data_chunk_reset)(duckdb_data_chunk chunk); @@ -237,6 +301,9 @@ typedef struct { duckdb_scalar_function (*duckdb_create_scalar_function)(); void (*duckdb_destroy_scalar_function)(duckdb_scalar_function *scalar_function); void (*duckdb_scalar_function_set_name)(duckdb_scalar_function scalar_function, const char *name); + void (*duckdb_scalar_function_set_varargs)(duckdb_scalar_function scalar_function, duckdb_logical_type type); + void (*duckdb_scalar_function_set_special_handling)(duckdb_scalar_function scalar_function); + void (*duckdb_scalar_function_set_volatile)(duckdb_scalar_function scalar_function); void (*duckdb_scalar_function_add_parameter)(duckdb_scalar_function scalar_function, duckdb_logical_type type); void (*duckdb_scalar_function_set_return_type)(duckdb_scalar_function scalar_function, duckdb_logical_type type); void (*duckdb_scalar_function_set_extra_info)(duckdb_scalar_function scalar_function, void *extra_info, @@ -244,6 +311,39 @@ typedef struct { void (*duckdb_scalar_function_set_function)(duckdb_scalar_function scalar_function, duckdb_scalar_function_t function); duckdb_state (*duckdb_register_scalar_function)(duckdb_connection con, duckdb_scalar_function scalar_function); + void *(*duckdb_scalar_function_get_extra_info)(duckdb_function_info info); + void (*duckdb_scalar_function_set_error)(duckdb_function_info info, const char *error); + duckdb_scalar_function_set (*duckdb_create_scalar_function_set)(const char *name); + void (*duckdb_destroy_scalar_function_set)(duckdb_scalar_function_set *scalar_function_set); + duckdb_state (*duckdb_add_scalar_function_to_set)(duckdb_scalar_function_set set, duckdb_scalar_function function); + duckdb_state (*duckdb_register_scalar_function_set)(duckdb_connection con, duckdb_scalar_function_set set); + duckdb_aggregate_function (*duckdb_create_aggregate_function)(); + void (*duckdb_destroy_aggregate_function)(duckdb_aggregate_function *aggregate_function); + void (*duckdb_aggregate_function_set_name)(duckdb_aggregate_function aggregate_function, const char *name); + void (*duckdb_aggregate_function_add_parameter)(duckdb_aggregate_function aggregate_function, + duckdb_logical_type type); + void (*duckdb_aggregate_function_set_return_type)(duckdb_aggregate_function aggregate_function, + duckdb_logical_type type); + void (*duckdb_aggregate_function_set_functions)(duckdb_aggregate_function aggregate_function, + duckdb_aggregate_state_size state_size, + duckdb_aggregate_init_t state_init, + duckdb_aggregate_update_t update, + duckdb_aggregate_combine_t combine, + duckdb_aggregate_finalize_t finalize); + void (*duckdb_aggregate_function_set_destructor)(duckdb_aggregate_function aggregate_function, + duckdb_aggregate_destroy_t destroy); + duckdb_state (*duckdb_register_aggregate_function)(duckdb_connection con, + duckdb_aggregate_function aggregate_function); + void (*duckdb_aggregate_function_set_special_handling)(duckdb_aggregate_function aggregate_function); + void (*duckdb_aggregate_function_set_extra_info)(duckdb_aggregate_function aggregate_function, void *extra_info, + duckdb_delete_callback_t destroy); + void *(*duckdb_aggregate_function_get_extra_info)(duckdb_function_info info); + void (*duckdb_aggregate_function_set_error)(duckdb_function_info info, const char *error); + duckdb_aggregate_function_set (*duckdb_create_aggregate_function_set)(const char *name); + void (*duckdb_destroy_aggregate_function_set)(duckdb_aggregate_function_set *aggregate_function_set); + duckdb_state (*duckdb_add_aggregate_function_to_set)(duckdb_aggregate_function_set set, + duckdb_aggregate_function function); + duckdb_state (*duckdb_register_aggregate_function_set)(duckdb_connection con, duckdb_aggregate_function_set set); duckdb_table_function (*duckdb_create_table_function)(); void (*duckdb_destroy_table_function)(duckdb_table_function *table_function); void (*duckdb_table_function_set_name)(duckdb_table_function table_function, const char *name); @@ -284,14 +384,68 @@ typedef struct { void (*duckdb_replacement_scan_set_function_name)(duckdb_replacement_scan_info info, const char *function_name); void (*duckdb_replacement_scan_add_parameter)(duckdb_replacement_scan_info info, duckdb_value parameter); void (*duckdb_replacement_scan_set_error)(duckdb_replacement_scan_info info, const char *error); + duckdb_value (*duckdb_profiling_info_get_metrics)(duckdb_profiling_info info); + idx_t (*duckdb_profiling_info_get_child_count)(duckdb_profiling_info info); + duckdb_profiling_info (*duckdb_profiling_info_get_child)(duckdb_profiling_info info, idx_t index); duckdb_state (*duckdb_appender_create)(duckdb_connection connection, const char *schema, const char *table, duckdb_appender *out_appender); + duckdb_state (*duckdb_appender_create_ext)(duckdb_connection connection, const char *catalog, const char *schema, + const char *table, duckdb_appender *out_appender); idx_t (*duckdb_appender_column_count)(duckdb_appender appender); duckdb_logical_type (*duckdb_appender_column_type)(duckdb_appender appender, idx_t col_idx); const char *(*duckdb_appender_error)(duckdb_appender appender); duckdb_state (*duckdb_appender_flush)(duckdb_appender appender); duckdb_state (*duckdb_appender_close)(duckdb_appender appender); duckdb_state (*duckdb_appender_destroy)(duckdb_appender *appender); + duckdb_state (*duckdb_appender_add_column)(duckdb_appender appender, const char *name); + duckdb_state (*duckdb_appender_clear_columns)(duckdb_appender appender); + duckdb_state (*duckdb_append_data_chunk)(duckdb_appender appender, duckdb_data_chunk chunk); + duckdb_state (*duckdb_table_description_create)(duckdb_connection connection, const char *schema, const char *table, + duckdb_table_description *out); + duckdb_state (*duckdb_table_description_create_ext)(duckdb_connection connection, const char *catalog, + const char *schema, const char *table, + duckdb_table_description *out); + void (*duckdb_table_description_destroy)(duckdb_table_description *table_description); + const char *(*duckdb_table_description_error)(duckdb_table_description table_description); + duckdb_state (*duckdb_column_has_default)(duckdb_table_description table_description, idx_t index, bool *out); + char *(*duckdb_table_description_get_column_name)(duckdb_table_description table_description, idx_t index); + void (*duckdb_execute_tasks)(duckdb_database database, idx_t max_tasks); + duckdb_task_state (*duckdb_create_task_state)(duckdb_database database); + void (*duckdb_execute_tasks_state)(duckdb_task_state state); + idx_t (*duckdb_execute_n_tasks_state)(duckdb_task_state state, idx_t max_tasks); + void (*duckdb_finish_execution)(duckdb_task_state state); + bool (*duckdb_task_state_is_finished)(duckdb_task_state state); + void (*duckdb_destroy_task_state)(duckdb_task_state state); + bool (*duckdb_execution_is_finished)(duckdb_connection con); + duckdb_data_chunk (*duckdb_fetch_chunk)(duckdb_result result); + duckdb_cast_function (*duckdb_create_cast_function)(); + void (*duckdb_cast_function_set_source_type)(duckdb_cast_function cast_function, duckdb_logical_type source_type); + void (*duckdb_cast_function_set_target_type)(duckdb_cast_function cast_function, duckdb_logical_type target_type); + void (*duckdb_cast_function_set_implicit_cast_cost)(duckdb_cast_function cast_function, int64_t cost); + void (*duckdb_cast_function_set_function)(duckdb_cast_function cast_function, duckdb_cast_function_t function); + void (*duckdb_cast_function_set_extra_info)(duckdb_cast_function cast_function, void *extra_info, + duckdb_delete_callback_t destroy); + void *(*duckdb_cast_function_get_extra_info)(duckdb_function_info info); + duckdb_cast_mode (*duckdb_cast_function_get_cast_mode)(duckdb_function_info info); + void (*duckdb_cast_function_set_error)(duckdb_function_info info, const char *error); + void (*duckdb_cast_function_set_row_error)(duckdb_function_info info, const char *error, idx_t row, + duckdb_vector output); + duckdb_state (*duckdb_register_cast_function)(duckdb_connection con, duckdb_cast_function cast_function); + void (*duckdb_destroy_cast_function)(duckdb_cast_function *cast_function); + bool (*duckdb_is_finite_timestamp_s)(duckdb_timestamp_s ts); + bool (*duckdb_is_finite_timestamp_ms)(duckdb_timestamp_ms ts); + bool (*duckdb_is_finite_timestamp_ns)(duckdb_timestamp_ns ts); + duckdb_value (*duckdb_create_timestamp_tz)(duckdb_timestamp input); + duckdb_value (*duckdb_create_timestamp_s)(duckdb_timestamp_s input); + duckdb_value (*duckdb_create_timestamp_ms)(duckdb_timestamp_ms input); + duckdb_value (*duckdb_create_timestamp_ns)(duckdb_timestamp_ns input); + duckdb_timestamp (*duckdb_get_timestamp_tz)(duckdb_value val); + duckdb_timestamp_s (*duckdb_get_timestamp_s)(duckdb_value val); + duckdb_timestamp_ms (*duckdb_get_timestamp_ms)(duckdb_value val); + duckdb_timestamp_ns (*duckdb_get_timestamp_ns)(duckdb_value val); + duckdb_state (*duckdb_append_value)(duckdb_appender appender, duckdb_value value); + duckdb_profiling_info (*duckdb_get_profiling_info)(duckdb_connection connection); + duckdb_value (*duckdb_profiling_info_get_value)(duckdb_profiling_info info, const char *key); duckdb_state (*duckdb_appender_begin_row)(duckdb_appender appender); duckdb_state (*duckdb_appender_end_row)(duckdb_appender appender); duckdb_state (*duckdb_append_default)(duckdb_appender appender); @@ -316,127 +470,16 @@ typedef struct { duckdb_state (*duckdb_append_varchar_length)(duckdb_appender appender, const char *val, idx_t length); duckdb_state (*duckdb_append_blob)(duckdb_appender appender, const void *data, idx_t length); duckdb_state (*duckdb_append_null)(duckdb_appender appender); - duckdb_state (*duckdb_append_data_chunk)(duckdb_appender appender, duckdb_data_chunk chunk); - void (*duckdb_execute_tasks)(duckdb_database database, idx_t max_tasks); - duckdb_task_state (*duckdb_create_task_state)(duckdb_database database); - void (*duckdb_execute_tasks_state)(duckdb_task_state state); - idx_t (*duckdb_execute_n_tasks_state)(duckdb_task_state state, idx_t max_tasks); - void (*duckdb_finish_execution)(duckdb_task_state state); - bool (*duckdb_task_state_is_finished)(duckdb_task_state state); - void (*duckdb_destroy_task_state)(duckdb_task_state state); - bool (*duckdb_execution_is_finished)(duckdb_connection con); - duckdb_profiling_info (*duckdb_get_profiling_info)(duckdb_connection connection); - duckdb_value (*duckdb_profiling_info_get_value)(duckdb_profiling_info info, const char *key); - idx_t (*duckdb_profiling_info_get_child_count)(duckdb_profiling_info info); - duckdb_profiling_info (*duckdb_profiling_info_get_child)(duckdb_profiling_info info, idx_t index); - duckdb_value (*duckdb_profiling_info_get_metrics)(duckdb_profiling_info info); - void (*duckdb_scalar_function_set_varargs)(duckdb_scalar_function scalar_function, duckdb_logical_type type); - void (*duckdb_scalar_function_set_special_handling)(duckdb_scalar_function scalar_function); - void (*duckdb_scalar_function_set_volatile)(duckdb_scalar_function scalar_function); - void *(*duckdb_scalar_function_get_extra_info)(duckdb_function_info info); - void (*duckdb_scalar_function_set_error)(duckdb_function_info info, const char *error); - duckdb_state (*duckdb_table_description_create)(duckdb_connection connection, const char *schema, const char *table, - duckdb_table_description *out); - void (*duckdb_table_description_destroy)(duckdb_table_description *table_description); - const char *(*duckdb_table_description_error)(duckdb_table_description table_description); - duckdb_error_type (*duckdb_result_error_type)(duckdb_result *result); - uint32_t (*duckdb_string_t_length)(duckdb_string_t string); - const char *(*duckdb_string_t_data)(duckdb_string_t *string); - duckdb_value (*duckdb_create_bool)(bool input); - duckdb_value (*duckdb_create_int8)(int8_t input); - duckdb_value (*duckdb_create_uint8)(uint8_t input); - duckdb_value (*duckdb_create_int16)(int16_t input); - duckdb_value (*duckdb_create_uint16)(uint16_t input); - duckdb_value (*duckdb_create_int32)(int32_t input); - duckdb_value (*duckdb_create_uint32)(uint32_t input); - duckdb_value (*duckdb_create_uint64)(uint64_t input); - duckdb_value (*duckdb_create_hugeint)(duckdb_hugeint input); - duckdb_value (*duckdb_create_uhugeint)(duckdb_uhugeint input); - duckdb_value (*duckdb_create_float)(float input); - duckdb_value (*duckdb_create_double)(double input); - duckdb_value (*duckdb_create_date)(duckdb_date input); - duckdb_value (*duckdb_create_time)(duckdb_time input); - duckdb_value (*duckdb_create_time_tz_value)(duckdb_time_tz value); - duckdb_value (*duckdb_create_timestamp)(duckdb_timestamp input); - duckdb_value (*duckdb_create_interval)(duckdb_interval input); - duckdb_value (*duckdb_create_blob)(const uint8_t *data, idx_t length); - bool (*duckdb_get_bool)(duckdb_value val); - int8_t (*duckdb_get_int8)(duckdb_value val); - uint8_t (*duckdb_get_uint8)(duckdb_value val); - int16_t (*duckdb_get_int16)(duckdb_value val); - uint16_t (*duckdb_get_uint16)(duckdb_value val); - int32_t (*duckdb_get_int32)(duckdb_value val); - uint32_t (*duckdb_get_uint32)(duckdb_value val); - uint64_t (*duckdb_get_uint64)(duckdb_value val); - duckdb_hugeint (*duckdb_get_hugeint)(duckdb_value val); - duckdb_uhugeint (*duckdb_get_uhugeint)(duckdb_value val); - float (*duckdb_get_float)(duckdb_value val); - double (*duckdb_get_double)(duckdb_value val); - duckdb_date (*duckdb_get_date)(duckdb_value val); - duckdb_time (*duckdb_get_time)(duckdb_value val); - duckdb_time_tz (*duckdb_get_time_tz)(duckdb_value val); - duckdb_timestamp (*duckdb_get_timestamp)(duckdb_value val); - duckdb_interval (*duckdb_get_interval)(duckdb_value val); - duckdb_logical_type (*duckdb_get_value_type)(duckdb_value val); - duckdb_blob (*duckdb_get_blob)(duckdb_value val); - duckdb_scalar_function_set (*duckdb_create_scalar_function_set)(const char *name); - void (*duckdb_destroy_scalar_function_set)(duckdb_scalar_function_set *scalar_function_set); - duckdb_state (*duckdb_add_scalar_function_to_set)(duckdb_scalar_function_set set, duckdb_scalar_function function); - duckdb_state (*duckdb_register_scalar_function_set)(duckdb_connection con, duckdb_scalar_function_set set); - duckdb_aggregate_function_set (*duckdb_create_aggregate_function_set)(const char *name); - void (*duckdb_destroy_aggregate_function_set)(duckdb_aggregate_function_set *aggregate_function_set); - duckdb_state (*duckdb_add_aggregate_function_to_set)(duckdb_aggregate_function_set set, - duckdb_aggregate_function function); - duckdb_state (*duckdb_register_aggregate_function_set)(duckdb_connection con, duckdb_aggregate_function_set set); - idx_t (*duckdb_get_map_size)(duckdb_value value); - duckdb_value (*duckdb_get_map_key)(duckdb_value value, idx_t index); - duckdb_value (*duckdb_get_map_value)(duckdb_value value, idx_t index); - duckdb_aggregate_function (*duckdb_create_aggregate_function)(); - void (*duckdb_destroy_aggregate_function)(duckdb_aggregate_function *aggregate_function); - void (*duckdb_aggregate_function_set_name)(duckdb_aggregate_function aggregate_function, const char *name); - void (*duckdb_aggregate_function_add_parameter)(duckdb_aggregate_function aggregate_function, - duckdb_logical_type type); - void (*duckdb_aggregate_function_set_return_type)(duckdb_aggregate_function aggregate_function, - duckdb_logical_type type); - void (*duckdb_aggregate_function_set_functions)(duckdb_aggregate_function aggregate_function, - duckdb_aggregate_state_size state_size, - duckdb_aggregate_init_t state_init, - duckdb_aggregate_update_t update, - duckdb_aggregate_combine_t combine, - duckdb_aggregate_finalize_t finalize); - void (*duckdb_aggregate_function_set_destructor)(duckdb_aggregate_function aggregate_function, - duckdb_aggregate_destroy_t destroy); - duckdb_state (*duckdb_register_aggregate_function)(duckdb_connection con, - duckdb_aggregate_function aggregate_function); - void (*duckdb_aggregate_function_set_special_handling)(duckdb_aggregate_function aggregate_function); - void (*duckdb_aggregate_function_set_extra_info)(duckdb_aggregate_function aggregate_function, void *extra_info, - duckdb_delete_callback_t destroy); - void *(*duckdb_aggregate_function_get_extra_info)(duckdb_function_info info); - void (*duckdb_aggregate_function_set_error)(duckdb_function_info info, const char *error); - void (*duckdb_logical_type_set_alias)(duckdb_logical_type type, const char *alias); - duckdb_state (*duckdb_register_logical_type)(duckdb_connection con, duckdb_logical_type type, - duckdb_create_type_info info); - duckdb_cast_function (*duckdb_create_cast_function)(); - void (*duckdb_cast_function_set_source_type)(duckdb_cast_function cast_function, duckdb_logical_type source_type); - void (*duckdb_cast_function_set_target_type)(duckdb_cast_function cast_function, duckdb_logical_type target_type); - void (*duckdb_cast_function_set_implicit_cast_cost)(duckdb_cast_function cast_function, int64_t cost); - void (*duckdb_cast_function_set_function)(duckdb_cast_function cast_function, duckdb_cast_function_t function); - void (*duckdb_cast_function_set_extra_info)(duckdb_cast_function cast_function, void *extra_info, - duckdb_delete_callback_t destroy); - void *(*duckdb_cast_function_get_extra_info)(duckdb_function_info info); - duckdb_cast_mode (*duckdb_cast_function_get_cast_mode)(duckdb_function_info info); - void (*duckdb_cast_function_set_error)(duckdb_function_info info, const char *error); - void (*duckdb_cast_function_set_row_error)(duckdb_function_info info, const char *error, idx_t row, - duckdb_vector output); - duckdb_state (*duckdb_register_cast_function)(duckdb_connection con, duckdb_cast_function cast_function); - void (*duckdb_destroy_cast_function)(duckdb_cast_function *cast_function); +#endif + +// These functions have been deprecated and may be removed in future versions of DuckDB +#ifdef DUCKDB_EXTENSION_API_VERSION_UNSTABLE idx_t (*duckdb_row_count)(duckdb_result *result); void *(*duckdb_column_data)(duckdb_result *result, idx_t col); bool *(*duckdb_nullmask_data)(duckdb_result *result, idx_t col); duckdb_data_chunk (*duckdb_result_get_chunk)(duckdb_result result, idx_t chunk_index); bool (*duckdb_result_is_streaming)(duckdb_result result); idx_t (*duckdb_result_chunk_count)(duckdb_result result); - duckdb_result_type (*duckdb_result_return_type)(duckdb_result result); bool (*duckdb_value_boolean)(duckdb_result *result, idx_t col, idx_t row); int8_t (*duckdb_value_int8)(duckdb_result *result, idx_t col, idx_t row); int16_t (*duckdb_value_int16)(duckdb_result *result, idx_t col, idx_t row); @@ -465,7 +508,6 @@ typedef struct { duckdb_result *out_result); duckdb_state (*duckdb_pending_prepared_streaming)(duckdb_prepared_statement prepared_statement, duckdb_pending_result *out_result); - duckdb_state (*duckdb_column_has_default)(duckdb_table_description table_description, idx_t index, bool *out); duckdb_state (*duckdb_query_arrow)(duckdb_connection connection, const char *query, duckdb_arrow *out_result); duckdb_state (*duckdb_query_arrow_schema)(duckdb_arrow result, duckdb_arrow_schema *out_schema); duckdb_state (*duckdb_prepared_arrow_schema)(duckdb_prepared_statement prepared, duckdb_arrow_schema *out_schema); @@ -486,53 +528,12 @@ typedef struct { duckdb_data_chunk (*duckdb_stream_fetch_chunk)(duckdb_result result); #endif -#ifdef DUCKDB_EXTENSION_API_VERSION_DEV // dev - // WARNING! the functions below are not (yet) stable - - duckdb_state (*duckdb_appender_create_ext)(duckdb_connection connection, const char *catalog, const char *schema, - const char *table, duckdb_appender *out_appender); - duckdb_state (*duckdb_table_description_create_ext)(duckdb_connection connection, const char *catalog, - const char *schema, const char *table, - duckdb_table_description *out); - char *(*duckdb_table_description_get_column_name)(duckdb_table_description table_description, idx_t index); - duckdb_logical_type (*duckdb_param_logical_type)(duckdb_prepared_statement prepared_statement, idx_t param_idx); - bool (*duckdb_is_null_value)(duckdb_value value); - duckdb_value (*duckdb_create_null_value)(); - idx_t (*duckdb_get_list_size)(duckdb_value value); - duckdb_value (*duckdb_get_list_child)(duckdb_value value, idx_t index); - duckdb_value (*duckdb_create_enum_value)(duckdb_logical_type type, uint64_t value); - uint64_t (*duckdb_get_enum_value)(duckdb_value value); - duckdb_value (*duckdb_get_struct_child)(duckdb_value value, idx_t index); - duckdb_state (*duckdb_appender_add_column)(duckdb_appender appender, const char *name); - duckdb_state (*duckdb_appender_clear_columns)(duckdb_appender appender); - bool (*duckdb_is_finite_timestamp_s)(duckdb_timestamp_s ts); - bool (*duckdb_is_finite_timestamp_ms)(duckdb_timestamp_ms ts); - bool (*duckdb_is_finite_timestamp_ns)(duckdb_timestamp_ns ts); - duckdb_value (*duckdb_create_timestamp_tz)(duckdb_timestamp input); - duckdb_value (*duckdb_create_timestamp_s)(duckdb_timestamp_s input); - duckdb_value (*duckdb_create_timestamp_ms)(duckdb_timestamp_ms input); - duckdb_value (*duckdb_create_timestamp_ns)(duckdb_timestamp_ns input); - duckdb_timestamp (*duckdb_get_timestamp_tz)(duckdb_value val); - duckdb_timestamp_s (*duckdb_get_timestamp_s)(duckdb_value val); - duckdb_timestamp_ms (*duckdb_get_timestamp_ms)(duckdb_value val); - duckdb_timestamp_ns (*duckdb_get_timestamp_ns)(duckdb_value val); - duckdb_state (*duckdb_append_value)(duckdb_appender appender, duckdb_value value); - duckdb_value (*duckdb_create_varint)(duckdb_varint input); - duckdb_value (*duckdb_create_decimal)(duckdb_decimal input); - duckdb_value (*duckdb_create_bit)(duckdb_bit input); - duckdb_value (*duckdb_create_uuid)(duckdb_uhugeint input); - duckdb_varint (*duckdb_get_varint)(duckdb_value val); - duckdb_decimal (*duckdb_get_decimal)(duckdb_value val); - duckdb_bit (*duckdb_get_bit)(duckdb_value val); - duckdb_uhugeint (*duckdb_get_uuid)(duckdb_value val); -#endif - -} duckdb_ext_api_v0; +} duckdb_ext_api_v1; //===--------------------------------------------------------------------===// // Typedefs mapping functions to struct entries //===--------------------------------------------------------------------===// -// Version v0.0.1 +// Version v1.2.0 #define duckdb_open duckdb_ext_api.duckdb_open #define duckdb_open_ext duckdb_ext_api.duckdb_open_ext #define duckdb_close duckdb_ext_api.duckdb_close @@ -553,40 +554,10 @@ typedef struct { #define duckdb_result_statement_type duckdb_ext_api.duckdb_result_statement_type #define duckdb_column_logical_type duckdb_ext_api.duckdb_column_logical_type #define duckdb_column_count duckdb_ext_api.duckdb_column_count -#define duckdb_row_count duckdb_ext_api.duckdb_row_count #define duckdb_rows_changed duckdb_ext_api.duckdb_rows_changed -#define duckdb_column_data duckdb_ext_api.duckdb_column_data -#define duckdb_nullmask_data duckdb_ext_api.duckdb_nullmask_data #define duckdb_result_error duckdb_ext_api.duckdb_result_error #define duckdb_result_error_type duckdb_ext_api.duckdb_result_error_type -#define duckdb_result_get_chunk duckdb_ext_api.duckdb_result_get_chunk -#define duckdb_result_is_streaming duckdb_ext_api.duckdb_result_is_streaming -#define duckdb_result_chunk_count duckdb_ext_api.duckdb_result_chunk_count #define duckdb_result_return_type duckdb_ext_api.duckdb_result_return_type -#define duckdb_value_boolean duckdb_ext_api.duckdb_value_boolean -#define duckdb_value_int8 duckdb_ext_api.duckdb_value_int8 -#define duckdb_value_int16 duckdb_ext_api.duckdb_value_int16 -#define duckdb_value_int32 duckdb_ext_api.duckdb_value_int32 -#define duckdb_value_int64 duckdb_ext_api.duckdb_value_int64 -#define duckdb_value_hugeint duckdb_ext_api.duckdb_value_hugeint -#define duckdb_value_uhugeint duckdb_ext_api.duckdb_value_uhugeint -#define duckdb_value_decimal duckdb_ext_api.duckdb_value_decimal -#define duckdb_value_uint8 duckdb_ext_api.duckdb_value_uint8 -#define duckdb_value_uint16 duckdb_ext_api.duckdb_value_uint16 -#define duckdb_value_uint32 duckdb_ext_api.duckdb_value_uint32 -#define duckdb_value_uint64 duckdb_ext_api.duckdb_value_uint64 -#define duckdb_value_float duckdb_ext_api.duckdb_value_float -#define duckdb_value_double duckdb_ext_api.duckdb_value_double -#define duckdb_value_date duckdb_ext_api.duckdb_value_date -#define duckdb_value_time duckdb_ext_api.duckdb_value_time -#define duckdb_value_timestamp duckdb_ext_api.duckdb_value_timestamp -#define duckdb_value_interval duckdb_ext_api.duckdb_value_interval -#define duckdb_value_varchar duckdb_ext_api.duckdb_value_varchar -#define duckdb_value_string duckdb_ext_api.duckdb_value_string -#define duckdb_value_varchar_internal duckdb_ext_api.duckdb_value_varchar_internal -#define duckdb_value_string_internal duckdb_ext_api.duckdb_value_string_internal -#define duckdb_value_blob duckdb_ext_api.duckdb_value_blob -#define duckdb_value_is_null duckdb_ext_api.duckdb_value_is_null #define duckdb_malloc duckdb_ext_api.duckdb_malloc #define duckdb_free duckdb_ext_api.duckdb_free #define duckdb_vector_size duckdb_ext_api.duckdb_vector_size @@ -603,6 +574,9 @@ typedef struct { #define duckdb_from_timestamp duckdb_ext_api.duckdb_from_timestamp #define duckdb_to_timestamp duckdb_ext_api.duckdb_to_timestamp #define duckdb_is_finite_timestamp duckdb_ext_api.duckdb_is_finite_timestamp +#define duckdb_is_finite_timestamp_s duckdb_ext_api.duckdb_is_finite_timestamp_s +#define duckdb_is_finite_timestamp_ms duckdb_ext_api.duckdb_is_finite_timestamp_ms +#define duckdb_is_finite_timestamp_ns duckdb_ext_api.duckdb_is_finite_timestamp_ns #define duckdb_hugeint_to_double duckdb_ext_api.duckdb_hugeint_to_double #define duckdb_double_to_hugeint duckdb_ext_api.duckdb_double_to_hugeint #define duckdb_uhugeint_to_double duckdb_ext_api.duckdb_uhugeint_to_double @@ -615,6 +589,7 @@ typedef struct { #define duckdb_nparams duckdb_ext_api.duckdb_nparams #define duckdb_parameter_name duckdb_ext_api.duckdb_parameter_name #define duckdb_param_type duckdb_ext_api.duckdb_param_type +#define duckdb_param_logical_type duckdb_ext_api.duckdb_param_logical_type #define duckdb_clear_bindings duckdb_ext_api.duckdb_clear_bindings #define duckdb_prepared_statement_type duckdb_ext_api.duckdb_prepared_statement_type #define duckdb_bind_value duckdb_ext_api.duckdb_bind_value @@ -643,13 +618,11 @@ typedef struct { #define duckdb_bind_blob duckdb_ext_api.duckdb_bind_blob #define duckdb_bind_null duckdb_ext_api.duckdb_bind_null #define duckdb_execute_prepared duckdb_ext_api.duckdb_execute_prepared -#define duckdb_execute_prepared_streaming duckdb_ext_api.duckdb_execute_prepared_streaming #define duckdb_extract_statements duckdb_ext_api.duckdb_extract_statements #define duckdb_prepare_extracted_statement duckdb_ext_api.duckdb_prepare_extracted_statement #define duckdb_extract_statements_error duckdb_ext_api.duckdb_extract_statements_error #define duckdb_destroy_extracted duckdb_ext_api.duckdb_destroy_extracted #define duckdb_pending_prepared duckdb_ext_api.duckdb_pending_prepared -#define duckdb_pending_prepared_streaming duckdb_ext_api.duckdb_pending_prepared_streaming #define duckdb_destroy_pending duckdb_ext_api.duckdb_destroy_pending #define duckdb_pending_error duckdb_ext_api.duckdb_pending_error #define duckdb_pending_execute_task duckdb_ext_api.duckdb_pending_execute_task @@ -670,14 +643,22 @@ typedef struct { #define duckdb_create_int64 duckdb_ext_api.duckdb_create_int64 #define duckdb_create_hugeint duckdb_ext_api.duckdb_create_hugeint #define duckdb_create_uhugeint duckdb_ext_api.duckdb_create_uhugeint +#define duckdb_create_varint duckdb_ext_api.duckdb_create_varint +#define duckdb_create_decimal duckdb_ext_api.duckdb_create_decimal #define duckdb_create_float duckdb_ext_api.duckdb_create_float #define duckdb_create_double duckdb_ext_api.duckdb_create_double #define duckdb_create_date duckdb_ext_api.duckdb_create_date #define duckdb_create_time duckdb_ext_api.duckdb_create_time #define duckdb_create_time_tz_value duckdb_ext_api.duckdb_create_time_tz_value #define duckdb_create_timestamp duckdb_ext_api.duckdb_create_timestamp +#define duckdb_create_timestamp_tz duckdb_ext_api.duckdb_create_timestamp_tz +#define duckdb_create_timestamp_s duckdb_ext_api.duckdb_create_timestamp_s +#define duckdb_create_timestamp_ms duckdb_ext_api.duckdb_create_timestamp_ms +#define duckdb_create_timestamp_ns duckdb_ext_api.duckdb_create_timestamp_ns #define duckdb_create_interval duckdb_ext_api.duckdb_create_interval #define duckdb_create_blob duckdb_ext_api.duckdb_create_blob +#define duckdb_create_bit duckdb_ext_api.duckdb_create_bit +#define duckdb_create_uuid duckdb_ext_api.duckdb_create_uuid #define duckdb_get_bool duckdb_ext_api.duckdb_get_bool #define duckdb_get_int8 duckdb_ext_api.duckdb_get_int8 #define duckdb_get_uint8 duckdb_ext_api.duckdb_get_uint8 @@ -689,15 +670,23 @@ typedef struct { #define duckdb_get_uint64 duckdb_ext_api.duckdb_get_uint64 #define duckdb_get_hugeint duckdb_ext_api.duckdb_get_hugeint #define duckdb_get_uhugeint duckdb_ext_api.duckdb_get_uhugeint +#define duckdb_get_varint duckdb_ext_api.duckdb_get_varint +#define duckdb_get_decimal duckdb_ext_api.duckdb_get_decimal #define duckdb_get_float duckdb_ext_api.duckdb_get_float #define duckdb_get_double duckdb_ext_api.duckdb_get_double #define duckdb_get_date duckdb_ext_api.duckdb_get_date #define duckdb_get_time duckdb_ext_api.duckdb_get_time #define duckdb_get_time_tz duckdb_ext_api.duckdb_get_time_tz #define duckdb_get_timestamp duckdb_ext_api.duckdb_get_timestamp +#define duckdb_get_timestamp_tz duckdb_ext_api.duckdb_get_timestamp_tz +#define duckdb_get_timestamp_s duckdb_ext_api.duckdb_get_timestamp_s +#define duckdb_get_timestamp_ms duckdb_ext_api.duckdb_get_timestamp_ms +#define duckdb_get_timestamp_ns duckdb_ext_api.duckdb_get_timestamp_ns #define duckdb_get_interval duckdb_ext_api.duckdb_get_interval #define duckdb_get_value_type duckdb_ext_api.duckdb_get_value_type #define duckdb_get_blob duckdb_ext_api.duckdb_get_blob +#define duckdb_get_bit duckdb_ext_api.duckdb_get_bit +#define duckdb_get_uuid duckdb_ext_api.duckdb_get_uuid #define duckdb_get_varchar duckdb_ext_api.duckdb_get_varchar #define duckdb_create_struct_value duckdb_ext_api.duckdb_create_struct_value #define duckdb_create_list_value duckdb_ext_api.duckdb_create_list_value @@ -705,6 +694,13 @@ typedef struct { #define duckdb_get_map_size duckdb_ext_api.duckdb_get_map_size #define duckdb_get_map_key duckdb_ext_api.duckdb_get_map_key #define duckdb_get_map_value duckdb_ext_api.duckdb_get_map_value +#define duckdb_is_null_value duckdb_ext_api.duckdb_is_null_value +#define duckdb_create_null_value duckdb_ext_api.duckdb_create_null_value +#define duckdb_get_list_size duckdb_ext_api.duckdb_get_list_size +#define duckdb_get_list_child duckdb_ext_api.duckdb_get_list_child +#define duckdb_create_enum_value duckdb_ext_api.duckdb_create_enum_value +#define duckdb_get_enum_value duckdb_ext_api.duckdb_get_enum_value +#define duckdb_get_struct_child duckdb_ext_api.duckdb_get_struct_child #define duckdb_create_logical_type duckdb_ext_api.duckdb_create_logical_type #define duckdb_logical_type_get_alias duckdb_ext_api.duckdb_logical_type_get_alias #define duckdb_logical_type_set_alias duckdb_ext_api.duckdb_logical_type_set_alias @@ -834,12 +830,15 @@ typedef struct { #define duckdb_profiling_info_get_child_count duckdb_ext_api.duckdb_profiling_info_get_child_count #define duckdb_profiling_info_get_child duckdb_ext_api.duckdb_profiling_info_get_child #define duckdb_appender_create duckdb_ext_api.duckdb_appender_create +#define duckdb_appender_create_ext duckdb_ext_api.duckdb_appender_create_ext #define duckdb_appender_column_count duckdb_ext_api.duckdb_appender_column_count #define duckdb_appender_column_type duckdb_ext_api.duckdb_appender_column_type #define duckdb_appender_error duckdb_ext_api.duckdb_appender_error #define duckdb_appender_flush duckdb_ext_api.duckdb_appender_flush #define duckdb_appender_close duckdb_ext_api.duckdb_appender_close #define duckdb_appender_destroy duckdb_ext_api.duckdb_appender_destroy +#define duckdb_appender_add_column duckdb_ext_api.duckdb_appender_add_column +#define duckdb_appender_clear_columns duckdb_ext_api.duckdb_appender_clear_columns #define duckdb_appender_begin_row duckdb_ext_api.duckdb_appender_begin_row #define duckdb_appender_end_row duckdb_ext_api.duckdb_appender_end_row #define duckdb_append_default duckdb_ext_api.duckdb_append_default @@ -864,25 +863,14 @@ typedef struct { #define duckdb_append_varchar_length duckdb_ext_api.duckdb_append_varchar_length #define duckdb_append_blob duckdb_ext_api.duckdb_append_blob #define duckdb_append_null duckdb_ext_api.duckdb_append_null +#define duckdb_append_value duckdb_ext_api.duckdb_append_value #define duckdb_append_data_chunk duckdb_ext_api.duckdb_append_data_chunk #define duckdb_table_description_create duckdb_ext_api.duckdb_table_description_create +#define duckdb_table_description_create_ext duckdb_ext_api.duckdb_table_description_create_ext #define duckdb_table_description_destroy duckdb_ext_api.duckdb_table_description_destroy #define duckdb_table_description_error duckdb_ext_api.duckdb_table_description_error #define duckdb_column_has_default duckdb_ext_api.duckdb_column_has_default -#define duckdb_query_arrow duckdb_ext_api.duckdb_query_arrow -#define duckdb_query_arrow_schema duckdb_ext_api.duckdb_query_arrow_schema -#define duckdb_prepared_arrow_schema duckdb_ext_api.duckdb_prepared_arrow_schema -#define duckdb_result_arrow_array duckdb_ext_api.duckdb_result_arrow_array -#define duckdb_query_arrow_array duckdb_ext_api.duckdb_query_arrow_array -#define duckdb_arrow_column_count duckdb_ext_api.duckdb_arrow_column_count -#define duckdb_arrow_row_count duckdb_ext_api.duckdb_arrow_row_count -#define duckdb_arrow_rows_changed duckdb_ext_api.duckdb_arrow_rows_changed -#define duckdb_query_arrow_error duckdb_ext_api.duckdb_query_arrow_error -#define duckdb_destroy_arrow duckdb_ext_api.duckdb_destroy_arrow -#define duckdb_destroy_arrow_stream duckdb_ext_api.duckdb_destroy_arrow_stream -#define duckdb_execute_prepared_arrow duckdb_ext_api.duckdb_execute_prepared_arrow -#define duckdb_arrow_scan duckdb_ext_api.duckdb_arrow_scan -#define duckdb_arrow_array_scan duckdb_ext_api.duckdb_arrow_array_scan +#define duckdb_table_description_get_column_name duckdb_ext_api.duckdb_table_description_get_column_name #define duckdb_execute_tasks duckdb_ext_api.duckdb_execute_tasks #define duckdb_create_task_state duckdb_ext_api.duckdb_create_task_state #define duckdb_execute_tasks_state duckdb_ext_api.duckdb_execute_tasks_state @@ -891,7 +879,6 @@ typedef struct { #define duckdb_task_state_is_finished duckdb_ext_api.duckdb_task_state_is_finished #define duckdb_destroy_task_state duckdb_ext_api.duckdb_destroy_task_state #define duckdb_execution_is_finished duckdb_ext_api.duckdb_execution_is_finished -#define duckdb_stream_fetch_chunk duckdb_ext_api.duckdb_stream_fetch_chunk #define duckdb_fetch_chunk duckdb_ext_api.duckdb_fetch_chunk #define duckdb_create_cast_function duckdb_ext_api.duckdb_create_cast_function #define duckdb_cast_function_set_source_type duckdb_ext_api.duckdb_cast_function_set_source_type @@ -906,56 +893,70 @@ typedef struct { #define duckdb_register_cast_function duckdb_ext_api.duckdb_register_cast_function #define duckdb_destroy_cast_function duckdb_ext_api.duckdb_destroy_cast_function -// Version dev -#define duckdb_is_finite_timestamp_s duckdb_ext_api.duckdb_is_finite_timestamp_s -#define duckdb_is_finite_timestamp_ms duckdb_ext_api.duckdb_is_finite_timestamp_ms -#define duckdb_is_finite_timestamp_ns duckdb_ext_api.duckdb_is_finite_timestamp_ns -#define duckdb_param_logical_type duckdb_ext_api.duckdb_param_logical_type -#define duckdb_create_varint duckdb_ext_api.duckdb_create_varint -#define duckdb_create_decimal duckdb_ext_api.duckdb_create_decimal -#define duckdb_create_timestamp_tz duckdb_ext_api.duckdb_create_timestamp_tz -#define duckdb_create_timestamp_s duckdb_ext_api.duckdb_create_timestamp_s -#define duckdb_create_timestamp_ms duckdb_ext_api.duckdb_create_timestamp_ms -#define duckdb_create_timestamp_ns duckdb_ext_api.duckdb_create_timestamp_ns -#define duckdb_create_bit duckdb_ext_api.duckdb_create_bit -#define duckdb_create_uuid duckdb_ext_api.duckdb_create_uuid -#define duckdb_get_varint duckdb_ext_api.duckdb_get_varint -#define duckdb_get_decimal duckdb_ext_api.duckdb_get_decimal -#define duckdb_get_timestamp_tz duckdb_ext_api.duckdb_get_timestamp_tz -#define duckdb_get_timestamp_s duckdb_ext_api.duckdb_get_timestamp_s -#define duckdb_get_timestamp_ms duckdb_ext_api.duckdb_get_timestamp_ms -#define duckdb_get_timestamp_ns duckdb_ext_api.duckdb_get_timestamp_ns -#define duckdb_get_bit duckdb_ext_api.duckdb_get_bit -#define duckdb_get_uuid duckdb_ext_api.duckdb_get_uuid -#define duckdb_is_null_value duckdb_ext_api.duckdb_is_null_value -#define duckdb_create_null_value duckdb_ext_api.duckdb_create_null_value -#define duckdb_get_list_size duckdb_ext_api.duckdb_get_list_size -#define duckdb_get_list_child duckdb_ext_api.duckdb_get_list_child -#define duckdb_create_enum_value duckdb_ext_api.duckdb_create_enum_value -#define duckdb_get_enum_value duckdb_ext_api.duckdb_get_enum_value -#define duckdb_get_struct_child duckdb_ext_api.duckdb_get_struct_child -#define duckdb_appender_create_ext duckdb_ext_api.duckdb_appender_create_ext -#define duckdb_appender_add_column duckdb_ext_api.duckdb_appender_add_column -#define duckdb_appender_clear_columns duckdb_ext_api.duckdb_appender_clear_columns -#define duckdb_append_value duckdb_ext_api.duckdb_append_value -#define duckdb_table_description_create_ext duckdb_ext_api.duckdb_table_description_create_ext -#define duckdb_table_description_get_column_name duckdb_ext_api.duckdb_table_description_get_column_name +// Version unstable_deprecated +#define duckdb_row_count duckdb_ext_api.duckdb_row_count +#define duckdb_column_data duckdb_ext_api.duckdb_column_data +#define duckdb_nullmask_data duckdb_ext_api.duckdb_nullmask_data +#define duckdb_result_get_chunk duckdb_ext_api.duckdb_result_get_chunk +#define duckdb_result_is_streaming duckdb_ext_api.duckdb_result_is_streaming +#define duckdb_result_chunk_count duckdb_ext_api.duckdb_result_chunk_count +#define duckdb_value_boolean duckdb_ext_api.duckdb_value_boolean +#define duckdb_value_int8 duckdb_ext_api.duckdb_value_int8 +#define duckdb_value_int16 duckdb_ext_api.duckdb_value_int16 +#define duckdb_value_int32 duckdb_ext_api.duckdb_value_int32 +#define duckdb_value_int64 duckdb_ext_api.duckdb_value_int64 +#define duckdb_value_hugeint duckdb_ext_api.duckdb_value_hugeint +#define duckdb_value_uhugeint duckdb_ext_api.duckdb_value_uhugeint +#define duckdb_value_decimal duckdb_ext_api.duckdb_value_decimal +#define duckdb_value_uint8 duckdb_ext_api.duckdb_value_uint8 +#define duckdb_value_uint16 duckdb_ext_api.duckdb_value_uint16 +#define duckdb_value_uint32 duckdb_ext_api.duckdb_value_uint32 +#define duckdb_value_uint64 duckdb_ext_api.duckdb_value_uint64 +#define duckdb_value_float duckdb_ext_api.duckdb_value_float +#define duckdb_value_double duckdb_ext_api.duckdb_value_double +#define duckdb_value_date duckdb_ext_api.duckdb_value_date +#define duckdb_value_time duckdb_ext_api.duckdb_value_time +#define duckdb_value_timestamp duckdb_ext_api.duckdb_value_timestamp +#define duckdb_value_interval duckdb_ext_api.duckdb_value_interval +#define duckdb_value_varchar duckdb_ext_api.duckdb_value_varchar +#define duckdb_value_string duckdb_ext_api.duckdb_value_string +#define duckdb_value_varchar_internal duckdb_ext_api.duckdb_value_varchar_internal +#define duckdb_value_string_internal duckdb_ext_api.duckdb_value_string_internal +#define duckdb_value_blob duckdb_ext_api.duckdb_value_blob +#define duckdb_value_is_null duckdb_ext_api.duckdb_value_is_null +#define duckdb_execute_prepared_streaming duckdb_ext_api.duckdb_execute_prepared_streaming +#define duckdb_pending_prepared_streaming duckdb_ext_api.duckdb_pending_prepared_streaming +#define duckdb_query_arrow duckdb_ext_api.duckdb_query_arrow +#define duckdb_query_arrow_schema duckdb_ext_api.duckdb_query_arrow_schema +#define duckdb_prepared_arrow_schema duckdb_ext_api.duckdb_prepared_arrow_schema +#define duckdb_result_arrow_array duckdb_ext_api.duckdb_result_arrow_array +#define duckdb_query_arrow_array duckdb_ext_api.duckdb_query_arrow_array +#define duckdb_arrow_column_count duckdb_ext_api.duckdb_arrow_column_count +#define duckdb_arrow_row_count duckdb_ext_api.duckdb_arrow_row_count +#define duckdb_arrow_rows_changed duckdb_ext_api.duckdb_arrow_rows_changed +#define duckdb_query_arrow_error duckdb_ext_api.duckdb_query_arrow_error +#define duckdb_destroy_arrow duckdb_ext_api.duckdb_destroy_arrow +#define duckdb_destroy_arrow_stream duckdb_ext_api.duckdb_destroy_arrow_stream +#define duckdb_execute_prepared_arrow duckdb_ext_api.duckdb_execute_prepared_arrow +#define duckdb_arrow_scan duckdb_ext_api.duckdb_arrow_scan +#define duckdb_arrow_array_scan duckdb_ext_api.duckdb_arrow_array_scan +#define duckdb_stream_fetch_chunk duckdb_ext_api.duckdb_stream_fetch_chunk //===--------------------------------------------------------------------===// // Struct Global Macros //===--------------------------------------------------------------------===// // This goes in the c/c++ file containing the entrypoint (handle -#define DUCKDB_EXTENSION_GLOBAL duckdb_ext_api_v0 duckdb_ext_api = {0}; +#define DUCKDB_EXTENSION_GLOBAL duckdb_ext_api_v1 duckdb_ext_api = {0}; // Initializes the C Extension API: First thing to call in the extension entrypoint #define DUCKDB_EXTENSION_API_INIT(info, access, minimum_api_version) \ - duckdb_ext_api_v0 *res = (duckdb_ext_api_v0 *)access->get_api(info, minimum_api_version); \ + duckdb_ext_api_v1 *res = (duckdb_ext_api_v1 *)access->get_api(info, minimum_api_version); \ if (!res) { \ return false; \ }; \ duckdb_ext_api = *res; // Place in global scope of any C/C++ file that needs to access the extension API -#define DUCKDB_EXTENSION_EXTERN extern duckdb_ext_api_v0 duckdb_ext_api; +#define DUCKDB_EXTENSION_EXTERN extern duckdb_ext_api_v1 duckdb_ext_api; //===--------------------------------------------------------------------===// // Entrypoint Macros diff --git a/src/duckdb/src/main/database.cpp b/src/duckdb/src/main/database.cpp index 1a391f41..de60b1cf 100644 --- a/src/duckdb/src/main/database.cpp +++ b/src/duckdb/src/main/database.cpp @@ -59,7 +59,7 @@ DBConfig::~DBConfig() { DatabaseInstance::DatabaseInstance() { config.is_user_config = false; - create_api_v0 = nullptr; + create_api_v1 = nullptr; } DatabaseInstance::~DatabaseInstance() { @@ -262,8 +262,8 @@ void DatabaseInstance::LoadExtensionSettings() { } } -static duckdb_ext_api_v0 CreateAPIv0Wrapper() { - return CreateAPIv0(); +static duckdb_ext_api_v1 CreateAPIv1Wrapper() { + return CreateAPIv1(); } void DatabaseInstance::Initialize(const char *database_path, DBConfig *user_config) { @@ -275,7 +275,7 @@ void DatabaseInstance::Initialize(const char *database_path, DBConfig *user_conf Configure(*config_ptr, database_path); - create_api_v0 = CreateAPIv0Wrapper; + create_api_v1 = CreateAPIv1Wrapper; db_file_system = make_uniq(*this); db_manager = make_uniq(*this); @@ -516,9 +516,9 @@ ValidChecker &DatabaseInstance::GetValidChecker() { return db_validity; } -const duckdb_ext_api_v0 DatabaseInstance::GetExtensionAPIV0() { - D_ASSERT(create_api_v0); - return create_api_v0(); +const duckdb_ext_api_v1 DatabaseInstance::GetExtensionAPIV1() { + D_ASSERT(create_api_v1); + return create_api_v1(); } ValidChecker &ValidChecker::Get(DatabaseInstance &db) { diff --git a/src/duckdb/src/main/extension.cpp b/src/duckdb/src/main/extension.cpp index 7eba4e48..c13971b0 100644 --- a/src/duckdb/src/main/extension.cpp +++ b/src/duckdb/src/main/extension.cpp @@ -1,8 +1,9 @@ #include "duckdb/main/extension.hpp" + +#include "duckdb/common/operator/cast_operators.hpp" #include "duckdb/common/string_util.hpp" -#include "duckdb/main/extension_helper.hpp" #include "duckdb/main/capi/extension_api.hpp" -#include "duckdb/common/operator/cast_operators.hpp" +#include "duckdb/main/extension_helper.hpp" namespace duckdb { @@ -44,7 +45,8 @@ string ParsedExtensionMetaData::GetInvalidMetadataError() { string result; - if (abi_type == ExtensionABIType::CPP) { + // CPP or C_STRUCT_UNSTABLE ABI versioning needs to match the DuckDB version exactly + if (abi_type == ExtensionABIType::CPP || abi_type == ExtensionABIType::C_STRUCT_UNSTABLE) { const string engine_version = string(ExtensionHelper::GetVersionDirectoryName()); if (engine_version != duckdb_version) { @@ -52,14 +54,14 @@ string ParsedExtensionMetaData::GetInvalidMetadataError() { "built for DuckDB version '%s'.", PrettyPrintString(duckdb_version), engine_version); } + // C_STRUCT ABI versioning works when current duckdb version >= required version } else if (abi_type == ExtensionABIType::C_STRUCT) { if (!VersioningUtils::IsSupportedCAPIVersion(duckdb_capi_version)) { - result += - StringUtil::Format("The file was built for DuckDB C API version '%s', but we can only load extensions " - "built for DuckDB C API 'v%lld.%lld.%lld' and lower.", - duckdb_capi_version, DUCKDB_EXTENSION_API_VERSION_MAJOR, - DUCKDB_EXTENSION_API_VERSION_MINOR, DUCKDB_EXTENSION_API_VERSION_PATCH); + result += StringUtil::Format("The file was built for DuckDB version '%s', but we can only load extensions " + "built for DuckDB C API 'v%lld.%lld.%lld' and lower.", + duckdb_capi_version, DUCKDB_EXTENSION_API_VERSION_MAJOR, + DUCKDB_EXTENSION_API_VERSION_MINOR, DUCKDB_EXTENSION_API_VERSION_PATCH); } } else { throw InternalException("Unknown ABI type for extension: " + extension_abi_metadata); @@ -89,11 +91,18 @@ bool VersioningUtils::IsSupportedCAPIVersion(string &capi_version_string) { } bool VersioningUtils::IsSupportedCAPIVersion(idx_t major, idx_t minor, idx_t patch) { - if (major > DUCKDB_EXTENSION_API_VERSION_MAJOR || minor > DUCKDB_EXTENSION_API_VERSION_MINOR || - patch > DUCKDB_EXTENSION_API_VERSION_PATCH) { + if (major != DUCKDB_EXTENSION_API_VERSION_MAJOR) { + return false; + } + if (minor > DUCKDB_EXTENSION_API_VERSION_MINOR) { + return false; + } + if (minor < DUCKDB_EXTENSION_API_VERSION_MINOR) { + return true; + } + if (patch > DUCKDB_EXTENSION_API_VERSION_PATCH) { return false; } - return true; } diff --git a/src/duckdb/src/main/extension/extension_load.cpp b/src/duckdb/src/main/extension/extension_load.cpp index bb2a8957..a7a7c62f 100644 --- a/src/duckdb/src/main/extension/extension_load.cpp +++ b/src/duckdb/src/main/extension/extension_load.cpp @@ -25,7 +25,8 @@ namespace duckdb { //! State that is kept during the load phase of a C API extension struct DuckDBExtensionLoadState { - explicit DuckDBExtensionLoadState(DatabaseInstance &db_p) : db(db_p), database_data(nullptr) { + explicit DuckDBExtensionLoadState(DatabaseInstance &db_p, ExtensionInitResult &init_result_p) + : db(db_p), init_result(init_result_p), database_data(nullptr) { } //! Create a DuckDBExtensionLoadState reference from a C API opaque pointer @@ -42,13 +43,16 @@ struct DuckDBExtensionLoadState { //! Ref to the database being loaded DatabaseInstance &db; + //! The init result from initializing the extension + ExtensionInitResult &init_result; + //! This is the duckdb_database struct that will be passed to the extension during initialization. Note that the //! extension does not need to free it. unique_ptr database_data; //! The function pointer struct passed to the extension. The extension is expected to copy this struct during //! initialization - duckdb_ext_api_v0 api_struct; + duckdb_ext_api_v1 api_struct; //! Error handling bool has_error = false; @@ -99,22 +103,34 @@ struct ExtensionAccess { //! Called by the extension get a pointer the correctly versioned extension C API struct. static const void *GetAPI(duckdb_extension_info info, const char *version) { - string version_string = version; - idx_t major, minor, patch; - auto parsed = VersioningUtils::ParseSemver(version_string, major, minor, patch); - auto &load_state = DuckDBExtensionLoadState::Get(info); - if (!parsed || !VersioningUtils::IsSupportedCAPIVersion(major, minor, patch)) { + if (load_state.init_result.abi_type == ExtensionABIType::C_STRUCT) { + idx_t major, minor, patch; + auto parsed = VersioningUtils::ParseSemver(version_string, major, minor, patch); + + if (!parsed || !VersioningUtils::IsSupportedCAPIVersion(major, minor, patch)) { + load_state.has_error = true; + load_state.error_data = ErrorData( + ExceptionType::UNKNOWN_TYPE, + "Unsupported C CAPI version detected during extension initialization: " + string(version)); + return nullptr; + } + } else if (load_state.init_result.abi_type == ExtensionABIType::C_STRUCT_UNSTABLE) { + // NOTE: we currently don't check anything here: the version of extensions of ABI type C_STRUCT_UNSTABLE is + // ignored because C_STRUCT_UNSTABLE extensions are tied 1:1 to duckdb verions meaning they will always + // receive the whole function pointer struct + } else { load_state.has_error = true; load_state.error_data = ErrorData(ExceptionType::UNKNOWN_TYPE, - "Unsupported C CAPI version detected during extension initialization: " + string(version)); + StringUtil::Format("Unknown ABI Type '%s' found when loading extension '%s'", + load_state.init_result.abi_type, load_state.init_result.filename)); return nullptr; } - load_state.api_struct = load_state.db.GetExtensionAPIV0(); + load_state.api_struct = load_state.db.GetExtensionAPIV1(); return &load_state.api_struct; } }; @@ -207,6 +223,9 @@ ParsedExtensionMetaData ExtensionHelper::ParseExtensionMetaData(const char *meta if (extension_abi_metadata == "C_STRUCT") { result.abi_type = ExtensionABIType::C_STRUCT; result.duckdb_capi_version = FilterZeroAtEnd(metadata_field[2]); + } else if (extension_abi_metadata == "C_STRUCT_UNSTABLE") { + result.abi_type = ExtensionABIType::C_STRUCT_UNSTABLE; + result.duckdb_version = FilterZeroAtEnd(metadata_field[2]); } else if (extension_abi_metadata == "CPP" || extension_abi_metadata.empty()) { result.abi_type = ExtensionABIType::CPP; result.duckdb_version = FilterZeroAtEnd(metadata_field[2]); @@ -432,6 +451,7 @@ bool ExtensionHelper::TryInitialLoad(DatabaseInstance &db, FileSystem &fs, const result.filebase = lowercase_extension_name; result.filename = filename; result.lib_hdl = lib_hdl; + result.abi_type = parsed_metadata.abi_type; if (!direct_load) { auto info_file_name = filename + ".info"; @@ -505,60 +525,73 @@ void ExtensionHelper::LoadExternalExtension(DatabaseInstance &db, FileSystem &fs #ifdef DUCKDB_DISABLE_EXTENSION_LOAD throw PermissionException("Loading external extensions is disabled through a compile time flag"); #else - auto res = InitialLoad(db, fs, extension); - auto init_fun_name = res.filebase + "_init"; + auto extension_init_result = InitialLoad(db, fs, extension); + + // C++ ABI + if (extension_init_result.abi_type == ExtensionABIType::CPP) { + auto init_fun_name = extension_init_result.filebase + "_init"; + ext_init_fun_t init_fun = TryLoadFunctionFromDLL(extension_init_result.lib_hdl, init_fun_name, + extension_init_result.filename); + if (!init_fun) { + throw IOException("Extension '%s' did not contain the expected entrypoint function '%s'", extension, + init_fun_name); + } - // "OLD WAY" of loading extensions. If the _init exists, we choose that - ext_init_fun_t init_fun = TryLoadFunctionFromDLL(res.lib_hdl, init_fun_name, res.filename); - if (init_fun) { try { (*init_fun)(db); } catch (std::exception &e) { ErrorData error(e); throw InvalidInputException("Initialization function \"%s\" from file \"%s\" threw an exception: \"%s\"", - init_fun_name, res.filename, error.RawMessage()); + init_fun_name, extension_init_result.filename, error.RawMessage()); } - D_ASSERT(res.install_info); + D_ASSERT(extension_init_result.install_info); - db.SetExtensionLoaded(extension, *res.install_info); + db.SetExtensionLoaded(extension, *extension_init_result.install_info); return; } - // TODO: make this the only way of calling extensions? - // "NEW WAY" of loading extensions enabling C API only - init_fun_name = res.filebase + "_init_c_api"; - ext_init_c_api_fun_t init_fun_capi = - TryLoadFunctionFromDLL(res.lib_hdl, init_fun_name, res.filename); + // C ABI + if (extension_init_result.abi_type == ExtensionABIType::C_STRUCT || + extension_init_result.abi_type == ExtensionABIType::C_STRUCT_UNSTABLE) { + auto init_fun_name = extension_init_result.filebase + "_init_c_api"; + ext_init_c_api_fun_t init_fun_capi = TryLoadFunctionFromDLL( + extension_init_result.lib_hdl, init_fun_name, extension_init_result.filename); - if (!init_fun_capi) { - throw IOException("File \"%s\" did not contain function \"%s\": %s", res.filename, init_fun_name, GetDLError()); - } + if (!init_fun_capi) { + throw IOException("File \"%s\" did not contain function \"%s\": %s", extension_init_result.filename, + init_fun_name, GetDLError()); + } + // Create the load state + DuckDBExtensionLoadState load_state(db, extension_init_result); - // Create the load state - DuckDBExtensionLoadState load_state(db); + auto access = ExtensionAccess::CreateAccessStruct(); + auto result = (*init_fun_capi)(load_state.ToCStruct(), &access); - auto access = ExtensionAccess::CreateAccessStruct(); - auto result = (*init_fun_capi)(load_state.ToCStruct(), &access); + // Throw any error that the extension might have encountered + if (load_state.has_error) { + load_state.error_data.Throw("An error was thrown during initialization of the extension '" + extension + + "': "); + } - // Throw any error that the extension might have encountered - if (load_state.has_error) { - load_state.error_data.Throw("An error was thrown during initialization of the extension '" + extension + "': "); - } + // Extensions are expected to either set an error or return true indicating successful initialization + if (result == false) { + throw FatalException( + "Extension '%s' failed to initialize but did not return an error. This indicates an " + "error in the extension: C API extensions should return a boolean `true` to indicate succesful " + "initialization. " + "This means that the Extension may be partially initialized resulting in an inconsistent state of " + "DuckDB.", + extension); + } - // Extensions are expected to either set an error or return true indicating successful initialization - if (result == false) { - throw FatalException( - "Extension '%s' failed to initialize but did not return an error. This indicates an " - "error in the extension: C API extensions should return a boolean `true` to indicate succesful " - "initialization. " - "This means that the Extension may be partially initialized resulting in an inconsistent state of DuckDB.", - extension); - } + D_ASSERT(extension_init_result.install_info); - D_ASSERT(res.install_info); + db.SetExtensionLoaded(extension, *extension_init_result.install_info); + return; + } - db.SetExtensionLoaded(extension, *res.install_info); + throw IOException("Unknown ABI type '%s' for extension '%s'", extension_init_result.abi_type, extension); #endif } diff --git a/src/duckdb/src/planner/binder/statement/bind_insert.cpp b/src/duckdb/src/planner/binder/statement/bind_insert.cpp index 5e390304..3b0d38a4 100644 --- a/src/duckdb/src/planner/binder/statement/bind_insert.cpp +++ b/src/duckdb/src/planner/binder/statement/bind_insert.cpp @@ -167,6 +167,11 @@ void Binder::BindDoUpdateSetExpressions(const string &table_alias, LogicalInsert insert.set_columns.end()) { throw BinderException("Multiple assignments to same column \"%s\"", colname); } + + if (!column.Type().SupportsRegularUpdate()) { + insert.update_is_del_and_insert = true; + } + insert.set_columns.push_back(column.Physical()); logical_column_ids.push_back(column.Oid()); insert.set_types.push_back(column.Type()); @@ -196,13 +201,13 @@ void Binder::BindDoUpdateSetExpressions(const string &table_alias, LogicalInsert } } - // Verify that none of the columns that are targeted with a SET expression are indexed on + // If any column targeted by a SET expression has an index, then + // we need to rewrite this to an DELETE + INSERT. for (idx_t i = 0; i < logical_column_ids.size(); i++) { auto &column = logical_column_ids[i]; if (indexed_columns.count(column)) { - throw BinderException("Can not assign to column '%s' because it has a UNIQUE/PRIMARY KEY constraint or is " - "referenced by an INDEX", - column_names[i]); + insert.update_is_del_and_insert = true; + break; } } } diff --git a/src/duckdb/src/planner/operator/logical_insert.cpp b/src/duckdb/src/planner/operator/logical_insert.cpp index dd5bf92a..10f11569 100644 --- a/src/duckdb/src/planner/operator/logical_insert.cpp +++ b/src/duckdb/src/planner/operator/logical_insert.cpp @@ -8,7 +8,7 @@ namespace duckdb { LogicalInsert::LogicalInsert(TableCatalogEntry &table, idx_t table_index) : LogicalOperator(LogicalOperatorType::LOGICAL_INSERT), table(table), table_index(table_index), return_chunk(false), - action_type(OnConflictAction::THROW) { + action_type(OnConflictAction::THROW), update_is_del_and_insert(false) { } LogicalInsert::LogicalInsert(ClientContext &context, const unique_ptr table_info) diff --git a/src/duckdb/src/storage/compression/dictionary/analyze.cpp b/src/duckdb/src/storage/compression/dictionary/analyze.cpp new file mode 100644 index 00000000..3d12bc2e --- /dev/null +++ b/src/duckdb/src/storage/compression/dictionary/analyze.cpp @@ -0,0 +1,54 @@ +#include "duckdb/storage/compression/dictionary/analyze.hpp" + +namespace duckdb { + +DictionaryAnalyzeState::DictionaryAnalyzeState(const CompressionInfo &info) + : DictionaryCompressionState(info), segment_count(0), current_tuple_count(0), current_unique_count(0), + current_dict_size(0), current_width(0), next_width(0) { +} + +bool DictionaryAnalyzeState::LookupString(string_t str) { + return current_set.count(str); +} + +void DictionaryAnalyzeState::AddNewString(string_t str) { + current_tuple_count++; + current_unique_count++; + current_dict_size += str.GetSize(); + if (str.IsInlined()) { + current_set.insert(str); + } else { + current_set.insert(heap.AddBlob(str)); + } + current_width = next_width; +} + +void DictionaryAnalyzeState::AddLastLookup() { + current_tuple_count++; +} + +void DictionaryAnalyzeState::AddNull() { + current_tuple_count++; +} + +bool DictionaryAnalyzeState::CalculateSpaceRequirements(bool new_string, idx_t string_size) { + if (!new_string) { + return DictionaryCompression::HasEnoughSpace(current_tuple_count + 1, current_unique_count, current_dict_size, + current_width, info.GetBlockSize()); + } + next_width = BitpackingPrimitives::MinimumBitWidth(current_unique_count + 2); // 1 for null, one for new string + return DictionaryCompression::HasEnoughSpace(current_tuple_count + 1, current_unique_count + 1, + current_dict_size + string_size, next_width, info.GetBlockSize()); +} + +void DictionaryAnalyzeState::Flush(bool final) { + segment_count++; + current_tuple_count = 0; + current_unique_count = 0; + current_dict_size = 0; + current_set.clear(); +} +void DictionaryAnalyzeState::Verify() { +} + +} // namespace duckdb diff --git a/src/duckdb/src/storage/compression/dictionary/common.cpp b/src/duckdb/src/storage/compression/dictionary/common.cpp new file mode 100644 index 00000000..05be9aae --- /dev/null +++ b/src/duckdb/src/storage/compression/dictionary/common.cpp @@ -0,0 +1,90 @@ +#include "duckdb/storage/compression/dictionary/common.hpp" + +namespace duckdb { + +//===--------------------------------------------------------------------===// +// Helper Functions +//===--------------------------------------------------------------------===// +bool DictionaryCompression::HasEnoughSpace(idx_t current_count, idx_t index_count, idx_t dict_size, + bitpacking_width_t packing_width, const idx_t block_size) { + return RequiredSpace(current_count, index_count, dict_size, packing_width) <= block_size; +} + +idx_t DictionaryCompression::RequiredSpace(idx_t current_count, idx_t index_count, idx_t dict_size, + bitpacking_width_t packing_width) { + idx_t base_space = DICTIONARY_HEADER_SIZE + dict_size; + idx_t string_number_space = BitpackingPrimitives::GetRequiredSize(current_count, packing_width); + idx_t index_space = index_count * sizeof(uint32_t); + + idx_t used_space = base_space + index_space + string_number_space; + + return used_space; +} + +StringDictionaryContainer DictionaryCompression::GetDictionary(ColumnSegment &segment, BufferHandle &handle) { + auto header_ptr = reinterpret_cast(handle.Ptr() + segment.GetBlockOffset()); + StringDictionaryContainer container; + container.size = Load(data_ptr_cast(&header_ptr->dict_size)); + container.end = Load(data_ptr_cast(&header_ptr->dict_end)); + return container; +} + +void DictionaryCompression::SetDictionary(ColumnSegment &segment, BufferHandle &handle, + StringDictionaryContainer container) { + auto header_ptr = reinterpret_cast(handle.Ptr() + segment.GetBlockOffset()); + Store(container.size, data_ptr_cast(&header_ptr->dict_size)); + Store(container.end, data_ptr_cast(&header_ptr->dict_end)); +} + +DictionaryCompressionState::DictionaryCompressionState(const CompressionInfo &info) : CompressionState(info) { +} +DictionaryCompressionState::~DictionaryCompressionState() { +} + +bool DictionaryCompressionState::UpdateState(Vector &scan_vector, idx_t count) { + UnifiedVectorFormat vdata; + scan_vector.ToUnifiedFormat(count, vdata); + auto data = UnifiedVectorFormat::GetData(vdata); + Verify(); + + for (idx_t i = 0; i < count; i++) { + auto idx = vdata.sel->get_index(i); + idx_t string_size = 0; + bool new_string = false; + auto row_is_valid = vdata.validity.RowIsValid(idx); + + if (row_is_valid) { + string_size = data[idx].GetSize(); + if (string_size >= StringUncompressed::GetStringBlockLimit(info.GetBlockSize())) { + // Big strings not implemented for dictionary compression + return false; + } + new_string = !LookupString(data[idx]); + } + + bool fits = CalculateSpaceRequirements(new_string, string_size); + if (!fits) { + Flush(); + new_string = true; + + fits = CalculateSpaceRequirements(new_string, string_size); + if (!fits) { + throw InternalException("Dictionary compression could not write to new segment"); + } + } + + if (!row_is_valid) { + AddNull(); + } else if (new_string) { + AddNewString(data[idx]); + } else { + AddLastLookup(); + } + + Verify(); + } + + return true; +} + +} // namespace duckdb diff --git a/src/duckdb/src/storage/compression/dictionary/compression.cpp b/src/duckdb/src/storage/compression/dictionary/compression.cpp new file mode 100644 index 00000000..eeb42953 --- /dev/null +++ b/src/duckdb/src/storage/compression/dictionary/compression.cpp @@ -0,0 +1,174 @@ +#include "duckdb/storage/compression/dictionary/compression.hpp" +#include "duckdb/storage/segment/uncompressed.hpp" + +namespace duckdb { + +DictionaryCompressionCompressState::DictionaryCompressionCompressState(ColumnDataCheckpointer &checkpointer_p, + const CompressionInfo &info) + : DictionaryCompressionState(info), checkpointer(checkpointer_p), + function(checkpointer.GetCompressionFunction(CompressionType::COMPRESSION_DICTIONARY)), + heap(BufferAllocator::Get(checkpointer.GetDatabase())) { + CreateEmptySegment(checkpointer.GetRowGroup().start); +} + +void DictionaryCompressionCompressState::CreateEmptySegment(idx_t row_start) { + auto &db = checkpointer.GetDatabase(); + auto &type = checkpointer.GetType(); + + auto compressed_segment = + ColumnSegment::CreateTransientSegment(db, function, type, row_start, info.GetBlockSize(), info.GetBlockSize()); + current_segment = std::move(compressed_segment); + current_segment->function = function; + + // Reset the buffers and the string map. + current_string_map.clear(); + index_buffer.clear(); + + // Reserve index 0 for null strings. + index_buffer.push_back(0); + selection_buffer.clear(); + + current_width = 0; + next_width = 0; + + // Reset the pointers into the current segment. + auto &buffer_manager = BufferManager::GetBufferManager(checkpointer.GetDatabase()); + current_handle = buffer_manager.Pin(current_segment->block); + current_dictionary = DictionaryCompression::GetDictionary(*current_segment, current_handle); + current_end_ptr = current_handle.Ptr() + current_dictionary.end; +} + +void DictionaryCompressionCompressState::Verify() { + current_dictionary.Verify(info.GetBlockSize()); + D_ASSERT(current_segment->count == selection_buffer.size()); + D_ASSERT(DictionaryCompression::HasEnoughSpace(current_segment->count.load(), index_buffer.size(), + current_dictionary.size, current_width, info.GetBlockSize())); + D_ASSERT(current_dictionary.end == info.GetBlockSize()); + D_ASSERT(index_buffer.size() == current_string_map.size() + 1); // +1 is for null value +} + +bool DictionaryCompressionCompressState::LookupString(string_t str) { + auto search = current_string_map.find(str); + auto has_result = search != current_string_map.end(); + + if (has_result) { + latest_lookup_result = search->second; + } + return has_result; +} + +void DictionaryCompressionCompressState::AddNewString(string_t str) { + UncompressedStringStorage::UpdateStringStats(current_segment->stats, str); + + // Copy string to dict + current_dictionary.size += str.GetSize(); + auto dict_pos = current_end_ptr - current_dictionary.size; + memcpy(dict_pos, str.GetData(), str.GetSize()); + current_dictionary.Verify(info.GetBlockSize()); + D_ASSERT(current_dictionary.end == info.GetBlockSize()); + + // Update buffers and map + index_buffer.push_back(current_dictionary.size); + selection_buffer.push_back(UnsafeNumericCast(index_buffer.size() - 1)); + if (str.IsInlined()) { + current_string_map.insert({str, index_buffer.size() - 1}); + } else { + current_string_map.insert({heap.AddBlob(str), index_buffer.size() - 1}); + } + DictionaryCompression::SetDictionary(*current_segment, current_handle, current_dictionary); + + current_width = next_width; + current_segment->count++; +} + +void DictionaryCompressionCompressState::AddNull() { + selection_buffer.push_back(0); + current_segment->count++; +} + +void DictionaryCompressionCompressState::AddLastLookup() { + selection_buffer.push_back(latest_lookup_result); + current_segment->count++; +} + +bool DictionaryCompressionCompressState::CalculateSpaceRequirements(bool new_string, idx_t string_size) { + if (!new_string) { + return DictionaryCompression::HasEnoughSpace(current_segment->count.load() + 1, index_buffer.size(), + current_dictionary.size, current_width, info.GetBlockSize()); + } + next_width = BitpackingPrimitives::MinimumBitWidth(index_buffer.size() - 1 + new_string); + return DictionaryCompression::HasEnoughSpace(current_segment->count.load() + 1, index_buffer.size() + 1, + current_dictionary.size + string_size, next_width, + info.GetBlockSize()); +} + +void DictionaryCompressionCompressState::Flush(bool final) { + auto next_start = current_segment->start + current_segment->count; + + auto segment_size = Finalize(); + auto &state = checkpointer.GetCheckpointState(); + state.FlushSegment(std::move(current_segment), std::move(current_handle), segment_size); + + if (!final) { + CreateEmptySegment(next_start); + } +} + +idx_t DictionaryCompressionCompressState::Finalize() { + auto &buffer_manager = BufferManager::GetBufferManager(checkpointer.GetDatabase()); + auto handle = buffer_manager.Pin(current_segment->block); + D_ASSERT(current_dictionary.end == info.GetBlockSize()); + + // calculate sizes + auto compressed_selection_buffer_size = + BitpackingPrimitives::GetRequiredSize(current_segment->count, current_width); + auto index_buffer_size = index_buffer.size() * sizeof(uint32_t); + auto total_size = DictionaryCompression::DICTIONARY_HEADER_SIZE + compressed_selection_buffer_size + + index_buffer_size + current_dictionary.size; + + // calculate ptr and offsets + auto base_ptr = handle.Ptr(); + auto header_ptr = reinterpret_cast(base_ptr); + auto compressed_selection_buffer_offset = DictionaryCompression::DICTIONARY_HEADER_SIZE; + auto index_buffer_offset = compressed_selection_buffer_offset + compressed_selection_buffer_size; + + // Write compressed selection buffer + BitpackingPrimitives::PackBuffer(base_ptr + compressed_selection_buffer_offset, + (sel_t *)(selection_buffer.data()), current_segment->count, + current_width); + + // Write the index buffer + memcpy(base_ptr + index_buffer_offset, index_buffer.data(), index_buffer_size); + + // Store sizes and offsets in segment header + Store(NumericCast(index_buffer_offset), data_ptr_cast(&header_ptr->index_buffer_offset)); + Store(NumericCast(index_buffer.size()), data_ptr_cast(&header_ptr->index_buffer_count)); + Store((uint32_t)current_width, data_ptr_cast(&header_ptr->bitpacking_width)); + + D_ASSERT(current_width == BitpackingPrimitives::MinimumBitWidth(index_buffer.size() - 1)); + D_ASSERT(DictionaryCompression::HasEnoughSpace(current_segment->count, index_buffer.size(), current_dictionary.size, + current_width, info.GetBlockSize())); + D_ASSERT((uint64_t)*max_element(std::begin(selection_buffer), std::end(selection_buffer)) == + index_buffer.size() - 1); + + // Early-out, if the block is sufficiently full. + if (total_size >= info.GetCompactionFlushLimit()) { + return info.GetBlockSize(); + } + + // Sufficient space: calculate how much space we can save. + auto move_amount = info.GetBlockSize() - total_size; + + // Move the dictionary to align it with the offsets. + auto new_dictionary_offset = index_buffer_offset + index_buffer_size; + memmove(base_ptr + new_dictionary_offset, base_ptr + current_dictionary.end - current_dictionary.size, + current_dictionary.size); + current_dictionary.end -= move_amount; + D_ASSERT(current_dictionary.end == total_size); + + // Write the new dictionary with the updated "end". + DictionaryCompression::SetDictionary(*current_segment, handle, current_dictionary); + return total_size; +} + +} // namespace duckdb diff --git a/src/duckdb/src/storage/compression/dictionary/decompression.cpp b/src/duckdb/src/storage/compression/dictionary/decompression.cpp new file mode 100644 index 00000000..a81c0e12 --- /dev/null +++ b/src/duckdb/src/storage/compression/dictionary/decompression.cpp @@ -0,0 +1,115 @@ +#include "duckdb/storage/compression/dictionary/decompression.hpp" + +namespace duckdb { + +uint16_t CompressedStringScanState::GetStringLength(sel_t index) { + if (index == 0) { + return 0; + } else { + return UnsafeNumericCast(index_buffer_ptr[index] - index_buffer_ptr[index - 1]); + } +} + +string_t CompressedStringScanState::FetchStringFromDict(int32_t dict_offset, uint16_t string_len) { + D_ASSERT(dict_offset >= 0 && dict_offset <= NumericCast(block_size)); + if (dict_offset == 0) { + return string_t(nullptr, 0); + } + + // normal string: read string from this block + auto dict_end = baseptr + dict.end; + auto dict_pos = dict_end - dict_offset; + + auto str_ptr = char_ptr_cast(dict_pos); + return string_t(str_ptr, string_len); +} + +void CompressedStringScanState::Initialize(ColumnSegment &segment, bool initialize_dictionary) { + baseptr = handle->Ptr() + segment.GetBlockOffset(); + + // Load header values + auto header_ptr = reinterpret_cast(baseptr); + auto index_buffer_offset = Load(data_ptr_cast(&header_ptr->index_buffer_offset)); + index_buffer_count = Load(data_ptr_cast(&header_ptr->index_buffer_count)); + current_width = (bitpacking_width_t)(Load(data_ptr_cast(&header_ptr->bitpacking_width))); + if (segment.GetBlockOffset() + index_buffer_offset + sizeof(uint32_t) * index_buffer_count > + segment.GetBlockManager().GetBlockSize()) { + throw IOException( + "Failed to scan dictionary string - index was out of range. Database file appears to be corrupted."); + } + index_buffer_ptr = reinterpret_cast(baseptr + index_buffer_offset); + base_data = data_ptr_cast(baseptr + DictionaryCompression::DICTIONARY_HEADER_SIZE); + + block_size = segment.GetBlockManager().GetBlockSize(); + + dict = DictionaryCompression::GetDictionary(segment, *handle); + dictionary = make_buffer(segment.type, index_buffer_count); + dictionary_size = index_buffer_count; + + if (!initialize_dictionary) { + // Used by fetch, as fetch will never produce a DictionaryVector + return; + } + + auto dict_child_data = FlatVector::GetData(*(dictionary)); + for (uint32_t i = 0; i < index_buffer_count; i++) { + // NOTE: the passing of dict_child_vector, will not be used, its for big strings + uint16_t str_len = GetStringLength(i); + dict_child_data[i] = FetchStringFromDict(UnsafeNumericCast(index_buffer_ptr[i]), str_len); + } +} + +void CompressedStringScanState::ScanToFlatVector(Vector &result, idx_t result_offset, idx_t start, idx_t scan_count) { + auto result_data = FlatVector::GetData(result); + + // Handling non-bitpacking-group-aligned start values; + idx_t start_offset = start % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE; + + // We will scan in blocks of BITPACKING_ALGORITHM_GROUP_SIZE, so we may scan some extra values. + idx_t decompress_count = BitpackingPrimitives::RoundUpToAlgorithmGroupSize(scan_count + start_offset); + + // Create a decompression buffer of sufficient size if we don't already have one. + if (!sel_vec || sel_vec_size < decompress_count) { + sel_vec_size = decompress_count; + sel_vec = make_buffer(decompress_count); + } + + data_ptr_t src = &base_data[((start - start_offset) * current_width) / 8]; + sel_t *sel_vec_ptr = sel_vec->data(); + + BitpackingPrimitives::UnPackBuffer(data_ptr_cast(sel_vec_ptr), src, decompress_count, current_width); + + for (idx_t i = 0; i < scan_count; i++) { + // Lookup dict offset in index buffer + auto string_number = sel_vec->get_index(i + start_offset); + auto dict_offset = index_buffer_ptr[string_number]; + auto str_len = GetStringLength(UnsafeNumericCast(string_number)); + result_data[result_offset + i] = FetchStringFromDict(UnsafeNumericCast(dict_offset), str_len); + } +} + +void CompressedStringScanState::ScanToDictionaryVector(ColumnSegment &segment, Vector &result, idx_t result_offset, + idx_t start, idx_t scan_count) { + D_ASSERT(start % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE == 0); + D_ASSERT(scan_count == STANDARD_VECTOR_SIZE); + D_ASSERT(result_offset == 0); + + idx_t decompress_count = BitpackingPrimitives::RoundUpToAlgorithmGroupSize(scan_count); + + // Create a selection vector of sufficient size if we don't already have one. + if (!sel_vec || sel_vec_size < decompress_count) { + sel_vec_size = decompress_count; + sel_vec = make_buffer(decompress_count); + } + + // Scanning 2048 values, emitting a dict vector + data_ptr_t dst = data_ptr_cast(sel_vec->data()); + data_ptr_t src = data_ptr_cast(&base_data[(start * current_width) / 8]); + + BitpackingPrimitives::UnPackBuffer(dst, src, scan_count, current_width); + + result.Dictionary(*(dictionary), dictionary_size, *sel_vec, scan_count); + DictionaryVector::SetDictionaryId(result, to_string(CastPointerToValue(&segment))); +} + +} // namespace duckdb diff --git a/src/duckdb/src/storage/compression/dictionary_compression.cpp b/src/duckdb/src/storage/compression/dictionary_compression.cpp index a531d12e..1ea1bb32 100644 --- a/src/duckdb/src/storage/compression/dictionary_compression.cpp +++ b/src/duckdb/src/storage/compression/dictionary_compression.cpp @@ -1,3 +1,7 @@ +#include "duckdb/storage/compression/dictionary/analyze.hpp" +#include "duckdb/storage/compression/dictionary/compression.hpp" +#include "duckdb/storage/compression/dictionary/decompression.hpp" + #include "duckdb/common/bitpacking.hpp" #include "duckdb/common/numeric_utils.hpp" #include "duckdb/common/operator/comparison_operators.hpp" @@ -9,90 +13,41 @@ #include "duckdb/storage/string_uncompressed.hpp" #include "duckdb/storage/table/column_data_checkpointer.hpp" -namespace duckdb { - -//! Abstract class managing the compression state for size analysis or compression. -class DictionaryCompressionState : public CompressionState { -public: - explicit DictionaryCompressionState(const CompressionInfo &info) : CompressionState(info) {}; - -public: - bool UpdateState(Vector &scan_vector, idx_t count) { - UnifiedVectorFormat vdata; - scan_vector.ToUnifiedFormat(count, vdata); - auto data = UnifiedVectorFormat::GetData(vdata); - Verify(); - - for (idx_t i = 0; i < count; i++) { - auto idx = vdata.sel->get_index(i); - idx_t string_size = 0; - bool new_string = false; - auto row_is_valid = vdata.validity.RowIsValid(idx); - - if (row_is_valid) { - string_size = data[idx].GetSize(); - if (string_size >= StringUncompressed::GetStringBlockLimit(info.GetBlockSize())) { - // Big strings not implemented for dictionary compression - return false; - } - new_string = !LookupString(data[idx]); - } - - bool fits = CalculateSpaceRequirements(new_string, string_size); - if (!fits) { - Flush(); - new_string = true; - - fits = CalculateSpaceRequirements(new_string, string_size); - if (!fits) { - throw InternalException("Dictionary compression could not write to new segment"); - } - } - - if (!row_is_valid) { - AddNull(); - } else if (new_string) { - AddNewString(data[idx]); - } else { - AddLastLookup(); - } +/* +Data layout per segment: ++------------------------------------------------------+ +| Header | +| +----------------------------------------------+ | +| | dictionary_compression_header_t header | | +| +----------------------------------------------+ | +| | ++------------------------------------------------------+ +| Selection Buffer | +| +------------------------------------+ | +| | uint16_t index_buffer_idx[] | | +| +------------------------------------+ | +| tuple index -> index buffer idx | +| | ++--------------------------------------------+ +| Index Buffer | +| +------------------------------------+ | +| | uint16_t dictionary_offset[] | | +| +------------------------------------+ | +| string_index -> offset in the dictionary | +| | ++--------------------------------------------+ +| Dictionary | +| +------------------------------------+ | +| | uint8_t *raw_string_data | | +| +------------------------------------+ | +| the string data without lengths | +| | ++--------------------------------------------+ +*/ - Verify(); - } - - return true; - } - -protected: - // Should verify the State - virtual void Verify() = 0; - // Performs a lookup of str, storing the result internally - virtual bool LookupString(string_t str) = 0; - // Add the most recently looked up str to compression state - virtual void AddLastLookup() = 0; - // Add string to the state that is known to not be seen yet - virtual void AddNewString(string_t str) = 0; - // Add a null value to the compression state - virtual void AddNull() = 0; - // Needs to be called before adding a value. Will return false if a flush is required first. - virtual bool CalculateSpaceRequirements(bool new_string, idx_t string_size) = 0; - // Flush the segment to disk if compressing or reset the counters if analyzing - virtual void Flush(bool final = false) = 0; -}; - -typedef struct { - uint32_t dict_size; - uint32_t dict_end; - uint32_t index_buffer_offset; - uint32_t index_buffer_count; - uint32_t bitpacking_width; -} dictionary_compression_header_t; +namespace duckdb { struct DictionaryCompressionStorage { - static constexpr float MINIMUM_COMPRESSION_RATIO = 1.2F; - //! Dictionary header size at the beginning of the string segment (offset + length) - static constexpr uint16_t DICTIONARY_HEADER_SIZE = sizeof(dictionary_compression_header_t); - static unique_ptr StringInitAnalyze(ColumnData &col_data, PhysicalType type); static bool StringAnalyze(AnalyzeState &state_p, Vector &input, idx_t count); static idx_t StringFinalAnalyze(AnalyzeState &state_p); @@ -109,292 +64,11 @@ struct DictionaryCompressionStorage { static void StringScan(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result); static void StringFetchRow(ColumnSegment &segment, ColumnFetchState &state, row_t row_id, Vector &result, idx_t result_idx); - - static bool HasEnoughSpace(idx_t current_count, idx_t index_count, idx_t dict_size, - bitpacking_width_t packing_width, const idx_t block_size); - static idx_t RequiredSpace(idx_t current_count, idx_t index_count, idx_t dict_size, - bitpacking_width_t packing_width); - - static StringDictionaryContainer GetDictionary(ColumnSegment &segment, BufferHandle &handle); - static void SetDictionary(ColumnSegment &segment, BufferHandle &handle, StringDictionaryContainer container); - static string_t FetchStringFromDict(ColumnSegment &segment, StringDictionaryContainer dict, data_ptr_t baseptr, - int32_t dict_offset, uint16_t string_len); - static uint16_t GetStringLength(uint32_t *index_buffer_ptr, sel_t index); -}; - -// Dictionary compression uses a combination of bitpacking and a dictionary to compress string segments. The data is -// stored across three buffers: the index buffer, the selection buffer and the dictionary. Firstly the Index buffer -// contains the offsets into the dictionary which are also used to determine the string lengths. Each value in the -// dictionary gets a single unique index in the index buffer. Secondly, the selection buffer maps the tuples to an index -// in the index buffer. The selection buffer is compressed with bitpacking. Finally, the dictionary contains simply all -// the unique strings without lengths or null termination as we can deduce the lengths from the index buffer. The -// addition of the selection buffer is done for two reasons: firstly, to allow the scan to emit dictionary vectors by -// scanning the whole dictionary at once and then scanning the selection buffer for each emitted vector. Secondly, it -// allows for efficient bitpacking compression as the selection values should remain relatively small. -struct DictionaryCompressionCompressState : public DictionaryCompressionState { - DictionaryCompressionCompressState(ColumnDataCheckpointer &checkpointer_p, const CompressionInfo &info) - : DictionaryCompressionState(info), checkpointer(checkpointer_p), - function(checkpointer.GetCompressionFunction(CompressionType::COMPRESSION_DICTIONARY)), - heap(BufferAllocator::Get(checkpointer.GetDatabase())) { - CreateEmptySegment(checkpointer.GetRowGroup().start); - } - - ColumnDataCheckpointer &checkpointer; - CompressionFunction &function; - - // State regarding current segment - unique_ptr current_segment; - BufferHandle current_handle; - StringDictionaryContainer current_dictionary; - data_ptr_t current_end_ptr; - - // Buffers and map for current segment - StringHeap heap; - string_map_t current_string_map; - vector index_buffer; - vector selection_buffer; - - bitpacking_width_t current_width = 0; - bitpacking_width_t next_width = 0; - - // Result of latest LookupString call - uint32_t latest_lookup_result; - -public: - void CreateEmptySegment(idx_t row_start) { - auto &db = checkpointer.GetDatabase(); - auto &type = checkpointer.GetType(); - - auto compressed_segment = ColumnSegment::CreateTransientSegment(db, function, type, row_start, - info.GetBlockSize(), info.GetBlockSize()); - current_segment = std::move(compressed_segment); - current_segment->function = function; - - // Reset the buffers and the string map. - current_string_map.clear(); - index_buffer.clear(); - - // Reserve index 0 for null strings. - index_buffer.push_back(0); - selection_buffer.clear(); - - current_width = 0; - next_width = 0; - - // Reset the pointers into the current segment. - auto &buffer_manager = BufferManager::GetBufferManager(checkpointer.GetDatabase()); - current_handle = buffer_manager.Pin(current_segment->block); - current_dictionary = DictionaryCompressionStorage::GetDictionary(*current_segment, current_handle); - current_end_ptr = current_handle.Ptr() + current_dictionary.end; - } - - void Verify() override { - current_dictionary.Verify(info.GetBlockSize()); - D_ASSERT(current_segment->count == selection_buffer.size()); - D_ASSERT(DictionaryCompressionStorage::HasEnoughSpace(current_segment->count.load(), index_buffer.size(), - current_dictionary.size, current_width, - info.GetBlockSize())); - D_ASSERT(current_dictionary.end == info.GetBlockSize()); - D_ASSERT(index_buffer.size() == current_string_map.size() + 1); // +1 is for null value - } - - bool LookupString(string_t str) override { - auto search = current_string_map.find(str); - auto has_result = search != current_string_map.end(); - - if (has_result) { - latest_lookup_result = search->second; - } - return has_result; - } - - void AddNewString(string_t str) override { - UncompressedStringStorage::UpdateStringStats(current_segment->stats, str); - - // Copy string to dict - current_dictionary.size += str.GetSize(); - auto dict_pos = current_end_ptr - current_dictionary.size; - memcpy(dict_pos, str.GetData(), str.GetSize()); - current_dictionary.Verify(info.GetBlockSize()); - D_ASSERT(current_dictionary.end == info.GetBlockSize()); - - // Update buffers and map - index_buffer.push_back(current_dictionary.size); - selection_buffer.push_back(UnsafeNumericCast(index_buffer.size() - 1)); - if (str.IsInlined()) { - current_string_map.insert({str, index_buffer.size() - 1}); - } else { - current_string_map.insert({heap.AddBlob(str), index_buffer.size() - 1}); - } - DictionaryCompressionStorage::SetDictionary(*current_segment, current_handle, current_dictionary); - - current_width = next_width; - current_segment->count++; - } - - void AddNull() override { - selection_buffer.push_back(0); - current_segment->count++; - } - - void AddLastLookup() override { - selection_buffer.push_back(latest_lookup_result); - current_segment->count++; - } - - bool CalculateSpaceRequirements(bool new_string, idx_t string_size) override { - if (!new_string) { - return DictionaryCompressionStorage::HasEnoughSpace(current_segment->count.load() + 1, index_buffer.size(), - current_dictionary.size, current_width, - info.GetBlockSize()); - } - next_width = BitpackingPrimitives::MinimumBitWidth(index_buffer.size() - 1 + new_string); - return DictionaryCompressionStorage::HasEnoughSpace(current_segment->count.load() + 1, index_buffer.size() + 1, - current_dictionary.size + string_size, next_width, - info.GetBlockSize()); - } - - void Flush(bool final = false) override { - auto next_start = current_segment->start + current_segment->count; - - auto segment_size = Finalize(); - auto &state = checkpointer.GetCheckpointState(); - state.FlushSegment(std::move(current_segment), std::move(current_handle), segment_size); - - if (!final) { - CreateEmptySegment(next_start); - } - } - - idx_t Finalize() { - auto &buffer_manager = BufferManager::GetBufferManager(checkpointer.GetDatabase()); - auto handle = buffer_manager.Pin(current_segment->block); - D_ASSERT(current_dictionary.end == info.GetBlockSize()); - - // calculate sizes - auto compressed_selection_buffer_size = - BitpackingPrimitives::GetRequiredSize(current_segment->count, current_width); - auto index_buffer_size = index_buffer.size() * sizeof(uint32_t); - auto total_size = DictionaryCompressionStorage::DICTIONARY_HEADER_SIZE + compressed_selection_buffer_size + - index_buffer_size + current_dictionary.size; - - // calculate ptr and offsets - auto base_ptr = handle.Ptr(); - auto header_ptr = reinterpret_cast(base_ptr); - auto compressed_selection_buffer_offset = DictionaryCompressionStorage::DICTIONARY_HEADER_SIZE; - auto index_buffer_offset = compressed_selection_buffer_offset + compressed_selection_buffer_size; - - // Write compressed selection buffer - BitpackingPrimitives::PackBuffer(base_ptr + compressed_selection_buffer_offset, - (sel_t *)(selection_buffer.data()), current_segment->count, - current_width); - - // Write the index buffer - memcpy(base_ptr + index_buffer_offset, index_buffer.data(), index_buffer_size); - - // Store sizes and offsets in segment header - Store(NumericCast(index_buffer_offset), data_ptr_cast(&header_ptr->index_buffer_offset)); - Store(NumericCast(index_buffer.size()), data_ptr_cast(&header_ptr->index_buffer_count)); - Store((uint32_t)current_width, data_ptr_cast(&header_ptr->bitpacking_width)); - - D_ASSERT(current_width == BitpackingPrimitives::MinimumBitWidth(index_buffer.size() - 1)); - D_ASSERT(DictionaryCompressionStorage::HasEnoughSpace( - current_segment->count, index_buffer.size(), current_dictionary.size, current_width, info.GetBlockSize())); - D_ASSERT((uint64_t)*max_element(std::begin(selection_buffer), std::end(selection_buffer)) == - index_buffer.size() - 1); - - // Early-out, if the block is sufficiently full. - if (total_size >= info.GetCompactionFlushLimit()) { - return info.GetBlockSize(); - } - - // Sufficient space: calculate how much space we can save. - auto move_amount = info.GetBlockSize() - total_size; - - // Move the dictionary to align it with the offsets. - auto new_dictionary_offset = index_buffer_offset + index_buffer_size; - memmove(base_ptr + new_dictionary_offset, base_ptr + current_dictionary.end - current_dictionary.size, - current_dictionary.size); - current_dictionary.end -= move_amount; - D_ASSERT(current_dictionary.end == total_size); - - // Write the new dictionary with the updated "end". - DictionaryCompressionStorage::SetDictionary(*current_segment, handle, current_dictionary); - return total_size; - } }; //===--------------------------------------------------------------------===// // Analyze //===--------------------------------------------------------------------===// -struct DictionaryAnalyzeState : public DictionaryCompressionState { - explicit DictionaryAnalyzeState(const CompressionInfo &info) - : DictionaryCompressionState(info), segment_count(0), current_tuple_count(0), current_unique_count(0), - current_dict_size(0), current_width(0), next_width(0) { - } - - idx_t segment_count; - idx_t current_tuple_count; - idx_t current_unique_count; - idx_t current_dict_size; - StringHeap heap; - string_set_t current_set; - bitpacking_width_t current_width; - bitpacking_width_t next_width; - - bool LookupString(string_t str) override { - return current_set.count(str); - } - - void AddNewString(string_t str) override { - current_tuple_count++; - current_unique_count++; - current_dict_size += str.GetSize(); - if (str.IsInlined()) { - current_set.insert(str); - } else { - current_set.insert(heap.AddBlob(str)); - } - current_width = next_width; - } - - void AddLastLookup() override { - current_tuple_count++; - } - - void AddNull() override { - current_tuple_count++; - } - - bool CalculateSpaceRequirements(bool new_string, idx_t string_size) override { - if (!new_string) { - return DictionaryCompressionStorage::HasEnoughSpace(current_tuple_count + 1, current_unique_count, - current_dict_size, current_width, info.GetBlockSize()); - } - next_width = BitpackingPrimitives::MinimumBitWidth(current_unique_count + 2); // 1 for null, one for new string - return DictionaryCompressionStorage::HasEnoughSpace(current_tuple_count + 1, current_unique_count + 1, - current_dict_size + string_size, next_width, - info.GetBlockSize()); - } - - void Flush(bool final = false) override { - segment_count++; - current_tuple_count = 0; - current_unique_count = 0; - current_dict_size = 0; - current_set.clear(); - } - void Verify() override {}; -}; - -struct DictionaryCompressionAnalyzeState : public AnalyzeState { - explicit DictionaryCompressionAnalyzeState(const CompressionInfo &info) - : AnalyzeState(info), analyze_state(make_uniq(info)) { - } - - unique_ptr analyze_state; -}; - unique_ptr DictionaryCompressionStorage::StringInitAnalyze(ColumnData &col_data, PhysicalType type) { CompressionInfo info(col_data.GetBlockManager().GetBlockSize()); return make_uniq(info); @@ -410,11 +84,11 @@ idx_t DictionaryCompressionStorage::StringFinalAnalyze(AnalyzeState &state_p) { auto &state = *analyze_state.analyze_state; auto width = BitpackingPrimitives::MinimumBitWidth(state.current_unique_count + 1); - auto req_space = - RequiredSpace(state.current_tuple_count, state.current_unique_count, state.current_dict_size, width); + auto req_space = DictionaryCompression::RequiredSpace(state.current_tuple_count, state.current_unique_count, + state.current_dict_size, width); const auto total_space = state.segment_count * state.info.GetBlockSize() + req_space; - return LossyNumericCast(MINIMUM_COMPRESSION_RATIO * float(total_space)); + return LossyNumericCast(DictionaryCompression::MINIMUM_COMPRESSION_RATIO * float(total_space)); } //===--------------------------------------------------------------------===// @@ -438,47 +112,10 @@ void DictionaryCompressionStorage::FinalizeCompress(CompressionState &state_p) { //===--------------------------------------------------------------------===// // Scan //===--------------------------------------------------------------------===// -struct CompressedStringScanState : public StringScanState { - BufferHandle handle; - buffer_ptr dictionary; - idx_t dictionary_size; - bitpacking_width_t current_width; - buffer_ptr sel_vec; - idx_t sel_vec_size = 0; -}; - unique_ptr DictionaryCompressionStorage::StringInitScan(ColumnSegment &segment) { - auto state = make_uniq(); auto &buffer_manager = BufferManager::GetBufferManager(segment.db); - state->handle = buffer_manager.Pin(segment.block); - - auto baseptr = state->handle.Ptr() + segment.GetBlockOffset(); - - // Load header values - auto dict = DictionaryCompressionStorage::GetDictionary(segment, state->handle); - auto header_ptr = reinterpret_cast(baseptr); - auto index_buffer_offset = Load(data_ptr_cast(&header_ptr->index_buffer_offset)); - auto index_buffer_count = Load(data_ptr_cast(&header_ptr->index_buffer_count)); - state->current_width = (bitpacking_width_t)(Load(data_ptr_cast(&header_ptr->bitpacking_width))); - if (segment.GetBlockOffset() + index_buffer_offset + sizeof(uint32_t) * index_buffer_count > - segment.GetBlockManager().GetBlockSize()) { - throw IOException( - "Failed to scan dictionary string - index was out of range. Database file appears to be corrupted."); - } - - auto index_buffer_ptr = reinterpret_cast(baseptr + index_buffer_offset); - - state->dictionary = make_buffer(segment.type, index_buffer_count); - state->dictionary_size = index_buffer_count; - auto dict_child_data = FlatVector::GetData(*(state->dictionary)); - - for (uint32_t i = 0; i < index_buffer_count; i++) { - // NOTE: the passing of dict_child_vector, will not be used, its for big strings - uint16_t str_len = GetStringLength(index_buffer_ptr, i); - dict_child_data[i] = - FetchStringFromDict(segment, dict, baseptr, UnsafeNumericCast(index_buffer_ptr[i]), str_len); - } - + auto state = make_uniq(buffer_manager.Pin(segment.block)); + state->Initialize(segment, true); return std::move(state); } @@ -490,70 +127,13 @@ void DictionaryCompressionStorage::StringScanPartial(ColumnSegment &segment, Col Vector &result, idx_t result_offset) { // clear any previously locked buffers and get the primary buffer handle auto &scan_state = state.scan_state->Cast(); - auto start = segment.GetRelativeIndex(state.row_index); - - auto baseptr = scan_state.handle.Ptr() + segment.GetBlockOffset(); - auto dict = DictionaryCompressionStorage::GetDictionary(segment, scan_state.handle); - - auto header_ptr = reinterpret_cast(baseptr); - auto index_buffer_offset = Load(data_ptr_cast(&header_ptr->index_buffer_offset)); - auto index_buffer_ptr = reinterpret_cast(baseptr + index_buffer_offset); - - auto base_data = data_ptr_cast(baseptr + DICTIONARY_HEADER_SIZE); - auto result_data = FlatVector::GetData(result); + auto start = segment.GetRelativeIndex(state.row_index); if (!ALLOW_DICT_VECTORS || scan_count != STANDARD_VECTOR_SIZE || start % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE != 0) { - // Emit regular vector - - // Handling non-bitpacking-group-aligned start values; - idx_t start_offset = start % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE; - - // We will scan in blocks of BITPACKING_ALGORITHM_GROUP_SIZE, so we may scan some extra values. - idx_t decompress_count = BitpackingPrimitives::RoundUpToAlgorithmGroupSize(scan_count + start_offset); - - // Create a decompression buffer of sufficient size if we don't already have one. - if (!scan_state.sel_vec || scan_state.sel_vec_size < decompress_count) { - scan_state.sel_vec_size = decompress_count; - scan_state.sel_vec = make_buffer(decompress_count); - } - - data_ptr_t src = &base_data[((start - start_offset) * scan_state.current_width) / 8]; - sel_t *sel_vec_ptr = scan_state.sel_vec->data(); - - BitpackingPrimitives::UnPackBuffer(data_ptr_cast(sel_vec_ptr), src, decompress_count, - scan_state.current_width); - - for (idx_t i = 0; i < scan_count; i++) { - // Lookup dict offset in index buffer - auto string_number = scan_state.sel_vec->get_index(i + start_offset); - auto dict_offset = index_buffer_ptr[string_number]; - auto str_len = GetStringLength(index_buffer_ptr, UnsafeNumericCast(string_number)); - result_data[result_offset + i] = - FetchStringFromDict(segment, dict, baseptr, UnsafeNumericCast(dict_offset), str_len); - } - + scan_state.ScanToFlatVector(result, result_offset, start, scan_count); } else { - D_ASSERT(start % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE == 0); - D_ASSERT(scan_count == STANDARD_VECTOR_SIZE); - D_ASSERT(result_offset == 0); - - idx_t decompress_count = BitpackingPrimitives::RoundUpToAlgorithmGroupSize(scan_count); - - // Create a selection vector of sufficient size if we don't already have one. - if (!scan_state.sel_vec || scan_state.sel_vec_size < decompress_count) { - scan_state.sel_vec_size = decompress_count; - scan_state.sel_vec = make_buffer(decompress_count); - } - - // Scanning 2048 values, emitting a dict vector - data_ptr_t dst = data_ptr_cast(scan_state.sel_vec->data()); - data_ptr_t src = data_ptr_cast(&base_data[(start * scan_state.current_width) / 8]); - - BitpackingPrimitives::UnPackBuffer(dst, src, scan_count, scan_state.current_width); - - result.Dictionary(*(scan_state.dictionary), scan_state.dictionary_size, *scan_state.sel_vec, scan_count); - DictionaryVector::SetDictionaryId(result, to_string(CastPointerToValue(&segment))); + scan_state.ScanToDictionaryVector(segment, result, result_offset, start, scan_count); } } @@ -568,91 +148,9 @@ void DictionaryCompressionStorage::StringScan(ColumnSegment &segment, ColumnScan void DictionaryCompressionStorage::StringFetchRow(ColumnSegment &segment, ColumnFetchState &state, row_t row_id, Vector &result, idx_t result_idx) { // fetch a single row from the string segment - // first pin the main buffer if it is not already pinned - auto &handle = state.GetOrInsertHandle(segment); - - auto baseptr = handle.Ptr() + segment.GetBlockOffset(); - auto header_ptr = reinterpret_cast(baseptr); - auto dict = DictionaryCompressionStorage::GetDictionary(segment, handle); - auto index_buffer_offset = Load(data_ptr_cast(&header_ptr->index_buffer_offset)); - auto width = (bitpacking_width_t)Load(data_ptr_cast(&header_ptr->bitpacking_width)); - auto index_buffer_ptr = reinterpret_cast(baseptr + index_buffer_offset); - auto base_data = data_ptr_cast(baseptr + DICTIONARY_HEADER_SIZE); - auto result_data = FlatVector::GetData(result); - - // Handling non-bitpacking-group-aligned start values; - idx_t start_offset = NumericCast(row_id) % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE; - - // Decompress part of selection buffer we need for this value. - sel_t decompression_buffer[BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE]; - data_ptr_t src = data_ptr_cast(&base_data[((NumericCast(row_id) - start_offset) * width) / 8]); - BitpackingPrimitives::UnPackBuffer(data_ptr_cast(decompression_buffer), src, - BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE, width); - - auto selection_value = decompression_buffer[start_offset]; - auto dict_offset = index_buffer_ptr[selection_value]; - uint16_t str_len = GetStringLength(index_buffer_ptr, selection_value); - - result_data[result_idx] = FetchStringFromDict(segment, dict, baseptr, NumericCast(dict_offset), str_len); -} - -//===--------------------------------------------------------------------===// -// Helper Functions -//===--------------------------------------------------------------------===// -bool DictionaryCompressionStorage::HasEnoughSpace(idx_t current_count, idx_t index_count, idx_t dict_size, - bitpacking_width_t packing_width, const idx_t block_size) { - return RequiredSpace(current_count, index_count, dict_size, packing_width) <= block_size; -} - -idx_t DictionaryCompressionStorage::RequiredSpace(idx_t current_count, idx_t index_count, idx_t dict_size, - bitpacking_width_t packing_width) { - idx_t base_space = DICTIONARY_HEADER_SIZE + dict_size; - idx_t string_number_space = BitpackingPrimitives::GetRequiredSize(current_count, packing_width); - idx_t index_space = index_count * sizeof(uint32_t); - - idx_t used_space = base_space + index_space + string_number_space; - - return used_space; -} - -StringDictionaryContainer DictionaryCompressionStorage::GetDictionary(ColumnSegment &segment, BufferHandle &handle) { - auto header_ptr = reinterpret_cast(handle.Ptr() + segment.GetBlockOffset()); - StringDictionaryContainer container; - container.size = Load(data_ptr_cast(&header_ptr->dict_size)); - container.end = Load(data_ptr_cast(&header_ptr->dict_end)); - return container; -} - -void DictionaryCompressionStorage::SetDictionary(ColumnSegment &segment, BufferHandle &handle, - StringDictionaryContainer container) { - auto header_ptr = reinterpret_cast(handle.Ptr() + segment.GetBlockOffset()); - Store(container.size, data_ptr_cast(&header_ptr->dict_size)); - Store(container.end, data_ptr_cast(&header_ptr->dict_end)); -} - -string_t DictionaryCompressionStorage::FetchStringFromDict(ColumnSegment &segment, StringDictionaryContainer dict, - data_ptr_t baseptr, int32_t dict_offset, - uint16_t string_len) { - - D_ASSERT(dict_offset >= 0 && dict_offset <= NumericCast(segment.GetBlockManager().GetBlockSize())); - if (dict_offset == 0) { - return string_t(nullptr, 0); - } - - // normal string: read string from this block - auto dict_end = baseptr + dict.end; - auto dict_pos = dict_end - dict_offset; - - auto str_ptr = char_ptr_cast(dict_pos); - return string_t(str_ptr, string_len); -} - -uint16_t DictionaryCompressionStorage::GetStringLength(uint32_t *index_buffer_ptr, sel_t index) { - if (index == 0) { - return 0; - } else { - return UnsafeNumericCast(index_buffer_ptr[index] - index_buffer_ptr[index - 1]); - } + CompressedStringScanState scan_state(state.GetOrInsertHandle(segment)); + scan_state.Initialize(segment, false); + scan_state.ScanToFlatVector(result, result_idx, NumericCast(row_id), 1); } //===--------------------------------------------------------------------===// @@ -672,4 +170,5 @@ CompressionFunction DictionaryCompressionFun::GetFunction(PhysicalType data_type bool DictionaryCompressionFun::TypeIsSupported(const PhysicalType physical_type) { return physical_type == PhysicalType::VARCHAR; } + } // namespace duckdb diff --git a/src/duckdb/src/storage/data_table.cpp b/src/duckdb/src/storage/data_table.cpp index 082896b1..81bf5010 100644 --- a/src/duckdb/src/storage/data_table.cpp +++ b/src/duckdb/src/storage/data_table.cpp @@ -319,6 +319,21 @@ bool DataTable::HasIndexes() const { return !info->indexes.Empty(); } +bool DataTable::HasUniqueIndexes() const { + if (!HasIndexes()) { + return false; + } + bool has_unique_index = false; + info->indexes.Scan([&](Index &index) { + if (index.IsUnique()) { + has_unique_index = true; + return true; + } + return false; + }); + return has_unique_index; +} + void DataTable::AddIndex(unique_ptr index) { info->indexes.AddIndex(std::move(index)); } @@ -489,15 +504,12 @@ static idx_t FirstMissingMatch(const ManagedSelection &matches) { } idx_t LocateErrorIndex(bool is_append, const ManagedSelection &matches) { - idx_t failed_index = DConstants::INVALID_INDEX; + // We expected to find nothing, so the first error is the first match. if (!is_append) { - // We expected to find nothing, so the first error is the first match - failed_index = matches[0]; - } else { - // We expected to find matches for all of them, so the first missing match is the first error - return FirstMissingMatch(matches); + return matches[0]; } - return failed_index; + // We expected to find matches for all of them, so the first missing match is the first error. + return FirstMissingMatch(matches); } [[noreturn]] static void ThrowForeignKeyConstraintError(idx_t failed_index, bool is_append, Index &conflict_index, @@ -669,79 +681,82 @@ void DataTable::VerifyNewConstraint(LocalStorage &local_storage, DataTable &pare local_storage.VerifyNewConstraint(parent, constraint); } -bool HasUniqueIndexes(TableIndexList &list) { - bool has_unique_index = false; - list.Scan([&](Index &index) { - if (index.IsUnique()) { - has_unique_index = true; - return true; - } - return false; - }); - return has_unique_index; -} - -void DataTable::VerifyUniqueIndexes(TableIndexList &indexes, ClientContext &context, DataChunk &chunk, - optional_ptr conflict_manager) { - //! check whether or not the chunk can be inserted into the indexes - if (!conflict_manager) { - // Only need to verify that no unique constraints are violated - indexes.Scan([&](Index &index) { - if (!index.IsUnique()) { +void DataTable::VerifyUniqueIndexes(TableIndexList &indexes, optional_ptr storage, DataChunk &chunk, + optional_ptr manager) { + // Verify the constraint without a conflict manager. + if (!manager) { + return indexes.ScanBound([&](ART &art) { + if (!art.IsUnique()) { return false; } - D_ASSERT(index.IsBound()); - index.Cast().VerifyAppend(chunk); + + if (storage) { + auto delete_index = storage->delete_indexes.Find(art.GetIndexName()); + art.VerifyAppend(chunk, delete_index, nullptr); + } else { + art.VerifyAppend(chunk, nullptr, nullptr); + } return false; }); - return; } - D_ASSERT(conflict_manager); - // The conflict manager is only provided when a ON CONFLICT clause was provided to the INSERT statement + // The conflict manager is only provided for statements containing ON CONFLICT. + auto &conflict_info = manager->GetConflictInfo(); - auto &conflict_info = conflict_manager->GetConflictInfo(); - // First we figure out how many indexes match our conflict target - // So we can optimize accordingly - indexes.Scan([&](Index &index) { - if (!index.IsUnique()) { + // Find all indexes matching the conflict target. + indexes.ScanBound([&](ART &art) { + if (!art.IsUnique()) { return false; } - if (conflict_info.ConflictTargetMatches(index)) { - D_ASSERT(index.IsBound()); - conflict_manager->AddIndex(index.Cast()); + if (!conflict_info.ConflictTargetMatches(art)) { + return false; + } + + if (storage) { + auto delete_index = storage->delete_indexes.Find(art.GetIndexName()); + manager->AddIndex(art, delete_index); + } else { + manager->AddIndex(art, nullptr); } return false; }); - conflict_manager->SetMode(ConflictManagerMode::SCAN); - // First we verify only the indexes that match our conflict target - for (auto index : conflict_manager->MatchedIndexes()) { - index->VerifyAppend(chunk, *conflict_manager); + + // Verify indexes matching the conflict target. + manager->SetMode(ConflictManagerMode::SCAN); + auto &matched_indexes = manager->MatchedIndexes(); + auto &matched_delete_indexes = manager->MatchedDeleteIndexes(); + for (idx_t i = 0; i < matched_indexes.size(); i++) { + matched_indexes[i].get().VerifyAppend(chunk, matched_delete_indexes[i], *manager); } - conflict_manager->SetMode(ConflictManagerMode::THROW); - // Then we scan the other indexes, throwing if they cause conflicts on tuples that were not found during - // the scan - indexes.Scan([&](Index &index) { - if (!index.IsUnique()) { + // Scan the other indexes and throw, if there are any conflicts. + manager->SetMode(ConflictManagerMode::THROW); + indexes.ScanBound([&](ART &art) { + if (!art.IsUnique()) { return false; } - D_ASSERT(index.IsBound()); - auto &bound_index = index.Cast(); - if (conflict_manager->MatchedIndex(bound_index)) { - // Already checked this constraint + if (manager->MatchedIndex(art)) { return false; } - bound_index.VerifyAppend(chunk, *conflict_manager); + + if (storage) { + auto delete_index = storage->delete_indexes.Find(art.GetIndexName()); + art.VerifyAppend(chunk, delete_index, *manager); + } else { + art.VerifyAppend(chunk, nullptr, *manager); + } return false; }); } -void DataTable::VerifyAppendConstraints(ConstraintState &state, ClientContext &context, DataChunk &chunk, - optional_ptr conflict_manager) { - auto &table = state.table; +void DataTable::VerifyAppendConstraints(ConstraintState &constraint_state, ClientContext &context, DataChunk &chunk, + optional_ptr storage, + optional_ptr manager) { + + auto &table = constraint_state.table; + if (table.HasGeneratedColumns()) { - // Verify that the generated columns expression work with the inserted values + // Verify the generated columns against the inserted values. auto binder = Binder::CreateBinder(context); physical_index_set_t bound_columns; CheckBinder generated_check_binder(*binder, context, table.name, table.GetColumns(), bound_columns); @@ -757,14 +772,14 @@ void DataTable::VerifyAppendConstraints(ConstraintState &state, ClientContext &c } } - if (HasUniqueIndexes(info->indexes)) { - VerifyUniqueIndexes(info->indexes, context, chunk, conflict_manager); + if (HasUniqueIndexes()) { + VerifyUniqueIndexes(info->indexes, storage, chunk, manager); } auto &constraints = table.GetConstraints(); - for (idx_t i = 0; i < state.bound_constraints.size(); i++) { + for (idx_t i = 0; i < constraint_state.bound_constraints.size(); i++) { auto &base_constraint = constraints[i]; - auto &constraint = state.bound_constraints[i]; + auto &constraint = constraint_state.bound_constraints[i]; switch (base_constraint->type) { case ConstraintType::NOT_NULL: { auto &bound_not_null = constraint->Cast(); @@ -780,7 +795,7 @@ void DataTable::VerifyAppendConstraints(ConstraintState &state, ClientContext &c break; } case ConstraintType::UNIQUE: { - // These were handled earlier on + // These were handled earlier. break; } case ConstraintType::FOREIGN_KEY: { @@ -792,7 +807,7 @@ void DataTable::VerifyAppendConstraints(ConstraintState &state, ClientContext &c break; } default: - throw NotImplementedException("Constraint type not implemented!"); + throw InternalException("invalid constraint type"); } } } @@ -810,28 +825,37 @@ void DataTable::InitializeLocalAppend(LocalAppendState &state, TableCatalogEntry } auto &local_storage = LocalStorage::Get(context, db); local_storage.InitializeAppend(state, *this); + state.constraint_state = InitializeConstraintState(table, bound_constraints); +} +void DataTable::InitializeLocalStorage(LocalAppendState &state, TableCatalogEntry &table, ClientContext &context, + const vector> &bound_constraints) { + if (!is_root) { + throw TransactionException("Transaction conflict: adding entries to a table that has been altered!"); + } + + auto &local_storage = LocalStorage::Get(context, db); + local_storage.InitializeStorage(state, *this); state.constraint_state = InitializeConstraintState(table, bound_constraints); } -void DataTable::LocalAppend(LocalAppendState &state, TableCatalogEntry &table, ClientContext &context, DataChunk &chunk, - bool unsafe) { +void DataTable::LocalAppend(LocalAppendState &state, ClientContext &context, DataChunk &chunk, bool unsafe) { if (chunk.size() == 0) { return; } - D_ASSERT(chunk.ColumnCount() == table.GetColumns().PhysicalColumnCount()); if (!is_root) { - throw TransactionException("Transaction conflict: adding entries to a table that has been altered!"); + throw TransactionException("write conflict: adding entries to a table that has been altered"); } - chunk.Verify(); - // verify any constraints on the new chunk + // Insert any row ids into the DELETE ART and verify constraints afterward. + // This happens only for the global indexes. if (!unsafe) { - VerifyAppendConstraints(*state.constraint_state, context, chunk); + auto &constraint_state = *state.constraint_state; + VerifyAppendConstraints(constraint_state, context, chunk, *state.storage, nullptr); } - // append to the transaction local data + // Append to the transaction-local data. LocalStorage::Append(state, chunk); } @@ -859,7 +883,20 @@ void DataTable::LocalAppend(TableCatalogEntry &table, ClientContext &context, Da LocalAppendState append_state; auto &storage = table.GetStorage(); storage.InitializeLocalAppend(append_state, table, context, bound_constraints); - storage.LocalAppend(append_state, table, context, chunk); + + storage.LocalAppend(append_state, context, chunk, false); + storage.FinalizeLocalAppend(append_state); +} + +void DataTable::LocalAppend(TableCatalogEntry &table, ClientContext &context, DataChunk &chunk, + const vector> &bound_constraints, Vector &row_ids, + DataChunk &delete_chunk) { + LocalAppendState append_state; + auto &storage = table.GetStorage(); + storage.InitializeLocalAppend(append_state, table, context, bound_constraints); + append_state.storage->AppendToDeleteIndexes(row_ids, delete_chunk); + + storage.LocalAppend(append_state, context, chunk, false); storage.FinalizeLocalAppend(append_state); } @@ -873,7 +910,7 @@ void DataTable::LocalAppend(TableCatalogEntry &table, ClientContext &context, Co if (!column_ids || column_ids->empty()) { for (auto &chunk : collection.Chunks()) { - storage.LocalAppend(append_state, table, context, chunk); + storage.LocalAppend(append_state, context, chunk, false); } storage.FinalizeLocalAppend(append_state); return; @@ -916,7 +953,7 @@ void DataTable::LocalAppend(TableCatalogEntry &table, ClientContext &context, Co for (auto &chunk : collection.Chunks()) { expression_executor.Execute(chunk, result); - storage.LocalAppend(append_state, table, context, result); + storage.LocalAppend(append_state, context, result, false); result.Reset(); } storage.FinalizeLocalAppend(append_state); @@ -1086,30 +1123,40 @@ void DataTable::RevertAppend(DuckTransaction &transaction, idx_t start_row, idx_ //===--------------------------------------------------------------------===// // Indexes //===--------------------------------------------------------------------===// -ErrorData DataTable::AppendToIndexes(TableIndexList &indexes, DataChunk &chunk, row_t row_start) { +ErrorData DataTable::AppendToIndexes(TableIndexList &indexes, optional_ptr delete_indexes, + DataChunk &chunk, row_t row_start) { ErrorData error; if (indexes.Empty()) { return error; } + // first generate the vector of row identifiers - Vector row_identifiers(LogicalType::ROW_TYPE); - VectorOperations::GenerateSequence(row_identifiers, chunk.size(), row_start, 1); + Vector row_ids(LogicalType::ROW_TYPE); + VectorOperations::GenerateSequence(row_ids, chunk.size(), row_start, 1); vector already_appended; bool append_failed = false; // now append the entries to the indices indexes.Scan([&](Index &index_to_append) { if (!index_to_append.IsBound()) { - error = ErrorData("Unbound index found in DataTable::AppendToIndexes"); - append_failed = true; - return true; + throw InternalException("unbound index in DataTable::AppendToIndexes"); } auto &index = index_to_append.Cast(); + + // Find the matching delete index. + optional_ptr delete_index; + if (index.IsUnique()) { + if (delete_indexes) { + delete_index = delete_indexes->Find(index.name); + } + } + try { - error = index.Append(chunk, row_identifiers); + error = index.AppendWithDeleteIndex(chunk, row_ids, delete_index); } catch (std::exception &ex) { error = ErrorData(ex); } + if (error.HasError()) { append_failed = true; return true; @@ -1122,15 +1169,15 @@ ErrorData DataTable::AppendToIndexes(TableIndexList &indexes, DataChunk &chunk, // constraint violation! // remove any appended entries from previous indexes (if any) for (auto *index : already_appended) { - index->Delete(chunk, row_identifiers); + index->Delete(chunk, row_ids); } } return error; } -ErrorData DataTable::AppendToIndexes(DataChunk &chunk, row_t row_start) { +ErrorData DataTable::AppendToIndexes(optional_ptr delete_indexes, DataChunk &chunk, row_t row_start) { D_ASSERT(is_root); - return AppendToIndexes(info->indexes, chunk, row_start); + return AppendToIndexes(info->indexes, delete_indexes, chunk, row_start); } void DataTable::RemoveFromIndexes(TableAppendState &state, DataChunk &chunk, row_t row_start) { @@ -1257,8 +1304,9 @@ idx_t DataTable::Delete(TableDeleteState &state, ClientContext &context, Vector idx_t current_count = pos - start; Vector offset_ids(row_identifiers, current_offset, pos); + + // This is a transaction-local DELETE. if (is_transaction_delete) { - // transaction-local delete if (state.has_delete_constraints) { // perform the constraint verification ColumnFetchState fetch_state; @@ -1267,16 +1315,17 @@ idx_t DataTable::Delete(TableDeleteState &state, ClientContext &context, Vector VerifyDeleteConstraints(state, context, state.verify_chunk); } delete_count += local_storage.Delete(*this, offset_ids, current_count); - } else { - // regular table delete - if (state.has_delete_constraints) { - // perform the constraint verification - ColumnFetchState fetch_state; - Fetch(transaction, state.verify_chunk, state.col_ids, offset_ids, current_count, fetch_state); - VerifyDeleteConstraints(state, context, state.verify_chunk); - } - delete_count += row_groups->Delete(transaction, *this, ids + current_offset, current_count); + continue; + } + + // This is a regular DELETE. + if (state.has_delete_constraints) { + // perform the constraint verification + ColumnFetchState fetch_state; + Fetch(transaction, state.verify_chunk, state.col_ids, offset_ids, current_count, fetch_state); + VerifyDeleteConstraints(state, context, state.verify_chunk); } + delete_count += row_groups->Delete(transaction, *this, ids + current_offset, current_count); } return delete_count; } @@ -1358,15 +1407,15 @@ void DataTable::VerifyUpdateConstraints(ConstraintState &state, ClientContext &c throw NotImplementedException("Constraint type not implemented!"); } } - // update should not be called for indexed columns! - // instead update should have been rewritten to delete + update on higher layer + #ifdef DEBUG + // Ensure that we never call UPDATE for indexed columns. + // Instead, we must rewrite these updates into DELETE + INSERT. info->indexes.Scan([&](Index &index) { D_ASSERT(index.IsBound()); D_ASSERT(!index.Cast().IndexIsUpdated(column_ids)); return false; }); - #endif } @@ -1468,6 +1517,10 @@ void DataTable::SetDistinct(column_t column_id, unique_ptr d row_groups->SetDistinct(column_id, std::move(distinct_stats)); } +unique_ptr DataTable::GetSample() { + return row_groups->GetSample(); +} + //===--------------------------------------------------------------------===// // Checkpoint //===--------------------------------------------------------------------===// @@ -1484,8 +1537,8 @@ void DataTable::Checkpoint(TableDataWriter &writer, Serializer &serializer) { TableStatistics global_stats; row_groups->CopyStats(global_stats); row_groups->Checkpoint(writer, global_stats); - // The row group payload data has been written. Now write: + // sample // column stats // row-group pointers // table pointer diff --git a/src/duckdb/src/storage/local_storage.cpp b/src/duckdb/src/storage/local_storage.cpp index 76e5292d..71b20bd1 100644 --- a/src/duckdb/src/storage/local_storage.cpp +++ b/src/duckdb/src/storage/local_storage.cpp @@ -17,6 +17,7 @@ namespace duckdb { LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &table) : table_ref(table), allocator(Allocator::Get(table.db)), deleted_rows(0), optimistic_writer(table), merged_storage(false) { + auto types = table.GetTypes(); auto data_table_info = table.GetDataTableInfo(); auto &io_manager = TableIOManager::Get(table); @@ -24,16 +25,28 @@ LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &table) row_groups->InitializeEmpty(); data_table_info->GetIndexes().BindAndScan(context, *data_table_info, [&](ART &art) { - if (art.GetConstraintType() != IndexConstraintType::NONE) { - // unique index: create a local ART index that maintains the same unique constraint - vector> unbound_expressions; - unbound_expressions.reserve(art.unbound_expressions.size()); - for (auto &expr : art.unbound_expressions) { - unbound_expressions.push_back(expr->Copy()); - } - indexes.AddIndex(make_uniq(art.GetIndexName(), art.GetConstraintType(), art.GetColumnIds(), - art.table_io_manager, std::move(unbound_expressions), art.db)); + auto constraint_type = art.GetConstraintType(); + if (constraint_type == IndexConstraintType::NONE) { + return false; } + + // UNIQUE constraint. + vector> expressions; + vector> delete_expressions; + for (auto &expr : art.unbound_expressions) { + expressions.push_back(expr->Copy()); + delete_expressions.push_back(expr->Copy()); + } + + // Create a delete index and a local index. + auto delete_index = make_uniq(art.GetIndexName(), constraint_type, art.GetColumnIds(), + art.table_io_manager, std::move(delete_expressions), art.db); + delete_index->append_mode = ARTAppendMode::IGNORE_DUPLICATES; + delete_indexes.AddIndex(std::move(delete_index)); + + auto index = make_uniq(art.GetIndexName(), constraint_type, art.GetColumnIds(), art.table_io_manager, + std::move(expressions), art.db); + append_indexes.AddIndex(std::move(index)); return false; }); } @@ -46,7 +59,7 @@ LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &new_dt, merged_storage(parent.merged_storage) { row_groups = parent.row_groups->AlterType(context, changed_idx, target_type, bound_columns, cast_expr); parent.row_groups.reset(); - indexes.Move(parent.indexes); + append_indexes.Move(parent.append_indexes); } LocalTableStorage::LocalTableStorage(DataTable &new_dt, LocalTableStorage &parent, idx_t drop_idx) @@ -55,7 +68,7 @@ LocalTableStorage::LocalTableStorage(DataTable &new_dt, LocalTableStorage &paren merged_storage(parent.merged_storage) { row_groups = parent.row_groups->RemoveColumn(drop_idx); parent.row_groups.reset(); - indexes.Move(parent.indexes); + append_indexes.Move(parent.append_indexes); } LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &new_dt, LocalTableStorage &parent, @@ -65,7 +78,7 @@ LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &new_dt, merged_storage(parent.merged_storage) { row_groups = parent.row_groups->AddColumn(context, new_column, default_executor); parent.row_groups.reset(); - indexes.Move(parent.indexes); + append_indexes.Move(parent.append_indexes); } LocalTableStorage::~LocalTableStorage() { @@ -91,7 +104,7 @@ idx_t LocalTableStorage::EstimatedSize() { // get the index size idx_t index_sizes = 0; - indexes.Scan([&](Index &index) { + append_indexes.Scan([&](Index &index) { D_ASSERT(index.IsBound()); index_sizes += index.Cast().GetInMemorySize(); return false; @@ -139,7 +152,7 @@ ErrorData LocalTableStorage::AppendToIndexes(DuckTransaction &transaction, RowGr } mock_chunk.SetCardinality(chunk); // append this chunk to the indexes of the table - error = DataTable::AppendToIndexes(index_list, mock_chunk, start_row); + error = DataTable::AppendToIndexes(index_list, nullptr, mock_chunk, start_row); if (error.HasError()) { return false; } @@ -150,7 +163,7 @@ ErrorData LocalTableStorage::AppendToIndexes(DuckTransaction &transaction, RowGr } void LocalTableStorage::AppendToIndexes(DuckTransaction &transaction, TableAppendState &append_state, - idx_t append_count, bool append_to_table) { + bool append_to_table) { auto &table = table_ref.get(); if (append_to_table) { table.InitializeAppend(transaction, append_state); @@ -160,7 +173,7 @@ void LocalTableStorage::AppendToIndexes(DuckTransaction &transaction, TableAppen // appending: need to scan entire row_groups->Scan(transaction, [&](DataChunk &chunk) -> bool { // append this chunk to the indexes of the table - error = table.AppendToIndexes(chunk, append_state.current_row); + error = table.AppendToIndexes(delete_indexes, chunk, append_state.current_row); if (error.HasError()) { return false; } @@ -359,19 +372,44 @@ void LocalStorage::InitializeAppend(LocalAppendState &state, DataTable &table) { state.storage->row_groups->InitializeAppend(TransactionData(transaction), state.append_state); } +void LocalStorage::InitializeStorage(LocalAppendState &state, DataTable &table) { + table.InitializeIndexes(context); + state.storage = &table_manager.GetOrCreateStorage(context, table); +} + +void LocalTableStorage::AppendToDeleteIndexes(Vector &row_ids, DataChunk &delete_chunk) { + if (delete_chunk.size() == 0) { + return; + } + + delete_indexes.ScanBound([&](ART &art) { + if (!art.IsUnique()) { + return false; + } + auto result = art.Cast().Append(delete_chunk, row_ids); + if (result.HasError()) { + throw InternalException("unexpected constraint violation on delete ART: ", result.Message()); + } + return false; + }); +} + void LocalStorage::Append(LocalAppendState &state, DataChunk &chunk) { - // append to unique indices (if any) + // Append to any unique indexes. auto storage = state.storage; - idx_t base_id = - NumericCast(MAX_ROW_ID) + storage->row_groups->GetTotalRows() + state.append_state.total_append_count; - auto error = DataTable::AppendToIndexes(storage->indexes, chunk, NumericCast(base_id)); + auto offset = NumericCast(MAX_ROW_ID) + storage->row_groups->GetTotalRows(); + idx_t base_id = offset + state.append_state.total_append_count; + + auto error = DataTable::AppendToIndexes(storage->append_indexes, storage->delete_indexes, chunk, + NumericCast(base_id)); if (error.HasError()) { error.Throw(); } - //! Append the chunk to the local storage + // Append the chunk to the local storage. auto new_row_group = storage->row_groups->Append(chunk, state.append_state); - //! Check if we should pre-emptively flush blocks to disk + + // Check if we should pre-emptively flush blocks to disk. if (new_row_group) { storage->WriteNewRowGroup(); } @@ -383,10 +421,11 @@ void LocalStorage::FinalizeAppend(LocalAppendState &state) { void LocalStorage::LocalMerge(DataTable &table, RowGroupCollection &collection) { auto &storage = table_manager.GetOrCreateStorage(context, table); - if (!storage.indexes.Empty()) { + if (!storage.append_indexes.Empty()) { // append data to indexes if required row_t base_id = MAX_ROW_ID + NumericCast(storage.row_groups->GetTotalRows()); - auto error = storage.AppendToIndexes(transaction, collection, storage.indexes, table.GetTypes(), base_id); + auto error = + storage.AppendToIndexes(transaction, collection, storage.append_indexes, table.GetTypes(), base_id); if (error.HasError()) { error.Throw(); } @@ -422,8 +461,8 @@ idx_t LocalStorage::Delete(DataTable &table, Vector &row_ids, idx_t count) { D_ASSERT(storage); // delete from unique indices (if any) - if (!storage->indexes.Empty()) { - storage->row_groups->RemoveFromIndexes(storage->indexes, row_ids, count); + if (!storage->append_indexes.Empty()) { + storage->row_groups->RemoveFromIndexes(storage->append_indexes, row_ids, count); } auto ids = FlatVector::GetData(row_ids); @@ -468,17 +507,17 @@ void LocalStorage::Flush(DataTable &table, LocalTableStorage &storage, optional_ // FIXME: we should be able to merge the transaction-local index directly into the main table index // as long we just rewrite some row-ids if (table.HasIndexes()) { - storage.AppendToIndexes(transaction, append_state, append_count, false); + storage.AppendToIndexes(transaction, append_state, false); } // finally move over the row groups - table.MergeStorage(*storage.row_groups, storage.indexes, commit_state); + table.MergeStorage(*storage.row_groups, storage.append_indexes, commit_state); } else { // check if we have written data // if we have, we cannot merge to disk after all // so we need to revert the data we have already written storage.Rollback(); // append to the indexes and append to the base table - storage.AppendToIndexes(transaction, append_state, append_count, true); + storage.AppendToIndexes(transaction, append_state, true); } // possibly vacuum any excess index data @@ -596,7 +635,11 @@ TableIndexList &LocalStorage::GetIndexes(DataTable &table) { if (!storage) { throw InternalException("LocalStorage::GetIndexes - local storage not found"); } - return storage->indexes; + return storage->append_indexes; +} + +optional_ptr LocalStorage::GetStorage(DataTable &table) { + return table_manager.GetStorage(table); } void LocalStorage::VerifyNewConstraint(DataTable &parent, const BoundConstraint &constraint) { diff --git a/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp b/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp index e5df17d5..9288cec2 100644 --- a/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +++ b/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp @@ -508,6 +508,7 @@ void LogicalInsert::Serialize(Serializer &serializer) const { serializer.WritePropertyWithDefault>(215, "columns_to_fetch", columns_to_fetch); serializer.WritePropertyWithDefault>(216, "source_columns", source_columns); serializer.WritePropertyWithDefault>>(217, "expressions", expressions); + serializer.WritePropertyWithDefault(218, "update_is_del_and_insert", update_is_del_and_insert, false); } unique_ptr LogicalInsert::Deserialize(Deserializer &deserializer) { @@ -530,6 +531,7 @@ unique_ptr LogicalInsert::Deserialize(Deserializer &deserialize deserializer.ReadPropertyWithDefault>(215, "columns_to_fetch", result->columns_to_fetch); deserializer.ReadPropertyWithDefault>(216, "source_columns", result->source_columns); deserializer.ReadPropertyWithDefault>>(217, "expressions", result->expressions); + deserializer.ReadPropertyWithExplicitDefault(218, "update_is_del_and_insert", result->update_is_del_and_insert, false); return std::move(result); } diff --git a/src/duckdb/src/storage/serialization/serialize_nodes.cpp b/src/duckdb/src/storage/serialization/serialize_nodes.cpp index 440976e8..2c723038 100644 --- a/src/duckdb/src/storage/serialization/serialize_nodes.cpp +++ b/src/duckdb/src/storage/serialization/serialize_nodes.cpp @@ -596,8 +596,8 @@ void ReservoirSample::Serialize(Serializer &serializer) const { unique_ptr ReservoirSample::Deserialize(Deserializer &deserializer) { auto sample_count = deserializer.ReadPropertyWithDefault(200, "sample_count"); - auto result = duckdb::unique_ptr(new ReservoirSample(sample_count)); - deserializer.ReadPropertyWithDefault>(201, "reservoir_chunk", result->reservoir_chunk); + auto reservoir_chunk = deserializer.ReadPropertyWithDefault>(201, "reservoir_chunk"); + auto result = duckdb::unique_ptr(new ReservoirSample(sample_count, std::move(reservoir_chunk))); return std::move(result); } diff --git a/src/duckdb/src/storage/table/row_group_collection.cpp b/src/duckdb/src/storage/table/row_group_collection.cpp index f2ff23fe..90a7236f 100644 --- a/src/duckdb/src/storage/table/row_group_collection.cpp +++ b/src/duckdb/src/storage/table/row_group_collection.cpp @@ -1,5 +1,4 @@ #include "duckdb/storage/table/row_group_collection.hpp" - #include "duckdb/common/serializer/binary_deserializer.hpp" #include "duckdb/execution/expression_executor.hpp" #include "duckdb/execution/index/bound_index.hpp" @@ -397,11 +396,20 @@ bool RowGroupCollection::Append(DataChunk &chunk, TableAppendState &state) { } } state.current_row += row_t(total_append_count); + auto local_stats_lock = state.stats.GetLock(); + for (idx_t col_idx = 0; col_idx < types.size(); col_idx++) { auto &column_stats = state.stats.GetStats(*local_stats_lock, col_idx); column_stats.UpdateDistinctStatistics(chunk.data[col_idx], chunk.size(), state.hashes); } + + auto &table_sample = state.stats.GetTableSampleRef(*local_stats_lock); + if (!table_sample.destroyed) { + D_ASSERT(table_sample.type == SampleType::RESERVOIR_SAMPLE); + table_sample.AddToReservoir(chunk); + } + return new_row_group; } @@ -421,8 +429,8 @@ void RowGroupCollection::FinalizeAppend(TransactionData transaction, TableAppend state.total_append_count = 0; state.start_row_group = nullptr; - auto global_stats_lock = stats.GetLock(); auto local_stats_lock = state.stats.GetLock(); + auto global_stats_lock = stats.GetLock(); for (idx_t col_idx = 0; col_idx < types.size(); col_idx++) { auto &global_stats = stats.GetStats(*global_stats_lock, col_idx); if (!global_stats.HasDistinctStats()) { @@ -435,6 +443,22 @@ void RowGroupCollection::FinalizeAppend(TransactionData transaction, TableAppend global_stats.DistinctStats().Merge(local_stats.DistinctStats()); } + auto local_sample = state.stats.GetTableSample(*local_stats_lock); + auto global_sample = stats.GetTableSample(*global_stats_lock); + + if (local_sample && global_sample) { + D_ASSERT(global_sample->type == SampleType::RESERVOIR_SAMPLE); + auto &reservoir_sample = global_sample->Cast(); + reservoir_sample.Merge(std::move(local_sample)); + // initialize the thread local sample again + auto new_local_sample = make_uniq(reservoir_sample.GetSampleCount()); + state.stats.SetTableSample(*local_stats_lock, std::move(new_local_sample)); + stats.SetTableSample(*global_stats_lock, std::move(global_sample)); + } else { + state.stats.SetTableSample(*local_stats_lock, std::move(local_sample)); + stats.SetTableSample(*global_stats_lock, std::move(global_sample)); + } + Verify(); } @@ -582,6 +606,11 @@ idx_t RowGroupCollection::Delete(TransactionData transaction, DataTable &table, } delete_count += row_group->Delete(transaction, table, ids + start, pos - start); } while (pos < count); + + // When deleting destroy the sample. + auto stats_guard = stats.GetLock(); + stats.DestroyTableSample(*stats_guard); + return delete_count; } @@ -619,6 +648,9 @@ void RowGroupCollection::Update(TransactionData transaction, row_t *ids, const v stats.MergeStats(*l, column_id.index, *row_group->GetStatistics(column_id.index)); } } while (pos < updates.size()); + // on update destroy the sample + auto stats_guard = stats.GetLock(); + stats.DestroyTableSample(*stats_guard); } void RowGroupCollection::RemoveFromIndexes(TableIndexList &indexes, Vector &row_identifiers, idx_t count) { @@ -1102,6 +1134,9 @@ shared_ptr RowGroupCollection::AddColumn(ClientContext &cont result->row_groups->AppendSegment(std::move(new_row_group)); } + // When adding a column destroy the sample + stats.DestroyTableSample(*lock); + return result; } @@ -1114,6 +1149,9 @@ shared_ptr RowGroupCollection::RemoveColumn(idx_t col_idx) { total_rows.load(), row_group_size); result->stats.InitializeRemoveColumn(stats, col_idx); + auto result_lock = result->stats.GetLock(); + result->stats.DestroyTableSample(*result_lock); + for (auto ¤t_row_group : row_groups->Segments()) { auto new_row_group = current_row_group.RemoveColumn(*result, col_idx); result->row_groups->AppendSegment(std::move(new_row_group)); @@ -1160,7 +1198,6 @@ shared_ptr RowGroupCollection::AlterType(ClientContext &cont new_row_group->MergeIntoStatistics(changed_idx, changed_stats.Statistics()); result->row_groups->AppendSegment(std::move(new_row_group)); } - return result; } @@ -1207,7 +1244,7 @@ void RowGroupCollection::VerifyNewConstraint(DataTable &parent, const BoundConst //===--------------------------------------------------------------------===// // Statistics -//===--------------------------------------------------------------------===// +//===---------------------------------------------------------------r-----===// void RowGroupCollection::CopyStats(TableStatistics &other_stats) { stats.CopyStats(other_stats); } @@ -1216,6 +1253,18 @@ unique_ptr RowGroupCollection::CopyStats(column_t column_id) { return stats.CopyStats(column_id); } +unique_ptr RowGroupCollection::GetSample() { + auto lock = stats.GetLock(); + auto &sample = stats.GetTableSampleRef(*lock); + if (!sample.destroyed) { + D_ASSERT(sample.type == SampleType::RESERVOIR_SAMPLE); + auto ret = sample.Copy(); + ret->Cast().EvictOverBudgetSamples(); + return ret; + } + return nullptr; +} + void RowGroupCollection::SetDistinct(column_t column_id, unique_ptr distinct_stats) { D_ASSERT(column_id != COLUMN_IDENTIFIER_ROW_ID); auto stats_lock = stats.GetLock(); diff --git a/src/duckdb/src/storage/table/table_statistics.cpp b/src/duckdb/src/storage/table/table_statistics.cpp index b51c445d..e56f9844 100644 --- a/src/duckdb/src/storage/table/table_statistics.cpp +++ b/src/duckdb/src/storage/table/table_statistics.cpp @@ -1,16 +1,23 @@ #include "duckdb/storage/table/table_statistics.hpp" -#include "duckdb/storage/table/persistent_table_data.hpp" -#include "duckdb/common/serializer/serializer.hpp" + #include "duckdb/common/serializer/deserializer.hpp" +#include "duckdb/common/serializer/serializer.hpp" #include "duckdb/execution/reservoir_sample.hpp" +#include "duckdb/storage/table/persistent_table_data.hpp" namespace duckdb { void TableStatistics::Initialize(const vector &types, PersistentTableData &data) { D_ASSERT(Empty()); + D_ASSERT(!table_sample); stats_lock = make_shared_ptr(); column_stats = std::move(data.table_stats.column_stats); + if (data.table_stats.table_sample) { + table_sample = std::move(data.table_stats.table_sample); + } else { + table_sample = make_uniq(static_cast(FIXED_SAMPLE_SIZE)); + } if (column_stats.size() != types.size()) { // LCOV_EXCL_START throw IOException("Table statistics column count is not aligned with table column count. Corrupt file?"); } // LCOV_EXCL_STOP @@ -18,8 +25,10 @@ void TableStatistics::Initialize(const vector &types, PersistentTab void TableStatistics::InitializeEmpty(const vector &types) { D_ASSERT(Empty()); + D_ASSERT(!table_sample); stats_lock = make_shared_ptr(); + table_sample = make_uniq(static_cast(FIXED_SAMPLE_SIZE)); for (auto &type : types) { column_stats.push_back(ColumnStatistics::CreateEmptyStats(type)); } @@ -35,6 +44,12 @@ void TableStatistics::InitializeAddColumn(TableStatistics &parent, const Logical column_stats.push_back(parent.column_stats[i]); } column_stats.push_back(ColumnStatistics::CreateEmptyStats(new_column_type)); + if (parent.table_sample) { + table_sample = std::move(parent.table_sample); + } + if (table_sample) { + table_sample->Destroy(); + } } void TableStatistics::InitializeRemoveColumn(TableStatistics &parent, idx_t removed_column) { @@ -48,6 +63,12 @@ void TableStatistics::InitializeRemoveColumn(TableStatistics &parent, idx_t remo column_stats.push_back(parent.column_stats[i]); } } + if (parent.table_sample) { + table_sample = std::move(parent.table_sample); + } + if (table_sample) { + table_sample->Destroy(); + } } void TableStatistics::InitializeAlterType(TableStatistics &parent, idx_t changed_idx, const LogicalType &new_type) { @@ -63,6 +84,12 @@ void TableStatistics::InitializeAlterType(TableStatistics &parent, idx_t changed column_stats.push_back(parent.column_stats[i]); } } + if (parent.table_sample) { + table_sample = std::move(parent.table_sample); + } + if (table_sample) { + table_sample->Destroy(); + } } void TableStatistics::InitializeAddConstraint(TableStatistics &parent) { @@ -79,6 +106,21 @@ void TableStatistics::InitializeAddConstraint(TableStatistics &parent) { void TableStatistics::MergeStats(TableStatistics &other) { auto l = GetLock(); D_ASSERT(column_stats.size() == other.column_stats.size()); + if (table_sample) { + if (other.table_sample) { + D_ASSERT(table_sample->type == SampleType::RESERVOIR_SAMPLE); + auto &this_reservoir = table_sample->Cast(); + D_ASSERT(other.table_sample->type == SampleType::RESERVOIR_SAMPLE); + this_reservoir.Merge(std::move(other.table_sample)); + } + // if no other.table sample, do nothig + } else { + if (other.table_sample) { + auto &other_reservoir = other.table_sample->Cast(); + auto other_table_sample_copy = other_reservoir.Copy(); + table_sample = std::move(other_table_sample_copy); + } + } for (idx_t i = 0; i < column_stats.size(); i++) { if (column_stats[i]) { D_ASSERT(other.column_stats[i]); @@ -100,6 +142,25 @@ ColumnStatistics &TableStatistics::GetStats(TableStatisticsLock &lock, idx_t i) return *column_stats[i]; } +BlockingSample &TableStatistics::GetTableSampleRef(TableStatisticsLock &lock) { + D_ASSERT(table_sample); + return *table_sample; +} + +unique_ptr TableStatistics::GetTableSample(TableStatisticsLock &lock) { + return std::move(table_sample); +} + +void TableStatistics::SetTableSample(TableStatisticsLock &lock, unique_ptr sample) { + table_sample = std::move(sample); +} + +void TableStatistics::DestroyTableSample(TableStatisticsLock &lock) const { + if (table_sample) { + table_sample->Destroy(); + } +} + unique_ptr TableStatistics::CopyStats(idx_t i) { lock_guard l(*stats_lock); auto result = column_stats[i]->Statistics().Copy(); @@ -120,11 +181,25 @@ void TableStatistics::CopyStats(TableStatisticsLock &lock, TableStatistics &othe for (auto &stats : column_stats) { other.column_stats.push_back(stats->Copy()); } + + if (table_sample) { + D_ASSERT(table_sample->type == SampleType::RESERVOIR_SAMPLE); + auto &res = table_sample->Cast(); + other.table_sample = res.Copy(); + } } void TableStatistics::Serialize(Serializer &serializer) const { serializer.WriteProperty(100, "column_stats", column_stats); - serializer.WritePropertyWithDefault>(101, "table_sample", table_sample, nullptr); + unique_ptr to_serialize = nullptr; + if (table_sample) { + D_ASSERT(table_sample->type == SampleType::RESERVOIR_SAMPLE); + auto &reservoir_sample = table_sample->Cast(); + to_serialize = unique_ptr_cast(reservoir_sample.Copy()); + auto &res_serialize = to_serialize->Cast(); + res_serialize.EvictOverBudgetSamples(); + } + serializer.WritePropertyWithDefault>(101, "table_sample", to_serialize, nullptr); } void TableStatistics::Deserialize(Deserializer &deserializer, ColumnList &columns) { @@ -142,8 +217,19 @@ void TableStatistics::Deserialize(Deserializer &deserializer, ColumnList &column deserializer.Unset(); }); - table_sample = - deserializer.ReadPropertyWithExplicitDefault>(101, "table_sample", nullptr); + table_sample = deserializer.ReadPropertyWithDefault>(101, "table_sample"); + if (table_sample) { + D_ASSERT(table_sample->type == SampleType::RESERVOIR_SAMPLE); +#ifdef DEBUG + if (table_sample) { + auto &reservoir_sample = table_sample->Cast(); + reservoir_sample.Verify(); + } +#endif + } else { + table_sample = make_uniq(static_cast(FIXED_SAMPLE_SIZE)); + table_sample->Destroy(); + } } unique_ptr TableStatistics::GetLock() { diff --git a/src/duckdb/src/storage/table_index_list.cpp b/src/duckdb/src/storage/table_index_list.cpp index 4e7ecdd3..010162e8 100644 --- a/src/duckdb/src/storage/table_index_list.cpp +++ b/src/duckdb/src/storage/table_index_list.cpp @@ -1,16 +1,17 @@ #include "duckdb/storage/table/table_index_list.hpp" +#include "duckdb/catalog/catalog_entry/duck_table_entry.hpp" #include "duckdb/common/types/conflict_manager.hpp" #include "duckdb/execution/index/index_type_set.hpp" #include "duckdb/execution/index/unbound_index.hpp" #include "duckdb/main/config.hpp" #include "duckdb/main/database.hpp" +#include "duckdb/planner/expression_binder/index_binder.hpp" #include "duckdb/storage/data_table.hpp" #include "duckdb/storage/table/data_table_info.hpp" -#include "duckdb/catalog/catalog_entry/duck_table_entry.hpp" -#include "duckdb/planner/expression_binder/index_binder.hpp" namespace duckdb { + void TableIndexList::AddIndex(unique_ptr index) { D_ASSERT(index); lock_guard lock(indexes_lock); @@ -20,10 +21,10 @@ void TableIndexList::AddIndex(unique_ptr index) { void TableIndexList::RemoveIndex(const string &name) { lock_guard lock(indexes_lock); - for (idx_t index_idx = 0; index_idx < indexes.size(); index_idx++) { - auto &index_entry = indexes[index_idx]; - if (index_entry->GetIndexName() == name) { - indexes.erase_at(index_idx); + for (idx_t i = 0; i < indexes.size(); i++) { + auto &index = indexes[i]; + if (index->GetIndexName() == name) { + indexes.erase_at(i); break; } } @@ -32,10 +33,9 @@ void TableIndexList::RemoveIndex(const string &name) { void TableIndexList::CommitDrop(const string &name) { lock_guard lock(indexes_lock); - for (idx_t index_idx = 0; index_idx < indexes.size(); index_idx++) { - auto &index_entry = indexes[index_idx]; - if (index_entry->GetIndexName() == name) { - index_entry->CommitDrop(); + for (auto &index : indexes) { + if (index->GetIndexName() == name) { + index->CommitDrop(); } } } @@ -43,19 +43,26 @@ void TableIndexList::CommitDrop(const string &name) { bool TableIndexList::NameIsUnique(const string &name) { lock_guard lock(indexes_lock); - // only cover PK, FK, and UNIQUE, which are not (yet) catalog entries - for (idx_t index_idx = 0; index_idx < indexes.size(); index_idx++) { - auto &index_entry = indexes[index_idx]; - if (index_entry->IsPrimary() || index_entry->IsForeign() || index_entry->IsUnique()) { - if (index_entry->GetIndexName() == name) { + // Only covers PK, FK, and UNIQUE indexes. + for (const auto &index : indexes) { + if (index->IsPrimary() || index->IsForeign() || index->IsUnique()) { + if (index->GetIndexName() == name) { return false; } } } - return true; } +optional_ptr TableIndexList::Find(const string &name) { + for (auto &index : indexes) { + if (index->GetIndexName() == name) { + return index->Cast(); + } + } + return nullptr; +} + void TableIndexList::InitializeIndexes(ClientContext &context, DataTableInfo &table_info, const char *index_type) { // Fast path: do we have any unbound indexes? bool needs_binding = false; @@ -72,11 +79,13 @@ void TableIndexList::InitializeIndexes(ClientContext &context, DataTableInfo &ta return; } - // Get the table from the catalog so we can add it to the binder + // Get the table from the catalog, so we can add it to the binder. auto &catalog = table_info.GetDB().GetCatalog(); - auto &table = - catalog.GetEntry(context, CatalogType::TABLE_ENTRY, table_info.GetSchemaName(), table_info.GetTableName()) - .Cast(); + auto schema = table_info.GetSchemaName(); + auto table_name = table_info.GetTableName(); + auto &table_entry = catalog.GetEntry(context, CatalogType::TABLE_ENTRY, schema, table_name); + auto &table = table_entry.Cast(); + vector column_types; vector column_names; for (auto &col : table.GetColumns().Logical()) { @@ -87,18 +96,17 @@ void TableIndexList::InitializeIndexes(ClientContext &context, DataTableInfo &ta lock_guard lock(indexes_lock); for (auto &index : indexes) { if (!index->IsBound() && (index_type == nullptr || index->GetIndexType() == index_type)) { - // Create a binder to bind this index (we cant reuse this binder for other indexes) + // Create a binder to bind this index. auto binder = Binder::CreateBinder(context); - // Add the table to the binder - // We're not interested in the column_ids here, so just pass a dummy vector + // Add the table to the binder. vector dummy_column_ids; binder->bind_context.AddBaseTable(0, string(), column_names, column_types, dummy_column_ids, table); // Create an IndexBinder to bind the index IndexBinder idx_binder(*binder, context); - // Replace the unbound index with a bound index + // Replace the unbound index with a bound index. auto bound_idx = idx_binder.BindIndex(index->Cast()); index = std::move(bound_idx); } @@ -120,15 +128,13 @@ void TableIndexList::Move(TableIndexList &other) { indexes = std::move(other.indexes); } -Index *TableIndexList::FindForeignKeyIndex(const vector &fk_keys, ForeignKeyType fk_type) { - Index *result = nullptr; - Scan([&](Index &index) { - if (DataTable::IsForeignKeyIndex(fk_keys, index, fk_type)) { - result = &index; +Index *TableIndexList::FindForeignKeyIndex(const vector &fk_keys, const ForeignKeyType fk_type) { + for (auto &index : indexes) { + if (DataTable::IsForeignKeyIndex(fk_keys, *index, fk_type)) { + return &(*index); } - return false; - }); - return result; + } + return nullptr; } void TableIndexList::VerifyForeignKey(const vector &fk_keys, DataChunk &chunk, @@ -137,7 +143,7 @@ void TableIndexList::VerifyForeignKey(const vector &fk_keys, Data ? ForeignKeyType::FK_TYPE_PRIMARY_KEY_TABLE : ForeignKeyType::FK_TYPE_FOREIGN_KEY_TABLE; - // check whether the chunk can be inserted or deleted into the referenced table storage + // Check whether the chunk can be inserted or deleted into the referenced table storage. auto index = FindForeignKeyIndex(fk_keys, fk_type); if (!index) { throw InternalException("Internal Foreign Key error: could not find index to verify..."); @@ -145,42 +151,36 @@ void TableIndexList::VerifyForeignKey(const vector &fk_keys, Data if (!index->IsBound()) { throw InternalException("Internal Foreign Key error: trying to verify an unbound index..."); } - index->Cast().CheckConstraintsForChunk(chunk, conflict_manager); + index->Cast().VerifyConstraint(chunk, nullptr, conflict_manager); } -vector TableIndexList::GetRequiredColumns() { +unordered_set TableIndexList::GetRequiredColumns() { lock_guard lock(indexes_lock); - set unique_indexes; + unordered_set column_ids; for (auto &index : indexes) { - for (auto col_index : index->GetColumnIds()) { - unique_indexes.insert(col_index); + for (auto col_id : index->GetColumnIds()) { + column_ids.insert(col_id); } } - vector result; - result.reserve(unique_indexes.size()); - for (auto column_index : unique_indexes) { - result.emplace_back(column_index); - } - return result; + return column_ids; } vector TableIndexList::GetStorageInfos(const case_insensitive_map_t &options) { - - vector index_storage_infos; + vector infos; for (auto &index : indexes) { if (index->IsBound()) { - auto index_storage_info = index->Cast().GetStorageInfo(options, false); - D_ASSERT(index_storage_info.IsValid() && !index_storage_info.name.empty()); - index_storage_infos.push_back(index_storage_info); + auto info = index->Cast().GetStorageInfo(options, false); + D_ASSERT(info.IsValid() && !info.name.empty()); + infos.push_back(info); continue; } - auto index_storage_info = index->Cast().GetStorageInfo(); - D_ASSERT(index_storage_info.IsValid() && !index_storage_info.name.empty()); - index_storage_infos.push_back(index_storage_info); + auto info = index->Cast().GetStorageInfo(); + D_ASSERT(info.IsValid() && !info.name.empty()); + infos.push_back(info); } - return index_storage_infos; + return infos; } } // namespace duckdb diff --git a/src/duckdb/src/storage/wal_replay.cpp b/src/duckdb/src/storage/wal_replay.cpp index bac2cf04..eefe70ef 100644 --- a/src/duckdb/src/storage/wal_replay.cpp +++ b/src/duckdb/src/storage/wal_replay.cpp @@ -735,10 +735,10 @@ void WriteAheadLogDeserializer::ReplayInsert() { throw InternalException("Corrupt WAL: insert without table"); } - // append to the current table - // we don't do any constraint verification here + // Append to the current table without constraint verification. vector> bound_constraints; - state.current_table->GetStorage().LocalAppend(*state.current_table, context, chunk, bound_constraints); + auto &storage = state.current_table->GetStorage(); + storage.LocalAppend(*state.current_table, context, chunk, bound_constraints); } static void MarkBlocksAsUsed(BlockManager &manager, const PersistentColumnData &col_data) { diff --git a/src/duckdb/ub_src_execution.cpp b/src/duckdb/ub_src_execution.cpp index 85125d0d..1acd5651 100644 --- a/src/duckdb/ub_src_execution.cpp +++ b/src/duckdb/ub_src_execution.cpp @@ -20,5 +20,3 @@ #include "src/execution/radix_partitioned_hashtable.cpp" -#include "src/execution/reservoir_sample.cpp" - diff --git a/src/duckdb/ub_src_execution_sample.cpp b/src/duckdb/ub_src_execution_sample.cpp new file mode 100644 index 00000000..32d9b85c --- /dev/null +++ b/src/duckdb/ub_src_execution_sample.cpp @@ -0,0 +1,4 @@ +#include "src/execution/sample/base_reservoir_sample.cpp" + +#include "src/execution/sample/reservoir_sample.cpp" + diff --git a/src/duckdb/ub_src_function_table_system.cpp b/src/duckdb/ub_src_function_table_system.cpp index a534b2bf..4ab6fdaf 100644 --- a/src/duckdb/ub_src_function_table_system.cpp +++ b/src/duckdb/ub_src_function_table_system.cpp @@ -48,6 +48,8 @@ #include "src/function/table/system/pragma_table_info.cpp" +#include "src/function/table/system/pragma_table_sample.cpp" + #include "src/function/table/system/pragma_user_agent.cpp" #include "src/function/table/system/test_all_types.cpp" diff --git a/src/duckdb/ub_src_storage_compression_dictionary.cpp b/src/duckdb/ub_src_storage_compression_dictionary.cpp new file mode 100644 index 00000000..abfb3b32 --- /dev/null +++ b/src/duckdb/ub_src_storage_compression_dictionary.cpp @@ -0,0 +1,8 @@ +#include "src/storage/compression/dictionary/common.cpp" + +#include "src/storage/compression/dictionary/analyze.cpp" + +#include "src/storage/compression/dictionary/compression.cpp" + +#include "src/storage/compression/dictionary/decompression.cpp" +