diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml new file mode 100644 index 0000000..3019f80 --- /dev/null +++ b/.github/workflows/MainDistributionPipeline.yml @@ -0,0 +1,31 @@ +# +# This workflow calls the main distribution pipeline from DuckDB to build, test and (optionally) release the extension +# +name: Main Extension Distribution Pipeline +on: + push: + pull_request: + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} + cancel-in-progress: true + +jobs: + duckdb-stable-build: + name: Build extension binaries + uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main + with: + duckdb_version: v1.1.2 + ci_tools_version: main + extension_name: httpfs + + duckdb-stable-deploy: + name: Deploy extension binaries + needs: duckdb-stable-build + uses: duckdb/extension-ci-tools/.github/workflows/_extension_deploy.yml@main + secrets: inherit + with: + duckdb_version: v1.1.2 + extension_name: httpfs + deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }} diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..4558991 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "duckdb"] + path = duckdb + url = https://github.com/duckdb/duckdb.git +[submodule "extension-ci-tools"] + path = extension-ci-tools + url = https://github.com/duckdb/extension-ci-tools.git diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..6f40044 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,55 @@ +cmake_minimum_required(VERSION 2.8.12...3.29) + +set(FTS_BASE_FOLDER "extension/fts") + +project(FTSExtension) + +add_extension_definitions() + +include_directories(include ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/libstemmer) +set(FTS_SOURCES + fts_extension.cpp + fts_indexing.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/libstemmer/libstemmer.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/runtime/utilities.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/runtime/api.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_arabic.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_basque.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_catalan.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_danish.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_dutch.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_english.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_finnish.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_french.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_german.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_german2.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_greek.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_hindi.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_hungarian.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_indonesian.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_irish.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_italian.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_kraaij_pohlmann.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_lithuanian.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_lovins.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_nepali.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_norwegian.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_porter.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_portuguese.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_romanian.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_russian.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_serbian.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_spanish.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_swedish.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_tamil.cpp + ${DUCKDB_MODULE_BASE_DIR}/third_party/snowball/src_c/stem_UTF_8_turkish.cpp) + +build_static_extension(fts ${FTS_SOURCES}) +set(PARAMETERS "-warnings") +build_loadable_extension(fts ${PARAMETERS} ${FTS_SOURCES}) + +install( + TARGETS fts_extension + EXPORT "${DUCKDB_EXPORT_SET}" + LIBRARY DESTINATION "${INSTALL_LIB_DIR}" + ARCHIVE DESTINATION "${INSTALL_LIB_DIR}") diff --git a/extension/fts/fts_config.py b/extension/fts/fts_config.py new file mode 100644 index 0000000..1f8f0eb --- /dev/null +++ b/extension/fts/fts_config.py @@ -0,0 +1,55 @@ +import os + +# list all include directories +include_directories = [ + os.path.sep.join(x.split('/')) + for x in [ + 'extension/fts/include', + 'third_party/snowball/libstemmer', + 'third_party/snowball/runtime', + 'third_party/snowball/src_c', + ] +] +# source files +source_files = [ + os.path.sep.join(x.split('/')) for x in ['extension/fts/fts_extension.cpp', 'extension/fts/fts_indexing.cpp'] +] +# snowball +source_files += [ + os.path.sep.join(x.split('/')) + for x in [ + 'third_party/snowball/libstemmer/libstemmer.cpp', + 'third_party/snowball/runtime/utilities.cpp', + 'third_party/snowball/runtime/api.cpp', + 'third_party/snowball/src_c/stem_UTF_8_arabic.cpp', + 'third_party/snowball/src_c/stem_UTF_8_basque.cpp', + 'third_party/snowball/src_c/stem_UTF_8_catalan.cpp', + 'third_party/snowball/src_c/stem_UTF_8_danish.cpp', + 'third_party/snowball/src_c/stem_UTF_8_dutch.cpp', + 'third_party/snowball/src_c/stem_UTF_8_english.cpp', + 'third_party/snowball/src_c/stem_UTF_8_finnish.cpp', + 'third_party/snowball/src_c/stem_UTF_8_french.cpp', + 'third_party/snowball/src_c/stem_UTF_8_german.cpp', + 'third_party/snowball/src_c/stem_UTF_8_german2.cpp', + 'third_party/snowball/src_c/stem_UTF_8_greek.cpp', + 'third_party/snowball/src_c/stem_UTF_8_hindi.cpp', + 'third_party/snowball/src_c/stem_UTF_8_hungarian.cpp', + 'third_party/snowball/src_c/stem_UTF_8_indonesian.cpp', + 'third_party/snowball/src_c/stem_UTF_8_irish.cpp', + 'third_party/snowball/src_c/stem_UTF_8_italian.cpp', + 'third_party/snowball/src_c/stem_UTF_8_kraaij_pohlmann.cpp', + 'third_party/snowball/src_c/stem_UTF_8_lithuanian.cpp', + 'third_party/snowball/src_c/stem_UTF_8_lovins.cpp', + 'third_party/snowball/src_c/stem_UTF_8_nepali.cpp', + 'third_party/snowball/src_c/stem_UTF_8_norwegian.cpp', + 'third_party/snowball/src_c/stem_UTF_8_porter.cpp', + 'third_party/snowball/src_c/stem_UTF_8_portuguese.cpp', + 'third_party/snowball/src_c/stem_UTF_8_romanian.cpp', + 'third_party/snowball/src_c/stem_UTF_8_russian.cpp', + 'third_party/snowball/src_c/stem_UTF_8_serbian.cpp', + 'third_party/snowball/src_c/stem_UTF_8_spanish.cpp', + 'third_party/snowball/src_c/stem_UTF_8_swedish.cpp', + 'third_party/snowball/src_c/stem_UTF_8_tamil.cpp', + 'third_party/snowball/src_c/stem_UTF_8_turkish.cpp', + ] +] diff --git a/extension/fts/fts_extension.cpp b/extension/fts/fts_extension.cpp new file mode 100644 index 0000000..3cd6e75 --- /dev/null +++ b/extension/fts/fts_extension.cpp @@ -0,0 +1,103 @@ +#define DUCKDB_EXTENSION_MAIN +#include "fts_extension.hpp" + +#include "duckdb.hpp" +#include "duckdb/common/exception.hpp" +#include "duckdb/common/string_util.hpp" +#include "duckdb/function/pragma_function.hpp" +#include "duckdb/function/scalar_function.hpp" +#include "duckdb/main/extension_util.hpp" +#include "fts_indexing.hpp" +#include "libstemmer.h" + +namespace duckdb { + +static void StemFunction(DataChunk &args, ExpressionState &state, Vector &result) { + auto &input_vector = args.data[0]; + auto &stemmer_vector = args.data[1]; + + BinaryExecutor::Execute( + input_vector, stemmer_vector, result, args.size(), [&](string_t input, string_t stemmer) { + auto input_data = input.GetData(); + auto input_size = input.GetSize(); + + if (stemmer.GetString() == "none") { + auto output = StringVector::AddString(result, input_data, input_size); + return output; + } + + struct sb_stemmer *s = sb_stemmer_new(stemmer.GetString().c_str(), "UTF_8"); + if (s == 0) { + const char **stemmers = sb_stemmer_list(); + size_t n_stemmers = 27; + throw InvalidInputException( + "Unrecognized stemmer '%s'. Supported stemmers are: ['%s'], or use 'none' for no stemming", + stemmer.GetString(), + StringUtil::Join(stemmers, n_stemmers, "', '", [](const char *st) { return st; })); + } + + auto output_data = + const_char_ptr_cast(sb_stemmer_stem(s, reinterpret_cast(input_data), input_size)); + auto output_size = sb_stemmer_length(s); + auto output = StringVector::AddString(result, output_data, output_size); + + sb_stemmer_delete(s); + return output; + }); +} + +static void LoadInternal(DuckDB &db) { + auto &db_instance = *db.instance; + ScalarFunction stem_func("stem", {LogicalType::VARCHAR, LogicalType::VARCHAR}, LogicalType::VARCHAR, StemFunction); + + auto create_fts_index_func = + PragmaFunction::PragmaCall("create_fts_index", FTSIndexing::CreateFTSIndexQuery, + {LogicalType::VARCHAR, LogicalType::VARCHAR}, LogicalType::VARCHAR); + create_fts_index_func.named_parameters["stemmer"] = LogicalType::VARCHAR; + create_fts_index_func.named_parameters["stopwords"] = LogicalType::VARCHAR; + create_fts_index_func.named_parameters["ignore"] = LogicalType::VARCHAR; + create_fts_index_func.named_parameters["strip_accents"] = LogicalType::BOOLEAN; + create_fts_index_func.named_parameters["lower"] = LogicalType::BOOLEAN; + create_fts_index_func.named_parameters["overwrite"] = LogicalType::BOOLEAN; + + auto drop_fts_index_func = + PragmaFunction::PragmaCall("drop_fts_index", FTSIndexing::DropFTSIndexQuery, {LogicalType::VARCHAR}); + + ExtensionUtil::RegisterFunction(db_instance, stem_func); + ExtensionUtil::RegisterFunction(db_instance, create_fts_index_func); + ExtensionUtil::RegisterFunction(db_instance, drop_fts_index_func); +} + +void FtsExtension::Load(DuckDB &db) { + LoadInternal(db); +} + +std::string FtsExtension::Name() { + return "fts"; +} + +std::string FtsExtension::Version() const { +#ifdef EXT_VERSION_FTS + return EXT_VERSION_FTS; +#else + return ""; +#endif +} + +} // namespace duckdb + +extern "C" { + +DUCKDB_EXTENSION_API void fts_init(duckdb::DatabaseInstance &db) { + duckdb::DuckDB db_wrapper(db); + duckdb::LoadInternal(db_wrapper); +} + +DUCKDB_EXTENSION_API const char *fts_version() { + return duckdb::DuckDB::LibraryVersion(); +} +} + +#ifndef DUCKDB_EXTENSION_MAIN +#error DUCKDB_EXTENSION_MAIN not defined +#endif diff --git a/extension/fts/fts_indexing.cpp b/extension/fts/fts_indexing.cpp new file mode 100644 index 0000000..a74b9f5 --- /dev/null +++ b/extension/fts/fts_indexing.cpp @@ -0,0 +1,335 @@ +#include "fts_indexing.hpp" + +#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp" +#include "duckdb/catalog/catalog_search_path.hpp" +#include "duckdb/common/exception.hpp" +#include "duckdb/common/string_util.hpp" +#include "duckdb/main/client_data.hpp" +#include "duckdb/main/connection.hpp" +#include "duckdb/parser/qualified_name.hpp" + +namespace duckdb { + +static QualifiedName GetQualifiedName(ClientContext &context, const string &qname_str) { + auto qname = QualifiedName::Parse(qname_str); + if (qname.schema == INVALID_SCHEMA) { + qname.schema = ClientData::Get(context).catalog_search_path->GetDefaultSchema(qname.catalog); + } + return qname; +} + +static string GetFTSSchema(QualifiedName &qname) { + auto result = qname.catalog == INVALID_CATALOG ? "" : StringUtil::Format("%s.", qname.catalog); + result += StringUtil::Format("fts_%s_%s", qname.schema, qname.name); + return result; +} + +string FTSIndexing::DropFTSIndexQuery(ClientContext &context, const FunctionParameters ¶meters) { + auto qname = GetQualifiedName(context, StringValue::Get(parameters.values[0])); + string fts_schema = GetFTSSchema(qname); + + if (!Catalog::GetSchema(context, qname.catalog, fts_schema, OnEntryNotFound::RETURN_NULL)) { + throw CatalogException( + "a FTS index does not exist on table '%s.%s'. Create one with 'PRAGMA create_fts_index()'.", qname.schema, + qname.name); + } + + return StringUtil::Format("DROP SCHEMA %s CASCADE;", fts_schema); +} + +static string IndexingScript(ClientContext &context, QualifiedName &qname, const string &input_id, + const vector &input_values, const string &stemmer, const string &stopwords, + const string &ignore, bool strip_accents, bool lower) { + // clang-format off + string result = R"( + DROP SCHEMA IF EXISTS %fts_schema% CASCADE; + CREATE SCHEMA %fts_schema%; + CREATE TABLE %fts_schema%.stopwords (sw VARCHAR); + )"; + // clang-format on + + if (stopwords == "none") { + // do nothing + } else if (stopwords == "english") { + // default list of english stopwords from "The SMART system" + // clang-format off + result += R"( + INSERT INTO %fts_schema%.stopwords VALUES ('a'), ('a''s'), ('able'), ('about'), ('above'), ('according'), ('accordingly'), ('across'), ('actually'), ('after'), ('afterwards'), ('again'), ('against'), ('ain''t'), ('all'), ('allow'), ('allows'), ('almost'), ('alone'), ('along'), ('already'), ('also'), ('although'), ('always'), ('am'), ('among'), ('amongst'), ('an'), ('and'), ('another'), ('any'), ('anybody'), ('anyhow'), ('anyone'), ('anything'), ('anyway'), ('anyways'), ('anywhere'), ('apart'), ('appear'), ('appreciate'), ('appropriate'), ('are'), ('aren''t'), ('around'), ('as'), ('aside'), ('ask'), ('asking'), ('associated'), ('at'), ('available'), ('away'), ('awfully'), ('b'), ('be'), ('became'), ('because'), ('become'), ('becomes'), ('becoming'), ('been'), ('before'), ('beforehand'), ('behind'), ('being'), ('believe'), ('below'), ('beside'), ('besides'), ('best'), ('better'), ('between'), ('beyond'), ('both'), ('brief'), ('but'), ('by'), ('c'), ('c''mon'), ('c''s'), ('came'), ('can'), ('can''t'), ('cannot'), ('cant'), ('cause'), ('causes'), ('certain'), ('certainly'), ('changes'), ('clearly'), ('co'), ('com'), ('come'), ('comes'), ('concerning'), ('consequently'), ('consider'), ('considering'), ('contain'), ('containing'), ('contains'), ('corresponding'), ('could'), ('couldn''t'), ('course'), ('currently'), ('d'), ('definitely'), ('described'), ('despite'), ('did'), ('didn''t'), ('different'), ('do'), ('does'), ('doesn''t'), ('doing'), ('don''t'), ('done'), ('down'), ('downwards'), ('during'), ('e'), ('each'), ('edu'), ('eg'), ('eight'), ('either'), ('else'), ('elsewhere'), ('enough'), ('entirely'), ('especially'), ('et'), ('etc'), ('even'), ('ever'), ('every'), ('everybody'), ('everyone'), ('everything'), ('everywhere'), ('ex'), ('exactly'), ('example'), ('except'), ('f'), ('far'), ('few'), ('fifth'), ('first'), ('five'), ('followed'), ('following'), ('follows'), ('for'), ('former'), ('formerly'), ('forth'), ('four'), ('from'), ('further'), ('furthermore'), ('g'), ('get'), ('gets'), ('getting'), ('given'), ('gives'), ('go'), ('goes'), ('going'), ('gone'), ('got'), ('gotten'), ('greetings'), ('h'), ('had'), ('hadn''t'), ('happens'), ('hardly'), ('has'), ('hasn''t'), ('have'), ('haven''t'), ('having'), ('he'), ('he''s'), ('hello'), ('help'), ('hence'), ('her'), ('here'), ('here''s'), ('hereafter'), ('hereby'), ('herein'), ('hereupon'), ('hers'), ('herself'), ('hi'), ('him'), ('himself'), ('his'), ('hither'), ('hopefully'), ('how'), ('howbeit'), ('however'), ('i'), ('i''d'), ('i''ll'), ('i''m'), ('i''ve'), ('ie'), ('if'), ('ignored'), ('immediate'), ('in'), ('inasmuch'), ('inc'), ('indeed'), ('indicate'), ('indicated'), ('indicates'), ('inner'), ('insofar'), ('instead'), ('into'), ('inward'), ('is'), ('isn''t'), ('it'), ('it''d'), ('it''ll'), ('it''s'), ('its'), ('itself'), ('j'), ('just'), ('k'), ('keep'), ('keeps'), ('kept'), ('know'), ('knows'), ('known'), ('l'), ('last'), ('lately'), ('later'), ('latter'), ('latterly'), ('least'), ('less'), ('lest'), ('let'), ('let''s'), ('like'), ('liked'), ('likely'), ('little'), ('look'), ('looking'), ('looks'), ('ltd'), ('m'), ('mainly'), ('many'), ('may'), ('maybe'), ('me'), ('mean'), ('meanwhile'), ('merely'), ('might'), ('more'), ('moreover'), ('most'), ('mostly'), ('much'), ('must'), ('my'), ('myself'), ('n'), ('name'), ('namely'), ('nd'), ('near'), ('nearly'), ('necessary'), ('need'), ('needs'), ('neither'), ('never'), ('nevertheless'), ('new'), ('next'), ('nine'), ('no'), ('nobody'), ('non'), ('none'), ('noone'), ('nor'), ('normally'), ('not'), ('nothing'), ('novel'), ('now'), ('nowhere'), ('o'), ('obviously'), ('of'), ('off'), ('often'), ('oh'), ('ok'), ('okay'), ('old'), ('on'), ('once'), ('one'), ('ones'), ('only'), ('onto'), ('or'), ('other'), ('others'), ('otherwise'), ('ought'), ('our'), ('ours'), ('ourselves'), ('out'), ('outside'), ('over'), ('overall'), ('own'); + INSERT INTO %fts_schema%.stopwords VALUES ('p'), ('particular'), ('particularly'), ('per'), ('perhaps'), ('placed'), ('please'), ('plus'), ('possible'), ('presumably'), ('probably'), ('provides'), ('q'), ('que'), ('quite'), ('qv'), ('r'), ('rather'), ('rd'), ('re'), ('really'), ('reasonably'), ('regarding'), ('regardless'), ('regards'), ('relatively'), ('respectively'), ('right'), ('s'), ('said'), ('same'), ('saw'), ('say'), ('saying'), ('says'), ('second'), ('secondly'), ('see'), ('seeing'), ('seem'), ('seemed'), ('seeming'), ('seems'), ('seen'), ('self'), ('selves'), ('sensible'), ('sent'), ('serious'), ('seriously'), ('seven'), ('several'), ('shall'), ('she'), ('should'), ('shouldn''t'), ('since'), ('six'), ('so'), ('some'), ('somebody'), ('somehow'), ('someone'), ('something'), ('sometime'), ('sometimes'), ('somewhat'), ('somewhere'), ('soon'), ('sorry'), ('specified'), ('specify'), ('specifying'), ('still'), ('sub'), ('such'), ('sup'), ('sure'), ('t'), ('t''s'), ('take'), ('taken'), ('tell'), ('tends'), ('th'), ('than'), ('thank'), ('thanks'), ('thanx'), ('that'), ('that''s'), ('thats'), ('the'), ('their'), ('theirs'), ('them'), ('themselves'), ('then'), ('thence'), ('there'), ('there''s'), ('thereafter'), ('thereby'), ('therefore'), ('therein'), ('theres'), ('thereupon'), ('these'), ('they'), ('they''d'), ('they''ll'), ('they''re'), ('they''ve'), ('think'), ('third'), ('this'), ('thorough'), ('thoroughly'), ('those'), ('though'), ('three'), ('through'), ('throughout'), ('thru'), ('thus'), ('to'), ('together'), ('too'), ('took'), ('toward'), ('towards'), ('tried'), ('tries'), ('truly'), ('try'), ('trying'), ('twice'), ('two'), ('u'), ('un'), ('under'), ('unfortunately'), ('unless'), ('unlikely'), ('until'), ('unto'), ('up'), ('upon'), ('us'), ('use'), ('used'), ('useful'), ('uses'), ('using'), ('usually'), ('uucp'), ('v'), ('value'), ('various'), ('very'), ('via'), ('viz'), ('vs'), ('w'), ('want'), ('wants'), ('was'), ('wasn''t'), ('way'), ('we'), ('we''d'), ('we''ll'), ('we''re'), ('we''ve'), ('welcome'), ('well'), ('went'), ('were'), ('weren''t'), ('what'), ('what''s'), ('whatever'), ('when'), ('whence'), ('whenever'), ('where'), ('where''s'), ('whereafter'), ('whereas'), ('whereby'), ('wherein'), ('whereupon'), ('wherever'), ('whether'), ('which'), ('while'), ('whither'), ('who'), ('who''s'), ('whoever'), ('whole'), ('whom'), ('whose'), ('why'), ('will'), ('willing'), ('wish'), ('with'), ('within'), ('without'), ('won''t'), ('wonder'), ('would'), ('would'), ('wouldn''t'), ('x'), ('y'), ('yes'), ('yet'), ('you'), ('you''d'), ('you''ll'), ('you''re'), ('you''ve'), ('your'), ('yours'), ('yourself'), ('yourselves'), ('z'), ('zero'); + )"; + // clang-format on + } else { + // custom stopwords + result += "INSERT INTO %fts_schema%.stopwords SELECT * FROM " + stopwords + ";"; + } + + // create tokenize macro based on parameters + string tokenize = "s::VARCHAR"; + vector before; + vector after; + if (strip_accents) { + tokenize = "strip_accents(" + tokenize + ")"; + } + if (lower) { + tokenize = "lower(" + tokenize + ")"; + } + tokenize = "regexp_replace(" + tokenize + ", $$" + ignore + "$$, " + "' ', 'g')"; + tokenize = "string_split_regex(" + tokenize + ", '\\s+')"; + result += "CREATE MACRO %fts_schema%.tokenize(s) AS " + tokenize + ";"; + + // parameterized definition of indexing and retrieval model + // clang-format off + result += R"( + CREATE TABLE %fts_schema%.docs AS ( + SELECT rowid AS docid, + "%input_id%" AS name + FROM %input_table% + ); + + CREATE TABLE %fts_schema%.fields (fieldid BIGINT, field VARCHAR); + INSERT INTO %fts_schema%.fields VALUES %field_values%; + + CREATE TABLE %fts_schema%.terms AS + WITH tokenized AS ( + %union_fields_query% + ), + stemmed_stopped AS ( + SELECT stem(t.w, '%stemmer%') AS term, + t.docid AS docid, + t.fieldid AS fieldid + FROM tokenized AS t + WHERE t.w NOT NULL + AND len(t.w) > 0 + AND t.w NOT IN (SELECT sw FROM %fts_schema%.stopwords) + ) + SELECT ss.term, + ss.docid, + ss.fieldid + FROM stemmed_stopped AS ss; + + ALTER TABLE %fts_schema%.docs ADD len BIGINT; + UPDATE %fts_schema%.docs d + SET len = ( + SELECT count(term) + FROM %fts_schema%.terms AS t + WHERE t.docid = d.docid + ); + + CREATE TABLE %fts_schema%.dict AS + WITH distinct_terms AS ( + SELECT DISTINCT term + FROM %fts_schema%.terms + ORDER BY docid, term + ) + SELECT row_number() OVER () - 1 AS termid, + dt.term + FROM distinct_terms AS dt; + + ALTER TABLE %fts_schema%.terms ADD termid BIGINT; + UPDATE %fts_schema%.terms t + SET termid = ( + SELECT termid + FROM %fts_schema%.dict d + WHERE t.term = d.term + ); + ALTER TABLE %fts_schema%.terms DROP term; + + ALTER TABLE %fts_schema%.dict ADD df BIGINT; + UPDATE %fts_schema%.dict d + SET df = ( + SELECT count(distinct docid) + FROM %fts_schema%.terms t + WHERE d.termid = t.termid + GROUP BY termid + ); + + CREATE TABLE %fts_schema%.stats AS ( + SELECT COUNT(docs.docid) AS num_docs, + SUM(docs.len) / COUNT(docs.len) AS avgdl + FROM %fts_schema%.docs AS docs + ); + + CREATE MACRO %fts_schema%.match_bm25(docname, query_string, fields := NULL, k := 1.2, b := 0.75, conjunctive := false) AS ( + WITH tokens AS ( + SELECT DISTINCT stem(unnest(%fts_schema%.tokenize(query_string)), '%stemmer%') AS t + ), + fieldids AS ( + SELECT fieldid + FROM %fts_schema%.fields + WHERE CASE WHEN fields IS NULL THEN 1 ELSE field IN (SELECT * FROM (SELECT UNNEST(string_split(fields, ','))) AS fsq) END + ), + qtermids AS ( + SELECT termid + FROM %fts_schema%.dict AS dict, + tokens + WHERE dict.term = tokens.t + ), + qterms AS ( + SELECT termid, + docid + FROM %fts_schema%.terms AS terms + WHERE CASE WHEN fields IS NULL THEN 1 ELSE fieldid IN (SELECT * FROM fieldids) END + AND termid IN (SELECT qtermids.termid FROM qtermids) + ), + term_tf AS ( + SELECT termid, + docid, + COUNT(*) AS tf + FROM qterms + GROUP BY docid, + termid + ), + cdocs AS ( + SELECT docid + FROM qterms + GROUP BY docid + HAVING CASE WHEN conjunctive THEN COUNT(DISTINCT termid) = (SELECT COUNT(*) FROM tokens) ELSE 1 END + ), + subscores AS ( + SELECT docs.docid, + len, + term_tf.termid, + tf, + df, + (log(((SELECT num_docs FROM %fts_schema%.stats) - df + 0.5) / (df + 0.5) + 1) * ((tf * (k + 1)/(tf + k * (1 - b + b * (len / (SELECT avgdl FROM %fts_schema%.stats))))))) AS subscore + FROM term_tf, + cdocs, + %fts_schema%.docs AS docs, + %fts_schema%.dict AS dict + WHERE term_tf.docid = cdocs.docid + AND term_tf.docid = docs.docid + AND term_tf.termid = dict.termid + ), + scores AS ( + SELECT docid, + sum(subscore) AS score + FROM subscores + GROUP BY docid + ) + SELECT score + FROM scores, + %fts_schema%.docs AS docs + WHERE scores.docid = docs.docid + AND docs.name = docname + ); + )"; + + // we may have more than 1 input field, therefore we union over the fields, retaining information which field it came from + string tokenize_field_query = R"( + SELECT unnest(%fts_schema%.tokenize(fts_ii."%input_value%")) AS w, + rowid AS docid, + (SELECT fieldid FROM %fts_schema%.fields WHERE field = '%input_value%') AS fieldid + FROM %input_table% AS fts_ii + )"; + // clang-format on + vector field_values; + vector tokenize_fields; + for (idx_t i = 0; i < input_values.size(); i++) { + field_values.push_back(StringUtil::Format("(%i, '%s')", i, input_values[i])); + tokenize_fields.push_back(StringUtil::Replace(tokenize_field_query, "%input_value%", input_values[i])); + } + result = StringUtil::Replace(result, "%field_values%", StringUtil::Join(field_values, ", ")); + result = StringUtil::Replace(result, "%union_fields_query%", StringUtil::Join(tokenize_fields, " UNION ALL ")); + + string fts_schema = GetFTSSchema(qname); + string input_table = qname.catalog == INVALID_CATALOG ? "" : StringUtil::Format("%s.", qname.catalog); + input_table += StringUtil::Format("%s.%s", qname.schema, qname.name); + + // fill in variables (inefficiently, but keeps SQL script readable) + result = StringUtil::Replace(result, "%fts_schema%", fts_schema); + result = StringUtil::Replace(result, "%input_table%", input_table); + result = StringUtil::Replace(result, "%input_id%", input_id); + result = StringUtil::Replace(result, "%stemmer%", stemmer); + + return result; +} + +static void CheckIfTableExists(ClientContext &context, QualifiedName &qname) { + Catalog::GetEntry(context, qname.catalog, qname.schema, qname.name); +} + +string FTSIndexing::CreateFTSIndexQuery(ClientContext &context, const FunctionParameters ¶meters) { + auto qname = GetQualifiedName(context, StringValue::Get(parameters.values[0])); + CheckIfTableExists(context, qname); + + // get named parameters + string stemmer = "porter"; + auto stemmer_entry = parameters.named_parameters.find("stemmer"); + if (stemmer_entry != parameters.named_parameters.end()) { + stemmer = StringValue::Get(stemmer_entry->second); + } + + string stopwords = "english"; + auto stopword_entry = parameters.named_parameters.find("stopwords"); + if (stopword_entry != parameters.named_parameters.end()) { + stopwords = StringValue::Get(stopword_entry->second); + if (stopwords != "english" && stopwords != "none") { + auto stopwords_qname = GetQualifiedName(context, stopwords); + CheckIfTableExists(context, stopwords_qname); + } + } + + string ignore = "[0-9!@#$%^&*()_+={}\\[\\]:;<>,.?~\\\\/\\|''\"`-]+"; + auto ignore_entry = parameters.named_parameters.find("ignore"); + if (ignore_entry != parameters.named_parameters.end()) { + ignore = StringValue::Get(ignore_entry->second); + } + + bool strip_accents = true; + auto strip_accents_entry = parameters.named_parameters.find("strip_accents"); + if (strip_accents_entry != parameters.named_parameters.end()) { + strip_accents = BooleanValue::Get(strip_accents_entry->second); + } + + bool lower = true; + auto lower_entry = parameters.named_parameters.find("lower"); + if (lower_entry != parameters.named_parameters.end()) { + lower = BooleanValue::Get(lower_entry->second); + } + + bool overwrite = false; + auto overwrite_entry = parameters.named_parameters.find("overwrite"); + if (overwrite_entry != parameters.named_parameters.end()) { + overwrite = BooleanValue::Get(overwrite_entry->second); + } + + // throw error if an index already exists on this table + const string fts_schema = GetFTSSchema(qname); + if (Catalog::GetSchema(context, qname.catalog, fts_schema, OnEntryNotFound::RETURN_NULL) && !overwrite) { + throw CatalogException("a FTS index already exists on table '%s.%s'. Supply 'overwrite=1' to overwrite, or " + "drop the existing index with 'PRAGMA drop_fts_index()' before creating a new one.", + qname.schema, qname.name); + } + + // positional parameters + auto doc_id = StringValue::Get(parameters.values[1]); + // check all specified columns + auto &table = Catalog::GetEntry(context, qname.catalog, qname.schema, qname.name); + vector doc_values; + for (idx_t i = 2; i < parameters.values.size(); i++) { + string col_name = StringValue::Get(parameters.values[i]); + if (col_name == "*") { + // star found - get all columns + doc_values.clear(); + for (auto &cd : table.GetColumns().Logical()) { + if (cd.Type() == LogicalType::VARCHAR) { + doc_values.push_back(cd.Name()); + } + } + break; + } + if (!table.ColumnExists(col_name)) { + // we check this here because else we we end up with an error halfway the indexing script + throw CatalogException("Table '%s.%s' does not have a column named '%s'!", qname.schema, qname.name, + col_name); + } + doc_values.push_back(col_name); + } + if (doc_values.empty()) { + throw InvalidInputException("at least one column must be supplied for indexing!"); + } + + return IndexingScript(context, qname, doc_id, doc_values, stemmer, stopwords, ignore, strip_accents, lower); +} + +} // namespace duckdb diff --git a/extension/fts/include/fts_extension.hpp b/extension/fts/include/fts_extension.hpp new file mode 100644 index 0000000..389ffd7 --- /dev/null +++ b/extension/fts/include/fts_extension.hpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// fts_extension.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb.hpp" + +namespace duckdb { + +class FtsExtension : public Extension { +public: + void Load(DuckDB &db) override; + std::string Name() override; + std::string Version() const override; +}; + +} // namespace duckdb diff --git a/extension/fts/include/fts_indexing.hpp b/extension/fts/include/fts_indexing.hpp new file mode 100644 index 0000000..40f67e2 --- /dev/null +++ b/extension/fts/include/fts_indexing.hpp @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// fts_indexing.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb/main/client_context.hpp" + +namespace duckdb { + +struct FTSIndexing { + static string DropFTSIndexQuery(ClientContext &context, const FunctionParameters ¶meters); + static string CreateFTSIndexQuery(ClientContext &context, const FunctionParameters ¶meters); +}; + +} // namespace duckdb diff --git a/extension/fts/indexing.sql b/extension/fts/indexing.sql new file mode 100644 index 0000000..f986246 --- /dev/null +++ b/extension/fts/indexing.sql @@ -0,0 +1,101 @@ +DROP SCHEMA IF EXISTS %fts_schema% CASCADE; +CREATE SCHEMA %fts_schema%; +CREATE MACRO %fts_schema%.tokenize(s) AS stem(unnest(string_split_regex(regexp_replace(lower(strip_accents(s)), '[^a-z]', ' ', 'g'), '\s+')), '%stemmer%'); + +CREATE TABLE %fts_schema%.docs AS ( + SELECT + row_number() OVER (PARTITION BY(SELECT NULL)) AS docid, + %input_id% AS name + FROM + %input_schema%.%input_table% +); + +CREATE TABLE %fts_schema%.terms AS ( + SELECT + term, + docid, + row_number() OVER (PARTITION BY docid) AS pos + FROM ( + SELECT + %fts_schema%.tokenize(%input_val%) AS term, + row_number() OVER (PARTITION BY (SELECT NULL)) AS docid + FROM %input_schema%.%input_table% + ) AS sq + WHERE + term != '' +); + +ALTER TABLE %fts_schema%.docs ADD len INT; +UPDATE %fts_schema%.docs d +SET len = ( + SELECT count(term) + FROM %fts_schema%.terms t + WHERE t.docid = d.docid +); + +CREATE TABLE %fts_schema%.dict AS +WITH distinct_terms AS ( + SELECT DISTINCT term, docid + FROM %fts_schema%.terms + ORDER BY docid +) +SELECT + row_number() OVER (PARTITION BY (SELECT NULL)) AS termid, + term +FROM + distinct_terms; + +ALTER TABLE %fts_schema%.terms ADD termid INT; +UPDATE %fts_schema%.terms t +SET termid = ( + SELECT termid + FROM %fts_schema%.dict d + WHERE t.term = d.term +); +ALTER TABLE %fts_schema%.terms DROP term; + +ALTER TABLE %fts_schema%.dict ADD df INT; +UPDATE %fts_schema%.dict d +SET df = ( + SELECT count(distinct docid) + FROM %fts_schema%.terms t + WHERE d.termid = t.termid + GROUP BY termid +); + +CREATE TABLE %fts_schema%.stats AS ( + SELECT COUNT(docs.docid) AS num_docs, SUM(docs.len) / COUNT(docs.len) AS avgdl + FROM %fts_schema%.docs AS docs +); + +CREATE MACRO %fts_schema%.match_bm25(docname, query_string, k=1.2, b=0.75, conjunctive=0) AS docname IN ( + WITH tokens AS + (SELECT DISTINCT %fts_schema%.tokenize(query_string) AS t), + qtermids AS + (SELECT termid FROM %fts_schema%.dict AS dict, tokens WHERE dict.term = tokens.t), + qterms AS + (SELECT termid, docid FROM %fts_schema%.terms AS terms WHERE termid IN (SELECT qtermids.termid FROM qtermids)), + subscores AS ( + SELECT + docs.docid, len, term_tf.termid, tf, df, + (log(((SELECT num_docs FROM %fts_schema%.stats) - df + 0.5) / (df + 0.5))* ((tf * (k + 1)/(tf + k * (1 - b + b * (len / (SELECT avgdl FROM %fts_schema%.stats))))))) AS subscore + FROM + (SELECT termid, docid, COUNT(*) AS tf FROM qterms GROUP BY docid, termid) AS term_tf + JOIN + (SELECT docid FROM qterms GROUP BY docid HAVING CASE WHEN conjunctive THEN COUNT(DISTINCT termid) = (SELECT COUNT(*) FROM tokens) ELSE 1 END) AS cdocs + ON + term_tf.docid = cdocs.docid + JOIN + %fts_schema%.docs AS docs + ON + term_tf.docid = docs.docid + JOIN + %fts_schema%.dict AS dict + ON + term_tf.termid = dict.termid + ) + SELECT name + FROM (SELECT docid, sum(subscore) AS score FROM subscores GROUP BY docid) AS scores + JOIN %fts_schema%.docs AS docs + ON scores.docid = docs.docid ORDER BY score DESC LIMIT 1000 +); diff --git a/extension_config.cmake b/extension_config.cmake new file mode 100644 index 0000000..f0203c9 --- /dev/null +++ b/extension_config.cmake @@ -0,0 +1,9 @@ +# This file is included by DuckDB's build system. It specifies which extension to load + +################# FTS +duckdb_extension_load(fts + DONT_LINK + SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR} + INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/extension/fts/include + ${LOAD_FTS_TESTS} +) diff --git a/test/sql/fts/issue_12330.test b/test/sql/fts/issue_12330.test new file mode 100644 index 0000000..9a68d66 --- /dev/null +++ b/test/sql/fts/issue_12330.test @@ -0,0 +1,36 @@ +# name: test/sql/fts/issue_12330.test +# description: Issue 12330: BM25 matching scores seems to be invalid +# group: [fts] + +# issue #7384 and #8141 + +require fts + +require noalternativeverify + +statement ok +CREATE OR REPLACE TABLE documents ( + id VARCHAR, + content VARCHAR +); + +statement ok +INSERT INTO documents VALUES + ('doc1', 'DuckDB database lorem'), + ('doc2', 'DuckDB database ipsum'), + ('doc3', 'DuckDB database ipsum dolor'); + +statement ok +PRAGMA create_fts_index('documents', 'id', 'content'); + +query I +SELECT + id +FROM + documents +ORDER BY + fts_main_documents.match_bm25(id, 'DuckDB database ipsum') DESC; +---- +doc2 +doc3 +doc1 diff --git a/test/sql/fts/issue_13866.test b/test/sql/fts/issue_13866.test new file mode 100644 index 0000000..6334ebd --- /dev/null +++ b/test/sql/fts/issue_13866.test @@ -0,0 +1,13 @@ +# name: test/sql/fts/issue_13866.test +# description: Issue 13866: FTS ignore regex to include single quote +# group: [fts] + +require fts + +require noalternativeverify + +statement ok +CREATE TABLE my_table AS SELECT 1 AS CustomerId, 'hans' as CustomerName + +statement ok +PRAGMA create_fts_index(my_table, 'CustomerId', 'CustomerName', ignore='(\\.|[^a-z0-9''])+') diff --git a/test/sql/fts/test_fts_attach.test b/test/sql/fts/test_fts_attach.test new file mode 100644 index 0000000..9d90036 --- /dev/null +++ b/test/sql/fts/test_fts_attach.test @@ -0,0 +1,67 @@ +# name: test/sql/fts/test_fts_attach.test +# description: Test FTS and attach +# group: [fts] + +# issue #7384 and #8141 + +require fts + +require skip_reload + +require no_alternative_verify + +statement ok +ATTACH '__TEST_DIR__/tester.db' as search_con + +statement ok +CREATE TABLE search_con.main.my_table AS SELECT 1 AS CustomerId, 'hans' as CustomerName + +statement ok +PRAGMA create_fts_index(search_con.main.my_table, 'CustomerId', 'CustomerName') + +statement ok +SELECT search_con.fts_main_my_table.match_bm25(1, 'han') + +statement ok +DETACH search_con + +# test reopened #8141 +load __TEST_DIR__/index.db + +statement ok +CREATE TABLE data AS SELECT 0 __index, 0 id, 'lorem ipsum' nl, NULL code; + +statement ok +PRAGMA create_fts_index('data', '__index', '*', overwrite=1); + +# test that it works before doing the problematic stuff +query IIII +SELECT * FROM data WHERE fts_main_data.match_bm25(__index, 'lorem') IS NOT NULL; +---- +0 0 lorem ipsum NULL + +statement ok +ATTACH ':memory:' AS memory; + +statement ok +USE memory; + +statement ok +DETACH "index"; + +# now attach again +statement ok +ATTACH '__TEST_DIR__/index.db' AS db; + +statement ok +USE db; + +query T +SELECT COUNT(*) FROM data; +---- +1 + +query IIII +SELECT * FROM data WHERE fts_main_data.match_bm25(__index, 'lorem') IS NOT NULL; +---- +0 0 lorem ipsum NULL diff --git a/test/sql/fts/test_indexing.test_slow b/test/sql/fts/test_indexing.test_slow new file mode 100644 index 0000000..e633dd3 --- /dev/null +++ b/test/sql/fts/test_indexing.test_slow @@ -0,0 +1,259 @@ +# name: test/sql/fts/test_indexing.test_slow +# description: Full text search indexing +# group: [fts] + +require skip_reload + +require fts + +require no_alternative_verify + +statement ok +PRAGMA enable_verification + +statement error +PRAGMA drop_fts_index('test') +---- + +statement ok +CREATE SCHEMA fts_main_test + +statement ok +PRAGMA drop_fts_index('test') + +statement ok +CREATE TABLE documents(id VARCHAR, body VARCHAR) + +statement ok +INSERT INTO documents VALUES ('doc1', ' QUÁCKING+QUÁCKING+QUÁCKING'), ('doc2', ' BÁRKING+BÁRKING+BÁRKING+BÁRKING'), ('doc3', ' MÉOWING+MÉOWING+MÉOWING+MÉOWING+MÉOWING+999') + +# non-existant parameters should yield an error +statement error +PRAGMA create_fts_index('documents', 'id', 'body', nonexistant_param='dummy') +---- + +# test different stemmer +statement ok +PRAGMA create_fts_index('documents', 'id', 'body', stemmer='turkish') + +# cannot overwrite without supplying the 'overwrite' param +statement error +PRAGMA create_fts_index('documents', 'id', 'body') +---- + +statement ok +PRAGMA create_fts_index('documents', 'id', 'body', overwrite=true) + +# drop and re-create +statement ok +PRAGMA drop_fts_index('documents') + +statement error +PRAGMA create_fts_index('documents', 'id', 'body', stopwords='nonexistant_stopwords_table') +---- + +statement ok +PRAGMA create_fts_index('documents', 'id', 'body', stopwords='english') + +query III +SELECT termid, docid, fieldid FROM fts_main_documents.terms +---- +0 0 0 +0 0 0 +0 0 0 +1 1 0 +1 1 0 +1 1 0 +1 1 0 +2 2 0 +2 2 0 +2 2 0 +2 2 0 +2 2 0 + +query III +SELECT name, docid, len FROM fts_main_documents.docs +---- +doc1 0 3 +doc2 1 4 +doc3 2 5 + +query III +SELECT termid, term, df FROM fts_main_documents.dict +---- +0 quack 1 +1 bark 1 +2 meow 1 + +query T +WITH ppterms AS (SELECT stem(unnest(string_split_regex(regexp_replace(lower(strip_accents('QUÁCKED BÁRKED')), '[^a-z]', ' ', 'g'), '\s+')), 'porter') AS term), +qtermids AS (SELECT termid FROM fts_main_documents.dict AS dict, ppterms WHERE dict.term = ppterms.term) +SELECT * FROM qtermids +---- +0 +1 + +query II +WITH ppterms AS (SELECT stem(unnest(string_split_regex(regexp_replace(lower(strip_accents('QUÁCKED BÁRKED')), '[^a-z]', ' ', 'g'), '\s+')), 'porter') AS term), +qtermids AS (SELECT termid FROM fts_main_documents.dict AS dict, ppterms WHERE dict.term = ppterms.term), +qterms AS (SELECT termid, docid FROM fts_main_documents.terms AS terms WHERE termid IN (SELECT qtermids.termid FROM qtermids)) +SELECT * FROM qterms +---- +0 0 +0 0 +0 0 +1 1 +1 1 +1 1 +1 1 + +# log((3 - df + 0.5) / (df + 0.5)) -- number of documents = 3 +# (len / 4) -- average document length is 4 +# HAVING COUNT(DISTINCT termid) = 3 -- commented this out because there is no document with all terms present +query II +WITH ppterms AS (SELECT stem(unnest(string_split_regex(regexp_replace(lower(strip_accents('QUÁCKED BÁRKED')), '[^a-z]', ' ', 'g'), '\s+')), 'porter') AS term), +qtermids AS (SELECT termid FROM fts_main_documents.dict AS dict, ppterms WHERE dict.term = ppterms.term), +qterms AS (SELECT termid, docid FROM fts_main_documents.terms AS terms WHERE termid IN (SELECT qtermids.termid FROM qtermids)), +subscores AS ( +SELECT docs.docid, len, term_tf.termid, + tf, df, (log((3 - df + 0.5) / (df + 0.5))* ((tf * (1.2 + 1)/(tf + 1.2 * (1 - 0.75 + 0.75 * (len / 4)))))) AS subscore +FROM (SELECT termid, docid, COUNT(*) AS tf FROM qterms + GROUP BY docid, termid) AS term_tf + JOIN (SELECT docid FROM qterms + GROUP BY docid) -- HAVING COUNT(DISTINCT termid) = 3) + AS cdocs ON term_tf.docid = cdocs.docid + JOIN fts_main_documents.docs AS docs ON term_tf.docid = docs.docid + JOIN fts_main_documents.dict AS dict ON term_tf.termid = dict.termid) +SELECT name, score FROM (SELECT docid, sum(subscore) AS score + FROM subscores GROUP BY docid) AS scores JOIN fts_main_documents.docs AS docs ON + scores.docid = docs.docid ORDER BY score DESC LIMIT 1000 +---- +doc2 0.3754363455046031 +doc1 0.36835264087244074 + +# now test the actual match macro +query III +SELECT score, id, body FROM (SELECT *, fts_main_documents.match_bm25(id, 'quacked barked') AS score FROM documents) sq WHERE score IS NOT NULL ORDER BY score DESC +---- +0.7208701623069375 doc2 BÁRKING+BÁRKING+BÁRKING+BÁRKING +0.7072688384898254 doc1 QUÁCKING+QUÁCKING+QUÁCKING + +# drop and re-create, but index both the 'body' and 'author' column this time +statement ok +PRAGMA drop_fts_index('documents') + +statement ok +DROP TABLE documents + +statement ok +CREATE TABLE documents(id VARCHAR, body VARCHAR, author VARCHAR) + +statement ok +INSERT INTO documents VALUES ('doc1', ' QUÁCKING+QUÁCKING+QUÁCKING', 'Hannes'), ('doc2', ' BÁRKING+BÁRKING+BÁRKING+BÁRKING', 'Mark'), ('doc3', ' MÉOWING+MÉOWING+MÉOWING+MÉOWING+MÉOWING+999', 'Laurens') + +statement ok +PRAGMA create_fts_index('main.documents', 'id', 'body', 'author') + +# prepared statement for easier use +statement ok +PREPARE fts_query AS (WITH scored_docs AS (SELECT *, fts_main_documents.match_bm25(id, ?) AS score FROM documents) SELECT id, body, author FROM scored_docs WHERE score IS NOT NULL ORDER BY score DESC) + +query III +EXECUTE fts_query('hannes') +---- +doc1 QUÁCKING+QUÁCKING+QUÁCKING Hannes + +query III +EXECUTE fts_query('mark laurens') +---- +doc2 BÁRKING+BÁRKING+BÁRKING+BÁRKING Mark +doc3 MÉOWING+MÉOWING+MÉOWING+MÉOWING+MÉOWING+999 Laurens + +query III +EXECUTE fts_query(NULL) +---- + +# different order by changing the parameters +query III +SELECT id, body, author FROM (SELECT *, fts_main_documents.match_bm25(id, 'quacked barked', k := 0.6, b := 0.1) AS score FROM documents) sq WHERE score IS NOT NULL ORDER BY score DESC +---- +doc2 BÁRKING+BÁRKING+BÁRKING+BÁRKING Mark +doc1 QUÁCKING+QUÁCKING+QUÁCKING Hannes + +# no results for conjunctive query because no document contains both 'mark' and 'laurens +query I +SELECT id FROM (SELECT *, fts_main_documents.match_bm25(id, 'mark laurens', conjunctive := 1) AS score FROM documents) sq WHERE score IS NOT NULL ORDER BY score DESC +---- + +# strings 'mark' and 'laurens' are not found in the 'body' field of the table 'documents' +query I +SELECT id FROM (SELECT *, fts_main_documents.match_bm25(id, 'mark laurens', fields := 'body') AS score FROM documents) sq WHERE score IS NOT NULL ORDER BY score DESC +---- + +# but they are found in the 'author' field! +query I +SELECT id FROM (SELECT *, fts_main_documents.match_bm25(id, 'mark laurens', fields := 'author') AS score FROM documents) sq WHERE score IS NOT NULL ORDER BY score DESC +---- +doc2 +doc3 + +# if we search both the 'author' and 'body' fields then we get the same behaviour as leaving the fields empty +query I +SELECT id FROM (SELECT *, fts_main_documents.match_bm25(id, 'mark laurens', fields := 'body,author') AS score FROM documents) sq WHERE score IS NOT NULL ORDER BY score DESC +---- +doc2 +doc3 + +# if we don't search any fields, we won't get any results +query I +SELECT id FROM (SELECT *, fts_main_documents.match_bm25(id, 'hannes mark laurens', fields := '') AS score FROM documents) sq WHERE score IS NOT NULL ORDER BY score DESC +---- + +# re-index with different stopwords table +statement ok +PRAGMA drop_fts_index('documents') + +statement ok +CREATE TABLE my_stopwords (word VARCHAR) + +statement ok +INSERT INTO my_stopwords VALUES ('quacking') + +statement ok +PRAGMA create_fts_index('documents', 'id', 'body', stopwords='my_stopwords') + +# the word 'quacking' is no longer indexed, therefore doc1 is no longer retrieved with this query +query III +SELECT id, body, author FROM (SELECT *, fts_main_documents.match_bm25(id, 'quacked barked') AS score FROM documents) sq WHERE score IS NOT NULL ORDER BY score DESC +---- +doc2 BÁRKING+BÁRKING+BÁRKING+BÁRKING Mark + +# re-index with a custom whitelist, so that we can retrieve documents by searching on numbers +statement ok +PRAGMA drop_fts_index('documents') + +statement ok +PRAGMA create_fts_index('documents', 'id', 'body', ignore='(\\.|[^a-z0-9])+') + +query I +SELECT body FROM (SELECT *, fts_main_documents.match_bm25(id, '999') AS score FROM documents) sq WHERE score IS NOT NULL ORDER BY score DESC +---- + MÉOWING+MÉOWING+MÉOWING+MÉOWING+MÉOWING+999 + +# re-index with '*' to index all columns +statement ok +PRAGMA drop_fts_index('documents') + +statement ok +PRAGMA create_fts_index('documents', 'id', '*', stopwords='english') + +# prepared statement again for easier use +statement ok +PREPARE fts_query AS (WITH scored_docs AS (SELECT *, fts_main_documents.match_bm25(id, ?) AS score FROM documents) SELECT id, body, author FROM scored_docs WHERE score IS NOT NULL ORDER BY score DESC) + +query III +EXECUTE fts_query('quacked mark laurens') +---- +doc1 QUÁCKING+QUÁCKING+QUÁCKING Hannes +doc2 BÁRKING+BÁRKING+BÁRKING+BÁRKING Mark +doc3 MÉOWING+MÉOWING+MÉOWING+MÉOWING+MÉOWING+999 Laurens diff --git a/test/sql/fts/test_indexing_and_schema.test b/test/sql/fts/test_indexing_and_schema.test new file mode 100644 index 0000000..19f9e43 --- /dev/null +++ b/test/sql/fts/test_indexing_and_schema.test @@ -0,0 +1,39 @@ +# name: test/sql/fts/test_indexing_and_schema.test +# description: Refer default schema when the table name doesn't have a qualifier. +# group: [fts] + +require fts + +require no_alternative_verify + +statement ok +CREATE SCHEMA test + +statement ok +CREATE TABLE test.documents(id VARCHAR, body VARCHAR) + +statement ok +INSERT INTO test.documents VALUES ('doc1', ' QUÁCKING+QUÁCKING+QUÁCKING'), ('doc2', ' BÁRKING+BÁRKING+BÁRKING+BÁRKING'), ('doc3', ' MÉOWING+MÉOWING+MÉOWING+MÉOWING+MÉOWING+999') + +statement error +PRAGMA create_fts_index('documents', 'id', 'body') +---- + +statement ok +SET SCHEMA='test' + +statement ok +PRAGMA create_fts_index('documents', 'id', 'body') + +statement ok +SET SCHEMA='main' + +statement error +PRAGMA drop_fts_index('documents') +---- + +statement ok +SET SCHEMA='test' + +statement ok +PRAGMA drop_fts_index('documents') diff --git a/test/sql/fts/test_issue_10254.test b/test/sql/fts/test_issue_10254.test new file mode 100644 index 0000000..c1afeaf --- /dev/null +++ b/test/sql/fts/test_issue_10254.test @@ -0,0 +1,25 @@ +# name: test/sql/fts/test_issue_10254.test +# description: Test issue #10254: FTS not working with stemmer +# group: [fts] + +require fts + +require no_alternative_verify + +statement ok +CREATE TABLE data (context VARCHAR, question VARCHAR, id BIGINT) + +statement ok +INSERT INTO data VALUES + ('Многоклеточный организм — внесистематическая категория живых организмов, тело которых состоит из многих клеток, большая часть которых (кроме стволовых, например, клеток камбия у растений) дифференцированы, то есть различаются по строению и выполняемым функциям. Следует отличать многоклеточность и колониальность. У колониальных организмов отсутствуют настоящие дифференцированные клетки, а следовательно, и разделение тела на ткани. Граница между многоклеточностью и колониальностью нечёткая. Например, вольвокс часто относят к колониальным организмам, хотя в его колониях есть чёткое деление клеток на генеративные и соматические. Кроме дифференциации клеток, для многоклеточных характерен и более высокий уровень интеграции, чем для колониальных форм. Многоклеточные животные, возможно, появились на Земле 2,1 миллиарда лет назад, вскоре после кислородной революции .', 'У каких организмов отсутствуют настоящие дифференцированные клетки?', 0), + ('Многоклеточный организм — внесистематическая категория живых организмов, тело которых состоит из многих клеток, большая часть которых (кроме стволовых, например, клеток камбия у растений) дифференцированы, то есть различаются по строению и выполняемым функциям. Следует отличать многоклеточность и колониальность. У колониальных организмов отсутствуют настоящие дифференцированные клетки, а следовательно, и разделение тела на ткани. Граница между многоклеточностью и колониальностью нечёткая. Например, вольвокс часто относят к колониальным организмам, хотя в его колониях есть чёткое деление клеток на генеративные и соматические. Кроме дифференциации клеток, для многоклеточных характерен и более высокий уровень интеграции, чем для колониальных форм. Многоклеточные животные, возможно, появились на Земле 2,1 миллиарда лет назад, вскоре после кислородной революции .', 'Какие животные появились на Земле 2,1 миллиарда лет назад?', 1), + ('Многоклеточный организм — внесистематическая категория живых организмов, тело которых состоит из многих клеток, большая часть которых (кроме стволовых, например, клеток камбия у растений) дифференцированы, то есть различаются по строению и выполняемым функциям. Следует отличать многоклеточность и колониальность. У колониальных организмов отсутствуют настоящие дифференцированные клетки, а следовательно, и разделение тела на ткани. Граница между многоклеточностью и колониальностью нечёткая. Например, вольвокс часто относят к колониальным организмам, хотя в его колониях есть чёткое деление клеток на генеративные и соматические. Кроме дифференциации клеток, для многоклеточных характерен и более высокий уровень интеграции, чем для колониальных форм. Многоклеточные животные, возможно, появились на Земле 2,1 миллиарда лет назад, вскоре после кислородной революции .', 'Когда предположительно появились многоклеточные животные?', 2) + +statement ok +PRAGMA create_fts_index('data', 'id', 'context', 'question', stemmer='russian', overwrite=1); + +query I +SELECT id FROM (SELECT *, fts_main_data.match_bm25(id, 'Какие') AS score FROM data) sq WHERE score IS NOT NULL ORDER BY score DESC; +---- +0 +1 diff --git a/test/sql/fts/test_issue_10281.test b/test/sql/fts/test_issue_10281.test new file mode 100644 index 0000000..a62e3b5 --- /dev/null +++ b/test/sql/fts/test_issue_10281.test @@ -0,0 +1,19 @@ +# name: test/sql/fts/test_issue_10281.test +# description: Test issue #10281: Error when trying to create FTS index for column with struct data +# group: [fts] + +require fts + +require no_alternative_verify + +statement ok +CREATE OR REPLACE TABLE data AS SELECT {'duck': 42} conversations, 42::bigint _id; + +statement ok +PRAGMA create_fts_index('data', '_id', 'conversations'); + +# we should be able to retrieve the struct col +query I +SELECT _id FROM (SELECT *, fts_main_data.match_bm25(_id, 'duck') AS score FROM data) sq WHERE score IS NOT NULL ORDER BY score DESC; +---- +42 diff --git a/test/sql/fts/test_issue_5936.test b/test/sql/fts/test_issue_5936.test new file mode 100644 index 0000000..1f08477 --- /dev/null +++ b/test/sql/fts/test_issue_5936.test @@ -0,0 +1,18 @@ +# name: test/sql/fts/test_issue_5936.test +# description: Issue #5936 - Confusing "column does not exist" error when using column named "document" with full text search +# group: [fts] + +require skip_reload + +require fts + +require no_alternative_verify + +statement ok +CREATE TABLE documents(document VARCHAR, url VARCHAR); + +statement ok +INSERT INTO documents VALUES ('hello world', 'https://example.com'), ('foobar', 'https://google.com'); + +statement ok +PRAGMA create_fts_index(documents, url, document); diff --git a/test/sql/fts/test_stemmer.test_slow b/test/sql/fts/test_stemmer.test_slow new file mode 100644 index 0000000..cf637c0 --- /dev/null +++ b/test/sql/fts/test_stemmer.test_slow @@ -0,0 +1,73 @@ +# name: test/sql/fts/test_stemmer.test_slow +# description: Full text search stemmer +# group: [fts] + +require fts + +statement ok +PRAGMA enable_verification + +query T +select stem('iiiiiiinformation', 'porter') +---- +iiiiiiinform + +query T +SELECT stem(NULL, 'porter') +---- +NULL + +query T +SELECT stem('', 'porter') +---- +(empty) + +query T +SELECT stem('connection', 'porter') +---- +connect + +query T +SELECT stem('an', 'porter') +---- +an + +query T +SELECT stem('🦆', 'porter') +---- +🦆 + +query T +SELECT stem('information information', 'porter') +---- +information inform + +query T +SELECT stem(concat(repeat('i', 64), 'nformation'), 'porter') +---- +iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiinform + +query T +SELECT stem('information', 'none') +---- +information + +query T +SELECT stem('information ', 'porter') +---- +information + +query T +select stem(UNNEST(string_split(repeat('information ', 5), ' ')), 'porter') +---- +inform +inform +inform +inform +inform +(empty) + +query I +SELECT stem(UNNEST(string_split(string_agg(range, 'information '), ' ')), 'porter') AS s, mod(range, 100) xx FROM range(50000) GROUP BY xx ORDER BY s +---- +100000 values hashing to 030f4662a25fbc772e84af37e1cc8177