From c27a4d7807af87024724f957e12698def4efdef6 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Mon, 9 Dec 2024 09:05:43 +0100 Subject: [PATCH 01/25] First try of turning the index building into a free function that can be used in a library. Signed-off-by: Johannes Kalmbach --- CMakeLists.txt | 1 + src/index/IndexBuilderMain.cpp | 1 - src/libqlever/CMakeLists.txt | 0 src/libqlever/Qlever.cpp | 5 ++ src/libqlever/Qlever.h | 111 +++++++++++++++++++++++++++++++++ 5 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 src/libqlever/CMakeLists.txt create mode 100644 src/libqlever/Qlever.cpp create mode 100644 src/libqlever/Qlever.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 0503cd210f..a9daa916a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -405,6 +405,7 @@ target_precompile_headers(engine PRIVATE ${PRECOMPILED_HEADER_FILES_ENGINE}) add_subdirectory(src/index) add_subdirectory(src/util) add_subdirectory(benchmark) +add_subdirectory(src/libqlever) enable_testing() option(SINGLE_TEST_BINARY "Link all unit tests into a single binary. This is useful e.g. for code coverage tools" OFF) diff --git a/src/index/IndexBuilderMain.cpp b/src/index/IndexBuilderMain.cpp index 1b500c9dde..5cca43f77d 100644 --- a/src/index/IndexBuilderMain.cpp +++ b/src/index/IndexBuilderMain.cpp @@ -165,7 +165,6 @@ int main(int argc, char** argv) { bool onlyPsoAndPos = false; bool addWordsFromLiterals = false; std::optional stxxlMemory; - optind = 1; Index index{ad_utility::makeUnlimitedAllocator()}; diff --git a/src/libqlever/CMakeLists.txt b/src/libqlever/CMakeLists.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/libqlever/Qlever.cpp b/src/libqlever/Qlever.cpp new file mode 100644 index 0000000000..2d9c407597 --- /dev/null +++ b/src/libqlever/Qlever.cpp @@ -0,0 +1,5 @@ +// +// Created by kalmbacj on 12/9/24. +// + +#include "Qlever.h" diff --git a/src/libqlever/Qlever.h b/src/libqlever/Qlever.h new file mode 100644 index 0000000000..bc183b3706 --- /dev/null +++ b/src/libqlever/Qlever.h @@ -0,0 +1,111 @@ +// Copyright 2024, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#pragma once + +#include + +#include +#include +#include + +#include "index/Index.h" +#include "index/InputFileSpecification.h" +#include "util/AllocatorWithLimit.h" + +namespace qlever { + +struct IndexBuilderConfig { + std::string baseName; + std::string wordsfile; + std::string docsfile; + std::string textIndexName; + std::string kbIndexName; + std::string settingsFile; + std::vector inputFiles; + bool noPatterns = false; + bool onlyAddTextIndex = false; + bool keepTemporaryFiles = false; + bool onlyPsoAndPos = false; + bool addWordsFromLiterals = false; + std::optional stxxlMemory; +}; + +string getStxxlConfigFileName(const string& location) { + return absl::StrCat(location, ".stxxl"); +} + +string getStxxlDiskFileName(const string& location, const string& tail) { + return absl::StrCat(location, tail, ".stxxl-disk"); +} + +// Write a .stxxl config-file. +// All we want is sufficient space somewhere with enough space. +// We can use the location of input files and use a constant size for now. +// The required size can only be estimated anyway, since index size +// depends on the structure of words files rather than their size only, +// because of the "multiplications" performed. +void writeStxxlConfigFile(const string& location, const string& tail) { + string stxxlConfigFileName = getStxxlConfigFileName(location); + ad_utility::File stxxlConfig(stxxlConfigFileName, "w"); + auto configFile = ad_utility::makeOfstream(stxxlConfigFileName); + // Inform stxxl about .stxxl location + setenv("STXXLCFG", stxxlConfigFileName.c_str(), true); + configFile << "disk=" << getStxxlDiskFileName(location, tail) << "," + << STXXL_DISK_SIZE_INDEX_BUILDER << ",syscall\n"; +} + +class Qlever { + void buildIndex(IndexBuilderConfig config) { + Index index{ad_utility::makeUnlimitedAllocator()}; + + if (config.stxxlMemory.has_value()) { + index.memoryLimitIndexBuilding() = config.stxxlMemory.value(); + } + // If no text index name was specified, take the part of the wordsfile after + // the last slash. + if (config.textIndexName.empty() && !config.wordsfile.empty()) { + config.textIndexName = + ad_utility::getLastPartOfString(config.wordsfile, '/'); + } + try { + LOG(TRACE) << "Configuring STXXL..." << std::endl; + size_t posOfLastSlash = config.baseName.rfind('/'); + string location = config.baseName.substr(0, posOfLastSlash + 1); + string tail = config.baseName.substr(posOfLastSlash + 1); + writeStxxlConfigFile(location, tail); + string stxxlFileName = getStxxlDiskFileName(location, tail); + LOG(TRACE) << "done." << std::endl; + + index.setKbName(config.kbIndexName); + index.setTextName(config.textIndexName); + index.usePatterns() = !config.noPatterns; + index.setOnDiskBase(config.baseName); + index.setKeepTempFiles(config.keepTemporaryFiles); + index.setSettingsFile(config.settingsFile); + index.loadAllPermutations() = !config.onlyPsoAndPos; + + if (!config.onlyAddTextIndex) { + AD_CONTRACT_CHECK(!config.inputFiles.empty()); + index.createFromFiles(config.inputFiles); + } + + if (!config.wordsfile.empty() || config.addWordsFromLiterals) { + index.addTextFromContextFile(config.wordsfile, + config.addWordsFromLiterals); + } + + if (!config.docsfile.empty()) { + index.buildDocsDB(config.docsfile); + } + ad_utility::deleteFile(stxxlFileName, false); + } catch (std::exception& e) { + LOG(ERROR) << "Creating the index for QLever failed with the following " + "exception: " + << e.what() << std::endl; + throw; + } + } +}; +} // namespace qlever From 237e93cab050b5caa43f5708618904a60f88ee71 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Mon, 9 Dec 2024 11:18:18 +0100 Subject: [PATCH 02/25] First try of turning the index building into a free function that can be used in a library. Signed-off-by: Johannes Kalmbach --- src/libqlever/CMakeLists.txt | 5 + src/libqlever/LibQLeverExample.cpp | 16 +++ src/libqlever/Qlever.cpp | 162 ++++++++++++++++++++++++++++- src/libqlever/Qlever.h | 152 +++++++++++++-------------- 4 files changed, 249 insertions(+), 86 deletions(-) create mode 100644 src/libqlever/LibQLeverExample.cpp diff --git a/src/libqlever/CMakeLists.txt b/src/libqlever/CMakeLists.txt index e69de29bb2..589c150393 100644 --- a/src/libqlever/CMakeLists.txt +++ b/src/libqlever/CMakeLists.txt @@ -0,0 +1,5 @@ + +add_library(qlever Qlever.cpp) +qlever_target_link_libraries(qlever parser engine util index absl::strings) +add_executable(LibQLeverExample LibQLeverExample.cpp) +qlever_target_link_libraries(LibQLeverExample parser engine util index qlever absl::strings) \ No newline at end of file diff --git a/src/libqlever/LibQLeverExample.cpp b/src/libqlever/LibQLeverExample.cpp new file mode 100644 index 0000000000..dbc0ffe2e6 --- /dev/null +++ b/src/libqlever/LibQLeverExample.cpp @@ -0,0 +1,16 @@ +// Copyright 2024, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include + +#include "libqlever/Qlever.h" + +int main() { + qlever::QleverConfig config; + config.baseName = "exampleIndex"; + config.inputFiles.emplace_back("/dev/stdin", qlever::Filetype::Turtle); + qlever::Qlever::buildIndex(config); + qlever::Qlever qlever{config}; + std::cout << qlever.query("SELECT * {?s ?p ?o}") << std::endl; +} diff --git a/src/libqlever/Qlever.cpp b/src/libqlever/Qlever.cpp index 2d9c407597..cc37562dde 100644 --- a/src/libqlever/Qlever.cpp +++ b/src/libqlever/Qlever.cpp @@ -1,5 +1,159 @@ -// -// Created by kalmbacj on 12/9/24. -// +// Copyright 2024, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach -#include "Qlever.h" +#include "libqlever/Qlever.h" + +namespace qlever { +static std::string getStxxlConfigFileName(const string& location) { + return absl::StrCat(location, ".stxxl"); +} + +static std::string getStxxlDiskFileName(const string& location, + const string& tail) { + return absl::StrCat(location, tail, ".stxxl-disk"); +} + +// Write a .stxxl config-file. +// All we want is sufficient space somewhere with enough space. +// We can use the location of input files and use a constant size for now. +// The required size can only be estimated anyway, since index size +// depends on the structure of words files rather than their size only, +// because of the "multiplications" performed. +static void writeStxxlConfigFile(const string& location, const string& tail) { + string stxxlConfigFileName = getStxxlConfigFileName(location); + ad_utility::File stxxlConfig(stxxlConfigFileName, "w"); + auto configFile = ad_utility::makeOfstream(stxxlConfigFileName); + // Inform stxxl about .stxxl location + setenv("STXXLCFG", stxxlConfigFileName.c_str(), true); + configFile << "disk=" << getStxxlDiskFileName(location, tail) << "," + << STXXL_DISK_SIZE_INDEX_BUILDER << ",syscall\n"; +} + +// _____________________________________________________________________________ +Qlever::Qlever(const QleverConfig& config) + : allocator_{ad_utility::AllocatorWithLimit{ + ad_utility::makeAllocationMemoryLeftThreadsafeObject( + config.memoryLimit.value())}}, + index_{allocator_} { + ad_utility::setGlobalLoggingStream(&ignoreLogStream); + // This also directly triggers the update functions and propagates the + // values of the parameters to the cache. + RuntimeParameters().setOnUpdateAction<"cache-max-num-entries">( + [this](size_t newValue) { cache_.setMaxNumEntries(newValue); }); + RuntimeParameters().setOnUpdateAction<"cache-max-size">( + [this](ad_utility::MemorySize newValue) { cache_.setMaxSize(newValue); }); + RuntimeParameters().setOnUpdateAction<"cache-max-size-single-entry">( + [this](ad_utility::MemorySize newValue) { + cache_.setMaxSizeSingleEntry(newValue); + }); + index_.usePatterns() = !config.noPatterns; + enablePatternTrick_ = !config.noPatterns; + index_.loadAllPermutations() = !config.onlyPsoAndPos; + + // Init the index. + index_.createFromOnDiskIndex(config.baseName); + // TODO Enable the loading of the text index via the QLever lib. + /* + if (useText) { + index_.addTextFromOnDiskIndex(); + } + */ + + sortPerformanceEstimator_.computeEstimatesExpensively( + allocator_, index_.numTriples().normalAndInternal_() * + PERCENTAGE_OF_TRIPLES_FOR_SORT_ESTIMATE / 100); +} + +// _____________________________________________________________________________ +void Qlever::buildIndex(QleverConfig config) { + ad_utility::setGlobalLoggingStream(&ignoreLogStream); + Index index{ad_utility::makeUnlimitedAllocator()}; + + if (config.memoryLimit.has_value()) { + index.memoryLimitIndexBuilding() = config.memoryLimit.value(); + } + // If no text index name was specified, take the part of the wordsfile after + // the last slash. + if (config.textIndexName.empty() && !config.wordsfile.empty()) { + config.textIndexName = + ad_utility::getLastPartOfString(config.wordsfile, '/'); + } + try { + LOG(TRACE) << "Configuring STXXL..." << std::endl; + size_t posOfLastSlash = config.baseName.rfind('/'); + string location = config.baseName.substr(0, posOfLastSlash + 1); + string tail = config.baseName.substr(posOfLastSlash + 1); + writeStxxlConfigFile(location, tail); + string stxxlFileName = getStxxlDiskFileName(location, tail); + LOG(TRACE) << "done." << std::endl; + + index.setKbName(config.kbIndexName); + index.setTextName(config.textIndexName); + index.usePatterns() = !config.noPatterns; + index.setOnDiskBase(config.baseName); + index.setKeepTempFiles(config.keepTemporaryFiles); + index.setSettingsFile(config.settingsFile); + index.loadAllPermutations() = !config.onlyPsoAndPos; + + if (!config.onlyAddTextIndex) { + AD_CONTRACT_CHECK(!config.inputFiles.empty()); + index.createFromFiles(config.inputFiles); + } + + if (!config.wordsfile.empty() || config.addWordsFromLiterals) { + index.addTextFromContextFile(config.wordsfile, + config.addWordsFromLiterals); + } + + if (!config.docsfile.empty()) { + index.buildDocsDB(config.docsfile); + } + ad_utility::deleteFile(stxxlFileName, false); + } catch (std::exception& e) { + LOG(ERROR) << "Creating the index for QLever failed with the following " + "exception: " + << e.what() << std::endl; + throw; + } +} + +// ___________________________________________________________________________ +std::string Qlever::query(std::string query) { + QueryExecutionContext qec{index_, &cache_, allocator_, + sortPerformanceEstimator_}; + auto parsedQuery = SparqlParser::parseQuery(query); + auto handle = std::make_shared>(); + QueryPlanner qp{&qec, handle}; + qp.setEnablePatternTrick(enablePatternTrick_); + auto qet = qp.createExecutionTree(parsedQuery); + qet.isRoot() = true; + auto& limitOffset = parsedQuery._limitOffset; + + // TODO For cancellation we have to call + // `recursivelySetCancellationHandle` (see `Server::parseAndPlan`). + + // TODO The following interface looks fishy and should be + // incorporated directly in the query planner or somewhere else. + // (it is used identically in `Server.cpp`. + + // Make sure that the offset is not applied again when exporting the result + // (it is already applied by the root operation in the query execution + // tree). Note that we don't need this for the limit because applying a + // fixed limit is idempotent. + AD_CORRECTNESS_CHECK(limitOffset._offset >= + qet.getRootOperation()->getLimit()._offset); + limitOffset._offset -= qet.getRootOperation()->getLimit()._offset; + + ad_utility::Timer timer{ad_utility::Timer::Started}; + auto responseGenerator = ExportQueryExecutionTrees::computeResult( + parsedQuery, qet, ad_utility::MediaType::sparqlJson, timer, + std::move(handle)); + std::string result; + std::cout << "Writing the result:" << std::endl; + for (const auto& batch : responseGenerator) { + result += batch; + } + return result; +} +} // namespace qlever diff --git a/src/libqlever/Qlever.h b/src/libqlever/Qlever.h index bc183b3706..b69dcf9588 100644 --- a/src/libqlever/Qlever.h +++ b/src/libqlever/Qlever.h @@ -8,104 +8,92 @@ #include #include +#include #include +#include "engine/ExportQueryExecutionTrees.h" +#include "engine/QueryExecutionContext.h" +#include "engine/QueryPlanner.h" +#include "global/RuntimeParameters.h" #include "index/Index.h" #include "index/InputFileSpecification.h" +#include "parser/SparqlParser.h" #include "util/AllocatorWithLimit.h" +#include "util/http/MediaTypes.h" namespace qlever { -struct IndexBuilderConfig { +// A configuration for a QLever instance. +struct QleverConfig { + // A basename for all files that QLever will write as part of the index + // building. std::string baseName; + + // The specification of the input files (Turtle/NT or NQuad) from which the + // index will be built. + std::vector inputFiles; + + // A memory limit that will be applied during the index building as well as + // during the query processing. + std::optional memoryLimit = + ad_utility::MemorySize::gigabytes(1); + + // If set to true, then no so-called patterns will be built. Patterns are + // useful for autocompletion and for certain statistics queries, but not for + // typical SELECT queries. + bool noPatterns = false; + + // Only build two permutations. This is sufficient if all queries have a fixed + // predicate. + // TODO We haven't tested this mode in a while, it is currently + // probably broken because the UPDATE mechanism doesn't support only two + // permutations. + bool onlyPsoAndPos = false; + + // Optionally a filename to a .json file with additional settings... + // TODO Make these settings part of this struct directly + // TODO Document these additional settings. + std::string settingsFile; + + // The following members are only required if QLever's full-text search + // extension is to be used, see `IndexBuilderMain.cpp` for additional details. + bool addWordsFromLiterals = false; + std::string kbIndexName; std::string wordsfile; std::string docsfile; std::string textIndexName; - std::string kbIndexName; - std::string settingsFile; - std::vector inputFiles; - bool noPatterns = false; bool onlyAddTextIndex = false; + + // If set to true, then certain temporary files which are created while + // building the index are not deleted. This can be useful for debugging. bool keepTemporaryFiles = false; - bool onlyPsoAndPos = false; - bool addWordsFromLiterals = false; - std::optional stxxlMemory; }; -string getStxxlConfigFileName(const string& location) { - return absl::StrCat(location, ".stxxl"); -} - -string getStxxlDiskFileName(const string& location, const string& tail) { - return absl::StrCat(location, tail, ".stxxl-disk"); -} - -// Write a .stxxl config-file. -// All we want is sufficient space somewhere with enough space. -// We can use the location of input files and use a constant size for now. -// The required size can only be estimated anyway, since index size -// depends on the structure of words files rather than their size only, -// because of the "multiplications" performed. -void writeStxxlConfigFile(const string& location, const string& tail) { - string stxxlConfigFileName = getStxxlConfigFileName(location); - ad_utility::File stxxlConfig(stxxlConfigFileName, "w"); - auto configFile = ad_utility::makeOfstream(stxxlConfigFileName); - // Inform stxxl about .stxxl location - setenv("STXXLCFG", stxxlConfigFileName.c_str(), true); - configFile << "disk=" << getStxxlDiskFileName(location, tail) << "," - << STXXL_DISK_SIZE_INDEX_BUILDER << ",syscall\n"; -} - +// A class that can be used to use QLever without the HTTP server, e.g. as part +// of another program. class Qlever { - void buildIndex(IndexBuilderConfig config) { - Index index{ad_utility::makeUnlimitedAllocator()}; - - if (config.stxxlMemory.has_value()) { - index.memoryLimitIndexBuilding() = config.stxxlMemory.value(); - } - // If no text index name was specified, take the part of the wordsfile after - // the last slash. - if (config.textIndexName.empty() && !config.wordsfile.empty()) { - config.textIndexName = - ad_utility::getLastPartOfString(config.wordsfile, '/'); - } - try { - LOG(TRACE) << "Configuring STXXL..." << std::endl; - size_t posOfLastSlash = config.baseName.rfind('/'); - string location = config.baseName.substr(0, posOfLastSlash + 1); - string tail = config.baseName.substr(posOfLastSlash + 1); - writeStxxlConfigFile(location, tail); - string stxxlFileName = getStxxlDiskFileName(location, tail); - LOG(TRACE) << "done." << std::endl; - - index.setKbName(config.kbIndexName); - index.setTextName(config.textIndexName); - index.usePatterns() = !config.noPatterns; - index.setOnDiskBase(config.baseName); - index.setKeepTempFiles(config.keepTemporaryFiles); - index.setSettingsFile(config.settingsFile); - index.loadAllPermutations() = !config.onlyPsoAndPos; - - if (!config.onlyAddTextIndex) { - AD_CONTRACT_CHECK(!config.inputFiles.empty()); - index.createFromFiles(config.inputFiles); - } - - if (!config.wordsfile.empty() || config.addWordsFromLiterals) { - index.addTextFromContextFile(config.wordsfile, - config.addWordsFromLiterals); - } - - if (!config.docsfile.empty()) { - index.buildDocsDB(config.docsfile); - } - ad_utility::deleteFile(stxxlFileName, false); - } catch (std::exception& e) { - LOG(ERROR) << "Creating the index for QLever failed with the following " - "exception: " - << e.what() << std::endl; - throw; - } - } + private: + QueryResultCache cache_; + ad_utility::AllocatorWithLimit allocator_; + SortPerformanceEstimator sortPerformanceEstimator_; + Index index_; + bool enablePatternTrick_; + static inline std::ostringstream ignoreLogStream; + + public: + // Build a persistent on disk index using the `config`. + static void buildIndex(QleverConfig config); + + // Load the qlever index from file. + explicit Qlever(const QleverConfig& config); + + // Run the given query on the index. Currently only SELECT and ASK queries are + // supported, and the result will always be in sparql-results+json format. + // TODO Support other formats + CONSTRUCT queries, support + // cancellation, time limits, and observable queries. + std::string query(std::string query); + + // TODO Give access to the RuntimeParameters() which allow for + // further tweaking of the qlever instance. }; } // namespace qlever From 10d492df9d8f53cd2b55f319d65674ea59abc7b4 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Mon, 16 Dec 2024 21:35:03 +0100 Subject: [PATCH 03/25] Optimize some includes. Signed-off-by: Johannes Kalmbach --- CMakeLists.txt | 2 +- src/engine/CMakeLists.txt | 4 +++- src/engine/Service.cpp | 1 - src/engine/Service.h | 4 ++++ test/CMakeLists.txt | 2 +- 5 files changed, 9 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a9daa916a3..ceb383ecd7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -420,7 +420,7 @@ add_executable(IndexBuilderMain src/index/IndexBuilderMain.cpp) qlever_target_link_libraries(IndexBuilderMain index ${CMAKE_THREAD_LIBS_INIT} Boost::program_options compilationInfo) add_executable(ServerMain src/ServerMain.cpp) -qlever_target_link_libraries(ServerMain engine ${CMAKE_THREAD_LIBS_INIT} Boost::program_options compilationInfo) +qlever_target_link_libraries(ServerMain engine server ${CMAKE_THREAD_LIBS_INIT} Boost::program_options compilationInfo) target_precompile_headers(ServerMain REUSE_FROM engine) add_executable(VocabularyMergerMain src/VocabularyMergerMain.cpp) diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt index cbfb3344c3..7105bde2da 100644 --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -5,7 +5,7 @@ add_library(engine Engine.cpp QueryExecutionTree.cpp Operation.cpp Result.cpp LocalVocab.cpp IndexScan.cpp Join.cpp Sort.cpp Distinct.cpp OrderBy.cpp Filter.cpp - Server.cpp QueryPlanner.cpp QueryPlanningCostFactors.cpp + QueryPlanner.cpp QueryPlanningCostFactors.cpp OptionalJoin.cpp CountAvailablePredicates.cpp GroupBy.cpp HasPredicateScan.cpp Union.cpp MultiColumnJoin.cpp TransitivePathBase.cpp TransitivePathHashMap.cpp TransitivePathBinSearch.cpp Service.cpp @@ -14,4 +14,6 @@ add_library(engine CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp TextLimit.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp CountConnectedSubgraphs.cpp SpatialJoinAlgorithms.cpp PathSearch.cpp ExecuteUpdate.cpp) +add_library(server Server.cpp) +qlever_target_link_libraries(server) qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams s2) diff --git a/src/engine/Service.cpp b/src/engine/Service.cpp index 8c946a2fb3..cf11babfa4 100644 --- a/src/engine/Service.cpp +++ b/src/engine/Service.cpp @@ -19,7 +19,6 @@ #include "util/HashMap.h" #include "util/HashSet.h" #include "util/StringUtils.h" -#include "util/http/HttpUtils.h" // ____________________________________________________________________________ Service::Service(QueryExecutionContext* qec, diff --git a/src/engine/Service.h b/src/engine/Service.h index 8fef6f5d0e..2267928f7f 100644 --- a/src/engine/Service.h +++ b/src/engine/Service.h @@ -12,6 +12,10 @@ #include "util/LazyJsonParser.h" #include "util/http/HttpClient.h" +// Forward declarations to reduce dependencies +struct HttpOrHttpsResponse; +namespace ad_utility {} + // The SERVICE operation. Sends a query to the remote endpoint specified by the // service IRI, gets the result as JSON, parses it, and writes it into a result // table. diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 9dd3a733a9..f85aaa8306 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -433,6 +433,6 @@ addLinkAndDiscoverTest(SparqlExpressionGeneratorsTest engine) addLinkAndDiscoverTest(UrlParserTest) -addLinkAndDiscoverTest(ServerTest engine) +addLinkAndDiscoverTest(ServerTest engine server) addLinkAndDiscoverTest(ExecuteUpdateTest engine) From 0d4fa20aeb9d764ed975de62d76b5d69462286c9 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 30 Jan 2025 10:16:43 +0100 Subject: [PATCH 04/25] We have the named cache compiling, now let's use it. Signed-off-by: Johannes Kalmbach --- benchmark/GroupByHashMapBenchmark.cpp | 2 +- src/engine/CMakeLists.txt | 5 +++- src/engine/Describe.cpp | 2 +- src/engine/NamedQueryCache.cpp | 5 ++++ src/engine/NamedQueryCache.h | 28 +++++++++++++++++++ src/engine/QueryExecutionContext.cpp | 25 +++++++++++++++++ src/engine/QueryExecutionContext.h | 25 +++++++++++------ src/engine/Server.cpp | 8 +++--- src/engine/Server.h | 2 ++ {test => src}/engine/ValuesForTesting.h | 0 test/OperationTest.cpp | 35 +++++++++++++++++++----- test/engine/BindTest.cpp | 2 +- test/engine/CartesianProductJoinTest.cpp | 2 +- test/engine/LazyGroupByTest.cpp | 2 +- test/engine/QueryExecutionTreeTest.cpp | 2 +- test/util/IdTableHelpers.cpp | 2 +- test/util/IdTableHelpers.h | 2 +- test/util/IndexTestHelpers.cpp | 7 +++-- 18 files changed, 125 insertions(+), 31 deletions(-) create mode 100644 src/engine/NamedQueryCache.cpp create mode 100644 src/engine/NamedQueryCache.h create mode 100644 src/engine/QueryExecutionContext.cpp rename {test => src}/engine/ValuesForTesting.h (100%) diff --git a/benchmark/GroupByHashMapBenchmark.cpp b/benchmark/GroupByHashMapBenchmark.cpp index 780785e9bc..1335ebc5bd 100644 --- a/benchmark/GroupByHashMapBenchmark.cpp +++ b/benchmark/GroupByHashMapBenchmark.cpp @@ -6,12 +6,12 @@ #include #include "../benchmark/infrastructure/Benchmark.h" -#include "../test/engine/ValuesForTesting.h" #include "../test/util/IdTableHelpers.h" #include "../test/util/IndexTestHelpers.h" #include "engine/GroupBy.h" #include "engine/Sort.h" #include "engine/Values.h" +#include "engine/ValuesForTesting.h" #include "engine/sparqlExpressions/AggregateExpression.h" #include "engine/sparqlExpressions/GroupConcatExpression.h" #include "engine/sparqlExpressions/LiteralExpression.h" diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt index e81c834303..d517c0f239 100644 --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -14,5 +14,8 @@ add_library(engine CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp TextLimit.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp CountConnectedSubgraphs.cpp SpatialJoinAlgorithms.cpp PathSearch.cpp ExecuteUpdate.cpp - Describe.cpp GraphStoreProtocol.cpp) + Describe.cpp GraphStoreProtocol.cpp + NamedQueryCache.cpp + NamedQueryCache.h + QueryExecutionContext.cpp) qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams s2) diff --git a/src/engine/Describe.cpp b/src/engine/Describe.cpp index a0c43222d2..61960d90b2 100644 --- a/src/engine/Describe.cpp +++ b/src/engine/Describe.cpp @@ -4,9 +4,9 @@ #include "engine/Describe.h" -#include "../../test/engine/ValuesForTesting.h" #include "engine/IndexScan.h" #include "engine/Join.h" +#include "engine/ValuesForTesting.h" // _____________________________________________________________________________ Describe::Describe(QueryExecutionContext* qec, diff --git a/src/engine/NamedQueryCache.cpp b/src/engine/NamedQueryCache.cpp new file mode 100644 index 0000000000..9d4bbb15a3 --- /dev/null +++ b/src/engine/NamedQueryCache.cpp @@ -0,0 +1,5 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include "NamedQueryCache.h" diff --git a/src/engine/NamedQueryCache.h b/src/engine/NamedQueryCache.h new file mode 100644 index 0000000000..f9d702b4e4 --- /dev/null +++ b/src/engine/NamedQueryCache.h @@ -0,0 +1,28 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach +#pragma once + +#include "engine/ValuesForTesting.h" +#include "util/Cache.h" +#include "util/Synchronized.h" + +class NamedQueryCache { + using Key = std::string; + using Value = std::shared_ptr; + using Cache = + ad_utility::HashMap>; + + ad_utility::Synchronized cache_; + + void store(const Key& key, Value value) { + (*cache_.wlock())[key] = std::move(value); + } + Value get(const Key& key) { + auto l = cache_.wlock(); + auto it = l->find(key); + // TODO Proper error message. + AD_CONTRACT_CHECK(it != l->end()); + return it->second; + } +}; diff --git a/src/engine/QueryExecutionContext.cpp b/src/engine/QueryExecutionContext.cpp new file mode 100644 index 0000000000..d7c4867898 --- /dev/null +++ b/src/engine/QueryExecutionContext.cpp @@ -0,0 +1,25 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include "engine/QueryExecutionContext.h" + +// _____________________________________________________________________________ +QueryExecutionContext::QueryExecutionContext( + const Index& index, QueryResultCache* const cache, + ad_utility::AllocatorWithLimit allocator, + SortPerformanceEstimator sortPerformanceEstimator, + NamedQueryCache* namedCache, + std::function updateCallback, const bool pinSubtrees, + const bool pinResult) + : _pinSubtrees(pinSubtrees), + _pinResult(pinResult), + _index(index), + _subtreeCache(cache), + _allocator(std::move(allocator)), + _sortPerformanceEstimator(sortPerformanceEstimator), + updateCallback_(std::move(updateCallback)), + namedQueryCache_{namedCache} {} + +// _____________________________________________________________________________ +QueryExecutionContext::~QueryExecutionContext() = default; diff --git a/src/engine/QueryExecutionContext.h b/src/engine/QueryExecutionContext.h index 1e891a398f..cd1c931952 100644 --- a/src/engine/QueryExecutionContext.h +++ b/src/engine/QueryExecutionContext.h @@ -65,6 +65,9 @@ class CacheValue { }; }; +// Forward declaration because of cyclic dependencies +class NamedQueryCache; + // The key for the `QueryResultCache` below. It consists of a `string` (the // actual cache key of a `QueryExecutionTree` and the index of the // `LocatedTriplesSnapshot` that was used to create the corresponding value. @@ -89,6 +92,9 @@ struct QueryCacheKey { using QueryResultCache = ad_utility::ConcurrentCache< ad_utility::LRUCache>; +// Forward declaration because of cyclic dependency +class NamedQueryCache; + // Execution context for queries. // Holds references to index and engine, implements caching. class QueryExecutionContext { @@ -97,17 +103,11 @@ class QueryExecutionContext { const Index& index, QueryResultCache* const cache, ad_utility::AllocatorWithLimit allocator, SortPerformanceEstimator sortPerformanceEstimator, + NamedQueryCache* namedCache, std::function updateCallback = [](std::string) { /* No-op by default for testing */ }, - const bool pinSubtrees = false, const bool pinResult = false) - : _pinSubtrees(pinSubtrees), - _pinResult(pinResult), - _index(index), - _subtreeCache(cache), - _allocator(std::move(allocator)), - _costFactors(), - _sortPerformanceEstimator(sortPerformanceEstimator), - updateCallback_(std::move(updateCallback)) {} + bool pinSubtrees = false, bool pinResult = false); + ~QueryExecutionContext(); QueryResultCache& getQueryTreeCache() { return *_subtreeCache; } @@ -151,6 +151,11 @@ class QueryExecutionContext { return areWebsocketUpdatesEnabled_; } + NamedQueryCache& namedQueryCache() { + AD_CORRECTNESS_CHECK(namedQueryCache_ != nullptr); + return *namedQueryCache_; + } + private: const Index& _index; @@ -170,4 +175,6 @@ class QueryExecutionContext { // mutex. bool areWebsocketUpdatesEnabled_ = RuntimeParameters().get<"websocket-updates-enabled">(); + + NamedQueryCache* namedQueryCache_ = nullptr; }; diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index 8c1248203c..7c3c7e2c2e 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -814,8 +814,8 @@ Awaitable Server::processQuery( << (pinSubtrees ? " [pin subresults]" : "") << "\n" << query.query_ << std::endl; QueryExecutionContext qec(index_, &cache_, allocator_, - sortPerformanceEstimator_, std::ref(messageSender), - pinSubtrees, pinResult); + sortPerformanceEstimator_, &namedQueryCache_, + std::ref(messageSender), pinSubtrees, pinResult); // The usage of an `optional` here is required because of a limitation in // Boost::Asio which forces us to use default-constructible result types with @@ -957,8 +957,8 @@ json Server::processUpdateImpl( << (pinSubtrees ? " [pin subresults]" : "") << "\n" << update.update_ << std::endl; QueryExecutionContext qec(index_, &cache_, allocator_, - sortPerformanceEstimator_, std::ref(messageSender), - pinSubtrees, pinResult); + sortPerformanceEstimator_, &namedQueryCache_, + std::ref(messageSender), pinSubtrees, pinResult); auto plannedQuery = setupPlannedQuery(update.datasetClauses_, update.update_, qec, cancellationHandle, timeLimit, requestTimer); diff --git a/src/engine/Server.h b/src/engine/Server.h index 4e0889b48a..04f31645fa 100644 --- a/src/engine/Server.h +++ b/src/engine/Server.h @@ -12,6 +12,7 @@ #include "ExecuteUpdate.h" #include "engine/Engine.h" +#include "engine/NamedQueryCache.h" #include "engine/QueryExecutionContext.h" #include "engine/QueryExecutionTree.h" #include "engine/SortPerformanceEstimator.h" @@ -68,6 +69,7 @@ class Server { unsigned short port_; std::string accessToken_; QueryResultCache cache_; + NamedQueryCache namedQueryCache_; ad_utility::AllocatorWithLimit allocator_; SortPerformanceEstimator sortPerformanceEstimator_; Index index_; diff --git a/test/engine/ValuesForTesting.h b/src/engine/ValuesForTesting.h similarity index 100% rename from test/engine/ValuesForTesting.h rename to src/engine/ValuesForTesting.h diff --git a/test/OperationTest.cpp b/test/OperationTest.cpp index 2afbe38a83..5f2c2c377c 100644 --- a/test/OperationTest.cpp +++ b/test/OperationTest.cpp @@ -6,6 +6,7 @@ #include +#include "engine/NamedQueryCache.h" #include "engine/NeutralElementOperation.h" #include "engine/ValuesForTesting.h" #include "global/RuntimeParameters.h" @@ -123,8 +124,13 @@ class OperationTestFixture : public testing::Test { Index index = makeTestIndex("OperationTest", std::nullopt, true, true, true, 32_B); QueryResultCache cache; + NamedQueryCache namedCache; QueryExecutionContext qec{ - index, &cache, makeAllocator(), SortPerformanceEstimator{}, + index, + &cache, + makeAllocator(), + SortPerformanceEstimator{}, + &namedCache, [&](std::string json) { jsonHistory.emplace_back(std::move(json)); }}; IdTable table = makeIdTableFromVector({{}, {}, {}}); ValuesForTesting operation{&qec, std::move(table), {}}; @@ -404,9 +410,14 @@ TEST(Operation, ensureFailedStatusIsSetWhenGeneratorThrowsException) { "ensureFailedStatusIsSetWhenGeneratorThrowsException", std::nullopt, true, true, true, ad_utility::MemorySize::bytes(16), false); QueryResultCache cache{}; + NamedQueryCache namedCache{}; QueryExecutionContext context{ - index, &cache, makeAllocator(ad_utility::MemorySize::megabytes(100)), - SortPerformanceEstimator{}, [&](std::string) { signaledUpdate = true; }}; + index, + &cache, + makeAllocator(ad_utility::MemorySize::megabytes(100)), + SortPerformanceEstimator{}, + &namedCache, + [&](std::string) { signaledUpdate = true; }}; AlwaysFailOperation operation{&context}; ad_utility::Timer timer{ad_utility::Timer::InitialStatus::Started}; auto result = @@ -431,9 +442,14 @@ TEST(Operation, ensureSignalUpdateIsOnlyCalledEvery50msAndAtTheEnd) { "ensureSignalUpdateIsOnlyCalledEvery50msAndAtTheEnd", std::nullopt, true, true, true, ad_utility::MemorySize::bytes(16), false); QueryResultCache cache{}; + NamedQueryCache namedCache{}; QueryExecutionContext context{ - index, &cache, makeAllocator(ad_utility::MemorySize::megabytes(100)), - SortPerformanceEstimator{}, [&](std::string) { ++updateCallCounter; }}; + index, + &cache, + makeAllocator(ad_utility::MemorySize::megabytes(100)), + SortPerformanceEstimator{}, + &namedCache, + [&](std::string) { ++updateCallCounter; }}; CustomGeneratorOperation operation{ &context, [](const IdTable& idTable) -> Result::Generator { std::this_thread::sleep_for(50ms); @@ -474,9 +490,14 @@ TEST(Operation, ensureSignalUpdateIsCalledAtTheEndOfPartialConsumption) { "ensureSignalUpdateIsCalledAtTheEndOfPartialConsumption", std::nullopt, true, true, true, ad_utility::MemorySize::bytes(16), false); QueryResultCache cache{}; + NamedQueryCache namedCache{}; QueryExecutionContext context{ - index, &cache, makeAllocator(ad_utility::MemorySize::megabytes(100)), - SortPerformanceEstimator{}, [&](std::string) { ++updateCallCounter; }}; + index, + &cache, + makeAllocator(ad_utility::MemorySize::megabytes(100)), + SortPerformanceEstimator{}, + &namedCache, + [&](std::string) { ++updateCallCounter; }}; CustomGeneratorOperation operation{ &context, [](const IdTable& idTable) -> Result::Generator { co_yield {idTable.clone(), LocalVocab{}}; diff --git a/test/engine/BindTest.cpp b/test/engine/BindTest.cpp index 34ef0eb370..43039c47f7 100644 --- a/test/engine/BindTest.cpp +++ b/test/engine/BindTest.cpp @@ -6,8 +6,8 @@ #include "../util/IdTableHelpers.h" #include "../util/IndexTestHelpers.h" -#include "./ValuesForTesting.h" #include "engine/Bind.h" +#include "engine/ValuesForTesting.h" #include "engine/sparqlExpressions/LiteralExpression.h" using namespace sparqlExpression; diff --git a/test/engine/CartesianProductJoinTest.cpp b/test/engine/CartesianProductJoinTest.cpp index 8727aa223a..3bc01b077a 100644 --- a/test/engine/CartesianProductJoinTest.cpp +++ b/test/engine/CartesianProductJoinTest.cpp @@ -4,12 +4,12 @@ #include -#include "../engine/ValuesForTesting.h" #include "../util/GTestHelpers.h" #include "../util/IdTableHelpers.h" #include "../util/IndexTestHelpers.h" #include "engine/CartesianProductJoin.h" #include "engine/QueryExecutionTree.h" +#include "engine/ValuesForTesting.h" using namespace ad_utility::testing; using ad_utility::source_location; diff --git a/test/engine/LazyGroupByTest.cpp b/test/engine/LazyGroupByTest.cpp index 1b952f9f7b..23bc903618 100644 --- a/test/engine/LazyGroupByTest.cpp +++ b/test/engine/LazyGroupByTest.cpp @@ -6,9 +6,9 @@ #include "../util/IdTableHelpers.h" #include "../util/IndexTestHelpers.h" -#include "./ValuesForTesting.h" #include "engine/GroupBy.h" #include "engine/LazyGroupBy.h" +#include "engine/ValuesForTesting.h" #include "engine/sparqlExpressions/AggregateExpression.h" #include "engine/sparqlExpressions/GroupConcatExpression.h" #include "engine/sparqlExpressions/NaryExpression.h" diff --git a/test/engine/QueryExecutionTreeTest.cpp b/test/engine/QueryExecutionTreeTest.cpp index c67e17202f..d464e9b837 100644 --- a/test/engine/QueryExecutionTreeTest.cpp +++ b/test/engine/QueryExecutionTreeTest.cpp @@ -6,8 +6,8 @@ #include "../util/IdTableHelpers.h" #include "../util/IndexTestHelpers.h" -#include "./ValuesForTesting.h" #include "engine/QueryExecutionTree.h" +#include "engine/ValuesForTesting.h" using namespace ad_utility::testing; diff --git a/test/util/IdTableHelpers.cpp b/test/util/IdTableHelpers.cpp index 34ad9414e7..b4708634aa 100644 --- a/test/util/IdTableHelpers.cpp +++ b/test/util/IdTableHelpers.cpp @@ -7,7 +7,7 @@ #include #include -#include "../engine/ValuesForTesting.h" +#include "engine/ValuesForTesting.h" #include "engine/idTable/IdTable.h" #include "global/ValueId.h" #include "util/Algorithm.h" diff --git a/test/util/IdTableHelpers.h b/test/util/IdTableHelpers.h index 40e2fe8213..928b4e7b99 100644 --- a/test/util/IdTableHelpers.h +++ b/test/util/IdTableHelpers.h @@ -13,7 +13,6 @@ #include #include -#include "../engine/ValuesForTesting.h" #include "./AllocatorTestHelpers.h" #include "./GTestHelpers.h" #include "./IdTestHelpers.h" @@ -22,6 +21,7 @@ #include "engine/Join.h" #include "engine/OptionalJoin.h" #include "engine/QueryExecutionTree.h" +#include "engine/ValuesForTesting.h" #include "engine/idTable/IdTable.h" #include "global/ValueId.h" #include "util/Algorithm.h" diff --git a/test/util/IndexTestHelpers.cpp b/test/util/IndexTestHelpers.cpp index 8e1a693209..26c2698be0 100644 --- a/test/util/IndexTestHelpers.cpp +++ b/test/util/IndexTestHelpers.cpp @@ -6,6 +6,7 @@ #include "./GTestHelpers.h" #include "./TripleComponentTestHelpers.h" +#include "engine/NamedQueryCache.h" #include "global/SpecialIds.h" #include "index/IndexImpl.h" #include "util/ProgressBar.h" @@ -277,10 +278,11 @@ QueryExecutionContext* getQec(std::optional turtleInput, TypeErasedCleanup cleanup_; std::unique_ptr index_; std::unique_ptr cache_; + std::unique_ptr namedCache_; std::unique_ptr qec_ = std::make_unique( *index_, cache_.get(), makeAllocator(MemorySize::megabytes(100)), - SortPerformanceEstimator{}); + SortPerformanceEstimator{}, namedCache_.get()); }; using Key = std::tuple, bool, bool, bool, @@ -308,7 +310,8 @@ QueryExecutionContext* getQec(std::optional turtleInput, usePatterns, usePrefixCompression, blocksizePermutations, createTextIndex, addWordsFromLiterals, contentsOfWordsFileAndDocsFile)), - std::make_unique()}); + std::make_unique(), + std::make_unique()}); } auto* qec = contextMap.at(key).qec_.get(); qec->getIndex().getImpl().setGlobalIndexAndComparatorOnlyForTesting(); From befc33dcd9aef155415b8c453f5d09b87cdaf1b3 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 30 Jan 2025 13:12:14 +0100 Subject: [PATCH 05/25] This seems to work, but copies IdTables etc. Signed-off-by: Johannes Kalmbach --- src/engine/CheckUsePatternTrick.cpp | 8 +- src/engine/NamedQueryCache.h | 23 ++++- src/engine/QueryPlanner.cpp | 12 +++ src/engine/QueryPlanner.h | 1 + src/engine/Server.cpp | 34 ++++++- src/engine/ValuesForTesting.h | 93 ++++++++++++------- src/global/Constants.h | 4 + src/parser/CMakeLists.txt | 2 + src/parser/GraphPatternOperation.h | 5 +- src/parser/NamedCachedQuery.cpp | 5 + src/parser/NamedCachedQuery.h | 28 ++++++ .../sparqlParser/SparqlQleverVisitor.cpp | 38 ++++++++ src/parser/sparqlParser/SparqlQleverVisitor.h | 51 +++++----- 13 files changed, 234 insertions(+), 70 deletions(-) create mode 100644 src/parser/NamedCachedQuery.cpp create mode 100644 src/parser/NamedCachedQuery.h diff --git a/src/engine/CheckUsePatternTrick.cpp b/src/engine/CheckUsePatternTrick.cpp index e7da58ea14..583976c829 100644 --- a/src/engine/CheckUsePatternTrick.cpp +++ b/src/engine/CheckUsePatternTrick.cpp @@ -72,9 +72,11 @@ bool isVariableContainedInGraphPatternOperation( } else if constexpr (std::is_same_v) { return ad_utility::contains(arg.visibleVariables_, variable); } else { - static_assert( - std::is_same_v || std::is_same_v || - std::is_same_v || std::is_same_v); + static_assert(std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v); // The `TransPath` is set up later in the query planning, when this // function should not be called anymore. AD_FAIL(); diff --git a/src/engine/NamedQueryCache.h b/src/engine/NamedQueryCache.h index f9d702b4e4..03b9c3ebe8 100644 --- a/src/engine/NamedQueryCache.h +++ b/src/engine/NamedQueryCache.h @@ -8,21 +8,34 @@ #include "util/Synchronized.h" class NamedQueryCache { + public: + struct Value { + IdTable result_; + VariableToColumnMap varToColMap_; + std::vector resultSortedOn_; + }; using Key = std::string; - using Value = std::shared_ptr; - using Cache = - ad_utility::HashMap>; + using Cache = ad_utility::HashMap; + private: ad_utility::Synchronized cache_; + public: void store(const Key& key, Value value) { - (*cache_.wlock())[key] = std::move(value); + (*cache_.wlock()).insert_or_assign(key, std::move(value)); } - Value get(const Key& key) { + const Value& get(const Key& key) { auto l = cache_.wlock(); auto it = l->find(key); // TODO Proper error message. AD_CONTRACT_CHECK(it != l->end()); return it->second; } + + std::shared_ptr getOperation(const Key& key, + QueryExecutionContext* ctx) { + const auto& [table, map, sortedOn] = get(key); + return std::make_shared( + ctx, std::make_shared(table.clone()), map); + } }; diff --git a/src/engine/QueryPlanner.cpp b/src/engine/QueryPlanner.cpp index 43caa71f02..7a349045d1 100644 --- a/src/engine/QueryPlanner.cpp +++ b/src/engine/QueryPlanner.cpp @@ -13,6 +13,7 @@ #include #include +#include "NamedQueryCache.h" #include "backports/algorithm.h" #include "engine/Bind.h" #include "engine/CartesianProductJoin.h" @@ -2408,6 +2409,8 @@ void QueryPlanner::GraphPatternPlanner::graphPatternOperationVisitor(Arg& arg) { visitDescribe(arg); } else if constexpr (std::is_same_v) { visitSpatialSearch(arg); + } else if constexpr (std::is_same_v) { + visitNamedCachedQuery(arg); } else { static_assert(std::is_same_v); visitBasicGraphPattern(arg); @@ -2581,6 +2584,15 @@ void QueryPlanner::GraphPatternPlanner::visitSpatialSearch( visitGroupOptionalOrMinus(std::move(candidatesOut)); } +// _____________________________________________________________________________ +void QueryPlanner::GraphPatternPlanner::visitNamedCachedQuery( + parsedQuery::NamedCachedQuery& arg) { + auto candidate = SubtreePlan{ + planner_._qec, planner_._qec->namedQueryCache().getOperation( + arg.validateAndGetIdentifier(), planner_._qec)}; + visitGroupOptionalOrMinus(std::vector{std::move(candidate)}); +} + // _______________________________________________________________ void QueryPlanner::GraphPatternPlanner::visitUnion(parsedQuery::Union& arg) { // TODO here we could keep all the candidates, and create a diff --git a/src/engine/QueryPlanner.h b/src/engine/QueryPlanner.h index b51523baed..72fb009716 100644 --- a/src/engine/QueryPlanner.h +++ b/src/engine/QueryPlanner.h @@ -543,6 +543,7 @@ class QueryPlanner { void visitTransitivePath(parsedQuery::TransPath& transitivePath); void visitPathSearch(parsedQuery::PathQuery& config); void visitSpatialSearch(parsedQuery::SpatialQuery& config); + void visitNamedCachedQuery(parsedQuery::NamedCachedQuery& config); void visitUnion(parsedQuery::Union& un); void visitSubquery(parsedQuery::Subquery& subquery); void visitDescribe(parsedQuery::Describe& describe); diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index 7c3c7e2c2e..8f2abaec1c 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -361,6 +361,11 @@ Awaitable Server::process( const auto parsedHttpRequest = parseHttpRequest(request); const auto& parameters = parsedHttpRequest.parameters_; + LOG(INFO) << "Logging all the parameters" << std::endl; + for (const auto& [key, value] : parameters) { + LOG(INFO) << key << ":" << value.at(0) << std::endl; + } + // We always want to call `Server::checkParameter` with the same first // parameter. auto checkParameter = std::bind_front(&ad_utility::url_parser::checkParameter, @@ -809,9 +814,17 @@ Awaitable Server::processQuery( // Do the query planning. This creates a `QueryExecutionTree`, which will // then be used to process the query. auto [pinSubtrees, pinResult] = determineResultPinning(params); + for (auto [key, value] : params) { + LOG(INFO) << "key : " << key << ": " << value.at(0) << std::endl; + } + std::optional pinNamed = + ad_utility::url_parser::checkParameter(params, "pin-named-query", {}); LOG(INFO) << "Processing the following SPARQL query:" << (pinResult ? " [pin result]" : "") << (pinSubtrees ? " [pin subresults]" : "") << "\n" + << (pinNamed ? absl::StrCat(" [pin named as ]", pinNamed.value()) + : "") + << "\n" << query.query_ << std::endl; QueryExecutionContext qec(index_, &cache_, allocator_, sortPerformanceEstimator_, &namedQueryCache_, @@ -866,10 +879,23 @@ Awaitable Server::processQuery( qet.getRootOperation()->getLimit()._offset); limitOffset._offset -= qet.getRootOperation()->getLimit()._offset; - // This actually processes the query and sends the result in the requested - // format. - co_await sendStreamableResponse(request, send, mediaType, plannedQuery, qet, - requestTimer, cancellationHandle); + if (pinNamed.has_value()) { + auto result = qet.getResult(false); + auto t = + NamedQueryCache::Value(result->idTable().clone(), + qet.getVariableColumns(), result->sortedBy()); + qec.namedQueryCache().store(pinNamed.value(), std::move(t)); + + auto response = ad_utility::httpUtils::createOkResponse( + "successfully pinned the query result", request, + ad_utility::MediaType::textPlain); + co_await send(response); + } else { + // This actually processes the query and sends the result in the requested + // format. + co_await sendStreamableResponse(request, send, mediaType, plannedQuery, qet, + requestTimer, cancellationHandle); + } // Print the runtime info. This needs to be done after the query // was computed. diff --git a/src/engine/ValuesForTesting.h b/src/engine/ValuesForTesting.h index 097ccd9c78..c9ad456720 100644 --- a/src/engine/ValuesForTesting.h +++ b/src/engine/ValuesForTesting.h @@ -4,19 +4,24 @@ #pragma once +#include + #include "engine/Operation.h" #include "engine/QueryExecutionContext.h" #include "engine/Result.h" #include "util/Algorithm.h" #include "util/Random.h" +auto tables(auto& tables_) { + return ql::views::transform(tables_, ad_utility::dereference); +} // An operation that yields a given `IdTable` as its result. It is used for // unit testing purposes when we need to specify the subtrees of another // operation. class ValuesForTesting : public Operation { private: - std::vector tables_; - std::vector> variables_; + std::vector> tables_; + VariableToColumnMap variables_; bool supportsLimit_; // Those can be manually overwritten for testing using the respective getters. size_t sizeEstimate_; @@ -27,16 +32,14 @@ class ValuesForTesting : public Operation { // Create an operation that has as its result the given `table` and the given // `variables`. The number of variables must be equal to the number // of columns in the table. - explicit ValuesForTesting(QueryExecutionContext* ctx, IdTable table, - std::vector> variables, - bool supportsLimit = false, - std::vector sortedColumns = {}, - LocalVocab localVocab = LocalVocab{}, - std::optional multiplicity = std::nullopt, - bool forceFullyMaterialized = false) + explicit ValuesForTesting( + QueryExecutionContext* ctx, IdTable table, + const std::vector>& variables, + bool supportsLimit = false, std::vector sortedColumns = {}, + LocalVocab localVocab = LocalVocab{}, + std::optional multiplicity = std::nullopt, + bool forceFullyMaterialized = false) : Operation{ctx}, - tables_{}, - variables_{std::move(variables)}, supportsLimit_{supportsLimit}, sizeEstimate_{table.numRows()}, costEstimate_{table.numRows()}, @@ -45,17 +48,32 @@ class ValuesForTesting : public Operation { multiplicity_{multiplicity}, forceFullyMaterialized_{forceFullyMaterialized} { AD_CONTRACT_CHECK(variables_.size() == table.numColumns()); - tables_.push_back(std::move(table)); + tables_.push_back(std::make_shared(std::move(table))); + variables_ = computeVarMapFromVector(variables); } + + ValuesForTesting(QueryExecutionContext* ctx, + std::shared_ptr table, + VariableToColumnMap variables, + std::vector sortedColumns = {}, + LocalVocab localVocab = LocalVocab{}) + : Operation{ctx}, + tables_{std::move(table)}, + variables_{std::move(variables)}, + supportsLimit_{false}, + sizeEstimate_{tables_.at(0)->numRows()}, + costEstimate_{0}, + resultSortedColumns_{std::move(sortedColumns)}, + localVocab_{std::move(localVocab)}, + multiplicity_{}, + forceFullyMaterialized_{false} {} explicit ValuesForTesting(QueryExecutionContext* ctx, - std::vector tables, + std::vector idTables, std::vector> variables, bool unlikelyToFitInCache = false, std::vector sortedColumns = {}, LocalVocab localVocab = LocalVocab{}) : Operation{ctx}, - tables_{std::move(tables)}, - variables_{std::move(variables)}, supportsLimit_{false}, sizeEstimate_{0}, costEstimate_{0}, @@ -63,15 +81,20 @@ class ValuesForTesting : public Operation { resultSortedColumns_{std::move(sortedColumns)}, localVocab_{std::move(localVocab)}, multiplicity_{std::nullopt} { - AD_CONTRACT_CHECK(ql::ranges::all_of(tables_, [this](const IdTable& table) { - return variables_.size() == table.numColumns(); - })); + for (auto& table : idTables) { + tables_.push_back(std::make_shared(std::move(table))); + } + AD_CONTRACT_CHECK( + ql::ranges::all_of(tables(tables_), [this](const IdTable& table) { + return variables_.size() == table.numColumns(); + })); size_t totalRows = 0; - for (const IdTable& idTable : tables_) { + for (const IdTable& idTable : tables(tables_)) { totalRows += idTable.numRows(); } sizeEstimate_ = totalRows; costEstimate_ = totalRows; + variables_ = computeVarMapFromVector(variables); } // Accessors for the estimates for manual testing. @@ -85,7 +108,7 @@ class ValuesForTesting : public Operation { AD_CORRECTNESS_CHECK(!supportsLimit_); std::vector clones; clones.reserve(tables_.size()); - for (const IdTable& idTable : tables_) { + for (const IdTable& idTable : tables(tables_)) { clones.push_back(idTable.clone()); } auto generator = [](auto idTables, @@ -98,15 +121,15 @@ class ValuesForTesting : public Operation { } std::optional optionalTable; if (tables_.size() > 1) { - IdTable aggregateTable{tables_.at(0).numColumns(), - tables_.at(0).getAllocator()}; - for (const IdTable& idTable : tables_) { + IdTable aggregateTable{tables(tables_)[0].numColumns(), + tables(tables_)[0].getAllocator()}; + for (const IdTable& idTable : tables(tables_)) { aggregateTable.insertAtEnd(idTable); } optionalTable = std::move(aggregateTable); } auto table = optionalTable.has_value() ? std::move(optionalTable).value() - : tables_.at(0).clone(); + : tables(tables_)[0].clone(); if (supportsLimit_) { table.erase(table.begin() + getLimit().upperBound(table.size()), table.end()); @@ -128,13 +151,13 @@ class ValuesForTesting : public Operation { std::stringstream str; auto numRowsView = tables_ | ql::views::transform(&IdTable::numRows); auto totalNumRows = std::reduce(numRowsView.begin(), numRowsView.end(), 0); - auto numCols = tables_.empty() ? 0 : tables_.at(0).numColumns(); + auto numCols = tables_.empty() ? 0 : tables_.at(0)->numColumns(); str << "Values for testing with " << numCols << " columns and " << totalNumRows << " rows. "; if (totalNumRows > 1000) { str << ad_utility::FastRandomIntGenerator{}(); } else { - for (const IdTable& idTable : tables_) { + for (const IdTable& idTable : tables(tables_)) { for (size_t i = 0; i < idTable.numColumns(); ++i) { for (Id entry : idTable.getColumn(i)) { str << entry << ' '; @@ -154,7 +177,7 @@ class ValuesForTesting : public Operation { size_t getResultWidth() const override { // Assume a width of 1 if we have no tables and no other information to base // it on because 0 would otherwise cause stuff to break. - return tables_.empty() ? 1 : tables_.at(0).numColumns(); + return tables_.empty() ? 1 : tables_.at(0)->numColumns(); } vector resultSortedOn() const override { @@ -179,27 +202,31 @@ class ValuesForTesting : public Operation { bool knownEmptyResult() override { return ql::ranges::all_of( - tables_, [](const IdTable& table) { return table.empty(); }); + tables(tables_), [](const IdTable& table) { return table.empty(); }); } private: - VariableToColumnMap computeVariableToColumnMap() const override { + VariableToColumnMap computeVarMapFromVector( + const std::vector>& vars) const { VariableToColumnMap m; - for (auto i = ColumnIndex{0}; i < variables_.size(); ++i) { - if (!variables_.at(i).has_value()) { + for (auto i = ColumnIndex{0}; i < vars.size(); ++i) { + if (!vars.at(i).has_value()) { continue; } bool containsUndef = - ql::ranges::any_of(tables_, [&i](const IdTable& table) { + ql::ranges::any_of(tables(tables_), [&i](const IdTable& table) { return ql::ranges::any_of(table.getColumn(i), [](Id id) { return id.isUndefined(); }); }); using enum ColumnIndexAndTypeInfo::UndefStatus; - m[variables_.at(i).value()] = ColumnIndexAndTypeInfo{ + m[vars.at(i).value()] = ColumnIndexAndTypeInfo{ i, containsUndef ? PossiblyUndefined : AlwaysDefined}; } return m; } + VariableToColumnMap computeVariableToColumnMap() const override { + return variables_; + } std::vector resultSortedColumns_; LocalVocab localVocab_; diff --git a/src/global/Constants.h b/src/global/Constants.h index 5a79575d82..dde13b65ee 100644 --- a/src/global/Constants.h +++ b/src/global/Constants.h @@ -74,6 +74,10 @@ constexpr inline std::string_view DEFAULT_GRAPH_IRI = constexpr inline std::string_view QLEVER_INTERNAL_GRAPH_IRI = makeQleverInternalIriConst<"internal-graph">(); +constexpr inline std::string_view NAMED_CACHED_QUERY_PREFIX = + ad_utility::constexprStrCat<"<", QLEVER_INTERNAL_PREFIX_URL, + "named-cached-query-">(); + constexpr inline std::pair GEOF_PREFIX = { "geof:", "http://www.opengis.net/def/function/geosparql/"}; constexpr inline std::pair MATH_PREFIX = { diff --git a/src/parser/CMakeLists.txt b/src/parser/CMakeLists.txt index 6fa123a793..57f519f6cf 100644 --- a/src/parser/CMakeLists.txt +++ b/src/parser/CMakeLists.txt @@ -31,6 +31,8 @@ add_library(parser Literal.cpp LiteralOrIri.cpp DatasetClauses.cpp + NamedCachedQuery.cpp + NamedCachedQuery.h ) qlever_target_link_libraries(parser sparqlParser parserData sparqlExpressions rdfEscaping re2::re2 util engine index) diff --git a/src/parser/GraphPatternOperation.h b/src/parser/GraphPatternOperation.h index 8f7d4a8505..9c454302ec 100644 --- a/src/parser/GraphPatternOperation.h +++ b/src/parser/GraphPatternOperation.h @@ -13,6 +13,7 @@ #include "engine/sparqlExpressions/SparqlExpressionPimpl.h" #include "parser/DatasetClauses.h" #include "parser/GraphPattern.h" +#include "parser/NamedCachedQuery.h" #include "parser/PathQuery.h" #include "parser/SpatialQuery.h" #include "parser/TripleComponent.h" @@ -178,8 +179,8 @@ struct Bind { // class actually becomes `using GraphPatternOperation = std::variant<...>` using GraphPatternOperationVariant = std::variant; + Values, Service, PathQuery, SpatialQuery, NamedCachedQuery, + Minus, GroupGraphPattern, Describe>; struct GraphPatternOperation : public GraphPatternOperationVariant, public VisitMixin { diff --git a/src/parser/NamedCachedQuery.cpp b/src/parser/NamedCachedQuery.cpp new file mode 100644 index 0000000000..a1db7e95c3 --- /dev/null +++ b/src/parser/NamedCachedQuery.cpp @@ -0,0 +1,5 @@ +// +// Created by kalmbacj on 1/30/25. +// + +#include "NamedCachedQuery.h" diff --git a/src/parser/NamedCachedQuery.h b/src/parser/NamedCachedQuery.h new file mode 100644 index 0000000000..40a9604d12 --- /dev/null +++ b/src/parser/NamedCachedQuery.h @@ -0,0 +1,28 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#pragma once + +#include "parser/MagicServiceQuery.h" + +namespace parsedQuery { +class NamedCachedQuery : public MagicServiceQuery { + std::string identifier_; + + public: + NamedCachedQuery(std::string identifier) + : identifier_{std::move(identifier)} {} + + void addParameter([[maybe_unused]] const SparqlTriple& triple) override { + throw std::runtime_error{ + "The body of a named cache query request must be empty"}; + } + + const std::string& validateAndGetIdentifier() const { + // TODO Better error messages. + AD_CORRECTNESS_CHECK(!childGraphPattern_.has_value()); + return identifier_; + } +}; +} // namespace parsedQuery diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index e22454cfd7..34cd48a25a 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -30,6 +30,7 @@ #include "parser/GraphPatternOperation.h" #include "parser/MagicServiceIriConstants.h" #include "parser/MagicServiceQuery.h" +#include "parser/NamedCachedQuery.h" #include "parser/RdfParser.h" #include "parser/SparqlParser.h" #include "parser/SpatialQuery.h" @@ -891,6 +892,40 @@ GraphPatternOperation Visitor::visitPathQuery( return pathQuery; } +// _____________________________________________________________________________ +GraphPatternOperation Visitor::visitNamedCachedQuery( + Parser::ServiceGraphPatternContext* ctx) { + auto parseContent = [ctx](parsedQuery::NamedCachedQuery& namedQuery, + const parsedQuery::GraphPatternOperation& op) { + if (std::holds_alternative(op)) { + namedQuery.addBasicPattern(std::get(op)); + } else if (std::holds_alternative(op)) { + namedQuery.addGraph(op); + } else { + reportError(ctx, + "Unsupported element in named cached query." + "A named cached query currently must have an empty body"); + } + }; + + auto iri = std::get(visit(ctx->varOrIri())); + auto s = iri.toSparql(); + AD_CORRECTNESS_CHECK(s.starts_with(NAMED_CACHED_QUERY_PREFIX)); + auto view = std::string_view{s}; + // Remove the prefix and the trailing ">" + view.remove_prefix(NAMED_CACHED_QUERY_PREFIX.size()); + view.remove_suffix(1); + + parsedQuery::GraphPattern graphPattern = visit(ctx->groupGraphPattern()); + parsedQuery::NamedCachedQuery namedQuery{std::string{view}}; + for (const auto& op : graphPattern._graphPatterns) { + parseContent(namedQuery, op); + } + [[maybe_unused]] const auto& validated = + namedQuery.validateAndGetIdentifier(); + return namedQuery; +} + GraphPatternOperation Visitor::visitSpatialQuery( Parser::ServiceGraphPatternContext* ctx) { auto parseSpatialQuery = [ctx](parsedQuery::SpatialQuery& spatialQuery, @@ -951,6 +986,9 @@ GraphPatternOperation Visitor::visit(Parser::ServiceGraphPatternContext* ctx) { return visitPathQuery(ctx); } else if (serviceIri.toStringRepresentation() == SPATIAL_SEARCH_IRI) { return visitSpatialQuery(ctx); + } else if (serviceIri.toStringRepresentation().starts_with( + NAMED_CACHED_QUERY_PREFIX)) { + return visitNamedCachedQuery(ctx); } // Parse the body of the SERVICE query. Add the visible variables from the // SERVICE clause to the visible variables so far, but also remember them diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.h b/src/parser/sparqlParser/SparqlQleverVisitor.h index fb1cb9c05c..697d353f65 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.h +++ b/src/parser/sparqlParser/SparqlQleverVisitor.h @@ -264,6 +264,9 @@ class SparqlQleverVisitor { GraphPatternOperation visitSpatialQuery( Parser::ServiceGraphPatternContext* ctx); + GraphPatternOperation visitNamedCachedQuery( + Parser::ServiceGraphPatternContext* ctx); + parsedQuery::GraphPatternOperation visit(Parser::BindContext* ctx); parsedQuery::GraphPatternOperation visit(Parser::InlineDataContext* ctx); @@ -343,10 +346,10 @@ class SparqlQleverVisitor { PropertyPath visit(Parser::PathEltOrInverseContext* ctx); - // NOTE: The `visit` overloads marked `[[noreturn]]` always throw an exception - // because the corresponding feature is not (yet) supported by QLever. Most - // of them have a return type of `void`. Some of the don't, in order to make - // the usage of abstractions like `visitAlternative` easier. + // NOTE: The `visit` overloads marked `[[noreturn]]` always throw an + // exception because the corresponding feature is not (yet) supported by + // QLever. Most of them have a return type of `void`. Some of the don't, in + // order to make the usage of abstractions like `visitAlternative` easier. [[noreturn]] static void visit(Parser::PathModContext* ctx); @@ -485,8 +488,8 @@ class SparqlQleverVisitor { static std::string currentTimeAsXsdString(); // Member starTime_ is needed for the NOW expression. All calls within - // the query execution reference it. The underlying date time format is e.g.: - // 2011-01-10T14:45:13.815-05:00 + // the query execution reference it. The underlying date time format is + // e.g.: 2011-01-10T14:45:13.815-05:00 std::string startTime_ = currentTimeAsXsdString(); template @@ -503,15 +506,16 @@ class SparqlQleverVisitor { // Get the part of the original input string that pertains to the given // context. This is necessary because ANTLR's `getText()` only provides that - // part with *all* whitespace removed. Preserving the whitespace is important - // for readability (for example, in an error message), and even more so when - // using such parts for further processing (like the body of a SERVICE query, - // which is not valid SPARQL anymore when you remove all whitespace). + // part with *all* whitespace removed. Preserving the whitespace is + // important for readability (for example, in an error message), and even + // more so when using such parts for further processing (like the body of a + // SERVICE query, which is not valid SPARQL anymore when you remove all + // whitespace). static std::string getOriginalInputForContext( const antlr4::ParserRuleContext* context); - // Process an IRI function call. This is used in both `visitFunctionCall` and - // `visitIriOrFunction`. + // Process an IRI function call. This is used in both `visitFunctionCall` + // and `visitIriOrFunction`. static ExpressionPtr processIriFunctionCall( const TripleComponent::Iri& iri, std::vector argList, const antlr4::ParserRuleContext*); @@ -555,13 +559,14 @@ class SparqlQleverVisitor { template auto visitOptional(Ctx* ctx) -> std::optional; - /// If `ctx` is not `nullptr`, visit it, convert the result to `Intermediate` - /// and assign it to `*target`. The case where `Intermediate!=Target` is - /// useful, when the result of `visit(ctx)` cannot be converted to `Target`, - /// but the conversion chain `VisitResult -> Intermediate -> Target` is valid. - /// For example when `visit(ctx)` yields `A`, `A` is explicitly convertible to - /// `B` and `Target` is `optional`, then `B` has to be specified as - /// `Intermediate` (see for example the implementation of `visitAlternative`). + /// If `ctx` is not `nullptr`, visit it, convert the result to + /// `Intermediate` and assign it to `*target`. The case where + /// `Intermediate!=Target` is useful, when the result of `visit(ctx)` cannot + /// be converted to `Target`, but the conversion chain `VisitResult -> + /// Intermediate -> Target` is valid. For example when `visit(ctx)` yields + /// `A`, `A` is explicitly convertible to `B` and `Target` is `optional`, + /// then `B` has to be specified as `Intermediate` (see for example the + /// implementation of `visitAlternative`). template void visitIf(Target* target, Ctx* ctx); @@ -581,8 +586,8 @@ class SparqlQleverVisitor { template Triples parseTriplesConstruction(Context* ctx); - // If the triple is a special triple for the text index (i.e. its predicate is - // either `ql:contains-word` or `ql:contains-entity`, register the magic + // If the triple is a special triple for the text index (i.e. its predicate + // is either `ql:contains-word` or `ql:contains-entity`, register the magic // variables for the matching word and the score that will be created when // processing those triples in the query body, s.t. they can be selected as // part of the query result. @@ -593,8 +598,8 @@ class SparqlQleverVisitor { static TripleComponent visitGraphTerm(const GraphTerm& graphTerm); // If any of the variables used in `expression` did not appear previously in - // the query, add a warning or throw an exception (depending on the setting of - // the corresponding `RuntimeParameter`). + // the query, add a warning or throw an exception (depending on the setting + // of the corresponding `RuntimeParameter`). void warnOrThrowIfUnboundVariables(auto* ctx, const SparqlExpressionPimpl& expression, std::string_view clauseName); From 7f30e170def42200ff1211339d2830173a76c668 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 30 Jan 2025 15:20:37 +0100 Subject: [PATCH 06/25] It still works and is a little bit cleaner. TODO: 1. Unit tests 2. correct cache-clearing (or updates etc.) Signed-off-by: Johannes Kalmbach --- src/engine/CMakeLists.txt | 1 - src/engine/NamedQueryCache.cpp | 31 ++++++++++- src/engine/NamedQueryCache.h | 39 ++++++++------ src/engine/QueryExecutionContext.cpp | 3 -- src/engine/QueryExecutionContext.h | 1 - src/engine/Server.cpp | 16 ++---- src/engine/ValuesForTesting.h | 53 ++++++++----------- src/global/Constants.h | 5 +- src/parser/CMakeLists.txt | 2 - src/parser/NamedCachedQuery.cpp | 5 -- src/parser/NamedCachedQuery.h | 6 +++ .../sparqlParser/SparqlQleverVisitor.cpp | 16 +++--- src/parser/sparqlParser/SparqlQleverVisitor.h | 49 +++++++++-------- 13 files changed, 121 insertions(+), 106 deletions(-) delete mode 100644 src/parser/NamedCachedQuery.cpp diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt index d517c0f239..91587521fd 100644 --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -16,6 +16,5 @@ add_library(engine CountConnectedSubgraphs.cpp SpatialJoinAlgorithms.cpp PathSearch.cpp ExecuteUpdate.cpp Describe.cpp GraphStoreProtocol.cpp NamedQueryCache.cpp - NamedQueryCache.h QueryExecutionContext.cpp) qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams s2) diff --git a/src/engine/NamedQueryCache.cpp b/src/engine/NamedQueryCache.cpp index 9d4bbb15a3..6c333fcbb0 100644 --- a/src/engine/NamedQueryCache.cpp +++ b/src/engine/NamedQueryCache.cpp @@ -2,4 +2,33 @@ // Chair of Algorithms and Data Structures. // Author: Johannes Kalmbach -#include "NamedQueryCache.h" +#include "engine/NamedQueryCache.h" + +// _____________________________________________________________________________ +std::shared_ptr NamedQueryCache ::getOperation( + const Key& key, QueryExecutionContext* ctx) const { + const auto& [table, map, sortedOn] = get(key); + // TODO we should get rid of the copies for the IdTable (and + // probably the other members) especially for larger results). + return std::make_shared(ctx, table.clone(), map); +} + +// _____________________________________________________________________________ +auto NamedQueryCache::get(const Key& key) const -> const Value& { + auto l = cache_.wlock(); + auto it = l->find(key); + if (it == l->end()) { + throw std::runtime_error{ + absl::StrCat("The named query with the name \"", key, + "\" was not pinned to the named query cache")}; + } + return it->second; +} + +// _____________________________________________________________________________ +void NamedQueryCache::store(const Key& key, Value value) { + (*cache_.wlock()).insert_or_assign(key, std::move(value)); +} + +// _____________________________________________________________________________ +void NamedQueryCache::clear() { cache_.wlock()->clear(); } diff --git a/src/engine/NamedQueryCache.h b/src/engine/NamedQueryCache.h index 03b9c3ebe8..4d9775b1c5 100644 --- a/src/engine/NamedQueryCache.h +++ b/src/engine/NamedQueryCache.h @@ -4,11 +4,14 @@ #pragma once #include "engine/ValuesForTesting.h" -#include "util/Cache.h" #include "util/Synchronized.h" +// A simple threadsafe cache that associates query results with an explicit +// name. class NamedQueryCache { public: + // The cache value. It stores all the information required to construct a + // proper `QueryExecutionTree` later on. struct Value { IdTable result_; VariableToColumnMap varToColMap_; @@ -21,21 +24,23 @@ class NamedQueryCache { ad_utility::Synchronized cache_; public: - void store(const Key& key, Value value) { - (*cache_.wlock()).insert_or_assign(key, std::move(value)); - } - const Value& get(const Key& key) { - auto l = cache_.wlock(); - auto it = l->find(key); - // TODO Proper error message. - AD_CONTRACT_CHECK(it != l->end()); - return it->second; - } + // Store an explicit query result with a given `key`. Previously stored + // `value`s with the same `key` are overwritten. + void store(const Key& key, Value value); - std::shared_ptr getOperation(const Key& key, - QueryExecutionContext* ctx) { - const auto& [table, map, sortedOn] = get(key); - return std::make_shared( - ctx, std::make_shared(table.clone()), map); - } + // Clear the cache. + void clear(); + + // Retrieve the query result that is associated with the `key`. + // Throw an exception if the `key` doesn't exist. + const Value& get(const Key& key) const; + + // Retrieve the query result with the given `key` and convert it into an + // explicit `ValuesForTesting` operation that can be used as part of a + // `QueryExecutionTree`. + // TODO This can be done more efficiently if we implement a dedicated + // operation for this use case, `ValuesForTesting` currently incurs one + // (unneeded) copy per query execution. + std::shared_ptr getOperation( + const Key& key, QueryExecutionContext* ctx) const; }; diff --git a/src/engine/QueryExecutionContext.cpp b/src/engine/QueryExecutionContext.cpp index d7c4867898..b1676e834a 100644 --- a/src/engine/QueryExecutionContext.cpp +++ b/src/engine/QueryExecutionContext.cpp @@ -20,6 +20,3 @@ QueryExecutionContext::QueryExecutionContext( _sortPerformanceEstimator(sortPerformanceEstimator), updateCallback_(std::move(updateCallback)), namedQueryCache_{namedCache} {} - -// _____________________________________________________________________________ -QueryExecutionContext::~QueryExecutionContext() = default; diff --git a/src/engine/QueryExecutionContext.h b/src/engine/QueryExecutionContext.h index cd1c931952..9eb632b48a 100644 --- a/src/engine/QueryExecutionContext.h +++ b/src/engine/QueryExecutionContext.h @@ -107,7 +107,6 @@ class QueryExecutionContext { std::function updateCallback = [](std::string) { /* No-op by default for testing */ }, bool pinSubtrees = false, bool pinResult = false); - ~QueryExecutionContext(); QueryResultCache& getQueryTreeCache() { return *_subtreeCache; } diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index 8f2abaec1c..d9297ea841 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -361,11 +361,6 @@ Awaitable Server::process( const auto parsedHttpRequest = parseHttpRequest(request); const auto& parameters = parsedHttpRequest.parameters_; - LOG(INFO) << "Logging all the parameters" << std::endl; - for (const auto& [key, value] : parameters) { - LOG(INFO) << key << ":" << value.at(0) << std::endl; - } - // We always want to call `Server::checkParameter` with the same first // parameter. auto checkParameter = std::bind_front(&ad_utility::url_parser::checkParameter, @@ -811,12 +806,9 @@ Awaitable Server::processQuery( auto [cancellationHandle, cancelTimeoutOnDestruction] = setupCancellationHandle(messageSender.getQueryId(), timeLimit); - // Do the query planning. This creates a `QueryExecutionTree`, which will - // then be used to process the query. + // Figure out, whether the query is to be pinned in the cache (either + // implicitly, or explicitly as a named query). auto [pinSubtrees, pinResult] = determineResultPinning(params); - for (auto [key, value] : params) { - LOG(INFO) << "key : " << key << ": " << value.at(0) << std::endl; - } std::optional pinNamed = ad_utility::url_parser::checkParameter(params, "pin-named-query", {}); LOG(INFO) << "Processing the following SPARQL query:" @@ -880,6 +872,8 @@ Awaitable Server::processQuery( limitOffset._offset -= qet.getRootOperation()->getLimit()._offset; if (pinNamed.has_value()) { + // The query is to be pinned in the named cache. In this case we don't + // return the result, but only pin it. auto result = qet.getResult(false); auto t = NamedQueryCache::Value(result->idTable().clone(), @@ -887,7 +881,7 @@ Awaitable Server::processQuery( qec.namedQueryCache().store(pinNamed.value(), std::move(t)); auto response = ad_utility::httpUtils::createOkResponse( - "successfully pinned the query result", request, + "Successfully pinned the query result", request, ad_utility::MediaType::textPlain); co_await send(response); } else { diff --git a/src/engine/ValuesForTesting.h b/src/engine/ValuesForTesting.h index c9ad456720..f7f78945b1 100644 --- a/src/engine/ValuesForTesting.h +++ b/src/engine/ValuesForTesting.h @@ -4,23 +4,18 @@ #pragma once -#include - #include "engine/Operation.h" #include "engine/QueryExecutionContext.h" #include "engine/Result.h" #include "util/Algorithm.h" #include "util/Random.h" -auto tables(auto& tables_) { - return ql::views::transform(tables_, ad_utility::dereference); -} // An operation that yields a given `IdTable` as its result. It is used for // unit testing purposes when we need to specify the subtrees of another // operation. class ValuesForTesting : public Operation { private: - std::vector> tables_; + std::vector tables_; VariableToColumnMap variables_; bool supportsLimit_; // Those can be manually overwritten for testing using the respective getters. @@ -47,33 +42,34 @@ class ValuesForTesting : public Operation { localVocab_{std::move(localVocab)}, multiplicity_{multiplicity}, forceFullyMaterialized_{forceFullyMaterialized} { - AD_CONTRACT_CHECK(variables_.size() == table.numColumns()); - tables_.push_back(std::make_shared(std::move(table))); + AD_CONTRACT_CHECK(variables.size() == table.numColumns()); + tables_.push_back(std::move(table)); variables_ = computeVarMapFromVector(variables); } - ValuesForTesting(QueryExecutionContext* ctx, - std::shared_ptr table, + ValuesForTesting(QueryExecutionContext* ctx, IdTable table, VariableToColumnMap variables, std::vector sortedColumns = {}, LocalVocab localVocab = LocalVocab{}) : Operation{ctx}, - tables_{std::move(table)}, variables_{std::move(variables)}, supportsLimit_{false}, - sizeEstimate_{tables_.at(0)->numRows()}, + sizeEstimate_{table.numRows()}, costEstimate_{0}, resultSortedColumns_{std::move(sortedColumns)}, localVocab_{std::move(localVocab)}, multiplicity_{}, - forceFullyMaterialized_{false} {} + forceFullyMaterialized_{false} { + tables_.push_back(std::move(table)); + } explicit ValuesForTesting(QueryExecutionContext* ctx, - std::vector idTables, + std::vector tables, std::vector> variables, bool unlikelyToFitInCache = false, std::vector sortedColumns = {}, LocalVocab localVocab = LocalVocab{}) : Operation{ctx}, + tables_{std::move(tables)}, supportsLimit_{false}, sizeEstimate_{0}, costEstimate_{0}, @@ -81,15 +77,12 @@ class ValuesForTesting : public Operation { resultSortedColumns_{std::move(sortedColumns)}, localVocab_{std::move(localVocab)}, multiplicity_{std::nullopt} { - for (auto& table : idTables) { - tables_.push_back(std::make_shared(std::move(table))); - } AD_CONTRACT_CHECK( - ql::ranges::all_of(tables(tables_), [this](const IdTable& table) { - return variables_.size() == table.numColumns(); + ql::ranges::all_of(tables_, [&variables](const IdTable& table) { + return variables.size() == table.numColumns(); })); size_t totalRows = 0; - for (const IdTable& idTable : tables(tables_)) { + for (const IdTable& idTable : tables_) { totalRows += idTable.numRows(); } sizeEstimate_ = totalRows; @@ -108,7 +101,7 @@ class ValuesForTesting : public Operation { AD_CORRECTNESS_CHECK(!supportsLimit_); std::vector clones; clones.reserve(tables_.size()); - for (const IdTable& idTable : tables(tables_)) { + for (const IdTable& idTable : tables_) { clones.push_back(idTable.clone()); } auto generator = [](auto idTables, @@ -121,15 +114,15 @@ class ValuesForTesting : public Operation { } std::optional optionalTable; if (tables_.size() > 1) { - IdTable aggregateTable{tables(tables_)[0].numColumns(), - tables(tables_)[0].getAllocator()}; - for (const IdTable& idTable : tables(tables_)) { + IdTable aggregateTable{tables_.at(0).numColumns(), + tables_.at(0).getAllocator()}; + for (const IdTable& idTable : tables_) { aggregateTable.insertAtEnd(idTable); } optionalTable = std::move(aggregateTable); } auto table = optionalTable.has_value() ? std::move(optionalTable).value() - : tables(tables_)[0].clone(); + : tables_.at(0).clone(); if (supportsLimit_) { table.erase(table.begin() + getLimit().upperBound(table.size()), table.end()); @@ -151,13 +144,13 @@ class ValuesForTesting : public Operation { std::stringstream str; auto numRowsView = tables_ | ql::views::transform(&IdTable::numRows); auto totalNumRows = std::reduce(numRowsView.begin(), numRowsView.end(), 0); - auto numCols = tables_.empty() ? 0 : tables_.at(0)->numColumns(); + auto numCols = tables_.empty() ? 0 : tables_.at(0).numColumns(); str << "Values for testing with " << numCols << " columns and " << totalNumRows << " rows. "; if (totalNumRows > 1000) { str << ad_utility::FastRandomIntGenerator{}(); } else { - for (const IdTable& idTable : tables(tables_)) { + for (const IdTable& idTable : tables_) { for (size_t i = 0; i < idTable.numColumns(); ++i) { for (Id entry : idTable.getColumn(i)) { str << entry << ' '; @@ -177,7 +170,7 @@ class ValuesForTesting : public Operation { size_t getResultWidth() const override { // Assume a width of 1 if we have no tables and no other information to base // it on because 0 would otherwise cause stuff to break. - return tables_.empty() ? 1 : tables_.at(0)->numColumns(); + return tables_.empty() ? 1 : tables_.at(0).numColumns(); } vector resultSortedOn() const override { @@ -202,7 +195,7 @@ class ValuesForTesting : public Operation { bool knownEmptyResult() override { return ql::ranges::all_of( - tables(tables_), [](const IdTable& table) { return table.empty(); }); + tables_, [](const IdTable& table) { return table.empty(); }); } private: @@ -214,7 +207,7 @@ class ValuesForTesting : public Operation { continue; } bool containsUndef = - ql::ranges::any_of(tables(tables_), [&i](const IdTable& table) { + ql::ranges::any_of(tables_, [&i](const IdTable& table) { return ql::ranges::any_of(table.getColumn(i), [](Id id) { return id.isUndefined(); }); }); diff --git a/src/global/Constants.h b/src/global/Constants.h index dde13b65ee..39f930b951 100644 --- a/src/global/Constants.h +++ b/src/global/Constants.h @@ -74,8 +74,11 @@ constexpr inline std::string_view DEFAULT_GRAPH_IRI = constexpr inline std::string_view QLEVER_INTERNAL_GRAPH_IRI = makeQleverInternalIriConst<"internal-graph">(); +// The prefix of a SERVICE IRI that refers to a query that has been pinned with +// an explicit name. The format currently is `ql:named-cached-query-$query-id$`. +// NOTE: This constant does not include the leading '<'. constexpr inline std::string_view NAMED_CACHED_QUERY_PREFIX = - ad_utility::constexprStrCat<"<", QLEVER_INTERNAL_PREFIX_URL, + ad_utility::constexprStrCat(); constexpr inline std::pair GEOF_PREFIX = { diff --git a/src/parser/CMakeLists.txt b/src/parser/CMakeLists.txt index 57f519f6cf..6fa123a793 100644 --- a/src/parser/CMakeLists.txt +++ b/src/parser/CMakeLists.txt @@ -31,8 +31,6 @@ add_library(parser Literal.cpp LiteralOrIri.cpp DatasetClauses.cpp - NamedCachedQuery.cpp - NamedCachedQuery.h ) qlever_target_link_libraries(parser sparqlParser parserData sparqlExpressions rdfEscaping re2::re2 util engine index) diff --git a/src/parser/NamedCachedQuery.cpp b/src/parser/NamedCachedQuery.cpp deleted file mode 100644 index a1db7e95c3..0000000000 --- a/src/parser/NamedCachedQuery.cpp +++ /dev/null @@ -1,5 +0,0 @@ -// -// Created by kalmbacj on 1/30/25. -// - -#include "NamedCachedQuery.h" diff --git a/src/parser/NamedCachedQuery.h b/src/parser/NamedCachedQuery.h index 40a9604d12..72e9adc64b 100644 --- a/src/parser/NamedCachedQuery.h +++ b/src/parser/NamedCachedQuery.h @@ -7,18 +7,24 @@ #include "parser/MagicServiceQuery.h" namespace parsedQuery { +// A magic service for queries that are pinned with an explicit query name. class NamedCachedQuery : public MagicServiceQuery { std::string identifier_; public: + // Construct with the name of the named query. NamedCachedQuery(std::string identifier) : identifier_{std::move(identifier)} {} + // Currently the body of the SERVICE clause must be empty. void addParameter([[maybe_unused]] const SparqlTriple& triple) override { throw std::runtime_error{ "The body of a named cache query request must be empty"}; } + // Return the name of the named query, and check, that the configuration is + // valid (which currently means, that the body of the SERVICE clause was + // empty. const std::string& validateAndGetIdentifier() const { // TODO Better error messages. AD_CORRECTNESS_CHECK(!childGraphPattern_.has_value()); diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index 34cd48a25a..fd09eeb34a 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -894,6 +894,7 @@ GraphPatternOperation Visitor::visitPathQuery( // _____________________________________________________________________________ GraphPatternOperation Visitor::visitNamedCachedQuery( + const TripleComponent::Iri& target, Parser::ServiceGraphPatternContext* ctx) { auto parseContent = [ctx](parsedQuery::NamedCachedQuery& namedQuery, const parsedQuery::GraphPatternOperation& op) { @@ -908,13 +909,10 @@ GraphPatternOperation Visitor::visitNamedCachedQuery( } }; - auto iri = std::get(visit(ctx->varOrIri())); - auto s = iri.toSparql(); - AD_CORRECTNESS_CHECK(s.starts_with(NAMED_CACHED_QUERY_PREFIX)); - auto view = std::string_view{s}; - // Remove the prefix and the trailing ">" + auto view = asStringViewUnsafe(target.getContent()); + AD_CORRECTNESS_CHECK(view.starts_with(NAMED_CACHED_QUERY_PREFIX)); + // Remove the prefix view.remove_prefix(NAMED_CACHED_QUERY_PREFIX.size()); - view.remove_suffix(1); parsedQuery::GraphPattern graphPattern = visit(ctx->groupGraphPattern()); parsedQuery::NamedCachedQuery namedQuery{std::string{view}}; @@ -986,9 +984,9 @@ GraphPatternOperation Visitor::visit(Parser::ServiceGraphPatternContext* ctx) { return visitPathQuery(ctx); } else if (serviceIri.toStringRepresentation() == SPATIAL_SEARCH_IRI) { return visitSpatialQuery(ctx); - } else if (serviceIri.toStringRepresentation().starts_with( - NAMED_CACHED_QUERY_PREFIX)) { - return visitNamedCachedQuery(ctx); + } else if (asStringViewUnsafe(serviceIri.getContent()) + .starts_with(NAMED_CACHED_QUERY_PREFIX)) { + return visitNamedCachedQuery(serviceIri, ctx); } // Parse the body of the SERVICE query. Add the visible variables from the // SERVICE clause to the visible variables so far, but also remember them diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.h b/src/parser/sparqlParser/SparqlQleverVisitor.h index 697d353f65..6282994784 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.h +++ b/src/parser/sparqlParser/SparqlQleverVisitor.h @@ -265,6 +265,7 @@ class SparqlQleverVisitor { Parser::ServiceGraphPatternContext* ctx); GraphPatternOperation visitNamedCachedQuery( + const TripleComponent::Iri& target, Parser::ServiceGraphPatternContext* ctx); parsedQuery::GraphPatternOperation visit(Parser::BindContext* ctx); @@ -346,10 +347,10 @@ class SparqlQleverVisitor { PropertyPath visit(Parser::PathEltOrInverseContext* ctx); - // NOTE: The `visit` overloads marked `[[noreturn]]` always throw an - // exception because the corresponding feature is not (yet) supported by - // QLever. Most of them have a return type of `void`. Some of the don't, in - // order to make the usage of abstractions like `visitAlternative` easier. + // NOTE: The `visit` overloads marked `[[noreturn]]` always throw an exception + // because the corresponding feature is not (yet) supported by QLever. Most + // of them have a return type of `void`. Some of the don't, in order to make + // the usage of abstractions like `visitAlternative` easier. [[noreturn]] static void visit(Parser::PathModContext* ctx); @@ -488,8 +489,8 @@ class SparqlQleverVisitor { static std::string currentTimeAsXsdString(); // Member starTime_ is needed for the NOW expression. All calls within - // the query execution reference it. The underlying date time format is - // e.g.: 2011-01-10T14:45:13.815-05:00 + // the query execution reference it. The underlying date time format is e.g.: + // 2011-01-10T14:45:13.815-05:00 std::string startTime_ = currentTimeAsXsdString(); template @@ -506,16 +507,15 @@ class SparqlQleverVisitor { // Get the part of the original input string that pertains to the given // context. This is necessary because ANTLR's `getText()` only provides that - // part with *all* whitespace removed. Preserving the whitespace is - // important for readability (for example, in an error message), and even - // more so when using such parts for further processing (like the body of a - // SERVICE query, which is not valid SPARQL anymore when you remove all - // whitespace). + // part with *all* whitespace removed. Preserving the whitespace is important + // for readability (for example, in an error message), and even more so when + // using such parts for further processing (like the body of a SERVICE query, + // which is not valid SPARQL anymore when you remove all whitespace). static std::string getOriginalInputForContext( const antlr4::ParserRuleContext* context); - // Process an IRI function call. This is used in both `visitFunctionCall` - // and `visitIriOrFunction`. + // Process an IRI function call. This is used in both `visitFunctionCall` and + // `visitIriOrFunction`. static ExpressionPtr processIriFunctionCall( const TripleComponent::Iri& iri, std::vector argList, const antlr4::ParserRuleContext*); @@ -559,14 +559,13 @@ class SparqlQleverVisitor { template auto visitOptional(Ctx* ctx) -> std::optional; - /// If `ctx` is not `nullptr`, visit it, convert the result to - /// `Intermediate` and assign it to `*target`. The case where - /// `Intermediate!=Target` is useful, when the result of `visit(ctx)` cannot - /// be converted to `Target`, but the conversion chain `VisitResult -> - /// Intermediate -> Target` is valid. For example when `visit(ctx)` yields - /// `A`, `A` is explicitly convertible to `B` and `Target` is `optional`, - /// then `B` has to be specified as `Intermediate` (see for example the - /// implementation of `visitAlternative`). + /// If `ctx` is not `nullptr`, visit it, convert the result to `Intermediate` + /// and assign it to `*target`. The case where `Intermediate!=Target` is + /// useful, when the result of `visit(ctx)` cannot be converted to `Target`, + /// but the conversion chain `VisitResult -> Intermediate -> Target` is valid. + /// For example when `visit(ctx)` yields `A`, `A` is explicitly convertible to + /// `B` and `Target` is `optional`, then `B` has to be specified as + /// `Intermediate` (see for example the implementation of `visitAlternative`). template void visitIf(Target* target, Ctx* ctx); @@ -586,8 +585,8 @@ class SparqlQleverVisitor { template Triples parseTriplesConstruction(Context* ctx); - // If the triple is a special triple for the text index (i.e. its predicate - // is either `ql:contains-word` or `ql:contains-entity`, register the magic + // If the triple is a special triple for the text index (i.e. its predicate is + // either `ql:contains-word` or `ql:contains-entity`, register the magic // variables for the matching word and the score that will be created when // processing those triples in the query body, s.t. they can be selected as // part of the query result. @@ -598,8 +597,8 @@ class SparqlQleverVisitor { static TripleComponent visitGraphTerm(const GraphTerm& graphTerm); // If any of the variables used in `expression` did not appear previously in - // the query, add a warning or throw an exception (depending on the setting - // of the corresponding `RuntimeParameter`). + // the query, add a warning or throw an exception (depending on the setting of + // the corresponding `RuntimeParameter`). void warnOrThrowIfUnboundVariables(auto* ctx, const SparqlExpressionPimpl& expression, std::string_view clauseName); From 65caf947e38d31d3e64442b724328d3c6f15d25d Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 30 Jan 2025 18:16:21 +0100 Subject: [PATCH 07/25] Fix the compilation of the tests again. Signed-off-by: Johannes Kalmbach --- src/engine/ValuesForTesting.h | 23 ++++++++++++----------- test/OperationTest.cpp | 5 +++-- test/ValuesForTestingTest.cpp | 7 ++++--- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/src/engine/ValuesForTesting.h b/src/engine/ValuesForTesting.h index f7f78945b1..389eefd493 100644 --- a/src/engine/ValuesForTesting.h +++ b/src/engine/ValuesForTesting.h @@ -14,6 +14,9 @@ // unit testing purposes when we need to specify the subtrees of another // operation. class ValuesForTesting : public Operation { + public: + using VarVector = std::vector>; + private: std::vector tables_; VariableToColumnMap variables_; @@ -27,13 +30,13 @@ class ValuesForTesting : public Operation { // Create an operation that has as its result the given `table` and the given // `variables`. The number of variables must be equal to the number // of columns in the table. - explicit ValuesForTesting( - QueryExecutionContext* ctx, IdTable table, - const std::vector>& variables, - bool supportsLimit = false, std::vector sortedColumns = {}, - LocalVocab localVocab = LocalVocab{}, - std::optional multiplicity = std::nullopt, - bool forceFullyMaterialized = false) + explicit ValuesForTesting(QueryExecutionContext* ctx, IdTable table, + const VarVector& variables, + bool supportsLimit = false, + std::vector sortedColumns = {}, + LocalVocab localVocab = LocalVocab{}, + std::optional multiplicity = std::nullopt, + bool forceFullyMaterialized = false) : Operation{ctx}, supportsLimit_{supportsLimit}, sizeEstimate_{table.numRows()}, @@ -63,8 +66,7 @@ class ValuesForTesting : public Operation { tables_.push_back(std::move(table)); } explicit ValuesForTesting(QueryExecutionContext* ctx, - std::vector tables, - std::vector> variables, + std::vector tables, VarVector variables, bool unlikelyToFitInCache = false, std::vector sortedColumns = {}, LocalVocab localVocab = LocalVocab{}) @@ -199,8 +201,7 @@ class ValuesForTesting : public Operation { } private: - VariableToColumnMap computeVarMapFromVector( - const std::vector>& vars) const { + VariableToColumnMap computeVarMapFromVector(const VarVector& vars) const { VariableToColumnMap m; for (auto i = ColumnIndex{0}; i < vars.size(); ++i) { if (!vars.at(i).has_value()) { diff --git a/test/OperationTest.cpp b/test/OperationTest.cpp index 5f2c2c377c..d0daeaaed2 100644 --- a/test/OperationTest.cpp +++ b/test/OperationTest.cpp @@ -133,7 +133,7 @@ class OperationTestFixture : public testing::Test { &namedCache, [&](std::string json) { jsonHistory.emplace_back(std::move(json)); }}; IdTable table = makeIdTableFromVector({{}, {}, {}}); - ValuesForTesting operation{&qec, std::move(table), {}}; + ValuesForTesting operation{&qec, std::move(table), VariableToColumnMap{}}; }; // _____________________________________________________________________________ @@ -288,7 +288,8 @@ TEST(Operation, updateRuntimeStatsWorksCorrectly) { auto qec = getQec(); auto idTable = makeIdTableFromVector({{3, 4}, {7, 8}, {9, 123}}); ValuesForTesting valuesForTesting{ - qec, std::move(idTable), {Variable{"?x"}, Variable{"?y"}}}; + qec, std::move(idTable), + ValuesForTesting::VarVector{Variable{"?x"}, Variable{"?y"}}}; auto& rti = valuesForTesting.runtimeInfo(); diff --git a/test/ValuesForTestingTest.cpp b/test/ValuesForTestingTest.cpp index 8c4b86d019..c8108f4d98 100644 --- a/test/ValuesForTestingTest.cpp +++ b/test/ValuesForTestingTest.cpp @@ -16,7 +16,8 @@ TEST(ValuesForTesting, valuesForTesting) { (ValuesForTesting{getQec(), table.clone(), {Variable{"?x"}}})); ValuesForTesting v{ - getQec(), table.clone(), {Variable{"?x"}, {Variable{"?y"}}}}; + getQec(), table.clone(), + ValuesForTesting::VarVector{Variable{"?x"}, {Variable{"?y"}}}}; // The following line has no effect. TODO provide default // implementations for such boilerplate methods in the `Operation` base class. ASSERT_EQ(v.getResultWidth(), 2u); @@ -42,7 +43,7 @@ TEST(ValuesForTesting, cornerCasesCacheKey) { auto empty = makeIdTableFromVector({}); auto neutral = makeIdTableFromVector({{}}); - ValuesForTesting vEmpty{getQec(), empty.clone(), {}}; - ValuesForTesting vNeutral{getQec(), neutral.clone(), {}}; + ValuesForTesting vEmpty{getQec(), empty.clone(), VariableToColumnMap{}}; + ValuesForTesting vNeutral{getQec(), neutral.clone(), VariableToColumnMap{}}; EXPECT_NE(vEmpty.getCacheKey(), vNeutral.getCacheKey()); } From e9e8dfd18978a96bf5d61926e828200b18d4a2ff Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 31 Jan 2025 09:03:12 +0100 Subject: [PATCH 08/25] Make the In-Memory-Vocabulary compatible with the RDFVocabulary Signed-off-by: Johannes Kalmbach --- src/index/IndexImpl.Text.cpp | 2 +- src/index/IndexImpl.cpp | 7 +++++-- src/index/IndexImpl.h | 2 +- src/index/StringSortComparator.h | 7 +++++++ src/index/Vocabulary.cpp | 2 +- src/index/Vocabulary.h | 7 ++++++- src/index/vocabulary/VocabularyInMemory.h | 5 ++++- 7 files changed, 25 insertions(+), 7 deletions(-) diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index 3b872eb39c..2f15be7e5c 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -48,7 +48,7 @@ cppcoro::generator IndexImpl::wordsInTextRecords( if (!isLiteral(text)) { continue; } - WordsFileLine entityLine{text, true, contextId, 1, true}; + WordsFileLine entityLine{std::string{text}, true, contextId, 1, true}; co_yield entityLine; std::string_view textView = text; textView = textView.substr(0, textView.rfind('"')); diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 72efec5307..6205d08f6b 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -974,7 +974,7 @@ size_t IndexImpl::getNumDistinctSubjectPredicatePairs() const { } // _____________________________________________________________________________ -bool IndexImpl::isLiteral(const string& object) const { +bool IndexImpl::isLiteral(std::string_view object) const { return decltype(vocab_)::stringIsLiteral(object); } @@ -1522,7 +1522,10 @@ size_t IndexImpl::getCardinality( } // ___________________________________________________________________________ -std::string IndexImpl::indexToString(VocabIndex id) const { return vocab_[id]; } +// TODO Make this the return type of the vocabulary. +std::string IndexImpl::indexToString(VocabIndex id) const { + return std::string{vocab_[id]}; +} // ___________________________________________________________________________ std::string_view IndexImpl::indexToString(WordVocabIndex id) const { diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index d284cdb415..a698a96c6f 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -634,7 +634,7 @@ class IndexImpl { friend class IndexTest_createFromOnDiskIndexTest_Test; friend class CreatePatternsFixture_createPatterns_Test; - bool isLiteral(const string& object) const; + bool isLiteral(std::string_view object) const; public: LangtagAndTriple tripleToInternalRepresentation(TurtleTriple&& triple) const; diff --git a/src/index/StringSortComparator.h b/src/index/StringSortComparator.h index 81829f226e..33f1f2077c 100644 --- a/src/index/StringSortComparator.h +++ b/src/index/StringSortComparator.h @@ -619,6 +619,13 @@ class TripleComponentComparator { return compare(spA, spB, level) < 0; } + // TODO Unify these three functions. + bool operator()(const SplitVal& spA, std::string_view b, + const Level level) const { + auto spB = extractAndTransformComparable(b, level, false); + return compare(spA, spB, level) < 0; + } + template bool operator()(const SplitValBase& a, const SplitValBase& b, const Level level) const { diff --git a/src/index/Vocabulary.cpp b/src/index/Vocabulary.cpp index ab2cb52505..cd3b25b490 100644 --- a/src/index/Vocabulary.cpp +++ b/src/index/Vocabulary.cpp @@ -43,7 +43,7 @@ void Vocabulary::readFromFile(const string& fileName) { << std::endl; vocabulary_.close(); vocabulary_.open(fileName); - if constexpr (isCompressed_) { + if constexpr (isCompressed_ && false) { const auto& internalExternalVocab = vocabulary_.getUnderlyingVocabulary().getUnderlyingVocabulary(); LOG(INFO) << "Done, number of words: " diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index 6775a13217..fc9c118b87 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -37,9 +37,11 @@ using std::string; using std::vector; template -using AccessReturnType_t = +using AccessReturnType_t = std::string_view; +/* std::conditional_t, std::string, std::string_view>; + */ template class IdRange { @@ -114,10 +116,13 @@ class Vocabulary { vector internalizedLangs_; vector externalizedPrefixes_{""}; + using UnderlyingVocabulary = VocabularyInMemory; + /* using UnderlyingVocabulary = std::conditional_t, VocabularyInMemory>; + */ using VocabularyWithUnicodeComparator = UnicodeVocabulary; diff --git a/src/index/vocabulary/VocabularyInMemory.h b/src/index/vocabulary/VocabularyInMemory.h index 5ce18fe721..efe9a9c7e7 100644 --- a/src/index/vocabulary/VocabularyInMemory.h +++ b/src/index/vocabulary/VocabularyInMemory.h @@ -68,11 +68,14 @@ class VocabularyInMemory struct WordWriter { typename Words::Writer writer_; explicit WordWriter(const std::string& filename) : writer_{filename} {} - void operator()(std::string_view str) { + void operator()(std::string_view str, + [[maybe_unused]] bool isExternalDummy = false) { writer_.push(str.data(), str.size()); } void finish() { writer_.finish(); } + std::string readableNameDummy_; + std::string& readableName() { return readableNameDummy_; } }; // Return a `WordWriter` that directly writes the words to the given From 79a11b662ad4a0e93db03b5d32512a689afef90d Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 31 Jan 2025 11:22:20 +0100 Subject: [PATCH 09/25] Refactor things. TODO: Make the vocabulary implementation be choosable from CMake Signed-off-by: Johannes Kalmbach --- src/engine/ExportQueryExecutionTrees.cpp | 11 +++- src/index/Index.cpp | 5 +- src/index/Index.h | 10 ++-- src/index/IndexImpl.cpp | 8 +-- src/index/IndexImpl.h | 4 +- src/index/StringSortComparator.h | 1 - src/index/Vocabulary.cpp | 42 ++++---------- src/index/Vocabulary.h | 57 ++++++++----------- src/index/vocabulary/VocabularyInMemory.cpp | 3 + src/index/vocabulary/VocabularyInMemory.h | 6 ++ .../vocabulary/VocabularyInternalExternal.cpp | 12 ++++ .../vocabulary/VocabularyInternalExternal.h | 5 +- test/engine/TextIndexScanTestHelpers.h | 8 +-- 13 files changed, 85 insertions(+), 87 deletions(-) diff --git a/src/engine/ExportQueryExecutionTrees.cpp b/src/engine/ExportQueryExecutionTrees.cpp index 3375e82924..351dc9b28d 100644 --- a/src/engine/ExportQueryExecutionTrees.cpp +++ b/src/engine/ExportQueryExecutionTrees.cpp @@ -356,8 +356,15 @@ ExportQueryExecutionTrees::getLiteralOrIriFromVocabIndex( case Datatype::LocalVocabIndex: return localVocab.getWord(id.getLocalVocabIndex()).asLiteralOrIri(); case Datatype::VocabIndex: { - auto entity = index.indexToString(id.getVocabIndex()); - return LiteralOrIri::fromStringRepresentation(entity); + auto getEntity = [&index, id]() { + return index.indexToString(id.getVocabIndex()); + }; + // The type of entity might be `string_view` (If the vocabulary is stored + // uncompressed in RAM) or `string` (if it is on-disk, or compressed or + // both). The following code works and is efficient in all cases. In + // particular, the `std::string` constructor is compiled out because of + // RVO if `getEntity()` already returns a `string`. + return LiteralOrIri::fromStringRepresentation(std::string(getEntity())); } default: AD_FAIL(); diff --git a/src/index/Index.cpp b/src/index/Index.cpp index f66914bfca..06350e1e26 100644 --- a/src/index/Index.cpp +++ b/src/index/Index.cpp @@ -71,12 +71,13 @@ size_t Index::getCardinality( } // ____________________________________________________________________________ -std::string Index::indexToString(VocabIndex id) const { +auto Index::indexToString(VocabIndex id) const -> Vocab::AccessReturnType { return pimpl_->indexToString(id); } // ____________________________________________________________________________ -std::string_view Index::indexToString(WordVocabIndex id) const { +auto Index::indexToString(WordVocabIndex id) const + -> TextVocabulary::AccessReturnType { return pimpl_->indexToString(id); } diff --git a/src/index/Index.h b/src/index/Index.h index 8c6dd1cd40..101908ab7e 100644 --- a/src/index/Index.h +++ b/src/index/Index.h @@ -104,13 +104,11 @@ class Index { // Read necessary metadata into memory and open file handles. void addTextFromOnDiskIndex(); - using Vocab = - Vocabulary; + using Vocab = RdfsVocabulary; [[nodiscard]] const Vocab& getVocab() const; Vocab& getNonConstVocabForTesting(); - using TextVocab = - Vocabulary; + using TextVocab = TextVocabulary; [[nodiscard]] const TextVocab& getTextVocab() const; // Get a (non-owning) pointer to the BlankNodeManager of this Index. @@ -132,8 +130,8 @@ class Index { // TODO Once we have an overview over the folding this logic should // probably not be in the index class. - std::string indexToString(VocabIndex id) const; - std::string_view indexToString(WordVocabIndex id) const; + Vocab::AccessReturnType indexToString(VocabIndex id) const; + TextVocab::AccessReturnType indexToString(WordVocabIndex id) const; [[nodiscard]] Vocab::PrefixRanges prefixRanges(std::string_view prefix) const; diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 6205d08f6b..d5781bb297 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -1522,13 +1522,13 @@ size_t IndexImpl::getCardinality( } // ___________________________________________________________________________ -// TODO Make this the return type of the vocabulary. -std::string IndexImpl::indexToString(VocabIndex id) const { - return std::string{vocab_[id]}; +RdfsVocabulary::AccessReturnType IndexImpl::indexToString(VocabIndex id) const { + return vocab_[id]; } // ___________________________________________________________________________ -std::string_view IndexImpl::indexToString(WordVocabIndex id) const { +TextVocabulary::AccessReturnType IndexImpl::indexToString( + WordVocabIndex id) const { return textVocab_[id]; } diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index a698a96c6f..8478943c92 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -305,10 +305,10 @@ class IndexImpl { const LocatedTriplesSnapshot& locatedTriplesSnapshot) const; // ___________________________________________________________________________ - std::string indexToString(VocabIndex id) const; + RdfsVocabulary::AccessReturnType indexToString(VocabIndex id) const; // ___________________________________________________________________________ - std::string_view indexToString(WordVocabIndex id) const; + TextVocabulary::AccessReturnType indexToString(WordVocabIndex id) const; public: // ___________________________________________________________________________ diff --git a/src/index/StringSortComparator.h b/src/index/StringSortComparator.h index 33f1f2077c..d77e616a76 100644 --- a/src/index/StringSortComparator.h +++ b/src/index/StringSortComparator.h @@ -619,7 +619,6 @@ class TripleComponentComparator { return compare(spA, spB, level) < 0; } - // TODO Unify these three functions. bool operator()(const SplitVal& spA, std::string_view b, const Level level) const { auto spB = extractAndTransformComparable(b, level, false); diff --git a/src/index/Vocabulary.cpp b/src/index/Vocabulary.cpp index cd3b25b490..70e9f0c50e 100644 --- a/src/index/Vocabulary.cpp +++ b/src/index/Vocabulary.cpp @@ -39,20 +39,8 @@ bool Vocabulary::PrefixRanges::contain( // _____________________________________________________________________________ template void Vocabulary::readFromFile(const string& fileName) { - LOG(INFO) << "Reading vocabulary from file " << fileName << " ..." - << std::endl; vocabulary_.close(); vocabulary_.open(fileName); - if constexpr (isCompressed_ && false) { - const auto& internalExternalVocab = - vocabulary_.getUnderlyingVocabulary().getUnderlyingVocabulary(); - LOG(INFO) << "Done, number of words: " - << internalExternalVocab.internalVocab().size() << std::endl; - LOG(INFO) << "Number of words in external vocabulary: " - << internalExternalVocab.externalVocab().size() << std::endl; - } else { - LOG(INFO) << "Done, number of words: " << vocabulary_.size() << std::endl; - } // Precomputing ranges for IRIs, blank nodes, and literals, for faster // processing of the `isIrI` and `isLiteral` functions. @@ -88,19 +76,12 @@ bool Vocabulary::stringIsLiteral(std::string_view s) { // _____________________________________________________________________________ template bool Vocabulary::shouldBeExternalized(string_view s) const { - // TODO Completely refactor the Vocabulary on the different - // Types, it is a mess. - - // If the string is not compressed, this means that this is a text vocabulary - // and thus doesn't support externalization. - if constexpr (std::is_same_v) { - if (!stringIsLiteral(s)) { - return shouldEntityBeExternalized(s); - } else { - return shouldLiteralBeExternalized(s); - } + // TODO We should have a completely separate layer that handles the + // externalization, not the Vocab. + if (!stringIsLiteral(s)) { + return shouldEntityBeExternalized(s); } else { - return false; + return shouldLiteralBeExternalized(s); } } @@ -264,17 +245,18 @@ auto Vocabulary::prefixRanges(std::string_view prefix) const } // _____________________________________________________________________________ -template -auto Vocabulary::operator[](IndexType idx) const - -> AccessReturnType_t { +template +auto Vocabulary::operator[](IndexType idx) const + -> AccessReturnType { AD_CONTRACT_CHECK(idx.get() < size()); return vocabulary_[idx.get()]; } // Explicit template instantiations -template class Vocabulary; -template class Vocabulary; +template class Vocabulary; +template class Vocabulary; template void RdfsVocabulary::initializeInternalizedLangs( const nlohmann::json&); diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index fc9c118b87..c7a8454a4a 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -36,13 +36,6 @@ using std::string; using std::vector; -template -using AccessReturnType_t = std::string_view; -/* - std::conditional_t, - std::string, std::string_view>; - */ - template class IdRange { public: @@ -69,9 +62,15 @@ inline std::ostream& operator<<(std::ostream& stream, // retrieval. Template parameters that are supported are: // std::string -> no compression is applied // CompressedString -> prefix compression is applied -template +template class Vocabulary { public: + // The type that is returned by the `operator[]` of this vocabulary. Typically + // either `std::string` or `std::string_view`. + using AccessReturnType = + decltype(std::declval()[0]); + // The index ranges for a prefix + a function to check whether a given index // is contained in one of them. // @@ -96,17 +95,6 @@ class Vocabulary { // The different type of data that is stored in the vocabulary enum class Datatypes { Literal, Iri, Float, Date }; - template - using enable_if_compressed = - std::enable_if_t>; - - template - using enable_if_uncompressed = - std::enable_if_t>; - - static constexpr bool isCompressed_ = - std::is_same_v; - // If a literal uses one of these language tags or starts with one of these // prefixes, it will be externalized. By default, everything is externalized. // Both of these settings can be overridden using the `settings.json` file. @@ -116,13 +104,19 @@ class Vocabulary { vector internalizedLangs_; vector externalizedPrefixes_{""}; - using UnderlyingVocabulary = VocabularyInMemory; + // using UnderlyingVocabulary = VocabularyInMemory; /* using UnderlyingVocabulary = std::conditional_t, VocabularyInMemory>; */ + /* + using UnderlyingVocabulary = + std::conditional_t, + VocabularyInMemory>; + */ using VocabularyWithUnicodeComparator = UnicodeVocabulary; @@ -137,10 +131,7 @@ class Vocabulary { using SortLevel = typename ComparatorType::Level; using IndexType = IndexT; - template < - typename = std::enable_if_t || - std::is_same_v>> - Vocabulary() {} + Vocabulary() = default; Vocabulary& operator=(Vocabulary&&) noexcept = default; Vocabulary(Vocabulary&&) noexcept = default; @@ -151,10 +142,7 @@ class Vocabulary { // Get the word with the given `idx`. Throw if the `idx` is not contained // in the vocabulary. - AccessReturnType_t operator[](IndexType idx) const; - - // AccessReturnType_t at(IndexType idx) const { return - // operator[](id); } + AccessReturnType operator[](IndexType idx) const; //! Get the number of words in the vocabulary. [[nodiscard]] size_t size() const { return vocabulary_.size(); } @@ -247,7 +235,12 @@ class Vocabulary { } }; -using RdfsVocabulary = - Vocabulary; -using TextVocabulary = - Vocabulary; +namespace detail { +using UnderlyingVocabRdfsVocabulary = VocabularyInMemory; +using UnderlyingVocabTextVocabulary = VocabularyInMemory; +} // namespace detail + +using RdfsVocabulary = Vocabulary; +using TextVocabulary = Vocabulary; diff --git a/src/index/vocabulary/VocabularyInMemory.cpp b/src/index/vocabulary/VocabularyInMemory.cpp index f3db258d59..a1c82231d3 100644 --- a/src/index/vocabulary/VocabularyInMemory.cpp +++ b/src/index/vocabulary/VocabularyInMemory.cpp @@ -8,9 +8,12 @@ using std::string; // _____________________________________________________________________________ void VocabularyInMemory::open(const string& fileName) { + LOG(INFO) << "Reading vocabulary from file " << fileName << " ..." + << std::endl; _words.clear(); ad_utility::serialization::FileReadSerializer file(fileName); file >> _words; + LOG(INFO) << "Done, number of words: " << size() << std::endl; } // _____________________________________________________________________________ diff --git a/src/index/vocabulary/VocabularyInMemory.h b/src/index/vocabulary/VocabularyInMemory.h index efe9a9c7e7..ed498d1702 100644 --- a/src/index/vocabulary/VocabularyInMemory.h +++ b/src/index/vocabulary/VocabularyInMemory.h @@ -68,12 +68,18 @@ class VocabularyInMemory struct WordWriter { typename Words::Writer writer_; explicit WordWriter(const std::string& filename) : writer_{filename} {} + + // Write a word. The `isExternalDummy` is only there to have a consistent + // interface with the `VocabularyInternalExternal`. void operator()(std::string_view str, [[maybe_unused]] bool isExternalDummy = false) { writer_.push(str.data(), str.size()); } void finish() { writer_.finish(); } + + // The `readableName()` function is only there to have a consistent + // interface with the `VocabularyInternalExternal`. std::string readableNameDummy_; std::string& readableName() { return readableNameDummy_; } }; diff --git a/src/index/vocabulary/VocabularyInternalExternal.cpp b/src/index/vocabulary/VocabularyInternalExternal.cpp index 62c5e29455..3d3d5fffb5 100644 --- a/src/index/vocabulary/VocabularyInternalExternal.cpp +++ b/src/index/vocabulary/VocabularyInternalExternal.cpp @@ -37,3 +37,15 @@ void VocabularyInternalExternal::WordWriter::finish() { internalWriter_.finish(); externalWriter_.finish(); } + +// _____________________________________________________________________________ +void VocabularyInternalExternal::open(const string& filename) { + LOG(INFO) << "Reading vocabulary from file " << filename << " ..." + << std::endl; + internalVocab_.open(filename + ".internal"); + externalVocab_.open(filename + ".external"); + LOG(INFO) << "Done, number of words: " << size() << std::endl; + LOG(INFO) << "Number of words in internal vocabulary (these are also part " + "of the external vocabulary): " + << internalVocab_.size() << std::endl; +} diff --git a/src/index/vocabulary/VocabularyInternalExternal.h b/src/index/vocabulary/VocabularyInternalExternal.h index f9024369bd..d92510a49f 100644 --- a/src/index/vocabulary/VocabularyInternalExternal.h +++ b/src/index/vocabulary/VocabularyInternalExternal.h @@ -40,10 +40,7 @@ class VocabularyInternalExternal { // Read the vocabulary from a file. The file must have been created using a // `WordWriter`. - void open(const string& filename) { - internalVocab_.open(filename + ".internal"); - externalVocab_.open(filename + ".external"); - } + void open(const string& filename); // Return the total number of words [[nodiscard]] size_t size() const { return externalVocab_.size(); } diff --git a/test/engine/TextIndexScanTestHelpers.h b/test/engine/TextIndexScanTestHelpers.h index 6ba1b8c6de..0cc6ae74c6 100644 --- a/test/engine/TextIndexScanTestHelpers.h +++ b/test/engine/TextIndexScanTestHelpers.h @@ -22,8 +22,8 @@ inline string getTextRecordFromResultTable(const QueryExecutionContext* qec, result.idTable().getColumn(0)[rowIndex].getTextRecordIndex().get(); if (nofNonLiterals <= textRecordIdFromTable) { // Return when from Literals - return qec->getIndex().indexToString( - VocabIndex::make(textRecordIdFromTable - nofNonLiterals)); + return std::string{qec->getIndex().indexToString( + VocabIndex::make(textRecordIdFromTable - nofNonLiterals))}; } else { // Return when from DocsDB return qec->getIndex().getTextExcerpt( @@ -41,8 +41,8 @@ inline const TextRecordIndex getTextRecordIdFromResultTable( inline string getEntityFromResultTable(const QueryExecutionContext* qec, const ProtoResult& result, const size_t& rowIndex) { - return qec->getIndex().indexToString( - result.idTable().getColumn(1)[rowIndex].getVocabIndex()); + return std::string{qec->getIndex().indexToString( + result.idTable().getColumn(1)[rowIndex].getVocabIndex())}; } // Only use on prefix search results From e406fa429a3b666f73391e2f4b7a7586a593cfc1 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 31 Jan 2025 11:44:39 +0100 Subject: [PATCH 10/25] Making the vocab configuration configurable at runtime. Signed-off-by: Johannes Kalmbach --- CMakeLists.txt | 10 ++++++++++ src/index/Vocabulary.h | 18 +++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3679de4c51..67b2feb62b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -203,6 +203,16 @@ if (${USE_CPP_17_BACKPORTS}) add_definitions("-DQLEVER_CPP_17 -DCPP_CXX_CONCEPTS=0") endif() +set(VOCAB_IN_MEMORY OFF CACHE BOOL "Store QLever's vocabulary completely in RAM") +if (${VOCAB_IN_MEMORY}) + add_definitions("-D_QLEVER_VOCAB_IN_MEMORY") +endif () + +set(ENABLE_VOCAB_COMPRESSION ON CACHE BOOL "Compress the vocabulary") +if (${ENABLE_VOCAB_COMPRESSION}) + add_definitions("-D_QLEVER_ENABLE_VOCAB_COMPRESSION") +endif () + # Enable the specification of additional linker flags manually from the commandline set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ADDITIONAL_LINKER_FLAGS}") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ADDITIONAL_LINKER_FLAGS}") diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index c7a8454a4a..e3513c39d4 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -236,7 +236,23 @@ class Vocabulary { }; namespace detail { -using UnderlyingVocabRdfsVocabulary = VocabularyInMemory; +// The two mactors `_QLEVER_VOCAB_IN_MEMORY` and +// `_QLEVER_ENABLE_VOCAB_COMPRESSION` can be used to disable the external vocab +// and the compression of the vocab at compile time. NOTE: These change the +// binary format of QLever's index, so changing them requires rebuilding of the +// indices. +#ifdef _QLEVER_VOCAB_IN_MEMORY +using VocabStorage = VocabularyInMemory; +#else +using VocabStorage = VocabularyInternalExternal; +#endif + +#ifndef _QLEVER_ENABLE_VOCAB_COMPRESSION +using UnderlyingVocabRdfsVocabulary = VocabStorage; +#else +using UnderlyingVocabRdfsVocabulary = CompressedVocabulary; +#endif + using UnderlyingVocabTextVocabulary = VocabularyInMemory; } // namespace detail From 53dc7411cd0547d0b77f316080844dd9c8f65f8c Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 31 Jan 2025 17:19:02 +0100 Subject: [PATCH 11/25] Do not move IdTables (we will later try this out on the dat dataset). Signed-off-by: Johannes Kalmbach --- src/engine/NamedQueryCache.cpp | 2 +- src/engine/Result.cpp | 39 +++++++++++++++++++---- src/engine/Result.h | 4 ++- src/engine/ValuesForTesting.h | 58 ++++++++++++++++++++++------------ 4 files changed, 74 insertions(+), 29 deletions(-) diff --git a/src/engine/NamedQueryCache.cpp b/src/engine/NamedQueryCache.cpp index 6c333fcbb0..7acc9515e5 100644 --- a/src/engine/NamedQueryCache.cpp +++ b/src/engine/NamedQueryCache.cpp @@ -10,7 +10,7 @@ std::shared_ptr NamedQueryCache ::getOperation( const auto& [table, map, sortedOn] = get(key); // TODO we should get rid of the copies for the IdTable (and // probably the other members) especially for larger results). - return std::make_shared(ctx, table.clone(), map); + return std::make_shared(ctx, table.clone(), map, sortedOn); } // _____________________________________________________________________________ diff --git a/src/engine/Result.cpp b/src/engine/Result.cpp index 3b476777bb..5671d92676 100644 --- a/src/engine/Result.cpp +++ b/src/engine/Result.cpp @@ -8,6 +8,7 @@ #include +#include "../../cmake-build-clang-16-debug-backports/_deps/range-v3-src/include/range/v3/experimental/view/shared.hpp" #include "util/Exception.h" #include "util/Generators.h" #include "util/Log.h" @@ -59,6 +60,18 @@ Result::Result(IdTable idTable, std::vector sortedBy, assertSortOrderIsRespected(this->idTable(), sortedBy_); } +// _____________________________________________________________________________ +Result::Result(std::shared_ptr idTablePtr, + std::vector sortedBy, LocalVocab&& localVocab) + : data_{IdTableSharedLocalVocabPair{ + std::move(idTablePtr), + std::make_shared(std::move(localVocab))}}, + sortedBy_{std::move(sortedBy)} { + AD_CONTRACT_CHECK(std::get(data_).localVocab_ != + nullptr); + assertSortOrderIsRespected(this->idTable(), sortedBy_); +} + // _____________________________________________________________________________ Result::Result(IdTable idTable, std::vector sortedBy, LocalVocab&& localVocab) @@ -120,8 +133,13 @@ void Result::applyLimitOffset( } if (isFullyMaterialized()) { ad_utility::timer::Timer limitTimer{ad_utility::timer::Timer::Started}; - resizeIdTable(std::get(data_).idTable_, - limitOffset); + + auto& tableOrPtr = std::get(data_).idTable_; + if (auto sharedTable = + std::get_if>(&tableOrPtr)) { + tableOrPtr = (**sharedTable).clone(); + } + resizeIdTable(std::get(tableOrPtr), limitOffset); limitTimeCallback(limitTimer.msecs(), idTable()); } else { auto generator = [](LazyResult original, LimitOffsetClause limitOffset, @@ -177,7 +195,7 @@ void Result::assertThatLimitWasRespected(const LimitOffsetClause& limitOffset) { // _____________________________________________________________________________ void Result::checkDefinedness(const VariableToColumnMap& varColMap) { - auto performCheck = [](const auto& map, IdTable& idTable) { + auto performCheck = [](const auto& map, const IdTable& idTable) { return ql::ranges::all_of(map, [&](const auto& varAndCol) { const auto& [columnIndex, mightContainUndef] = varAndCol.second; if (mightContainUndef == ColumnIndexAndTypeInfo::AlwaysDefined) { @@ -189,8 +207,7 @@ void Result::checkDefinedness(const VariableToColumnMap& varColMap) { }); }; if (isFullyMaterialized()) { - AD_EXPENSIVE_CHECK(performCheck( - varColMap, std::get(data_).idTable_)); + AD_EXPENSIVE_CHECK(performCheck(varColMap, idTable())); } else { auto generator = [](LazyResult original, [[maybe_unused]] VariableToColumnMap varColMap, @@ -250,7 +267,17 @@ void Result::assertSortOrderIsRespected( // _____________________________________________________________________________ const IdTable& Result::idTable() const { AD_CONTRACT_CHECK(isFullyMaterialized()); - return std::get(data_).idTable_; + auto visitor = [](const T& arg) -> const IdTable& { + if constexpr (std::is_same_v) { + return arg; + } else { + static_assert(std::is_same_v>); + AD_CORRECTNESS_CHECK(arg != nullptr); + return *arg; + } + }; + return std::visit(visitor, + std::get(data_).idTable_); } // _____________________________________________________________________________ diff --git a/src/engine/Result.h b/src/engine/Result.h index c372cf7102..1fe7dbcdac 100644 --- a/src/engine/Result.h +++ b/src/engine/Result.h @@ -55,7 +55,7 @@ class Result { using LocalVocabPtr = std::shared_ptr; struct IdTableSharedLocalVocabPair { - IdTable idTable_; + std::variant, IdTable> idTable_; // The local vocabulary of the result. LocalVocabPtr localVocab_; }; @@ -115,6 +115,8 @@ class Result { SharedLocalVocabWrapper localVocab); Result(IdTable idTable, std::vector sortedBy, LocalVocab&& localVocab); + Result(std::shared_ptr idTablePtr, + std::vector sortedBy, LocalVocab&& localVocab); Result(IdTableVocabPair pair, std::vector sortedBy); Result(Generator idTables, std::vector sortedBy); // Prevent accidental copying of a result table. diff --git a/src/engine/ValuesForTesting.h b/src/engine/ValuesForTesting.h index 389eefd493..a3b28eca36 100644 --- a/src/engine/ValuesForTesting.h +++ b/src/engine/ValuesForTesting.h @@ -9,16 +9,22 @@ #include "engine/Result.h" #include "util/Algorithm.h" #include "util/Random.h" +#include "util/TransparentFunctors.h" // An operation that yields a given `IdTable` as its result. It is used for // unit testing purposes when we need to specify the subtrees of another // operation. +namespace detail { +auto getTables(const auto& tables) { + return ql::views::transform(tables, ad_utility::dereference); +} +} // namespace detail class ValuesForTesting : public Operation { public: using VarVector = std::vector>; private: - std::vector tables_; + std::vector> tables_; VariableToColumnMap variables_; bool supportsLimit_; // Those can be manually overwritten for testing using the respective getters. @@ -46,7 +52,7 @@ class ValuesForTesting : public Operation { multiplicity_{multiplicity}, forceFullyMaterialized_{forceFullyMaterialized} { AD_CONTRACT_CHECK(variables.size() == table.numColumns()); - tables_.push_back(std::move(table)); + tables_.push_back(std::make_shared(std::move(table))); variables_ = computeVarMapFromVector(variables); } @@ -63,7 +69,7 @@ class ValuesForTesting : public Operation { localVocab_{std::move(localVocab)}, multiplicity_{}, forceFullyMaterialized_{false} { - tables_.push_back(std::move(table)); + tables_.push_back(std::make_shared(std::move(table))); } explicit ValuesForTesting(QueryExecutionContext* ctx, std::vector tables, VarVector variables, @@ -71,7 +77,7 @@ class ValuesForTesting : public Operation { std::vector sortedColumns = {}, LocalVocab localVocab = LocalVocab{}) : Operation{ctx}, - tables_{std::move(tables)}, + tables_{}, supportsLimit_{false}, sizeEstimate_{0}, costEstimate_{0}, @@ -79,12 +85,16 @@ class ValuesForTesting : public Operation { resultSortedColumns_{std::move(sortedColumns)}, localVocab_{std::move(localVocab)}, multiplicity_{std::nullopt} { - AD_CONTRACT_CHECK( - ql::ranges::all_of(tables_, [&variables](const IdTable& table) { + tables_.reserve(tables.size()); + for (auto& table : tables) { + tables_.push_back(std::make_shared(std::move(table))); + } + AD_CONTRACT_CHECK(ql::ranges::all_of( + detail::getTables(tables_), [&variables](const IdTable& table) { return variables.size() == table.numColumns(); })); size_t totalRows = 0; - for (const IdTable& idTable : tables_) { + for (const IdTable& idTable : detail::getTables(tables_)) { totalRows += idTable.numRows(); } sizeEstimate_ = totalRows; @@ -98,12 +108,12 @@ class ValuesForTesting : public Operation { // ___________________________________________________________________________ ProtoResult computeResult(bool requestLaziness) override { - if (requestLaziness && !forceFullyMaterialized_) { + if (requestLaziness && !forceFullyMaterialized_ && tables_.size() != 1) { // Not implemented yet AD_CORRECTNESS_CHECK(!supportsLimit_); std::vector clones; clones.reserve(tables_.size()); - for (const IdTable& idTable : tables_) { + for (const IdTable& idTable : detail::getTables(tables_)) { clones.push_back(idTable.clone()); } auto generator = [](auto idTables, @@ -114,17 +124,21 @@ class ValuesForTesting : public Operation { }(std::move(clones), localVocab_.clone()); return {std::move(generator), resultSortedOn()}; } + + if (tables_.size() == 1 && getLimit().isUnconstrained()) { + return {tables_.at(0), resultSortedOn(), localVocab_.clone()}; + } std::optional optionalTable; - if (tables_.size() > 1) { - IdTable aggregateTable{tables_.at(0).numColumns(), - tables_.at(0).getAllocator()}; - for (const IdTable& idTable : tables_) { + if (detail::getTables(tables_).size() > 1) { + IdTable aggregateTable{tables_.at(0)->numColumns(), + tables_.at(0)->getAllocator()}; + for (const IdTable& idTable : detail::getTables(tables_)) { aggregateTable.insertAtEnd(idTable); } optionalTable = std::move(aggregateTable); } auto table = optionalTable.has_value() ? std::move(optionalTable).value() - : tables_.at(0).clone(); + : tables_.at(0)->clone(); if (supportsLimit_) { table.erase(table.begin() + getLimit().upperBound(table.size()), table.end()); @@ -144,15 +158,16 @@ class ValuesForTesting : public Operation { // ___________________________________________________________________________ string getCacheKeyImpl() const override { std::stringstream str; - auto numRowsView = tables_ | ql::views::transform(&IdTable::numRows); + auto numRowsView = + detail::getTables(tables_) | ql::views::transform(&IdTable::numRows); auto totalNumRows = std::reduce(numRowsView.begin(), numRowsView.end(), 0); - auto numCols = tables_.empty() ? 0 : tables_.at(0).numColumns(); + auto numCols = tables_.empty() ? 0 : tables_.at(0)->numColumns(); str << "Values for testing with " << numCols << " columns and " << totalNumRows << " rows. "; if (totalNumRows > 1000) { str << ad_utility::FastRandomIntGenerator{}(); } else { - for (const IdTable& idTable : tables_) { + for (const IdTable& idTable : detail::getTables(tables_)) { for (size_t i = 0; i < idTable.numColumns(); ++i) { for (Id entry : idTable.getColumn(i)) { str << entry << ' '; @@ -172,7 +187,7 @@ class ValuesForTesting : public Operation { size_t getResultWidth() const override { // Assume a width of 1 if we have no tables and no other information to base // it on because 0 would otherwise cause stuff to break. - return tables_.empty() ? 1 : tables_.at(0).numColumns(); + return tables_.empty() ? 1 : tables_.at(0)->numColumns(); } vector resultSortedOn() const override { @@ -197,7 +212,8 @@ class ValuesForTesting : public Operation { bool knownEmptyResult() override { return ql::ranges::all_of( - tables_, [](const IdTable& table) { return table.empty(); }); + detail::getTables(tables_), + [](const IdTable& table) { return table.empty(); }); } private: @@ -207,8 +223,8 @@ class ValuesForTesting : public Operation { if (!vars.at(i).has_value()) { continue; } - bool containsUndef = - ql::ranges::any_of(tables_, [&i](const IdTable& table) { + bool containsUndef = ql::ranges::any_of( + detail::getTables(tables_), [&i](const IdTable& table) { return ql::ranges::any_of(table.getColumn(i), [](Id id) { return id.isUndefined(); }); }); From 5e52784cb565fb80f3d7c24db8445e818b2d5dc6 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 31 Jan 2025 18:45:44 +0100 Subject: [PATCH 12/25] Remove rogue include. Signed-off-by: Johannes Kalmbach --- src/engine/Result.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/engine/Result.cpp b/src/engine/Result.cpp index 5671d92676..be25eef786 100644 --- a/src/engine/Result.cpp +++ b/src/engine/Result.cpp @@ -8,7 +8,6 @@ #include -#include "../../cmake-build-clang-16-debug-backports/_deps/range-v3-src/include/range/v3/experimental/view/shared.hpp" #include "util/Exception.h" #include "util/Generators.h" #include "util/Log.h" From 49445e52badc92b2eae35a41a4b3e9cba80ce2a0 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 5 Feb 2025 08:48:05 +0100 Subject: [PATCH 13/25] An intermediate commit before switching branches. Signed-off-by: Johannes Kalmbach --- src/global/Pattern.h | 3 + src/index/ConstantsIndexBuilding.h | 3 +- src/index/IndexBuilderMain.cpp | 11 ++ src/index/IndexImpl.cpp | 10 +- src/index/IndexImpl.h | 8 + src/index/Vocabulary.h | 14 ++ src/index/VocabularyOnDisk.h | 2 + src/index/vocabulary/CMakeLists.txt | 3 +- src/index/vocabulary/CompressedVocabulary.h | 16 +- src/index/vocabulary/UnicodeVocabulary.h | 8 +- src/index/vocabulary/VocabularyInMemory.h | 10 +- .../vocabulary/VocabularyInMemoryBinSearch.h | 3 + .../vocabulary/VocabularyInternalExternal.h | 12 ++ src/index/vocabulary/VocabularyVariant.cpp | 76 ++++++++++ src/index/vocabulary/VocabularyVariant.h | 143 ++++++++++++++++++ src/util/ProgramOptionsHelpers.h | 52 +++++-- src/util/Serializer/SerializeVector.h | 3 + 17 files changed, 355 insertions(+), 22 deletions(-) create mode 100644 src/index/vocabulary/VocabularyVariant.cpp create mode 100644 src/index/vocabulary/VocabularyVariant.h diff --git a/src/global/Pattern.h b/src/global/Pattern.h index 1005add22d..9c37eb39ce 100644 --- a/src/global/Pattern.h +++ b/src/global/Pattern.h @@ -194,6 +194,9 @@ struct CompactStringVectorWriter { commonInitialization(); } + CompactStringVectorWriter(CompactStringVectorWriter&&) = default; + CompactStringVectorWriter& operator=(CompactStringVectorWriter&&) = default; + void push(const data_type* data, size_t elementSize) { AD_CONTRACT_CHECK(!_finished); _offsets.push_back(_nextOffset); diff --git a/src/index/ConstantsIndexBuilding.h b/src/index/ConstantsIndexBuilding.h index d7c1802969..4ca58f3e80 100644 --- a/src/index/ConstantsIndexBuilding.h +++ b/src/index/ConstantsIndexBuilding.h @@ -99,7 +99,8 @@ constinit inline std::atomic BUFFER_SIZE_PARTIAL_TO_GLOBAL_ID_MAPPINGS = // the overhead of the metadata that has to be stored per block becomes // infeasible. 250K seems to be a reasonable tradeoff here. constexpr inline ad_utility::MemorySize - UNCOMPRESSED_BLOCKSIZE_COMPRESSED_METADATA_PER_COLUMN = 250_kB; + UNCOMPRESSED_BLOCKSIZE_COMPRESSED_METADATA_PER_COLUMN = + ad_utility::MemorySize::kilobytes(250); constexpr inline size_t NumColumnsIndexBuilding = 4; diff --git a/src/index/IndexBuilderMain.cpp b/src/index/IndexBuilderMain.cpp index cfc121a2d1..8877c2d01a 100644 --- a/src/index/IndexBuilderMain.cpp +++ b/src/index/IndexBuilderMain.cpp @@ -11,6 +11,7 @@ #include #include "CompilationInfo.h" +#include "IndexImpl.h" #include "global/Constants.h" #include "index/ConstantsIndexBuilding.h" #include "index/Index.h" @@ -166,6 +167,8 @@ int main(int argc, char** argv) { bool addWordsFromLiterals = false; std::optional stxxlMemory; std::optional parserBufferSize; + std::optional vocabType; + // VocabularyEnum vocabType; optind = 1; Index index{ad_utility::makeUnlimitedAllocator()}; @@ -224,6 +227,9 @@ int main(int argc, char** argv) { add("only-pso-and-pos-permutations,o", po::bool_switch(&onlyPsoAndPos), "Only build the PSO and POS permutations. This is faster, but then " "queries with predicate variables are not supported"); + add("vocabulary-type", po::value(&vocabType), + "The vocabulary implementation for strings in qlever, can be any of ... " + "(TODO joka)"); // Options for the index building process. add("stxxl-memory,m", po::value(&stxxlMemory), @@ -256,6 +262,11 @@ int main(int argc, char** argv) { if (parserBufferSize.has_value()) { index.parserBufferSize() = parserBufferSize.value(); } + /* + if (vocabType.has_value()) { + index.getImpl().setVocabularyTypeForIndexBuilding(vocabType.value()); + } + */ // If no text index name was specified, take the part of the wordsfile after // the last slash. diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index d5781bb297..40ffeb1115 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -341,6 +341,8 @@ void IndexImpl::createFromFiles( "The patterns can only be built when all 6 permutations are created"}; } + vocab_.resetToType(vocabularyTypeForIndexBuilding_); + readIndexBuilderSettingsFromFile(); updateInputFileSpecificationsAndLog(files, useParallelParser_); @@ -560,7 +562,7 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( return (*cmp)(a, b, decltype(vocab_)::SortLevel::TOTAL); }; auto wordCallback = vocab_.makeWordWriter(onDiskBase_ + VOCAB_SUFFIX); - wordCallback.readableName() = "internal vocabulary"; + // wordCallback.readableName() = "internal vocabulary"; return ad_utility::vocabulary_merger::mergeVocabulary( onDiskBase_, numFiles, sortPred, wordCallback, memoryLimitIndexBuilding()); @@ -1132,6 +1134,12 @@ void IndexImpl::readConfiguration() { loadDataMember("num-triples", numTriples_, NumNormalAndInternal{}); loadDataMember("num-non-literals-text-index", nofNonLiteralsInTextIndex_, 0); + // TODO Comment and also write the configuration. + // The default value is the one the used to be the only. + VocabularyEnum vocabType(VocabularyEnum::Enum::CompressedOnDisk); + loadDataMember("vocabulary-type", vocabType, vocabType); + vocab_.resetToType(vocabType); + // Initialize BlankNodeManager uint64_t numBlankNodesTotal; loadDataMember("num-blank-nodes-total", numBlankNodesTotal); diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 8478943c92..ca35b52d86 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -192,6 +192,9 @@ class IndexImpl { std::optional idOfHasPatternDuringIndexBuilding_; std::optional idOfInternalGraphDuringIndexBuilding_; + VocabularyEnum vocabularyTypeForIndexBuilding_{ + VocabularyEnum::Enum::CompressedOnDisk}; + // BlankNodeManager, initialized during `readConfiguration` std::unique_ptr blankNodeManager_{nullptr}; @@ -275,6 +278,11 @@ class IndexImpl { return deltaTriples_.value(); } + void setVocabularyTypeForIndexBuilding(VocabularyEnum type) { + vocabularyTypeForIndexBuilding_ = type; + configurationJson_["vocabulary-type"] = type; + } + // -------------------------------------------------------------------------- // -- RETRIEVAL --- // -------------------------------------------------------------------------- diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index e3513c39d4..0f566cc138 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -27,6 +27,7 @@ #include "index/vocabulary/UnicodeVocabulary.h" #include "index/vocabulary/VocabularyInMemory.h" #include "index/vocabulary/VocabularyInternalExternal.h" +#include "index/vocabulary/VocabularyVariant.h" #include "util/Exception.h" #include "util/HashMap.h" #include "util/HashSet.h" @@ -233,6 +234,13 @@ class Vocabulary { const std::string& filename) const { return vocabulary_.getUnderlyingVocabulary().makeDiskWriter(filename); } + + // TODO Comment. + void resetToType(VocabularyEnum type) { + if constexpr (std::is_same_v) { + vocabulary_.getUnderlyingVocabulary().resetToType(type); + } + } }; namespace detail { @@ -241,18 +249,24 @@ namespace detail { // and the compression of the vocab at compile time. NOTE: These change the // binary format of QLever's index, so changing them requires rebuilding of the // indices. +/* #ifdef _QLEVER_VOCAB_IN_MEMORY using VocabStorage = VocabularyInMemory; #else using VocabStorage = VocabularyInternalExternal; #endif +*/ +/* #ifndef _QLEVER_ENABLE_VOCAB_COMPRESSION using UnderlyingVocabRdfsVocabulary = VocabStorage; #else using UnderlyingVocabRdfsVocabulary = CompressedVocabulary; #endif +*/ +// TODO Change this place. +using UnderlyingVocabRdfsVocabulary = VocabularyVariant; using UnderlyingVocabTextVocabulary = VocabularyInMemory; } // namespace detail diff --git a/src/index/VocabularyOnDisk.h b/src/index/VocabularyOnDisk.h index f677ac3e7a..2b6455cda3 100644 --- a/src/index/VocabularyOnDisk.h +++ b/src/index/VocabularyOnDisk.h @@ -58,6 +58,8 @@ class VocabularyOnDisk : public VocabularyBinarySearchMixin { void finish(); // Destructor. Implicitly calls `finish` if it hasn't been called before. ~WordWriter(); + WordWriter(WordWriter&&) = default; + WordWriter& operator=(WordWriter&&) = default; }; /// Build from a vector of pairs of `(string, id)`. This requires the IDs to diff --git a/src/index/vocabulary/CMakeLists.txt b/src/index/vocabulary/CMakeLists.txt index bb2dfdd4a3..ff3138601e 100644 --- a/src/index/vocabulary/CMakeLists.txt +++ b/src/index/vocabulary/CMakeLists.txt @@ -1,2 +1,3 @@ -add_library(vocabulary VocabularyInMemory.h VocabularyInMemory.cpp VocabularyInMemoryBinSearch.cpp VocabularyInternalExternal.cpp) +add_library(vocabulary VocabularyInMemory.h VocabularyInMemory.cpp VocabularyInMemoryBinSearch.cpp VocabularyInternalExternal.cpp + VocabularyVariant.cpp) qlever_target_link_libraries(vocabulary) diff --git a/src/index/vocabulary/CompressedVocabulary.h b/src/index/vocabulary/CompressedVocabulary.h index dad9e84457..1eeda3599c 100644 --- a/src/index/vocabulary/CompressedVocabulary.h +++ b/src/index/vocabulary/CompressedVocabulary.h @@ -193,6 +193,10 @@ class CompressedVocabulary { delete; DiskWriterFromUncompressedWords& operator=( const DiskWriterFromUncompressedWords&) = delete; + DiskWriterFromUncompressedWords(DiskWriterFromUncompressedWords&&) = + default; + DiskWriterFromUncompressedWords& operator=( + DiskWriterFromUncompressedWords&&) = default; private: // Compress a complete block and write it to the underlying vocabulary. @@ -243,12 +247,20 @@ class CompressedVocabulary { using WordWriter = DiskWriterFromUncompressedWords; // Return a `DiskWriter` that can be used to create the vocabulary. - DiskWriterFromUncompressedWords makeDiskWriter( - const std::string& filename) const { + static DiskWriterFromUncompressedWords makeDiskWriter( + const std::string& filename) { return DiskWriterFromUncompressedWords{ absl::StrCat(filename, wordsSuffix), absl::StrCat(filename, decodersSuffix)}; } + + static std::unique_ptr makeDiskWriterPtr( + const std::string& filename) { + return std::make_unique( + absl::StrCat(filename, wordsSuffix), + absl::StrCat(filename, decodersSuffix)); + } + /// Initialize the vocabulary from the given `words`. // TODO This can be a generic Mixin... void build(const std::vector& words, diff --git a/src/index/vocabulary/UnicodeVocabulary.h b/src/index/vocabulary/UnicodeVocabulary.h index c215843c0f..73dc85556c 100644 --- a/src/index/vocabulary/UnicodeVocabulary.h +++ b/src/index/vocabulary/UnicodeVocabulary.h @@ -102,6 +102,12 @@ class UnicodeVocabulary { void close() { _underlyingVocabulary.close(); } void build(const std::vector& v, const std::string& filename) { - _underlyingVocabulary.build(v, filename); + // TODO This is really hacky, we should get rid of it and make the + // building consistent for all the vocabularies. + if constexpr (requires { _underlyingVocabulary.build(v, filename); }) { + _underlyingVocabulary.build(v, filename); + } else { + AD_FAIL(); + } } }; diff --git a/src/index/vocabulary/VocabularyInMemory.h b/src/index/vocabulary/VocabularyInMemory.h index ed498d1702..a2504ad265 100644 --- a/src/index/vocabulary/VocabularyInMemory.h +++ b/src/index/vocabulary/VocabularyInMemory.h @@ -82,14 +82,20 @@ class VocabularyInMemory // interface with the `VocabularyInternalExternal`. std::string readableNameDummy_; std::string& readableName() { return readableNameDummy_; } + WordWriter(WordWriter&&) = default; + WordWriter& operator=(WordWriter&&) = default; }; // Return a `WordWriter` that directly writes the words to the given // `filename`. The words are not materialized in RAM, but the vocabulary later - // has to be explicitly initizlied via `open(filename)`. - WordWriter makeDiskWriter(const std::string& filename) const { + // has to be explicitly initialized via `open(filename)`. + static WordWriter makeDiskWriter(const std::string& filename) { return WordWriter{filename}; } + static std::unique_ptr makeDiskWriterPtr( + const std::string& filename) { + return std::make_unique(filename); + } /// Clear the vocabulary. void close() { _words.clear(); } diff --git a/src/index/vocabulary/VocabularyInMemoryBinSearch.h b/src/index/vocabulary/VocabularyInMemoryBinSearch.h index 8367c1e965..df2314eb81 100644 --- a/src/index/vocabulary/VocabularyInMemoryBinSearch.h +++ b/src/index/vocabulary/VocabularyInMemoryBinSearch.h @@ -79,6 +79,9 @@ class VocabularyInMemoryBinSearch // Finish writing and dump all contents that still reside in buffers to // disk. void finish(); + + WordWriter(WordWriter&&) = default; + WordWriter& operator=(WordWriter&&) = default; }; // Clear the vocabulary. diff --git a/src/index/vocabulary/VocabularyInternalExternal.h b/src/index/vocabulary/VocabularyInternalExternal.h index d92510a49f..491381a88e 100644 --- a/src/index/vocabulary/VocabularyInternalExternal.h +++ b/src/index/vocabulary/VocabularyInternalExternal.h @@ -112,6 +112,9 @@ class VocabularyInternalExternal { // Finish writing. void finish(); + + WordWriter(WordWriter&&) = default; + WordWriter& operator=(WordWriter&&) = default; }; /// Clear the vocabulary. @@ -148,4 +151,13 @@ class VocabularyInternalExternal { return boundFunction(externalVocab_, word, comparator, boundFromInternalVocab.previousIndex(), upperBound); } + + public: + // TODO Clean up positions + static WordWriter makeDiskWriter(const std::string& filename) { + return WordWriter{filename}; + } + static auto makeDiskWriterPtr(const std::string& filename) { + return std::make_unique(filename); + } }; diff --git a/src/index/vocabulary/VocabularyVariant.cpp b/src/index/vocabulary/VocabularyVariant.cpp new file mode 100644 index 0000000000..f8dca2b45d --- /dev/null +++ b/src/index/vocabulary/VocabularyVariant.cpp @@ -0,0 +1,76 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include "index/vocabulary/VocabularyVariant.h" + +#include + +void VocabularyVariant::open(const std::string& filename) { + std::visit([&filename](auto& vocab) { vocab.open(filename); }, vocab_); +} + +void VocabularyVariant::open(const std::string& filename, VocabularyEnum type) { + resetToType(type); + open(filename); +} + +void VocabularyVariant::close() { + return std::visit([](auto& vocab) { return vocab.close(); }, vocab_); +} +size_t VocabularyVariant::size() const { + return std::visit([](auto& vocab) { return vocab.size(); }, vocab_); +} +std::string VocabularyVariant::operator[](uint64_t i) const { + return std::visit([i](auto& vocab) { return std::string{vocab[i]}; }, vocab_); +} + +VocabularyVariant::WordWriter::WordWriter(WordWriters writer) + : writer_(std::move(writer)) {} + +void VocabularyVariant::WordWriter::finish() { + std::visit([](auto& writer) { return writer->finish(); }, writer_); +} + +void VocabularyVariant::WordWriter::operator()(std::string_view word, + bool isExternal) { + std::visit( + [&word, isExternal](auto& writer) { return (*writer)(word, isExternal); }, + writer_); +} + +auto VocabularyVariant::makeDiskWriter(const std::string& filename) const + -> WordWriter { + return WordWriter{std::visit( + [&filename](auto& vocab) -> WordWriters { + return vocab.makeDiskWriterPtr(filename); + }, + vocab_)}; +} + +VocabularyVariant::WordWriter VocabularyVariant::makeDiskWriter( + const std::string& filename, VocabularyEnum type) { + VocabularyVariant dummyVocab; + dummyVocab.resetToType(type); + return dummyVocab.makeDiskWriter(filename); +} + +void VocabularyVariant::resetToType(VocabularyEnum type) { + close(); + switch (type.value()) { + case VocabularyEnum::Enum::InMemory: + vocab_.emplace(); + break; + case VocabularyEnum::Enum::OnDisk: + vocab_.emplace(); + break; + case VocabularyEnum::Enum::CompressedInMemory: + vocab_.emplace(); + break; + case VocabularyEnum::Enum::CompressedOnDisk: + vocab_.emplace(); + break; + default: + AD_FAIL(); + } +} diff --git a/src/index/vocabulary/VocabularyVariant.h b/src/index/vocabulary/VocabularyVariant.h new file mode 100644 index 0000000000..355fd58abd --- /dev/null +++ b/src/index/vocabulary/VocabularyVariant.h @@ -0,0 +1,143 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#pragma once +#include +#include + +#include + +#include "index/vocabulary/CompressedVocabulary.h" +#include "index/vocabulary/VocabularyInMemory.h" +#include "index/vocabulary/VocabularyInternalExternal.h" +#include "util/json.h" + +template +static constexpr auto getWordWriterTypes(const Variant& var) { + return std::apply( + [](const Vocab&...) { + return std::type_identity< + std::variant...>>{}; + }, + var); +} + +class VocabularyEnum { + public: + enum struct Enum { InMemory, OnDisk, CompressedInMemory, CompressedOnDisk }; + + private: + Enum value_ = Enum::InMemory; + + static constexpr std::array descriptions{ + "in-memory-uncompressed", "on-disk-uncompressed", "in-memory-compressed", + "on-disk-compressed"}; + + public: + VocabularyEnum() = default; + explicit VocabularyEnum(Enum value) : value_{value} {} + + static VocabularyEnum fromString(std::string_view description) { + auto it = ql::ranges::find(descriptions, description); + if (it == descriptions.end()) { + throw std::runtime_error{ + absl::StrCat("\"", description, + "\" is not a valid vocabulary type. The currently " + "supported vocabulary types are ", + absl::StrJoin(descriptions, ", "))}; + ; + } + return VocabularyEnum{static_cast(it - descriptions.begin())}; + } + std::string_view toString() const { + return descriptions.at(static_cast(value_)); + } + + Enum value() const { return value_; } + + // Conversion To JSON. + friend void to_json(nlohmann::json& j, const VocabularyEnum& vocabEnum) { + j = vocabEnum.toString(); + } + + // Conversion from JSON. + friend void from_json(const nlohmann::json& j, VocabularyEnum& vocabEnum) { + vocabEnum = VocabularyEnum::fromString(static_cast(j)); + } +}; + +class VocabularyVariant { + private: + using InMemory = VocabularyInMemory; + using External = VocabularyInternalExternal; + using CompressedInMemory = CompressedVocabulary; + using CompressedExternal = CompressedVocabulary; + using Variant = + std::variant; + using Tuple = + std::tuple; + + Variant vocab_; + + public: + void resetToType(VocabularyEnum); + void open(const std::string& filename); + void open(const std::string& filename, VocabularyEnum type); + void close(); + size_t size() const; + std::string operator[](uint64_t i) const; + + template + WordAndIndex lower_bound(const String& word, Comp comp) const { + return std::visit( + [&word, &comp](auto& vocab) { + return vocab.lower_bound(word, std::move(comp)); + }, + vocab_); + } + + template + WordAndIndex lower_bound_iterator(const String& word, Comp comp) const { + return std::visit( + [&word, &comp](auto& vocab) { + return vocab.lower_bound_iterator(word, std::move(comp)); + }, + vocab_); + } + + template + WordAndIndex upper_bound(const String& word, Comp comp) const { + return std::visit( + [&word, &comp](auto& vocab) { + return vocab.upper_bound(word, std::move(comp)); + }, + vocab_); + } + + template + WordAndIndex upper_bound_iterator(const String& word, Comp comp) const { + return std::visit( + [&word, &comp](auto& vocab) { + return vocab.upper_bound_iterator(word, std::move(comp)); + }, + vocab_); + } + + using WordWriters = decltype(getWordWriterTypes(std::declval()))::type; + + class WordWriter { + WordWriters writer_; + + public: + explicit WordWriter(WordWriters); + + void finish(); + + void operator()(std::string_view word, bool isExternal); + }; + + WordWriter makeDiskWriter(const std::string& filename) const; + static WordWriter makeDiskWriter(const std::string& filename, + VocabularyEnum type); +}; diff --git a/src/util/ProgramOptionsHelpers.h b/src/util/ProgramOptionsHelpers.h index bd804504d3..6c25565287 100644 --- a/src/util/ProgramOptionsHelpers.h +++ b/src/util/ProgramOptionsHelpers.h @@ -11,6 +11,8 @@ #include "util/Concepts.h" #include "util/MemorySize/MemorySize.h" #include "util/Parameters.h" +// TODO only include the enum. +#include "index/vocabulary/VocabularyVariant.h" namespace ad_utility { // An implicit wrapper that can be implicitly converted to and from `size_t`. @@ -47,20 +49,6 @@ inline void validate(boost::any& v, const std::vector& values, v = NonNegative{boost::lexical_cast(s)}; } -// This function is required to use `std::optional` in -// `boost::program_options`. -template -void validate(boost::any& v, const std::vector& values, - std::optional*, int) { - // First parse as a T - T* dummy = nullptr; - validate(v, values, dummy, 0); - - // Wrap the T inside std::optional - AD_CONTRACT_CHECK(!v.empty()); - v = std::optional(boost::any_cast(v)); -} - // This function is required to use `MemorySize` in `boost::program_options`. inline void validate(boost::any& v, const std::vector& values, MemorySize*, int) { @@ -120,4 +108,40 @@ class ParameterToProgramOptionFactory { } // namespace ad_utility +// This function is required to use `VocabularyEnum` in +// `boost::program_options`. +inline void validate(boost::any& v, const std::vector& values, + VocabularyEnum*, int) { + using namespace boost::program_options; + + // Make sure no previous assignment to 'v' was made. + validators::check_first_occurrence(v); + // Extract the first string from 'values'. If there is more than + // one string, it's an error, and exception will be thrown. + const string& s = validators::get_single_string(values); + + // Convert the string to `MemorySize` and put it into the option. + v = VocabularyEnum::fromString(s); +} + +// This function is required to use `std::optional` in +// `boost::program_options`. +// TODO We should find a solution that doesn't require opening +// namespace `std`, for example we could put all types + this function into the +// `ad_utility`namespace. +namespace std { +template +void validate(boost::any& v, const std::vector& values, + std::optional*, int) { + // First parse as a T + T* dummy = nullptr; + // using namespace boost::program_options; + validate(v, values, dummy, 0); + + // Wrap the T inside std::optional + AD_CONTRACT_CHECK(!v.empty()); + v = std::optional(boost::any_cast(v)); +} +} // namespace std + #endif // QLEVER_PROGRAMOPTIONSHELPERS_H diff --git a/src/util/Serializer/SerializeVector.h b/src/util/Serializer/SerializeVector.h index 982e43e2ff..d093f4c02f 100644 --- a/src/util/Serializer/SerializeVector.h +++ b/src/util/Serializer/SerializeVector.h @@ -75,6 +75,9 @@ class VectorIncrementalSerializer { } ~VectorIncrementalSerializer() { finish(); } + VectorIncrementalSerializer(VectorIncrementalSerializer&&) = default; + VectorIncrementalSerializer& operator=(VectorIncrementalSerializer&&) = + default; }; } // namespace ad_utility::serialization From 6d11c3ba8d03532a6eba846599ec7696a614bdea Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 5 Feb 2025 11:41:29 +0100 Subject: [PATCH 14/25] This seems to work, but the IDE has crashed, so we just restart:) Signed-off-by: Johannes Kalmbach --- src/index/IndexBuilderMain.cpp | 2 +- src/index/IndexImpl.cpp | 4 +- src/index/IndexImpl.h | 6 +- src/index/Vocabulary.cpp | 12 +++- src/index/Vocabulary.h | 2 +- src/index/VocabularyOnDisk.h | 2 - src/index/vocabulary/CMakeLists.txt | 3 +- src/index/vocabulary/CompressedVocabulary.h | 17 +---- src/index/vocabulary/UnicodeVocabulary.h | 10 --- src/index/vocabulary/VocabularyInMemory.h | 21 +----- .../vocabulary/VocabularyInMemoryBinSearch.h | 3 - .../vocabulary/VocabularyInternalExternal.h | 21 +++--- src/index/vocabulary/VocabularyType.h | 56 +++++++++++++++ src/index/vocabulary/VocabularyVariant.cpp | 14 ++-- src/index/vocabulary/VocabularyVariant.h | 70 +++++-------------- src/util/ProgramOptionsHelpers.h | 37 +++++----- src/util/Serializer/SerializeVector.h | 3 - 17 files changed, 128 insertions(+), 155 deletions(-) create mode 100644 src/index/vocabulary/VocabularyType.h diff --git a/src/index/IndexBuilderMain.cpp b/src/index/IndexBuilderMain.cpp index 8877c2d01a..1583a9a14f 100644 --- a/src/index/IndexBuilderMain.cpp +++ b/src/index/IndexBuilderMain.cpp @@ -167,7 +167,7 @@ int main(int argc, char** argv) { bool addWordsFromLiterals = false; std::optional stxxlMemory; std::optional parserBufferSize; - std::optional vocabType; + std::optional vocabType; // VocabularyEnum vocabType; optind = 1; diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 40ffeb1115..dd29e6d57a 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -562,7 +562,6 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( return (*cmp)(a, b, decltype(vocab_)::SortLevel::TOTAL); }; auto wordCallback = vocab_.makeWordWriter(onDiskBase_ + VOCAB_SUFFIX); - // wordCallback.readableName() = "internal vocabulary"; return ad_utility::vocabulary_merger::mergeVocabulary( onDiskBase_, numFiles, sortPred, wordCallback, memoryLimitIndexBuilding()); @@ -1136,7 +1135,8 @@ void IndexImpl::readConfiguration() { // TODO Comment and also write the configuration. // The default value is the one the used to be the only. - VocabularyEnum vocabType(VocabularyEnum::Enum::CompressedOnDisk); + ad_utility::VocabularyEnum vocabType( + ad_utility::VocabularyEnum::Enum::CompressedOnDisk); loadDataMember("vocabulary-type", vocabType, vocabType); vocab_.resetToType(vocabType); diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index ca35b52d86..7c4a937fdb 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -192,8 +192,8 @@ class IndexImpl { std::optional idOfHasPatternDuringIndexBuilding_; std::optional idOfInternalGraphDuringIndexBuilding_; - VocabularyEnum vocabularyTypeForIndexBuilding_{ - VocabularyEnum::Enum::CompressedOnDisk}; + ad_utility::VocabularyEnum vocabularyTypeForIndexBuilding_{ + ad_utility::VocabularyEnum::Enum::CompressedOnDisk}; // BlankNodeManager, initialized during `readConfiguration` std::unique_ptr blankNodeManager_{nullptr}; @@ -278,7 +278,7 @@ class IndexImpl { return deltaTriples_.value(); } - void setVocabularyTypeForIndexBuilding(VocabularyEnum type) { + void setVocabularyTypeForIndexBuilding(ad_utility::VocabularyEnum type) { vocabularyTypeForIndexBuilding_ = type; configurationJson_["vocabulary-type"] = type; } diff --git a/src/index/Vocabulary.cpp b/src/index/Vocabulary.cpp index 70e9f0c50e..80c61cc0ea 100644 --- a/src/index/Vocabulary.cpp +++ b/src/index/Vocabulary.cpp @@ -63,7 +63,17 @@ void Vocabulary::createFromSet( return getCaseComparator()(a, b, SortLevel::TOTAL); }; std::sort(begin(words), end(words), totalComparison); - vocabulary_.build(words, filename); + auto writer = makeWordWriter(filename); + auto writeWords = [&writer](std::string_view word) { + // All words are stored in the internal vocab (this is consistent with the + // previous behavior). NOTE: This function is currently only used for the + // text index and for few unit tests, where we don't have an external + // vocabulary anyway. + writer(word, false); + }; + ql::ranges::for_each(words, writeWords); + writer.finish(); + vocabulary_.open(filename); LOG(DEBUG) << "END Vocabulary::createFromSet" << std::endl; } diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index 0f566cc138..898233e284 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -236,7 +236,7 @@ class Vocabulary { } // TODO Comment. - void resetToType(VocabularyEnum type) { + void resetToType(ad_utility::VocabularyEnum type) { if constexpr (std::is_same_v) { vocabulary_.getUnderlyingVocabulary().resetToType(type); } diff --git a/src/index/VocabularyOnDisk.h b/src/index/VocabularyOnDisk.h index 2b6455cda3..f677ac3e7a 100644 --- a/src/index/VocabularyOnDisk.h +++ b/src/index/VocabularyOnDisk.h @@ -58,8 +58,6 @@ class VocabularyOnDisk : public VocabularyBinarySearchMixin { void finish(); // Destructor. Implicitly calls `finish` if it hasn't been called before. ~WordWriter(); - WordWriter(WordWriter&&) = default; - WordWriter& operator=(WordWriter&&) = default; }; /// Build from a vector of pairs of `(string, id)`. This requires the IDs to diff --git a/src/index/vocabulary/CMakeLists.txt b/src/index/vocabulary/CMakeLists.txt index ff3138601e..151f8ec18c 100644 --- a/src/index/vocabulary/CMakeLists.txt +++ b/src/index/vocabulary/CMakeLists.txt @@ -1,3 +1,4 @@ add_library(vocabulary VocabularyInMemory.h VocabularyInMemory.cpp VocabularyInMemoryBinSearch.cpp VocabularyInternalExternal.cpp - VocabularyVariant.cpp) + VocabularyVariant.cpp + VocabularyType.h) qlever_target_link_libraries(vocabulary) diff --git a/src/index/vocabulary/CompressedVocabulary.h b/src/index/vocabulary/CompressedVocabulary.h index 1eeda3599c..7f685750d4 100644 --- a/src/index/vocabulary/CompressedVocabulary.h +++ b/src/index/vocabulary/CompressedVocabulary.h @@ -193,10 +193,6 @@ class CompressedVocabulary { delete; DiskWriterFromUncompressedWords& operator=( const DiskWriterFromUncompressedWords&) = delete; - DiskWriterFromUncompressedWords(DiskWriterFromUncompressedWords&&) = - default; - DiskWriterFromUncompressedWords& operator=( - DiskWriterFromUncompressedWords&&) = default; private: // Compress a complete block and write it to the underlying vocabulary. @@ -254,6 +250,7 @@ class CompressedVocabulary { absl::StrCat(filename, decodersSuffix)}; } + // Return a `unique_ptr`. static std::unique_ptr makeDiskWriterPtr( const std::string& filename) { return std::make_unique( @@ -261,18 +258,6 @@ class CompressedVocabulary { absl::StrCat(filename, decodersSuffix)); } - /// Initialize the vocabulary from the given `words`. - // TODO This can be a generic Mixin... - void build(const std::vector& words, - const std::string& filename) { - WordWriter writer = makeDiskWriter(filename); - for (const auto& word : words) { - writer(word); - } - writer.finish(); - open(filename); - } - // Access to the underlying vocabulary. UnderlyingVocabulary& getUnderlyingVocabulary() { return underlyingVocabulary_; diff --git a/src/index/vocabulary/UnicodeVocabulary.h b/src/index/vocabulary/UnicodeVocabulary.h index 73dc85556c..66aaaf0d67 100644 --- a/src/index/vocabulary/UnicodeVocabulary.h +++ b/src/index/vocabulary/UnicodeVocabulary.h @@ -100,14 +100,4 @@ class UnicodeVocabulary { const UnicodeComparator& getComparator() const { return _comparator; } void close() { _underlyingVocabulary.close(); } - - void build(const std::vector& v, const std::string& filename) { - // TODO This is really hacky, we should get rid of it and make the - // building consistent for all the vocabularies. - if constexpr (requires { _underlyingVocabulary.build(v, filename); }) { - _underlyingVocabulary.build(v, filename); - } else { - AD_FAIL(); - } - } }; diff --git a/src/index/vocabulary/VocabularyInMemory.h b/src/index/vocabulary/VocabularyInMemory.h index a2504ad265..6d68e2a6f6 100644 --- a/src/index/vocabulary/VocabularyInMemory.h +++ b/src/index/vocabulary/VocabularyInMemory.h @@ -77,13 +77,6 @@ class VocabularyInMemory } void finish() { writer_.finish(); } - - // The `readableName()` function is only there to have a consistent - // interface with the `VocabularyInternalExternal`. - std::string readableNameDummy_; - std::string& readableName() { return readableNameDummy_; } - WordWriter(WordWriter&&) = default; - WordWriter& operator=(WordWriter&&) = default; }; // Return a `WordWriter` that directly writes the words to the given @@ -92,6 +85,9 @@ class VocabularyInMemory static WordWriter makeDiskWriter(const std::string& filename) { return WordWriter{filename}; } + + // Same as `makeDiskWriter` above, but the result is returned via + // `unique_ptr`. static std::unique_ptr makeDiskWriterPtr( const std::string& filename) { return std::make_unique(filename); @@ -100,17 +96,6 @@ class VocabularyInMemory /// Clear the vocabulary. void close() { _words.clear(); } - /// Initialize the vocabulary from the given `words`. - void build(const std::vector& words, - const std::string& filename) { - WordWriter writer = makeDiskWriter(filename); - for (const auto& word : words) { - writer(word); - } - writer.finish(); - open(filename); - } - // Const access to the underlying words. auto begin() const { return _words.begin(); } auto end() const { return _words.end(); } diff --git a/src/index/vocabulary/VocabularyInMemoryBinSearch.h b/src/index/vocabulary/VocabularyInMemoryBinSearch.h index df2314eb81..8367c1e965 100644 --- a/src/index/vocabulary/VocabularyInMemoryBinSearch.h +++ b/src/index/vocabulary/VocabularyInMemoryBinSearch.h @@ -79,9 +79,6 @@ class VocabularyInMemoryBinSearch // Finish writing and dump all contents that still reside in buffers to // disk. void finish(); - - WordWriter(WordWriter&&) = default; - WordWriter& operator=(WordWriter&&) = default; }; // Clear the vocabulary. diff --git a/src/index/vocabulary/VocabularyInternalExternal.h b/src/index/vocabulary/VocabularyInternalExternal.h index 491381a88e..897b29258d 100644 --- a/src/index/vocabulary/VocabularyInternalExternal.h +++ b/src/index/vocabulary/VocabularyInternalExternal.h @@ -112,11 +112,17 @@ class VocabularyInternalExternal { // Finish writing. void finish(); - - WordWriter(WordWriter&&) = default; - WordWriter& operator=(WordWriter&&) = default; }; + // Return a `WordWriter` or (in the second function) a + // `unique_ptr` for the given filename. + static WordWriter makeDiskWriter(const std::string& filename) { + return WordWriter{filename}; + } + static auto makeDiskWriterPtr(const std::string& filename) { + return std::make_unique(filename); + } + /// Clear the vocabulary. void close() { internalVocab_.close(); } @@ -151,13 +157,4 @@ class VocabularyInternalExternal { return boundFunction(externalVocab_, word, comparator, boundFromInternalVocab.previousIndex(), upperBound); } - - public: - // TODO Clean up positions - static WordWriter makeDiskWriter(const std::string& filename) { - return WordWriter{filename}; - } - static auto makeDiskWriterPtr(const std::string& filename) { - return std::make_unique(filename); - } }; diff --git a/src/index/vocabulary/VocabularyType.h b/src/index/vocabulary/VocabularyType.h new file mode 100644 index 0000000000..21474023a3 --- /dev/null +++ b/src/index/vocabulary/VocabularyType.h @@ -0,0 +1,56 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#pragma once + +#include +#include + +#include "util/json.h" + +namespace ad_utility { +class VocabularyEnum { + public: + enum struct Enum { InMemory, OnDisk, CompressedInMemory, CompressedOnDisk }; + + private: + Enum value_ = Enum::InMemory; + + static constexpr std::array descriptions{ + "in-memory-uncompressed", "on-disk-uncompressed", "in-memory-compressed", + "on-disk-compressed"}; + + public: + VocabularyEnum() = default; + explicit VocabularyEnum(Enum value) : value_{value} {} + + static VocabularyEnum fromString(std::string_view description) { + auto it = ql::ranges::find(descriptions, description); + if (it == descriptions.end()) { + throw std::runtime_error{ + absl::StrCat("\"", description, + "\" is not a valid vocabulary type. The currently " + "supported vocabulary types are ", + absl::StrJoin(descriptions, ", "))}; + ; + } + return VocabularyEnum{static_cast(it - descriptions.begin())}; + } + std::string_view toString() const { + return descriptions.at(static_cast(value_)); + } + + Enum value() const { return value_; } + + // Conversion To JSON. + friend void to_json(nlohmann::json& j, const VocabularyEnum& vocabEnum) { + j = vocabEnum.toString(); + } + + // Conversion from JSON. + friend void from_json(const nlohmann::json& j, VocabularyEnum& vocabEnum) { + vocabEnum = VocabularyEnum::fromString(static_cast(j)); + } +}; +} // namespace ad_utility diff --git a/src/index/vocabulary/VocabularyVariant.cpp b/src/index/vocabulary/VocabularyVariant.cpp index f8dca2b45d..504591116e 100644 --- a/src/index/vocabulary/VocabularyVariant.cpp +++ b/src/index/vocabulary/VocabularyVariant.cpp @@ -10,7 +10,7 @@ void VocabularyVariant::open(const std::string& filename) { std::visit([&filename](auto& vocab) { vocab.open(filename); }, vocab_); } -void VocabularyVariant::open(const std::string& filename, VocabularyEnum type) { +void VocabularyVariant::open(const std::string& filename, VocabularyType type) { resetToType(type); open(filename); } @@ -49,25 +49,25 @@ auto VocabularyVariant::makeDiskWriter(const std::string& filename) const } VocabularyVariant::WordWriter VocabularyVariant::makeDiskWriter( - const std::string& filename, VocabularyEnum type) { + const std::string& filename, VocabularyType type) { VocabularyVariant dummyVocab; dummyVocab.resetToType(type); return dummyVocab.makeDiskWriter(filename); } -void VocabularyVariant::resetToType(VocabularyEnum type) { +void VocabularyVariant::resetToType(VocabularyType type) { close(); switch (type.value()) { - case VocabularyEnum::Enum::InMemory: + case VocabularyType::Enum::InMemory: vocab_.emplace(); break; - case VocabularyEnum::Enum::OnDisk: + case VocabularyType::Enum::OnDisk: vocab_.emplace(); break; - case VocabularyEnum::Enum::CompressedInMemory: + case VocabularyType::Enum::CompressedInMemory: vocab_.emplace(); break; - case VocabularyEnum::Enum::CompressedOnDisk: + case VocabularyType::Enum::CompressedOnDisk: vocab_.emplace(); break; default: diff --git a/src/index/vocabulary/VocabularyVariant.h b/src/index/vocabulary/VocabularyVariant.h index 355fd58abd..7ec162890d 100644 --- a/src/index/vocabulary/VocabularyVariant.h +++ b/src/index/vocabulary/VocabularyVariant.h @@ -11,63 +11,24 @@ #include "index/vocabulary/CompressedVocabulary.h" #include "index/vocabulary/VocabularyInMemory.h" #include "index/vocabulary/VocabularyInternalExternal.h" +#include "index/vocabulary/VocabularyType.h" #include "util/json.h" -template -static constexpr auto getWordWriterTypes(const Variant& var) { - return std::apply( - [](const Vocab&...) { - return std::type_identity< - std::variant...>>{}; - }, - var); -} - -class VocabularyEnum { - public: - enum struct Enum { InMemory, OnDisk, CompressedInMemory, CompressedOnDisk }; - - private: - Enum value_ = Enum::InMemory; - - static constexpr std::array descriptions{ - "in-memory-uncompressed", "on-disk-uncompressed", "in-memory-compressed", - "on-disk-compressed"}; +namespace polymorphic_vocabulary::detail { - public: - VocabularyEnum() = default; - explicit VocabularyEnum(Enum value) : value_{value} {} - - static VocabularyEnum fromString(std::string_view description) { - auto it = ql::ranges::find(descriptions, description); - if (it == descriptions.end()) { - throw std::runtime_error{ - absl::StrCat("\"", description, - "\" is not a valid vocabulary type. The currently " - "supported vocabulary types are ", - absl::StrJoin(descriptions, ", "))}; - ; - } - return VocabularyEnum{static_cast(it - descriptions.begin())}; - } - std::string_view toString() const { - return descriptions.at(static_cast(value_)); - } +template +struct WriterPointers {}; - Enum value() const { return value_; } - - // Conversion To JSON. - friend void to_json(nlohmann::json& j, const VocabularyEnum& vocabEnum) { - j = vocabEnum.toString(); - } - - // Conversion from JSON. - friend void from_json(const nlohmann::json& j, VocabularyEnum& vocabEnum) { - vocabEnum = VocabularyEnum::fromString(static_cast(j)); - } +template +struct WriterPointers> { + using type = std::variant...>; }; +} // namespace polymorphic_vocabulary::detail class VocabularyVariant { + public: + using VocabularyType = ad_utility::VocabularyEnum; + private: using InMemory = VocabularyInMemory; using External = VocabularyInternalExternal; @@ -81,9 +42,9 @@ class VocabularyVariant { Variant vocab_; public: - void resetToType(VocabularyEnum); + void resetToType(VocabularyType); void open(const std::string& filename); - void open(const std::string& filename, VocabularyEnum type); + void open(const std::string& filename, VocabularyType type); void close(); size_t size() const; std::string operator[](uint64_t i) const; @@ -124,7 +85,8 @@ class VocabularyVariant { vocab_); } - using WordWriters = decltype(getWordWriterTypes(std::declval()))::type; + using WordWriters = + polymorphic_vocabulary::detail::WriterPointers::type; class WordWriter { WordWriters writer_; @@ -139,5 +101,5 @@ class VocabularyVariant { WordWriter makeDiskWriter(const std::string& filename) const; static WordWriter makeDiskWriter(const std::string& filename, - VocabularyEnum type); + VocabularyType type); }; diff --git a/src/util/ProgramOptionsHelpers.h b/src/util/ProgramOptionsHelpers.h index 6c25565287..0d3ede6a1a 100644 --- a/src/util/ProgramOptionsHelpers.h +++ b/src/util/ProgramOptionsHelpers.h @@ -49,6 +49,21 @@ inline void validate(boost::any& v, const std::vector& values, v = NonNegative{boost::lexical_cast(s)}; } +// This function is required to use `std::optional` in +// `boost::program_options`. +template +void validate(boost::any& v, const std::vector& values, + std::optional*, int) { + // First parse as a T + T* dummy = nullptr; + // using namespace boost::program_options; + validate(v, values, dummy, 0); + + // Wrap the T inside std::optional + AD_CONTRACT_CHECK(!v.empty()); + v = std::optional(boost::any_cast(v)); +} + // This function is required to use `MemorySize` in `boost::program_options`. inline void validate(boost::any& v, const std::vector& values, MemorySize*, int) { @@ -106,8 +121,6 @@ class ParameterToProgramOptionFactory { } }; -} // namespace ad_utility - // This function is required to use `VocabularyEnum` in // `boost::program_options`. inline void validate(boost::any& v, const std::vector& values, @@ -124,24 +137,6 @@ inline void validate(boost::any& v, const std::vector& values, v = VocabularyEnum::fromString(s); } -// This function is required to use `std::optional` in -// `boost::program_options`. -// TODO We should find a solution that doesn't require opening -// namespace `std`, for example we could put all types + this function into the -// `ad_utility`namespace. -namespace std { -template -void validate(boost::any& v, const std::vector& values, - std::optional*, int) { - // First parse as a T - T* dummy = nullptr; - // using namespace boost::program_options; - validate(v, values, dummy, 0); - - // Wrap the T inside std::optional - AD_CONTRACT_CHECK(!v.empty()); - v = std::optional(boost::any_cast(v)); -} -} // namespace std +} // namespace ad_utility #endif // QLEVER_PROGRAMOPTIONSHELPERS_H diff --git a/src/util/Serializer/SerializeVector.h b/src/util/Serializer/SerializeVector.h index d093f4c02f..982e43e2ff 100644 --- a/src/util/Serializer/SerializeVector.h +++ b/src/util/Serializer/SerializeVector.h @@ -75,9 +75,6 @@ class VectorIncrementalSerializer { } ~VectorIncrementalSerializer() { finish(); } - VectorIncrementalSerializer(VectorIncrementalSerializer&&) = default; - VectorIncrementalSerializer& operator=(VectorIncrementalSerializer&&) = - default; }; } // namespace ad_utility::serialization From 3e7f49476c5f344bd44a5a447a8ee20653c4adcf Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 5 Feb 2025 18:53:41 +0100 Subject: [PATCH 15/25] Several refactorings. Signed-off-by: Johannes Kalmbach --- src/index/IndexBuilderMain.cpp | 2 +- src/index/IndexImpl.cpp | 4 +- src/index/IndexImpl.h | 6 +- src/index/Vocabulary.h | 8 +- src/index/vocabulary/CMakeLists.txt | 2 +- .../vocabulary/PolymorphicVocabulary.cpp | 90 +++++++++++++++++++ ...ularyVariant.h => PolymorphicVocabulary.h} | 53 +++++++++-- src/index/vocabulary/VocabularyType.h | 16 ++-- src/index/vocabulary/VocabularyVariant.cpp | 76 ---------------- src/util/ProgramOptionsHelpers.h | 7 +- 10 files changed, 158 insertions(+), 106 deletions(-) create mode 100644 src/index/vocabulary/PolymorphicVocabulary.cpp rename src/index/vocabulary/{VocabularyVariant.h => PolymorphicVocabulary.h} (56%) delete mode 100644 src/index/vocabulary/VocabularyVariant.cpp diff --git a/src/index/IndexBuilderMain.cpp b/src/index/IndexBuilderMain.cpp index 1583a9a14f..c75fd5d427 100644 --- a/src/index/IndexBuilderMain.cpp +++ b/src/index/IndexBuilderMain.cpp @@ -167,7 +167,7 @@ int main(int argc, char** argv) { bool addWordsFromLiterals = false; std::optional stxxlMemory; std::optional parserBufferSize; - std::optional vocabType; + std::optional vocabType; // VocabularyEnum vocabType; optind = 1; diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index dd29e6d57a..3ad2e997ec 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -1135,8 +1135,8 @@ void IndexImpl::readConfiguration() { // TODO Comment and also write the configuration. // The default value is the one the used to be the only. - ad_utility::VocabularyEnum vocabType( - ad_utility::VocabularyEnum::Enum::CompressedOnDisk); + ad_utility::VocabularyType vocabType( + ad_utility::VocabularyType::Enum::CompressedOnDisk); loadDataMember("vocabulary-type", vocabType, vocabType); vocab_.resetToType(vocabType); diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 7c4a937fdb..aaa6d0a1f1 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -192,8 +192,8 @@ class IndexImpl { std::optional idOfHasPatternDuringIndexBuilding_; std::optional idOfInternalGraphDuringIndexBuilding_; - ad_utility::VocabularyEnum vocabularyTypeForIndexBuilding_{ - ad_utility::VocabularyEnum::Enum::CompressedOnDisk}; + ad_utility::VocabularyType vocabularyTypeForIndexBuilding_{ + ad_utility::VocabularyType::Enum::CompressedOnDisk}; // BlankNodeManager, initialized during `readConfiguration` std::unique_ptr blankNodeManager_{nullptr}; @@ -278,7 +278,7 @@ class IndexImpl { return deltaTriples_.value(); } - void setVocabularyTypeForIndexBuilding(ad_utility::VocabularyEnum type) { + void setVocabularyTypeForIndexBuilding(ad_utility::VocabularyType type) { vocabularyTypeForIndexBuilding_ = type; configurationJson_["vocabulary-type"] = type; } diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index 898233e284..46af9c8c56 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -24,10 +24,10 @@ #include "index/StringSortComparator.h" #include "index/VocabularyOnDisk.h" #include "index/vocabulary/CompressedVocabulary.h" +#include "index/vocabulary/PolymorphicVocabulary.h" #include "index/vocabulary/UnicodeVocabulary.h" #include "index/vocabulary/VocabularyInMemory.h" #include "index/vocabulary/VocabularyInternalExternal.h" -#include "index/vocabulary/VocabularyVariant.h" #include "util/Exception.h" #include "util/HashMap.h" #include "util/HashSet.h" @@ -236,8 +236,8 @@ class Vocabulary { } // TODO Comment. - void resetToType(ad_utility::VocabularyEnum type) { - if constexpr (std::is_same_v) { + void resetToType(ad_utility::VocabularyType type) { + if constexpr (std::is_same_v) { vocabulary_.getUnderlyingVocabulary().resetToType(type); } } @@ -266,7 +266,7 @@ using UnderlyingVocabRdfsVocabulary = CompressedVocabulary; */ // TODO Change this place. -using UnderlyingVocabRdfsVocabulary = VocabularyVariant; +using UnderlyingVocabRdfsVocabulary = PolymorphicVocabulary; using UnderlyingVocabTextVocabulary = VocabularyInMemory; } // namespace detail diff --git a/src/index/vocabulary/CMakeLists.txt b/src/index/vocabulary/CMakeLists.txt index 151f8ec18c..910ad61c3a 100644 --- a/src/index/vocabulary/CMakeLists.txt +++ b/src/index/vocabulary/CMakeLists.txt @@ -1,4 +1,4 @@ add_library(vocabulary VocabularyInMemory.h VocabularyInMemory.cpp VocabularyInMemoryBinSearch.cpp VocabularyInternalExternal.cpp - VocabularyVariant.cpp + PolymorphicVocabulary.cpp VocabularyType.h) qlever_target_link_libraries(vocabulary) diff --git a/src/index/vocabulary/PolymorphicVocabulary.cpp b/src/index/vocabulary/PolymorphicVocabulary.cpp new file mode 100644 index 0000000000..1b9936afee --- /dev/null +++ b/src/index/vocabulary/PolymorphicVocabulary.cpp @@ -0,0 +1,90 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include "index/vocabulary/PolymorphicVocabulary.h" + +#include + +// _____________________________________________________________________________ +void PolymorphicVocabulary::open(const std::string& filename) { + std::visit([&filename](auto& vocab) { vocab.open(filename); }, vocab_); +} + +// _____________________________________________________________________________ +void PolymorphicVocabulary::open(const std::string& filename, + VocabularyType type) { + resetToType(type); + open(filename); +} + +// _____________________________________________________________________________ +void PolymorphicVocabulary::close() { + std::visit([](auto& vocab) { return vocab.close(); }, vocab_); +} + +// _____________________________________________________________________________ +size_t PolymorphicVocabulary::size() const { + return std::visit([](auto& vocab) { return vocab.size(); }, vocab_); +} + +// _____________________________________________________________________________ +std::string PolymorphicVocabulary::operator[](uint64_t i) const { + return std::visit([i](auto& vocab) { return std::string{vocab[i]}; }, vocab_); +} + +// _____________________________________________________________________________ +PolymorphicVocabulary::WordWriter::WordWriter(WordWriters writer) + : writer_(std::move(writer)) {} + +// _____________________________________________________________________________ +void PolymorphicVocabulary::WordWriter::finish() { + std::visit([](auto& writer) { return writer->finish(); }, writer_); +} + +// _____________________________________________________________________________ +void PolymorphicVocabulary::WordWriter::operator()(std::string_view word, + bool isExternal) { + std::visit( + [&word, isExternal](auto& writer) { return (*writer)(word, isExternal); }, + writer_); +} + +// _____________________________________________________________________________ +auto PolymorphicVocabulary::makeDiskWriter(const std::string& filename) const + -> WordWriter { + return WordWriter{std::visit( + [&filename](auto& vocab) -> WordWriters { + return vocab.makeDiskWriterPtr(filename); + }, + vocab_)}; +} + +// _____________________________________________________________________________ +PolymorphicVocabulary::WordWriter PolymorphicVocabulary::makeDiskWriter( + const std::string& filename, VocabularyType type) { + PolymorphicVocabulary dummyVocab; + dummyVocab.resetToType(type); + return dummyVocab.makeDiskWriter(filename); +} + +// _____________________________________________________________________________ +void PolymorphicVocabulary::resetToType(VocabularyType type) { + close(); + switch (type.value()) { + case VocabularyType::Enum::InMemory: + vocab_.emplace(); + break; + case VocabularyType::Enum::OnDisk: + vocab_.emplace(); + break; + case VocabularyType::Enum::CompressedInMemory: + vocab_.emplace(); + break; + case VocabularyType::Enum::CompressedOnDisk: + vocab_.emplace(); + break; + default: + AD_FAIL(); + } +} diff --git a/src/index/vocabulary/VocabularyVariant.h b/src/index/vocabulary/PolymorphicVocabulary.h similarity index 56% rename from src/index/vocabulary/VocabularyVariant.h rename to src/index/vocabulary/PolymorphicVocabulary.h index 7ec162890d..4a18e57465 100644 --- a/src/index/vocabulary/VocabularyVariant.h +++ b/src/index/vocabulary/PolymorphicVocabulary.h @@ -16,6 +16,10 @@ namespace polymorphic_vocabulary::detail { +// For `T = std::variant = +// std::variant, +// unique_ptr, ...>`. This is used in the implementation +// of the `PolymorphicVocabulary` below. template struct WriterPointers {}; @@ -25,30 +29,51 @@ struct WriterPointers> { }; } // namespace polymorphic_vocabulary::detail -class VocabularyVariant { +// A vocabulary that can at runtime choose between different vocabulary +// implementations. The only restriction is, that a vocabulary can only be read +// from disk with the same implementation that it was written to. +class PolymorphicVocabulary { public: - using VocabularyType = ad_utility::VocabularyEnum; + using VocabularyType = ad_utility::VocabularyType; private: + // Type aliases for all the currently supported vocabularies. If another + // vocabulary is added, don't forget to also register it in the + // `VocabularyType` enum. using InMemory = VocabularyInMemory; using External = VocabularyInternalExternal; using CompressedInMemory = CompressedVocabulary; using CompressedExternal = CompressedVocabulary; using Variant = std::variant; - using Tuple = - std::tuple; + // In this variant we store the actual vocabulary. Variant vocab_; public: - void resetToType(VocabularyType); - void open(const std::string& filename); + // Read a vocabulary with the given `type` from the file with the `filename`. + // A vocabulary with the corresponding `type` must have been previously + // written to that file. void open(const std::string& filename, VocabularyType type); + + // Close the vocabulary if it is open, and set the underlying vocabulary + // implementation according to the `type` without opening the vocabulary. + void resetToType(VocabularyType type); + + // Same as the overload of `open` above, but expects that the correct + // `VocabularyType` has already been set via `resetToType` above. + void open(const std::string& filename); + + // Close the vocabulary s.t. it consumes no more RAM. void close(); + + // Return the total number of words in the vocabulary. size_t size() const; + + // Return the `i`-the word, throw of `i` is out of bounds. std::string operator[](uint64_t i) const; + // Same as `std::lower_bound`, return the smallest entry >= `word`. template WordAndIndex lower_bound(const String& word, Comp comp) const { return std::visit( @@ -58,6 +83,8 @@ class VocabularyVariant { vocab_); } + // Same as `lower_bound` above, but the comparator compares a `word` and an + // `iterator` instead of two words. template WordAndIndex lower_bound_iterator(const String& word, Comp comp) const { return std::visit( @@ -67,6 +94,7 @@ class VocabularyVariant { vocab_); } + // Analogous to `lower_bound` (see above). template WordAndIndex upper_bound(const String& word, Comp comp) const { return std::visit( @@ -76,6 +104,7 @@ class VocabularyVariant { vocab_); } + // Analogous to `lower_bound_iterator` (see above). template WordAndIndex upper_bound_iterator(const String& word, Comp comp) const { return std::visit( @@ -88,18 +117,28 @@ class VocabularyVariant { using WordWriters = polymorphic_vocabulary::detail::WriterPointers::type; + // The `WordWriter` is used to write a vocabulary to disk word by word (in + // sorted order). class WordWriter { WordWriters writer_; public: + // Constructor, used by the `makeDiskWriter` functions below. explicit WordWriter(WordWriters); + // This function has to be called after the last word has been written. void finish(); + // Write the next word to the vocabulary. void operator()(std::string_view word, bool isExternal); }; - WordWriter makeDiskWriter(const std::string& filename) const; + // Create a `WordWriter` that will create a vocabulary with the given `type` + // at the given `filename`. static WordWriter makeDiskWriter(const std::string& filename, VocabularyType type); + + // Same as above, but the `VocabularyType` is the currently active type of + // `this`. + WordWriter makeDiskWriter(const std::string& filename) const; }; diff --git a/src/index/vocabulary/VocabularyType.h b/src/index/vocabulary/VocabularyType.h index 21474023a3..4e65a481df 100644 --- a/src/index/vocabulary/VocabularyType.h +++ b/src/index/vocabulary/VocabularyType.h @@ -10,7 +10,7 @@ #include "util/json.h" namespace ad_utility { -class VocabularyEnum { +class VocabularyType { public: enum struct Enum { InMemory, OnDisk, CompressedInMemory, CompressedOnDisk }; @@ -22,10 +22,10 @@ class VocabularyEnum { "on-disk-compressed"}; public: - VocabularyEnum() = default; - explicit VocabularyEnum(Enum value) : value_{value} {} + VocabularyType() = default; + explicit VocabularyType(Enum value) : value_{value} {} - static VocabularyEnum fromString(std::string_view description) { + static VocabularyType fromString(std::string_view description) { auto it = ql::ranges::find(descriptions, description); if (it == descriptions.end()) { throw std::runtime_error{ @@ -35,7 +35,7 @@ class VocabularyEnum { absl::StrJoin(descriptions, ", "))}; ; } - return VocabularyEnum{static_cast(it - descriptions.begin())}; + return VocabularyType{static_cast(it - descriptions.begin())}; } std::string_view toString() const { return descriptions.at(static_cast(value_)); @@ -44,13 +44,13 @@ class VocabularyEnum { Enum value() const { return value_; } // Conversion To JSON. - friend void to_json(nlohmann::json& j, const VocabularyEnum& vocabEnum) { + friend void to_json(nlohmann::json& j, const VocabularyType& vocabEnum) { j = vocabEnum.toString(); } // Conversion from JSON. - friend void from_json(const nlohmann::json& j, VocabularyEnum& vocabEnum) { - vocabEnum = VocabularyEnum::fromString(static_cast(j)); + friend void from_json(const nlohmann::json& j, VocabularyType& vocabEnum) { + vocabEnum = VocabularyType::fromString(static_cast(j)); } }; } // namespace ad_utility diff --git a/src/index/vocabulary/VocabularyVariant.cpp b/src/index/vocabulary/VocabularyVariant.cpp deleted file mode 100644 index 504591116e..0000000000 --- a/src/index/vocabulary/VocabularyVariant.cpp +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2025, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach - -#include "index/vocabulary/VocabularyVariant.h" - -#include - -void VocabularyVariant::open(const std::string& filename) { - std::visit([&filename](auto& vocab) { vocab.open(filename); }, vocab_); -} - -void VocabularyVariant::open(const std::string& filename, VocabularyType type) { - resetToType(type); - open(filename); -} - -void VocabularyVariant::close() { - return std::visit([](auto& vocab) { return vocab.close(); }, vocab_); -} -size_t VocabularyVariant::size() const { - return std::visit([](auto& vocab) { return vocab.size(); }, vocab_); -} -std::string VocabularyVariant::operator[](uint64_t i) const { - return std::visit([i](auto& vocab) { return std::string{vocab[i]}; }, vocab_); -} - -VocabularyVariant::WordWriter::WordWriter(WordWriters writer) - : writer_(std::move(writer)) {} - -void VocabularyVariant::WordWriter::finish() { - std::visit([](auto& writer) { return writer->finish(); }, writer_); -} - -void VocabularyVariant::WordWriter::operator()(std::string_view word, - bool isExternal) { - std::visit( - [&word, isExternal](auto& writer) { return (*writer)(word, isExternal); }, - writer_); -} - -auto VocabularyVariant::makeDiskWriter(const std::string& filename) const - -> WordWriter { - return WordWriter{std::visit( - [&filename](auto& vocab) -> WordWriters { - return vocab.makeDiskWriterPtr(filename); - }, - vocab_)}; -} - -VocabularyVariant::WordWriter VocabularyVariant::makeDiskWriter( - const std::string& filename, VocabularyType type) { - VocabularyVariant dummyVocab; - dummyVocab.resetToType(type); - return dummyVocab.makeDiskWriter(filename); -} - -void VocabularyVariant::resetToType(VocabularyType type) { - close(); - switch (type.value()) { - case VocabularyType::Enum::InMemory: - vocab_.emplace(); - break; - case VocabularyType::Enum::OnDisk: - vocab_.emplace(); - break; - case VocabularyType::Enum::CompressedInMemory: - vocab_.emplace(); - break; - case VocabularyType::Enum::CompressedOnDisk: - vocab_.emplace(); - break; - default: - AD_FAIL(); - } -} diff --git a/src/util/ProgramOptionsHelpers.h b/src/util/ProgramOptionsHelpers.h index 0d3ede6a1a..a86a850c35 100644 --- a/src/util/ProgramOptionsHelpers.h +++ b/src/util/ProgramOptionsHelpers.h @@ -8,11 +8,10 @@ #include #include +#include "index/vocabulary/VocabularyType.h" #include "util/Concepts.h" #include "util/MemorySize/MemorySize.h" #include "util/Parameters.h" -// TODO only include the enum. -#include "index/vocabulary/VocabularyVariant.h" namespace ad_utility { // An implicit wrapper that can be implicitly converted to and from `size_t`. @@ -124,7 +123,7 @@ class ParameterToProgramOptionFactory { // This function is required to use `VocabularyEnum` in // `boost::program_options`. inline void validate(boost::any& v, const std::vector& values, - VocabularyEnum*, int) { + VocabularyType*, int) { using namespace boost::program_options; // Make sure no previous assignment to 'v' was made. @@ -134,7 +133,7 @@ inline void validate(boost::any& v, const std::vector& values, const string& s = validators::get_single_string(values); // Convert the string to `MemorySize` and put it into the option. - v = VocabularyEnum::fromString(s); + v = VocabularyType::fromString(s); } } // namespace ad_utility From 825f8bfb754ef2e83e1f6aed374207aa9b331d35 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 5 Feb 2025 19:09:04 +0100 Subject: [PATCH 16/25] Some additional fixes and comments. Signed-off-by: Johannes Kalmbach --- src/global/Pattern.h | 3 --- src/index/IndexBuilderMain.cpp | 9 ++++----- src/index/IndexImpl.cpp | 2 -- src/index/IndexImpl.h | 2 ++ src/index/vocabulary/VocabularyType.h | 7 +++++-- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/global/Pattern.h b/src/global/Pattern.h index 9c37eb39ce..1005add22d 100644 --- a/src/global/Pattern.h +++ b/src/global/Pattern.h @@ -194,9 +194,6 @@ struct CompactStringVectorWriter { commonInitialization(); } - CompactStringVectorWriter(CompactStringVectorWriter&&) = default; - CompactStringVectorWriter& operator=(CompactStringVectorWriter&&) = default; - void push(const data_type* data, size_t elementSize) { AD_CONTRACT_CHECK(!_finished); _offsets.push_back(_nextOffset); diff --git a/src/index/IndexBuilderMain.cpp b/src/index/IndexBuilderMain.cpp index c75fd5d427..29b11eae9b 100644 --- a/src/index/IndexBuilderMain.cpp +++ b/src/index/IndexBuilderMain.cpp @@ -168,7 +168,6 @@ int main(int argc, char** argv) { std::optional stxxlMemory; std::optional parserBufferSize; std::optional vocabType; - // VocabularyEnum vocabType; optind = 1; Index index{ad_utility::makeUnlimitedAllocator()}; @@ -228,8 +227,9 @@ int main(int argc, char** argv) { "Only build the PSO and POS permutations. This is faster, but then " "queries with predicate variables are not supported"); add("vocabulary-type", po::value(&vocabType), - "The vocabulary implementation for strings in qlever, can be any of ... " - "(TODO joka)"); + absl::StrCat( + "The vocabulary implementation for strings in qlever, can be any of ", + ad_utility::VocabularyType::getListOfSupportedValues())); // Options for the index building process. add("stxxl-memory,m", po::value(&stxxlMemory), @@ -262,11 +262,10 @@ int main(int argc, char** argv) { if (parserBufferSize.has_value()) { index.parserBufferSize() = parserBufferSize.value(); } - /* + if (vocabType.has_value()) { index.getImpl().setVocabularyTypeForIndexBuilding(vocabType.value()); } - */ // If no text index name was specified, take the part of the wordsfile after // the last slash. diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 3ad2e997ec..9d8f89c19f 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -1133,8 +1133,6 @@ void IndexImpl::readConfiguration() { loadDataMember("num-triples", numTriples_, NumNormalAndInternal{}); loadDataMember("num-non-literals-text-index", nofNonLiteralsInTextIndex_, 0); - // TODO Comment and also write the configuration. - // The default value is the one the used to be the only. ad_utility::VocabularyType vocabType( ad_utility::VocabularyType::Enum::CompressedOnDisk); loadDataMember("vocabulary-type", vocabType, vocabType); diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index aaa6d0a1f1..a8828f2236 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -192,6 +192,8 @@ class IndexImpl { std::optional idOfHasPatternDuringIndexBuilding_; std::optional idOfInternalGraphDuringIndexBuilding_; + // The vocabulary type that is used (only relevant during index building). + // The default is chosen s.t. the compatibility to old index builds. ad_utility::VocabularyType vocabularyTypeForIndexBuilding_{ ad_utility::VocabularyType::Enum::CompressedOnDisk}; diff --git a/src/index/vocabulary/VocabularyType.h b/src/index/vocabulary/VocabularyType.h index 4e65a481df..a6b0eacfb4 100644 --- a/src/index/vocabulary/VocabularyType.h +++ b/src/index/vocabulary/VocabularyType.h @@ -32,11 +32,14 @@ class VocabularyType { absl::StrCat("\"", description, "\" is not a valid vocabulary type. The currently " "supported vocabulary types are ", - absl::StrJoin(descriptions, ", "))}; - ; + getListOfSupportedValues())}; } return VocabularyType{static_cast(it - descriptions.begin())}; } + + static std::string getListOfSupportedValues() { + return absl::StrJoin(descriptions, ", "); + } std::string_view toString() const { return descriptions.at(static_cast(value_)); } From 066ddf62c50de6add776499d08f1e643750d3a71 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 5 Feb 2025 19:29:04 +0100 Subject: [PATCH 17/25] Refactoring there and back again. Signed-off-by: Johannes Kalmbach --- src/index/IndexImpl.h | 3 ++- src/index/Vocabulary.h | 18 +----------------- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index a8828f2236..f3aba12cbb 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -193,7 +193,6 @@ class IndexImpl { std::optional idOfInternalGraphDuringIndexBuilding_; // The vocabulary type that is used (only relevant during index building). - // The default is chosen s.t. the compatibility to old index builds. ad_utility::VocabularyType vocabularyTypeForIndexBuilding_{ ad_utility::VocabularyType::Enum::CompressedOnDisk}; @@ -280,6 +279,8 @@ class IndexImpl { return deltaTriples_.value(); } + // See the documentation of the `vocabularyTypeForIndexBuilding_` member for + // details. void setVocabularyTypeForIndexBuilding(ad_utility::VocabularyType type) { vocabularyTypeForIndexBuilding_ = type; configurationJson_["vocabulary-type"] = type; diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index 46af9c8c56..7587275118 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -20,7 +20,6 @@ #include "global/Constants.h" #include "global/Id.h" #include "global/Pattern.h" -#include "index/CompressedString.h" #include "index/StringSortComparator.h" #include "index/VocabularyOnDisk.h" #include "index/vocabulary/CompressedVocabulary.h" @@ -60,9 +59,7 @@ inline std::ostream& operator<<(std::ostream& stream, } // A vocabulary. Wraps a vector of strings and provides additional methods for -// retrieval. Template parameters that are supported are: -// std::string -> no compression is applied -// CompressedString -> prefix compression is applied +// retrieval. template class Vocabulary { @@ -105,19 +102,6 @@ class Vocabulary { vector internalizedLangs_; vector externalizedPrefixes_{""}; - // using UnderlyingVocabulary = VocabularyInMemory; - /* - using UnderlyingVocabulary = - std::conditional_t, - VocabularyInMemory>; - */ - /* - using UnderlyingVocabulary = - std::conditional_t, - VocabularyInMemory>; - */ using VocabularyWithUnicodeComparator = UnicodeVocabulary; From b9948ff68f580224c79bfe1cb8c590da3cbce99e Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 5 Feb 2025 19:37:16 +0100 Subject: [PATCH 18/25] Fix compilation. Signed-off-by: Johannes Kalmbach --- src/global/Pattern.h | 3 +++ src/index/IndexBuilderMain.cpp | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/global/Pattern.h b/src/global/Pattern.h index 1005add22d..28ca2a9c0e 100644 --- a/src/global/Pattern.h +++ b/src/global/Pattern.h @@ -227,6 +227,9 @@ struct CompactStringVectorWriter { } } + CompactStringVectorWriter(CompactStringVectorWriter&&) = default; + CompactStringVectorWriter& operator=(CompactStringVectorWriter&&) = default; + private: // Has to be run by all the constructors void commonInitialization() { diff --git a/src/index/IndexBuilderMain.cpp b/src/index/IndexBuilderMain.cpp index 29b11eae9b..034e76050d 100644 --- a/src/index/IndexBuilderMain.cpp +++ b/src/index/IndexBuilderMain.cpp @@ -226,10 +226,10 @@ int main(int argc, char** argv) { add("only-pso-and-pos-permutations,o", po::bool_switch(&onlyPsoAndPos), "Only build the PSO and POS permutations. This is faster, but then " "queries with predicate variables are not supported"); - add("vocabulary-type", po::value(&vocabType), - absl::StrCat( - "The vocabulary implementation for strings in qlever, can be any of ", - ad_utility::VocabularyType::getListOfSupportedValues())); + auto msg = absl::StrCat( + "The vocabulary implementation for strings in qlever, can be any of ", + ad_utility::VocabularyType::getListOfSupportedValues()); + add("vocabulary-type", po::value(&vocabType), msg.c_str()); // Options for the index building process. add("stxxl-memory,m", po::value(&stxxlMemory), From b1b884e4fb482b030b6b9b24ed600f332412dbfd Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 6 Feb 2025 12:15:00 +0100 Subject: [PATCH 19/25] Feed this to the tools... Signed-off-by: Johannes Kalmbach --- CMakeLists.txt | 11 ++--- src/global/Pattern.h | 5 ++- src/index/CMakeLists.txt | 2 +- src/index/Vocabulary.h | 37 ++++++--------- src/index/vocabulary/CMakeLists.txt | 4 +- src/index/vocabulary/PolymorphicVocabulary.h | 21 --------- .../vocabulary/VocabularyInternalExternal.h | 2 +- .../{ => vocabulary}/VocabularyOnDisk.cpp | 2 +- src/index/{ => vocabulary}/VocabularyOnDisk.h | 0 src/index/vocabulary/VocabularyType.h | 45 ++++++++++++++++--- src/util/File.h | 6 ++- src/util/ProgramOptionsHelpers.h | 2 +- test/CMakeLists.txt | 2 +- test/StringSortComparatorTest.cpp | 5 +++ test/index/vocabulary/CMakeLists.txt | 14 +++--- .../vocabulary/CompressedVocabularyTest.cpp | 2 +- .../vocabulary/PolymorphicVocabularyTest.cpp | 42 +++++++++++++++++ .../VocabularyInternalExternalTest.cpp | 2 +- .../index/vocabulary/VocabularyOnDiskTest.cpp | 2 +- test/index/vocabulary/VocabularyTypeTest.cpp | 36 +++++++++++++++ test/util/IndexTestHelpers.cpp | 2 + 21 files changed, 169 insertions(+), 75 deletions(-) rename src/index/{ => vocabulary}/VocabularyOnDisk.cpp (98%) rename src/index/{ => vocabulary}/VocabularyOnDisk.h (100%) create mode 100644 test/index/vocabulary/PolymorphicVocabularyTest.cpp create mode 100644 test/index/vocabulary/VocabularyTypeTest.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 67b2feb62b..9402201159 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -203,14 +203,9 @@ if (${USE_CPP_17_BACKPORTS}) add_definitions("-DQLEVER_CPP_17 -DCPP_CXX_CONCEPTS=0") endif() -set(VOCAB_IN_MEMORY OFF CACHE BOOL "Store QLever's vocabulary completely in RAM") -if (${VOCAB_IN_MEMORY}) - add_definitions("-D_QLEVER_VOCAB_IN_MEMORY") -endif () - -set(ENABLE_VOCAB_COMPRESSION ON CACHE BOOL "Compress the vocabulary") -if (${ENABLE_VOCAB_COMPRESSION}) - add_definitions("-D_QLEVER_ENABLE_VOCAB_COMPRESSION") +set(VOCAB_UNCOMPRESSED_IN_MEMORY OFF CACHE BOOL "Store QLever's vocabulary uncompressed and completely in RAM") +if (${VOCAB_UNCOMPRESSED_IN_MEMORY}) + add_definitions("-D_QLEVER_VOCAB_UNCOMPRESSED_IN_MEMORY") endif () # Enable the specification of additional linker flags manually from the commandline diff --git a/src/global/Pattern.h b/src/global/Pattern.h index 28ca2a9c0e..9178e5d640 100644 --- a/src/global/Pattern.h +++ b/src/global/Pattern.h @@ -17,6 +17,7 @@ #include "util/File.h" #include "util/Generator.h" #include "util/Iterators.h" +#include "util/ResetWhenMoved.h" #include "util/Serializer/FileSerializer.h" #include "util/Serializer/SerializeVector.h" #include "util/TypeTraits.h" @@ -181,7 +182,9 @@ struct CompactStringVectorWriter { off_t _startOfFile; using offset_type = typename CompactVectorOfStrings::offset_type; std::vector _offsets; - bool _finished = false; + // A `CompactStringVectorWriter` that has been moved from may not call + // `finish()` any more in its destructor. + ad_utility::ResetWhenMoved _finished = false; offset_type _nextOffset = 0; explicit CompactStringVectorWriter(const std::string& filename) diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt index 4a226bdfdd..e421a03e55 100644 --- a/src/index/CMakeLists.txt +++ b/src/index/CMakeLists.txt @@ -1,7 +1,7 @@ add_subdirectory(vocabulary) add_library(index Index.cpp IndexImpl.cpp IndexImpl.Text.cpp - Vocabulary.cpp VocabularyOnDisk.cpp + Vocabulary.cpp LocatedTriples.cpp Permutation.cpp TextMetaData.cpp DocsDB.cpp FTSAlgorithms.cpp PrefixHeuristic.cpp CompressedRelation.cpp diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index 7587275118..eecf3b832a 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -21,12 +21,9 @@ #include "global/Id.h" #include "global/Pattern.h" #include "index/StringSortComparator.h" -#include "index/VocabularyOnDisk.h" -#include "index/vocabulary/CompressedVocabulary.h" #include "index/vocabulary/PolymorphicVocabulary.h" #include "index/vocabulary/UnicodeVocabulary.h" #include "index/vocabulary/VocabularyInMemory.h" -#include "index/vocabulary/VocabularyInternalExternal.h" #include "util/Exception.h" #include "util/HashMap.h" #include "util/HashSet.h" @@ -216,10 +213,16 @@ class Vocabulary { // vocabulary. UnderlyingVocabulary::WordWriter makeWordWriter( const std::string& filename) const { + // Note: In GCC this triggers a move construction of the created + // `DiskWriter`, although mandatory copy elision should kick in here + // according to our understanding (and does in clang). We could investigate + // whether this is a bug in GCC or whether we are missing something. return vocabulary_.getUnderlyingVocabulary().makeDiskWriter(filename); } - // TODO Comment. + // If the `UnderlyingVocabulary` is a `PolymorphicVocabulary`, close the + // vocabulary and set the type of the vocabulary according to the `type` + // argument (see the `PolymorphicVocabulary` class for details). void resetToType(ad_utility::VocabularyType type) { if constexpr (std::is_same_v) { vocabulary_.getUnderlyingVocabulary().resetToType(type); @@ -228,29 +231,17 @@ class Vocabulary { }; namespace detail { -// The two mactors `_QLEVER_VOCAB_IN_MEMORY` and -// `_QLEVER_ENABLE_VOCAB_COMPRESSION` can be used to disable the external vocab -// and the compression of the vocab at compile time. NOTE: These change the -// binary format of QLever's index, so changing them requires rebuilding of the -// indices. -/* -#ifdef _QLEVER_VOCAB_IN_MEMORY -using VocabStorage = VocabularyInMemory; -#else -using VocabStorage = VocabularyInternalExternal; -#endif -*/ +// Thecompile-time definitions `_QLEVER_VOCAB_UNCOMPRESSED_IN_MEMORY` can be +// used to disable the external vocab and the compression of the vocab at +// compile time. NOTE: These change the binary format of QLever's index, so +// changing them requires rebuilding of the indices. -/* -#ifndef _QLEVER_ENABLE_VOCAB_COMPRESSION -using UnderlyingVocabRdfsVocabulary = VocabStorage; +#ifdef _QLEVER_VOCAB_UNCOMPRESSED_IN_MEMORY +using UnderlyingVocabRdfsVocabulary = VocabularyInMemory; #else -using UnderlyingVocabRdfsVocabulary = CompressedVocabulary; +using UnderlyingVocabRdfsVocabulary = PolymorphicVocabulary; #endif -*/ -// TODO Change this place. -using UnderlyingVocabRdfsVocabulary = PolymorphicVocabulary; using UnderlyingVocabTextVocabulary = VocabularyInMemory; } // namespace detail diff --git a/src/index/vocabulary/CMakeLists.txt b/src/index/vocabulary/CMakeLists.txt index 910ad61c3a..ce746097da 100644 --- a/src/index/vocabulary/CMakeLists.txt +++ b/src/index/vocabulary/CMakeLists.txt @@ -1,4 +1,4 @@ add_library(vocabulary VocabularyInMemory.h VocabularyInMemory.cpp VocabularyInMemoryBinSearch.cpp VocabularyInternalExternal.cpp - PolymorphicVocabulary.cpp - VocabularyType.h) + PolymorphicVocabulary.cpp VocabularyOnDisk.cpp + ) qlever_target_link_libraries(vocabulary) diff --git a/src/index/vocabulary/PolymorphicVocabulary.h b/src/index/vocabulary/PolymorphicVocabulary.h index 4a18e57465..02fa12b962 100644 --- a/src/index/vocabulary/PolymorphicVocabulary.h +++ b/src/index/vocabulary/PolymorphicVocabulary.h @@ -83,17 +83,6 @@ class PolymorphicVocabulary { vocab_); } - // Same as `lower_bound` above, but the comparator compares a `word` and an - // `iterator` instead of two words. - template - WordAndIndex lower_bound_iterator(const String& word, Comp comp) const { - return std::visit( - [&word, &comp](auto& vocab) { - return vocab.lower_bound_iterator(word, std::move(comp)); - }, - vocab_); - } - // Analogous to `lower_bound` (see above). template WordAndIndex upper_bound(const String& word, Comp comp) const { @@ -104,16 +93,6 @@ class PolymorphicVocabulary { vocab_); } - // Analogous to `lower_bound_iterator` (see above). - template - WordAndIndex upper_bound_iterator(const String& word, Comp comp) const { - return std::visit( - [&word, &comp](auto& vocab) { - return vocab.upper_bound_iterator(word, std::move(comp)); - }, - vocab_); - } - using WordWriters = polymorphic_vocabulary::detail::WriterPointers::type; diff --git a/src/index/vocabulary/VocabularyInternalExternal.h b/src/index/vocabulary/VocabularyInternalExternal.h index 897b29258d..209820c604 100644 --- a/src/index/vocabulary/VocabularyInternalExternal.h +++ b/src/index/vocabulary/VocabularyInternalExternal.h @@ -8,8 +8,8 @@ #include #include -#include "index/VocabularyOnDisk.h" #include "index/vocabulary/VocabularyInMemoryBinSearch.h" +#include "index/vocabulary/VocabularyOnDisk.h" #include "index/vocabulary/VocabularyTypes.h" #include "util/Exception.h" diff --git a/src/index/VocabularyOnDisk.cpp b/src/index/vocabulary/VocabularyOnDisk.cpp similarity index 98% rename from src/index/VocabularyOnDisk.cpp rename to src/index/vocabulary/VocabularyOnDisk.cpp index 251130be26..1dc53e8453 100644 --- a/src/index/VocabularyOnDisk.cpp +++ b/src/index/vocabulary/VocabularyOnDisk.cpp @@ -2,7 +2,7 @@ // Chair of Algorithms and Data Structures. // Author: Johannes Kalmbach -#include "index/VocabularyOnDisk.h" +#include "index/vocabulary/VocabularyOnDisk.h" #include diff --git a/src/index/VocabularyOnDisk.h b/src/index/vocabulary/VocabularyOnDisk.h similarity index 100% rename from src/index/VocabularyOnDisk.h rename to src/index/vocabulary/VocabularyOnDisk.h diff --git a/src/index/vocabulary/VocabularyType.h b/src/index/vocabulary/VocabularyType.h index a6b0eacfb4..62036a495e 100644 --- a/src/index/vocabulary/VocabularyType.h +++ b/src/index/vocabulary/VocabularyType.h @@ -7,45 +7,72 @@ #include #include +#include "util/Random.h" #include "util/json.h" namespace ad_utility { + +// A lightweight enum for the different implementation strategies of the +// `PolymorphicVocabulary`. Also includes operations for conversion to and from +// string. +// TODO Implement a generic mixin that can also be used for other +// enums, especially such used in command-line interfaces. class VocabularyType { public: + // The different vocabulary implementations; enum struct Enum { InMemory, OnDisk, CompressedInMemory, CompressedOnDisk }; private: Enum value_ = Enum::InMemory; - static constexpr std::array descriptions{ + static constexpr size_t numValues_ = 4; + // All possible values. + static constexpr std::array all_{ + Enum::InMemory, Enum::OnDisk, Enum::CompressedInMemory, + Enum::CompressedOnDisk}; + + // The string representations of the enum values. + static constexpr std::array descriptions_{ "in-memory-uncompressed", "on-disk-uncompressed", "in-memory-compressed", "on-disk-compressed"}; + static_assert(all_.size() == descriptions_.size()); + public: + // Constructors VocabularyType() = default; explicit VocabularyType(Enum value) : value_{value} {} + // Create from a string. The string must be one of the `descriptions_`, + // otherwise a `runtime_error_` is thrown. static VocabularyType fromString(std::string_view description) { - auto it = ql::ranges::find(descriptions, description); - if (it == descriptions.end()) { + auto it = ql::ranges::find(descriptions_, description); + if (it == descriptions_.end()) { throw std::runtime_error{ absl::StrCat("\"", description, "\" is not a valid vocabulary type. The currently " "supported vocabulary types are ", getListOfSupportedValues())}; } - return VocabularyType{static_cast(it - descriptions.begin())}; + return VocabularyType{all().at(it - descriptions_.begin())}; } + // Return all the possible enum values as a comma-separated single string. static std::string getListOfSupportedValues() { - return absl::StrJoin(descriptions, ", "); + return absl::StrJoin(descriptions_, ", "); } + + // Convert the enum to the corresponding string. std::string_view toString() const { - return descriptions.at(static_cast(value_)); + return descriptions_.at(static_cast(value_)); } + // Return the actual enum value. Enum value() const { return value_; } + // Return a list of all the enum values. + static constexpr const std::array& all() { return all_; } + // Conversion To JSON. friend void to_json(nlohmann::json& j, const VocabularyType& vocabEnum) { j = vocabEnum.toString(); @@ -55,5 +82,11 @@ class VocabularyType { friend void from_json(const nlohmann::json& j, VocabularyType& vocabEnum) { vocabEnum = VocabularyType::fromString(static_cast(j)); } + + // Get a random value, useful for fuzz testing. + static VocabularyType random() { + ad_utility::FastRandomIntGenerator r; + return VocabularyType{static_cast(r() % numValues_)}; + } }; } // namespace ad_utility diff --git a/src/util/File.h b/src/util/File.h index cde77a4aaf..782e266380 100644 --- a/src/util/File.h +++ b/src/util/File.h @@ -52,6 +52,10 @@ class File { open(filename, mode); } + // Files are move-only types. + File(const File&) = delete; + File& operator=(const File&) = delete; + File& operator=(File&& rhs) noexcept { if (isOpen()) { close(); @@ -63,7 +67,7 @@ class File { return *this; } - File(File&& rhs) : name_{std::move(rhs.name_)}, file_{rhs.file_} { + File(File&& rhs) noexcept : name_{std::move(rhs.name_)}, file_{rhs.file_} { rhs.file_ = nullptr; } diff --git a/src/util/ProgramOptionsHelpers.h b/src/util/ProgramOptionsHelpers.h index a86a850c35..b395768f50 100644 --- a/src/util/ProgramOptionsHelpers.h +++ b/src/util/ProgramOptionsHelpers.h @@ -55,7 +55,7 @@ void validate(boost::any& v, const std::vector& values, std::optional*, int) { // First parse as a T T* dummy = nullptr; - // using namespace boost::program_options; + using namespace boost::program_options; validate(v, values, dummy, 0); // Wrap the T inside std::optional diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3a04b9d201..994b4ea9ae 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -201,7 +201,7 @@ addLinkAndDiscoverTest(BatchedPipelineTest) addLinkAndDiscoverTest(TupleHelpersTest) -addLinkAndDiscoverTest(StringSortComparatorTest) +addLinkAndDiscoverTestNoLibs(StringSortComparatorTest) addLinkAndDiscoverTest(PriorityQueueTest) diff --git a/test/StringSortComparatorTest.cpp b/test/StringSortComparatorTest.cpp index ade2178ae0..b6143ec70f 100644 --- a/test/StringSortComparatorTest.cpp +++ b/test/StringSortComparatorTest.cpp @@ -125,6 +125,11 @@ TEST(StringSortComparatorTest, TripleComponentComparatorTotal) { auto bSplit = comparator.extractAndTransformComparable( b, TripleComponentComparator::Level::TOTAL); EXPECT_EQ(ab, comp(aSplit, bSplit)); + EXPECT_EQ(ab, comp(a, bSplit)); + EXPECT_EQ(ab, comp(aSplit, b)); + + EXPECT_EQ(ba, comp(b, aSplit)); + EXPECT_EQ(ba, comp(bSplit, a)); EXPECT_EQ(ba, comp(bSplit, aSplit)); }; diff --git a/test/index/vocabulary/CMakeLists.txt b/test/index/vocabulary/CMakeLists.txt index 3b4499a751..2db01bd594 100644 --- a/test/index/vocabulary/CMakeLists.txt +++ b/test/index/vocabulary/CMakeLists.txt @@ -1,11 +1,15 @@ -addLinkAndDiscoverTest(VocabularyInMemoryTest vocabulary) +addLinkAndDiscoverTestNoLibs(VocabularyInMemoryTest vocabulary) -addLinkAndDiscoverTest(VocabularyOnDiskTest index) +addLinkAndDiscoverTestNoLibs(VocabularyOnDiskTest index) addLinkAndDiscoverTest(CompressedVocabularyTest vocabulary) -addLinkAndDiscoverTest(UnicodeVocabularyTest vocabulary) +addLinkAndDiscoverTestNoLibs(UnicodeVocabularyTest vocabulary) -addLinkAndDiscoverTest(VocabularyInternalExternalTest vocabulary) +addLinkAndDiscoverTestNoLibs(VocabularyInternalExternalTest vocabulary) -addLinkAndDiscoverTest(VocabularyInMemoryBinSearchTest vocabulary) +addLinkAndDiscoverTestNoLibs(VocabularyInMemoryBinSearchTest vocabulary) + +addLinkAndDiscoverTestNoLibs(PolymorphicVocabularyTest vocabulary) + +addLinkAndDiscoverTestNoLibs(VocabularyTypeTest) diff --git a/test/index/vocabulary/CompressedVocabularyTest.cpp b/test/index/vocabulary/CompressedVocabularyTest.cpp index a1a445e213..8a6f39d2bb 100644 --- a/test/index/vocabulary/CompressedVocabularyTest.cpp +++ b/test/index/vocabulary/CompressedVocabularyTest.cpp @@ -6,10 +6,10 @@ #include "VocabularyTestHelpers.h" #include "backports/algorithm.h" -#include "index/VocabularyOnDisk.h" #include "index/vocabulary/CompressedVocabulary.h" #include "index/vocabulary/PrefixCompressor.h" #include "index/vocabulary/VocabularyInMemory.h" +#include "index/vocabulary/VocabularyOnDisk.h" namespace { diff --git a/test/index/vocabulary/PolymorphicVocabularyTest.cpp b/test/index/vocabulary/PolymorphicVocabularyTest.cpp new file mode 100644 index 0000000000..fc01104d4c --- /dev/null +++ b/test/index/vocabulary/PolymorphicVocabularyTest.cpp @@ -0,0 +1,42 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include + +#include "index/vocabulary/PolymorphicVocabulary.h" + +using ad_utility::VocabularyType; + +namespace { +void testForVocabType(VocabularyType::Enum vocabType) { + VocabularyType type{vocabType}; + std::string filename = + absl::StrCat("polymorphicVocabularyTest.", type.toString(), ".vocab"); + + auto writer = PolymorphicVocabulary::makeDiskWriter(filename, type); + writer("alpha", false); + writer("beta", true); + writer("gamma", false); + writer.finish(); + + PolymorphicVocabulary vocab; + vocab.open(filename, type); + EXPECT_EQ(vocab.size(), 3); + + EXPECT_EQ(vocab[0], "alpha"); + EXPECT_EQ(vocab[1], "beta"); + EXPECT_EQ(vocab[2], "gamma"); + + auto wI = vocab.lower_bound("alx", ql::ranges::less{}); + EXPECT_EQ(wI.index(), 1); + EXPECT_EQ(wI.word(), "beta"); + + wI = vocab.upper_bound("gamma", ql::ranges::less{}); + EXPECT_TRUE(wI.isEnd()); +} +} // namespace + +TEST(PolymorphicVocabulary, basicTests) { + ql::ranges::for_each(VocabularyType::all(), &testForVocabType); +} diff --git a/test/index/vocabulary/VocabularyInternalExternalTest.cpp b/test/index/vocabulary/VocabularyInternalExternalTest.cpp index 6c41dc415a..08ef9164dc 100644 --- a/test/index/vocabulary/VocabularyInternalExternalTest.cpp +++ b/test/index/vocabulary/VocabularyInternalExternalTest.cpp @@ -34,7 +34,7 @@ class VocabularyCreator { auto createVocabularyImpl(const std::vector& words) { VocabularyInternalExternal vocabulary; { - auto writer = VocabularyInternalExternal::WordWriter(vocabFilename_); + auto writer = VocabularyInternalExternal::makeDiskWriter(vocabFilename_); size_t i = 0; for (auto& word : words) { writer(word, i % 2 == 0); diff --git a/test/index/vocabulary/VocabularyOnDiskTest.cpp b/test/index/vocabulary/VocabularyOnDiskTest.cpp index 54fc934f24..ee9090125e 100644 --- a/test/index/vocabulary/VocabularyOnDiskTest.cpp +++ b/test/index/vocabulary/VocabularyOnDiskTest.cpp @@ -5,7 +5,7 @@ #include #include "./VocabularyTestHelpers.h" -#include "index/VocabularyOnDisk.h" +#include "index/vocabulary/VocabularyOnDisk.h" #include "util/Forward.h" namespace { diff --git a/test/index/vocabulary/VocabularyTypeTest.cpp b/test/index/vocabulary/VocabularyTypeTest.cpp new file mode 100644 index 0000000000..2a8281dd80 --- /dev/null +++ b/test/index/vocabulary/VocabularyTypeTest.cpp @@ -0,0 +1,36 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include + +#include "index/vocabulary/VocabularyType.h" + +using namespace ad_utility; +TEST(VocabularyType, allTests) { + using E = VocabularyType::Enum; + using T = VocabularyType; + T t{}; + EXPECT_EQ(t.value(), E::InMemory); + for (auto e : T::all()) { + EXPECT_EQ(T{e}.value(), e); + } + + t = T::fromString("on-disk-compressed"); + EXPECT_EQ(t.value(), E::CompressedOnDisk); + + EXPECT_ANY_THROW(T::fromString("kartoffelsalat")); + + EXPECT_EQ(T{E::OnDisk}.toString(), "on-disk-uncompressed"); + + using namespace ::testing; + EXPECT_THAT(T::getListOfSupportedValues(), + AllOf(HasSubstr("in-memory-uncompressed"), + HasSubstr(", on-disk-uncompressed"))); + + for (auto e : T::all()) { + nlohmann::json j = T{e}; + t = j.get(); + EXPECT_EQ(t.value(), e); + } +} diff --git a/test/util/IndexTestHelpers.cpp b/test/util/IndexTestHelpers.cpp index 8e1a693209..6cc5724690 100644 --- a/test/util/IndexTestHelpers.cpp +++ b/test/util/IndexTestHelpers.cpp @@ -186,6 +186,8 @@ Index makeTestIndex(const std::string& indexBasename, index.loadAllPermutations() = loadAllPermutations; qlever::InputFileSpecification spec{inputFilename, qlever::Filetype::Turtle, std::nullopt}; + // randomly choose one of the vocabulary implementations + index.getImpl().setVocabularyTypeForIndexBuilding(VocabularyType::random()); index.createFromFiles({spec}); if (createTextIndex) { if (contentsOfWordsFileAndDocsFile.has_value()) { From 5f2ec6c2ca2850691bd66a80ae0f1d2db97f5ba7 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 6 Feb 2025 12:51:15 +0100 Subject: [PATCH 20/25] Fix for MacOS... Signed-off-by: Johannes Kalmbach --- src/index/vocabulary/PolymorphicVocabulary.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/index/vocabulary/PolymorphicVocabulary.cpp b/src/index/vocabulary/PolymorphicVocabulary.cpp index 1b9936afee..1c328dc6aa 100644 --- a/src/index/vocabulary/PolymorphicVocabulary.cpp +++ b/src/index/vocabulary/PolymorphicVocabulary.cpp @@ -25,7 +25,7 @@ void PolymorphicVocabulary::close() { // _____________________________________________________________________________ size_t PolymorphicVocabulary::size() const { - return std::visit([](auto& vocab) { return vocab.size(); }, vocab_); + return std::visit([](auto& vocab) -> size_t { return vocab.size(); }, vocab_); } // _____________________________________________________________________________ From b30861a3029300862d8b3fad6dd39a540d71b619 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 6 Feb 2025 14:45:53 +0100 Subject: [PATCH 21/25] Move the actually used code into the `Operation class.` Signed-off-by: Johannes Kalmbach --- src/engine/Operation.cpp | 23 +++++++++++++++++++++-- src/engine/QueryExecutionContext.h | 5 +++++ src/engine/Server.cpp | 26 +++++++------------------- test/engine/NamedQueryCacheTest.cpp | 3 +++ 4 files changed, 36 insertions(+), 21 deletions(-) create mode 100644 test/engine/NamedQueryCacheTest.cpp diff --git a/src/engine/Operation.cpp b/src/engine/Operation.cpp index ed5d9d3cc6..4147d24ca8 100644 --- a/src/engine/Operation.cpp +++ b/src/engine/Operation.cpp @@ -4,8 +4,7 @@ #include "engine/Operation.h" -#include - +#include "engine/NamedQueryCache.h" #include "engine/QueryExecutionTree.h" #include "global/RuntimeParameters.h" #include "util/OnDestructionDontThrowDuringStackUnwinding.h" @@ -292,6 +291,12 @@ std::shared_ptr Operation::getResult( _executionContext->_pinResult && isRoot; const bool pinResult = _executionContext->_pinSubtrees || pinFinalResultButNotSubtrees; + const bool pinWithName = + _executionContext->pinWithExplicitName().has_value() && isRoot; + + if (pinWithName) { + computationMode = ComputationMode::FULLY_MATERIALIZED; + } try { // In case of an exception, create the correct runtime info, no matter which @@ -337,6 +342,20 @@ std::shared_ptr Operation::getResult( updateRuntimeInformationOnSuccess(result, timer.msecs()); } + if (pinWithName) { + const auto& name = _executionContext->pinWithExplicitName().value(); + // The query is to be pinned in the named cache. In this case we don't + // return the result, but only pin it. + const auto& actualResult = result._resultPointer->resultTable(); + AD_CORRECTNESS_CHECK(actualResult.isFullyMaterialized()); + auto t = NamedQueryCache::Value(actualResult.idTable().clone(), + getExternallyVisibleVariableColumns(), + actualResult.sortedBy()); + _executionContext->namedQueryCache().store(name, std::move(t)); + + runtimeInfo().addDetail("pinned-with-explicit-name", name); + } + return result._resultPointer->resultTablePtr(); } catch (ad_utility::CancellationException& e) { e.setOperation(getDescriptor()); diff --git a/src/engine/QueryExecutionContext.h b/src/engine/QueryExecutionContext.h index 9eb632b48a..fc8d1b2ac3 100644 --- a/src/engine/QueryExecutionContext.h +++ b/src/engine/QueryExecutionContext.h @@ -155,6 +155,9 @@ class QueryExecutionContext { return *namedQueryCache_; } + auto& pinWithExplicitName() { return pinWithExplicitName_; } + const auto& pinWithExplicitName() const { return pinWithExplicitName_; } + private: const Index& _index; @@ -176,4 +179,6 @@ class QueryExecutionContext { RuntimeParameters().get<"websocket-updates-enabled">(); NamedQueryCache* namedQueryCache_ = nullptr; + + std::optional pinWithExplicitName_ = std::nullopt; }; diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index d9297ea841..ad1b75a61b 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -19,7 +19,6 @@ #include "index/IndexImpl.h" #include "util/AsioHelpers.h" #include "util/MemorySize/MemorySize.h" -#include "util/OnDestructionDontThrowDuringStackUnwinding.h" #include "util/ParseableDuration.h" #include "util/TypeIdentity.h" #include "util/TypeTraits.h" @@ -872,25 +871,14 @@ Awaitable Server::processQuery( limitOffset._offset -= qet.getRootOperation()->getLimit()._offset; if (pinNamed.has_value()) { - // The query is to be pinned in the named cache. In this case we don't - // return the result, but only pin it. - auto result = qet.getResult(false); - auto t = - NamedQueryCache::Value(result->idTable().clone(), - qet.getVariableColumns(), result->sortedBy()); - qec.namedQueryCache().store(pinNamed.value(), std::move(t)); - - auto response = ad_utility::httpUtils::createOkResponse( - "Successfully pinned the query result", request, - ad_utility::MediaType::textPlain); - co_await send(response); - } else { - // This actually processes the query and sends the result in the requested - // format. - co_await sendStreamableResponse(request, send, mediaType, plannedQuery, qet, - requestTimer, cancellationHandle); + // TODO 1. Make this require a valid access token. 2. also allow + // for clearing the cache. + qec.pinWithExplicitName() = pinNamed.value(); } - + // This actually processes the query and sends the result in the requested + // format. + co_await sendStreamableResponse(request, send, mediaType, plannedQuery, qet, + requestTimer, cancellationHandle); // Print the runtime info. This needs to be done after the query // was computed. LOG(INFO) << "Done processing query and sending result" diff --git a/test/engine/NamedQueryCacheTest.cpp b/test/engine/NamedQueryCacheTest.cpp new file mode 100644 index 0000000000..669758e208 --- /dev/null +++ b/test/engine/NamedQueryCacheTest.cpp @@ -0,0 +1,3 @@ +// +// Created by kalmbacj on 2/6/25. +// From d8080b30f9914a89e3ed3dcda9c9ccf85a880795 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 6 Feb 2025 17:33:33 +0100 Subject: [PATCH 22/25] Many more improvements for the tests and for the tools. Signed-off-by: Johannes Kalmbach --- src/global/Pattern.h | 17 ++++++++++++++++- src/index/vocabulary/VocabularyOnDisk.cpp | 6 +++--- src/index/vocabulary/VocabularyOnDisk.h | 4 ++-- .../vocabulary/PolymorphicVocabularyTest.cpp | 11 +++++++++++ 4 files changed, 32 insertions(+), 6 deletions(-) diff --git a/src/global/Pattern.h b/src/global/Pattern.h index 9178e5d640..c98487e772 100644 --- a/src/global/Pattern.h +++ b/src/global/Pattern.h @@ -182,6 +182,7 @@ struct CompactStringVectorWriter { off_t _startOfFile; using offset_type = typename CompactVectorOfStrings::offset_type; std::vector _offsets; + // A `CompactStringVectorWriter` that has been moved from may not call // `finish()` any more in its destructor. ad_utility::ResetWhenMoved _finished = false; @@ -230,6 +231,16 @@ struct CompactStringVectorWriter { } } + // The copy operations would be deleted implicitly (because `File` is not + // copyable. + CompactStringVectorWriter(const CompactStringVectorWriter&) = delete; + CompactStringVectorWriter& operator=(const CompactStringVectorWriter&) = + delete; + + // The move operations have to be explicitly defaulted, because we have a + // manually defined destructor. + // Note: The defaulted move operations behave correctly because of the usage + // of `ResetWhenMoved` with the `_finished` member. CompactStringVectorWriter(CompactStringVectorWriter&&) = default; CompactStringVectorWriter& operator=(CompactStringVectorWriter&&) = default; @@ -237,12 +248,16 @@ struct CompactStringVectorWriter { // Has to be run by all the constructors void commonInitialization() { AD_CONTRACT_CHECK(_file.isOpen()); - // We don't known the data size yet. + // We don't know the data size yet. _startOfFile = _file.tell(); size_t dataSizeDummy = 0; _file.write(&dataSizeDummy, sizeof(dataSizeDummy)); } }; +static_assert( + std::is_nothrow_move_assignable_v>); +static_assert( + std::is_nothrow_move_constructible_v>); } // namespace detail // Forward iterator for a `CompactVectorOfStrings` that reads directly from diff --git a/src/index/vocabulary/VocabularyOnDisk.cpp b/src/index/vocabulary/VocabularyOnDisk.cpp index 1dc53e8453..8f23170300 100644 --- a/src/index/vocabulary/VocabularyOnDisk.cpp +++ b/src/index/vocabulary/VocabularyOnDisk.cpp @@ -23,8 +23,8 @@ OffsetAndSize VocabularyOnDisk::getOffsetAndSize(uint64_t i) const { std::string VocabularyOnDisk::operator[](uint64_t idx) const { AD_CONTRACT_CHECK(idx < size()); auto offsetAndSize = getOffsetAndSize(idx); - string result(offsetAndSize._size, '\0'); - file_.read(result.data(), offsetAndSize._size, offsetAndSize._offset); + string result(offsetAndSize.size_, '\0'); + file_.read(result.data(), offsetAndSize.size_, offsetAndSize.offset_); return result; } @@ -88,7 +88,7 @@ VocabularyOnDisk::WordWriter::~WordWriter() { void VocabularyOnDisk::buildFromStringsAndIds( const std::vector>& wordsAndIds, const std::string& fileName) { - return buildFromIterable(wordsAndIds, fileName); + buildFromIterable(wordsAndIds, fileName); } // _____________________________________________________________________________ diff --git a/src/index/vocabulary/VocabularyOnDisk.h b/src/index/vocabulary/VocabularyOnDisk.h index f677ac3e7a..87506a4ed5 100644 --- a/src/index/vocabulary/VocabularyOnDisk.h +++ b/src/index/vocabulary/VocabularyOnDisk.h @@ -86,8 +86,8 @@ class VocabularyOnDisk : public VocabularyBinarySearchMixin { // The offset of a word in `file_` and its size in number of bytes. struct OffsetAndSize { - uint64_t _offset; - uint64_t _size; + uint64_t offset_; + uint64_t size_; }; // Helper function for implementing a random access iterator. diff --git a/test/index/vocabulary/PolymorphicVocabularyTest.cpp b/test/index/vocabulary/PolymorphicVocabularyTest.cpp index fc01104d4c..c5c91ed686 100644 --- a/test/index/vocabulary/PolymorphicVocabularyTest.cpp +++ b/test/index/vocabulary/PolymorphicVocabularyTest.cpp @@ -9,6 +9,8 @@ using ad_utility::VocabularyType; namespace { + +// Test a `PolymorphicVocabulary` with a given `vocabType`. void testForVocabType(VocabularyType::Enum vocabType) { VocabularyType type{vocabType}; std::string filename = @@ -37,6 +39,15 @@ void testForVocabType(VocabularyType::Enum vocabType) { } } // namespace +// Test the general functionality of the `PolymorphicVocabulary` for all the +// possible `VocabularyType`s. TEST(PolymorphicVocabulary, basicTests) { ql::ranges::for_each(VocabularyType::all(), &testForVocabType); } + +// Test a corner case in a `switch` statement. +TEST(PolymorphicVocabulary, invalidVocabularyType) { + PolymorphicVocabulary vocab; + auto invalidType = VocabularyType{static_cast(23401)}; + EXPECT_ANY_THROW(vocab.resetToType(invalidType)); +} From ea477275ca0924f220cffc47070cbf5b7837054f Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 7 Feb 2025 13:54:16 +0100 Subject: [PATCH 23/25] Merge in the vocab branch. Signed-off-by: Johannes Kalmbach --- src/engine/CMakeLists.txt | 1 - src/libqlever/Qlever.h | 11 +++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt index 542f2c3822..fece7772bf 100644 --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -15,7 +15,6 @@ add_library(engine TextLimit.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp CountConnectedSubgraphs.cpp SpatialJoinAlgorithms.cpp PathSearch.cpp ExecuteUpdate.cpp Describe.cpp GraphStoreProtocol.cpp) - CountConnectedSubgraphs.cpp SpatialJoinAlgorithms.cpp PathSearch.cpp ExecuteUpdate.cpp) add_library(server Server.cpp) qlever_target_link_libraries(server) qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams s2) diff --git a/src/libqlever/Qlever.h b/src/libqlever/Qlever.h index b69dcf9588..88ac4fc042 100644 --- a/src/libqlever/Qlever.h +++ b/src/libqlever/Qlever.h @@ -4,6 +4,7 @@ #pragma once +#include #include #include @@ -17,6 +18,7 @@ #include "global/RuntimeParameters.h" #include "index/Index.h" #include "index/InputFileSpecification.h" +#include "index/vocabulary/VocabularyType.h" #include "parser/SparqlParser.h" #include "util/AllocatorWithLimit.h" #include "util/http/MediaTypes.h" @@ -55,6 +57,11 @@ struct QleverConfig { // TODO Document these additional settings. std::string settingsFile; + // Specify whether the vocabulary is stored on disk or in RAM, compressed or + // uncompressed. + ad_utility::VocabularyType vocabularyType_{ + ad_utility::VocabularyType::Enum::CompressedOnDisk}; + // The following members are only required if QLever's full-text search // extension is to be used, see `IndexBuilderMain.cpp` for additional details. bool addWordsFromLiterals = false; @@ -93,6 +100,10 @@ class Qlever { // cancellation, time limits, and observable queries. std::string query(std::string query); + // Pin a query to the named query cache. In a subsequent query, this cache can + // be accessed via `SERVICE ql: + void pinNamed(std::string query, std::string name); + // TODO Give access to the RuntimeParameters() which allow for // further tweaking of the qlever instance. }; From 81529de5616ae64300de3662a50e421ce86d7262 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 7 Feb 2025 14:30:41 +0100 Subject: [PATCH 24/25] Update the example with a warmup etc. Signed-off-by: Johannes Kalmbach --- src/libqlever/LibQLeverExample.cpp | 30 +++++++++++++++++++- src/libqlever/Qlever.cpp | 45 +++++++++++++++++++++++++++++- src/libqlever/Qlever.h | 2 ++ 3 files changed, 75 insertions(+), 2 deletions(-) diff --git a/src/libqlever/LibQLeverExample.cpp b/src/libqlever/LibQLeverExample.cpp index dbc0ffe2e6..ccf5ef869c 100644 --- a/src/libqlever/LibQLeverExample.cpp +++ b/src/libqlever/LibQLeverExample.cpp @@ -2,15 +2,43 @@ // Chair of Algorithms and Data Structures. // Author: Johannes Kalmbach +#include + #include #include "libqlever/Qlever.h" +#include "util/Timer.h" + +static const std::string warmup1 = ""; +static const std::string warmup2 = ""; + +static const std::string queryTemplate = R"( +SELECT * { + #INPUTS# + SERVICE ql:named-cached-query-warmup1 {} + SERVICE ql:named-cached-query-warmup2 {} +} +)"; + +std::vector inputs{""}; int main() { qlever::QleverConfig config; config.baseName = "exampleIndex"; config.inputFiles.emplace_back("/dev/stdin", qlever::Filetype::Turtle); + config.vocabularyType_ = + ad_utility::VocabularyType{ad_utility::VocabularyType::Enum::InMemory}; qlever::Qlever::buildIndex(config); qlever::Qlever qlever{config}; - std::cout << qlever.query("SELECT * {?s ?p ?o}") << std::endl; + qlever.pinNamed(warmup1, "warmup1"); + qlever.pinNamed(warmup2, "warmup2"); + + for (std::string_view input : inputs) { + auto query = absl::StrReplaceAll(queryTemplate, + {{std::string_view{"#INPUTS#"}, input}}); + ad_utility::Timer t{ad_utility::Timer::Started}; + auto result = qlever.query(std::move(query)); + std::cout << "retrieved a query result of size " << result.size() << " in " + << t.msecs().count() << "ms\n"; + } } diff --git a/src/libqlever/Qlever.cpp b/src/libqlever/Qlever.cpp index cc37562dde..fb4c77eaf6 100644 --- a/src/libqlever/Qlever.cpp +++ b/src/libqlever/Qlever.cpp @@ -4,6 +4,8 @@ #include "libqlever/Qlever.h" +#include "index/IndexImpl.h" + namespace qlever { static std::string getStxxlConfigFileName(const string& location) { return absl::StrCat(location, ".stxxl"); @@ -51,6 +53,8 @@ Qlever::Qlever(const QleverConfig& config) enablePatternTrick_ = !config.noPatterns; index_.loadAllPermutations() = !config.onlyPsoAndPos; + index_.getImpl().setVocabularyTypeForIndexBuilding(config.vocabularyType_); + // Init the index. index_.createFromOnDiskIndex(config.baseName); // TODO Enable the loading of the text index via the QLever lib. @@ -121,7 +125,7 @@ void Qlever::buildIndex(QleverConfig config) { // ___________________________________________________________________________ std::string Qlever::query(std::string query) { QueryExecutionContext qec{index_, &cache_, allocator_, - sortPerformanceEstimator_}; + sortPerformanceEstimator_, &namedQueryCache_}; auto parsedQuery = SparqlParser::parseQuery(query); auto handle = std::make_shared>(); QueryPlanner qp{&qec, handle}; @@ -156,4 +160,43 @@ std::string Qlever::query(std::string query) { } return result; } +// ___________________________________________________________________________ +// TODO A lot of code duplication here. +void Qlever::pinNamed(std::string query, std::string name) { + QueryExecutionContext qec{index_, &cache_, allocator_, + sortPerformanceEstimator_, &namedQueryCache_}; + qec.pinWithExplicitName() = std::move(name); + auto parsedQuery = SparqlParser::parseQuery(query); + auto handle = std::make_shared>(); + QueryPlanner qp{&qec, handle}; + qp.setEnablePatternTrick(enablePatternTrick_); + auto qet = qp.createExecutionTree(parsedQuery); + qet.isRoot() = true; + auto& limitOffset = parsedQuery._limitOffset; + + // TODO For cancellation we have to call + // `recursivelySetCancellationHandle` (see `Server::parseAndPlan`). + + // TODO The following interface looks fishy and should be + // incorporated directly in the query planner or somewhere else. + // (it is used identically in `Server.cpp`. + + // Make sure that the offset is not applied again when exporting the result + // (it is already applied by the root operation in the query execution + // tree). Note that we don't need this for the limit because applying a + // fixed limit is idempotent. + AD_CORRECTNESS_CHECK(limitOffset._offset >= + qet.getRootOperation()->getLimit()._offset); + limitOffset._offset -= qet.getRootOperation()->getLimit()._offset; + + ad_utility::Timer timer{ad_utility::Timer::Started}; + auto responseGenerator = ExportQueryExecutionTrees::computeResult( + parsedQuery, qet, ad_utility::MediaType::sparqlJson, timer, + std::move(handle)); + std::string result; + std::cout << "Writing the result:" << std::endl; + for (const auto& batch : responseGenerator) { + result += batch; + } +} } // namespace qlever diff --git a/src/libqlever/Qlever.h b/src/libqlever/Qlever.h index 88ac4fc042..0a50620670 100644 --- a/src/libqlever/Qlever.h +++ b/src/libqlever/Qlever.h @@ -13,6 +13,7 @@ #include #include "engine/ExportQueryExecutionTrees.h" +#include "engine/NamedQueryCache.h" #include "engine/QueryExecutionContext.h" #include "engine/QueryPlanner.h" #include "global/RuntimeParameters.h" @@ -84,6 +85,7 @@ class Qlever { ad_utility::AllocatorWithLimit allocator_; SortPerformanceEstimator sortPerformanceEstimator_; Index index_; + NamedQueryCache namedQueryCache_; bool enablePatternTrick_; static inline std::ostringstream ignoreLogStream; From c0b7a44502a89ed269df0d3e475952b54a11a194 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Mon, 10 Feb 2025 00:46:23 +0100 Subject: [PATCH 25/25] Add argument for media type to `Qlever::query` and `Qlever::pinNamed` Also: remove some debug output --- src/libqlever/Qlever.cpp | 14 ++++++-------- src/libqlever/Qlever.h | 7 +++++-- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/libqlever/Qlever.cpp b/src/libqlever/Qlever.cpp index fb4c77eaf6..109c906f63 100644 --- a/src/libqlever/Qlever.cpp +++ b/src/libqlever/Qlever.cpp @@ -123,7 +123,7 @@ void Qlever::buildIndex(QleverConfig config) { } // ___________________________________________________________________________ -std::string Qlever::query(std::string query) { +std::string Qlever::query(std::string query, ad_utility::MediaType mediaType) { QueryExecutionContext qec{index_, &cache_, allocator_, sortPerformanceEstimator_, &namedQueryCache_}; auto parsedQuery = SparqlParser::parseQuery(query); @@ -151,10 +151,8 @@ std::string Qlever::query(std::string query) { ad_utility::Timer timer{ad_utility::Timer::Started}; auto responseGenerator = ExportQueryExecutionTrees::computeResult( - parsedQuery, qet, ad_utility::MediaType::sparqlJson, timer, - std::move(handle)); + parsedQuery, qet, mediaType, timer, std::move(handle)); std::string result; - std::cout << "Writing the result:" << std::endl; for (const auto& batch : responseGenerator) { result += batch; } @@ -162,7 +160,8 @@ std::string Qlever::query(std::string query) { } // ___________________________________________________________________________ // TODO A lot of code duplication here. -void Qlever::pinNamed(std::string query, std::string name) { +std::string Qlever::pinNamed(std::string query, std::string name, + ad_utility::MediaType mediaType) { QueryExecutionContext qec{index_, &cache_, allocator_, sortPerformanceEstimator_, &namedQueryCache_}; qec.pinWithExplicitName() = std::move(name); @@ -191,12 +190,11 @@ void Qlever::pinNamed(std::string query, std::string name) { ad_utility::Timer timer{ad_utility::Timer::Started}; auto responseGenerator = ExportQueryExecutionTrees::computeResult( - parsedQuery, qet, ad_utility::MediaType::sparqlJson, timer, - std::move(handle)); + parsedQuery, qet, mediaType, timer, std::move(handle)); std::string result; - std::cout << "Writing the result:" << std::endl; for (const auto& batch : responseGenerator) { result += batch; } + return result; } } // namespace qlever diff --git a/src/libqlever/Qlever.h b/src/libqlever/Qlever.h index 0a50620670..a57c405b7d 100644 --- a/src/libqlever/Qlever.h +++ b/src/libqlever/Qlever.h @@ -100,11 +100,14 @@ class Qlever { // supported, and the result will always be in sparql-results+json format. // TODO Support other formats + CONSTRUCT queries, support // cancellation, time limits, and observable queries. - std::string query(std::string query); + std::string query(std::string query, ad_utility::MediaType mediaType = + ad_utility::MediaType::sparqlJson); // Pin a query to the named query cache. In a subsequent query, this cache can // be accessed via `SERVICE ql: - void pinNamed(std::string query, std::string name); + [[maybe_unused]] std::string pinNamed( + std::string query, std::string name, + ad_utility::MediaType mediaType = ad_utility::MediaType::sparqlJson); // TODO Give access to the RuntimeParameters() which allow for // further tweaking of the qlever instance.