ad-freiburg · joka921 · Jan 31, 2025 · Jan 31, 2025 · Jan 31, 2025 · Feb 5, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -203,6 +203,11 @@ if (${USE_CPP_17_BACKPORTS})
     add_definitions("-DQLEVER_CPP_17 -DCPP_CXX_CONCEPTS=0")
 endif()
 
+set(VOCAB_UNCOMPRESSED_IN_MEMORY OFF CACHE BOOL "Store QLever's vocabulary uncompressed and completely in RAM")
+if (${VOCAB_UNCOMPRESSED_IN_MEMORY})
+    add_definitions("-D_QLEVER_VOCAB_UNCOMPRESSED_IN_MEMORY")
+endif ()
+
 # Enable the specification of additional linker flags manually from the commandline
 set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ADDITIONAL_LINKER_FLAGS}")
 set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ADDITIONAL_LINKER_FLAGS}")

diff --git a/src/engine/ExportQueryExecutionTrees.cpp b/src/engine/ExportQueryExecutionTrees.cpp
@@ -356,8 +356,15 @@ ExportQueryExecutionTrees::getLiteralOrIriFromVocabIndex(
     case Datatype::LocalVocabIndex:
       return localVocab.getWord(id.getLocalVocabIndex()).asLiteralOrIri();
     case Datatype::VocabIndex: {
-      auto entity = index.indexToString(id.getVocabIndex());
-      return LiteralOrIri::fromStringRepresentation(entity);
+      auto getEntity = [&index, id]() {
+        return index.indexToString(id.getVocabIndex());
+      };
+      // The type of entity might be `string_view` (If the vocabulary is stored
+      // uncompressed in RAM) or `string` (if it is on-disk, or compressed or
+      // both). The following code works and is efficient in all cases. In
+      // particular, the `std::string` constructor is compiled out because of
+      // RVO if `getEntity()` already returns a `string`.
+      return LiteralOrIri::fromStringRepresentation(std::string(getEntity()));
     }
     default:
       AD_FAIL();

diff --git a/src/global/Pattern.h b/src/global/Pattern.h
@@ -17,6 +17,7 @@
 #include "util/File.h"
 #include "util/Generator.h"
 #include "util/Iterators.h"
+#include "util/ResetWhenMoved.h"
 #include "util/Serializer/FileSerializer.h"
 #include "util/Serializer/SerializeVector.h"
 #include "util/TypeTraits.h"
@@ -181,7 +182,10 @@ struct CompactStringVectorWriter {
   off_t _startOfFile;
   using offset_type = typename CompactVectorOfStrings<data_type>::offset_type;
   std::vector<offset_type> _offsets;
-  bool _finished = false;
+
+  // A `CompactStringVectorWriter` that has been moved from may not call
+  // `finish()` any more in its destructor.
+  ad_utility::ResetWhenMoved<bool, true> _finished = false;
   offset_type _nextOffset = 0;
 
   explicit CompactStringVectorWriter(const std::string& filename)
@@ -227,16 +231,33 @@ struct CompactStringVectorWriter {
     }
   }
 
+  // The copy operations would be deleted implicitly (because `File` is not
+  // copyable.
+  CompactStringVectorWriter(const CompactStringVectorWriter&) = delete;
+  CompactStringVectorWriter& operator=(const CompactStringVectorWriter&) =
+      delete;
+
+  // The move operations have to be explicitly defaulted, because we have a
+  // manually defined destructor.
+  // Note: The defaulted move operations behave correctly because of the usage
+  // of `ResetWhenMoved` with the `_finished` member.
+  CompactStringVectorWriter(CompactStringVectorWriter&&) = default;
+  CompactStringVectorWriter& operator=(CompactStringVectorWriter&&) = default;
+
  private:
   // Has to be run by all the constructors
   void commonInitialization() {
     AD_CONTRACT_CHECK(_file.isOpen());
-    // We don't known the data size yet.
+    // We don't know the data size yet.
     _startOfFile = _file.tell();
     size_t dataSizeDummy = 0;
     _file.write(&dataSizeDummy, sizeof(dataSizeDummy));
   }
 };
+static_assert(
+    std::is_nothrow_move_assignable_v<CompactStringVectorWriter<char>>);
+static_assert(
+    std::is_nothrow_move_constructible_v<CompactStringVectorWriter<char>>);
 }  // namespace detail
 
 // Forward iterator for a `CompactVectorOfStrings` that reads directly from

diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_subdirectory(vocabulary)
 add_library(index
         Index.cpp IndexImpl.cpp IndexImpl.Text.cpp
-        Vocabulary.cpp VocabularyOnDisk.cpp
+        Vocabulary.cpp
         LocatedTriples.cpp Permutation.cpp TextMetaData.cpp
         DocsDB.cpp FTSAlgorithms.cpp
         PrefixHeuristic.cpp CompressedRelation.cpp

diff --git a/src/index/ConstantsIndexBuilding.h b/src/index/ConstantsIndexBuilding.h
@@ -99,7 +99,8 @@ constinit inline std::atomic<size_t> BUFFER_SIZE_PARTIAL_TO_GLOBAL_ID_MAPPINGS =
 // the overhead of the metadata that has to be stored per block becomes
 // infeasible. 250K seems to be a reasonable tradeoff here.
 constexpr inline ad_utility::MemorySize
-    UNCOMPRESSED_BLOCKSIZE_COMPRESSED_METADATA_PER_COLUMN = 250_kB;
+    UNCOMPRESSED_BLOCKSIZE_COMPRESSED_METADATA_PER_COLUMN =
+        ad_utility::MemorySize::kilobytes(250);
 
 constexpr inline size_t NumColumnsIndexBuilding = 4;
 

diff --git a/src/index/Index.cpp b/src/index/Index.cpp
@@ -71,12 +71,13 @@ size_t Index::getCardinality(
 }
 
 // ____________________________________________________________________________
-std::string Index::indexToString(VocabIndex id) const {
+auto Index::indexToString(VocabIndex id) const -> Vocab::AccessReturnType {
   return pimpl_->indexToString(id);
 }
 
 // ____________________________________________________________________________
-std::string_view Index::indexToString(WordVocabIndex id) const {
+auto Index::indexToString(WordVocabIndex id) const
+    -> TextVocabulary::AccessReturnType {
   return pimpl_->indexToString(id);
 }
 

diff --git a/src/index/Index.h b/src/index/Index.h
@@ -104,13 +104,11 @@ class Index {
   // Read necessary metadata into memory and open file handles.
   void addTextFromOnDiskIndex();
 
-  using Vocab =
-      Vocabulary<CompressedString, TripleComponentComparator, VocabIndex>;
+  using Vocab = RdfsVocabulary;
   [[nodiscard]] const Vocab& getVocab() const;
   Vocab& getNonConstVocabForTesting();
 
-  using TextVocab =
-      Vocabulary<std::string, SimpleStringComparator, WordVocabIndex>;
+  using TextVocab = TextVocabulary;
   [[nodiscard]] const TextVocab& getTextVocab() const;
 
   // Get a (non-owning) pointer to the BlankNodeManager of this Index.
@@ -132,8 +130,8 @@ class Index {
 
   // TODO<joka921> Once we have an overview over the folding this logic should
   // probably not be in the index class.
-  std::string indexToString(VocabIndex id) const;
-  std::string_view indexToString(WordVocabIndex id) const;
+  Vocab::AccessReturnType indexToString(VocabIndex id) const;
+  TextVocab::AccessReturnType indexToString(WordVocabIndex id) const;
 
   [[nodiscard]] Vocab::PrefixRanges prefixRanges(std::string_view prefix) const;
 

diff --git a/src/index/IndexBuilderMain.cpp b/src/index/IndexBuilderMain.cpp
@@ -11,6 +11,7 @@
 #include <string>
 
 #include "CompilationInfo.h"
+#include "IndexImpl.h"
 #include "global/Constants.h"
 #include "index/ConstantsIndexBuilding.h"
 #include "index/Index.h"
@@ -166,6 +167,7 @@ int main(int argc, char** argv) {
   bool addWordsFromLiterals = false;
   std::optional<ad_utility::MemorySize> stxxlMemory;
   std::optional<ad_utility::MemorySize> parserBufferSize;
+  std::optional<ad_utility::VocabularyType> vocabType;
   optind = 1;
 
   Index index{ad_utility::makeUnlimitedAllocator<Id>()};
@@ -224,6 +226,10 @@ int main(int argc, char** argv) {
   add("only-pso-and-pos-permutations,o", po::bool_switch(&onlyPsoAndPos),
       "Only build the PSO and POS permutations. This is faster, but then "
       "queries with predicate variables are not supported");
+  auto msg = absl::StrCat(
+      "The vocabulary implementation for strings in qlever, can be any of ",
+      ad_utility::VocabularyType::getListOfSupportedValues());
+  add("vocabulary-type", po::value(&vocabType), msg.c_str());
 
   // Options for the index building process.
   add("stxxl-memory,m", po::value(&stxxlMemory),
@@ -257,6 +263,10 @@ int main(int argc, char** argv) {
     index.parserBufferSize() = parserBufferSize.value();
   }
 
+  if (vocabType.has_value()) {
+    index.getImpl().setVocabularyTypeForIndexBuilding(vocabType.value());
+  }
+
   // If no text index name was specified, take the part of the wordsfile after
   // the last slash.
   if (textIndexName.empty() && !wordsfile.empty()) {

diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
@@ -48,7 +48,7 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
       if (!isLiteral(text)) {
         continue;
       }
-      WordsFileLine entityLine{text, true, contextId, 1, true};
+      WordsFileLine entityLine{std::string{text}, true, contextId, 1, true};
       co_yield entityLine;
       std::string_view textView = text;
       textView = textView.substr(0, textView.rfind('"'));

diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp
@@ -341,6 +341,8 @@ void IndexImpl::createFromFiles(
         "The patterns can only be built when all 6 permutations are created"};
   }
 
+  vocab_.resetToType(vocabularyTypeForIndexBuilding_);
+
   readIndexBuilderSettingsFromFile();
 
   updateInputFileSpecificationsAndLog(files, useParallelParser_);
@@ -560,7 +562,6 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary(
       return (*cmp)(a, b, decltype(vocab_)::SortLevel::TOTAL);
     };
     auto wordCallback = vocab_.makeWordWriter(onDiskBase_ + VOCAB_SUFFIX);
-    wordCallback.readableName() = "internal vocabulary";
     return ad_utility::vocabulary_merger::mergeVocabulary(
         onDiskBase_, numFiles, sortPred, wordCallback,
         memoryLimitIndexBuilding());
@@ -974,7 +975,7 @@ size_t IndexImpl::getNumDistinctSubjectPredicatePairs() const {
 }
 
 // _____________________________________________________________________________
-bool IndexImpl::isLiteral(const string& object) const {
+bool IndexImpl::isLiteral(std::string_view object) const {
   return decltype(vocab_)::stringIsLiteral(object);
 }
 
@@ -1132,6 +1133,11 @@ void IndexImpl::readConfiguration() {
   loadDataMember("num-triples", numTriples_, NumNormalAndInternal{});
   loadDataMember("num-non-literals-text-index", nofNonLiteralsInTextIndex_, 0);
 
+  ad_utility::VocabularyType vocabType(
+      ad_utility::VocabularyType::Enum::CompressedOnDisk);
+  loadDataMember("vocabulary-type", vocabType, vocabType);
+  vocab_.resetToType(vocabType);
+
   // Initialize BlankNodeManager
   uint64_t numBlankNodesTotal;
   loadDataMember("num-blank-nodes-total", numBlankNodesTotal);
@@ -1522,10 +1528,13 @@ size_t IndexImpl::getCardinality(
 }
 
 // ___________________________________________________________________________
-std::string IndexImpl::indexToString(VocabIndex id) const { return vocab_[id]; }
+RdfsVocabulary::AccessReturnType IndexImpl::indexToString(VocabIndex id) const {
+  return vocab_[id];
+}
 
 // ___________________________________________________________________________
-std::string_view IndexImpl::indexToString(WordVocabIndex id) const {
+TextVocabulary::AccessReturnType IndexImpl::indexToString(
+    WordVocabIndex id) const {
   return textVocab_[id];
 }
 

diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h
@@ -193,6 +193,10 @@ class IndexImpl {
   std::optional<Id> idOfHasPatternDuringIndexBuilding_;
   std::optional<Id> idOfInternalGraphDuringIndexBuilding_;
 
+  // The vocabulary type that is used (only relevant during index building).
+  ad_utility::VocabularyType vocabularyTypeForIndexBuilding_{
+      ad_utility::VocabularyType::Enum::CompressedOnDisk};
+
   // BlankNodeManager, initialized during `readConfiguration`
   std::unique_ptr<ad_utility::BlankNodeManager> blankNodeManager_{nullptr};
 
@@ -276,6 +280,13 @@ class IndexImpl {
     return deltaTriples_.value();
   }
 
+  // See the documentation of the `vocabularyTypeForIndexBuilding_` member for
+  // details.
+  void setVocabularyTypeForIndexBuilding(ad_utility::VocabularyType type) {
+    vocabularyTypeForIndexBuilding_ = type;
+    configurationJson_["vocabulary-type"] = type;
+  }
+
   // --------------------------------------------------------------------------
   //  -- RETRIEVAL ---
   // --------------------------------------------------------------------------
@@ -306,10 +317,10 @@ class IndexImpl {
       const LocatedTriplesSnapshot& locatedTriplesSnapshot) const;
 
   // ___________________________________________________________________________
-  std::string indexToString(VocabIndex id) const;
+  RdfsVocabulary::AccessReturnType indexToString(VocabIndex id) const;
 
   // ___________________________________________________________________________
-  std::string_view indexToString(WordVocabIndex id) const;
+  TextVocabulary::AccessReturnType indexToString(WordVocabIndex id) const;
 
  public:
   // ___________________________________________________________________________
@@ -635,7 +646,7 @@ class IndexImpl {
   friend class IndexTest_createFromOnDiskIndexTest_Test;
   friend class CreatePatternsFixture_createPatterns_Test;
 
-  bool isLiteral(const string& object) const;
+  bool isLiteral(std::string_view object) const;
 
  public:
   LangtagAndTriple tripleToInternalRepresentation(TurtleTriple&& triple) const;

diff --git a/src/index/StringSortComparator.h b/src/index/StringSortComparator.h
@@ -623,6 +623,12 @@ class TripleComponentComparator {
     return compare(spA, spB, level) < 0;
   }
 
+  bool operator()(const SplitVal& spA, std::string_view b,
+                  const Level level) const {
+    auto spB = extractAndTransformComparable(b, level, false);
+    return compare(spA, spB, level) < 0;
+  }
+
   template <typename A, typename B, typename C>
   bool operator()(const SplitValBase<A, B, C>& a,
                   const SplitValBase<A, B, C>& b, const Level level) const {

diff --git a/src/index/Vocabulary.cpp b/src/index/Vocabulary.cpp
@@ -39,20 +39,8 @@ bool Vocabulary<StringType, ComparatorType, IndexT>::PrefixRanges::contain(
 // _____________________________________________________________________________
 template <class S, class C, typename I>
 void Vocabulary<S, C, I>::readFromFile(const string& fileName) {
-  LOG(INFO) << "Reading vocabulary from file " << fileName << " ..."
-            << std::endl;
   vocabulary_.close();
   vocabulary_.open(fileName);
-  if constexpr (isCompressed_) {
-    const auto& internalExternalVocab =
-        vocabulary_.getUnderlyingVocabulary().getUnderlyingVocabulary();
-    LOG(INFO) << "Done, number of words: "
-              << internalExternalVocab.internalVocab().size() << std::endl;
-    LOG(INFO) << "Number of words in external vocabulary: "
-              << internalExternalVocab.externalVocab().size() << std::endl;
-  } else {
-    LOG(INFO) << "Done, number of words: " << vocabulary_.size() << std::endl;
-  }
 
   // Precomputing ranges for IRIs, blank nodes, and literals, for faster
   // processing of the `isIrI` and `isLiteral` functions.
@@ -75,7 +63,17 @@ void Vocabulary<S, C, I>::createFromSet(
     return getCaseComparator()(a, b, SortLevel::TOTAL);
   };
   std::sort(begin(words), end(words), totalComparison);
-  vocabulary_.build(words, filename);
+  auto writer = makeWordWriter(filename);
+  auto writeWords = [&writer](std::string_view word) {
+    // All words are stored in the internal vocab (this is consistent with the
+    // previous behavior). NOTE: This function is currently only used for the
+    // text index and for few unit tests, where we don't have an external
+    // vocabulary anyway.
+    writer(word, false);
+  };
+  ql::ranges::for_each(words, writeWords);
+  writer.finish();
+  vocabulary_.open(filename);
   LOG(DEBUG) << "END Vocabulary::createFromSet" << std::endl;
 }
 
@@ -88,19 +86,12 @@ bool Vocabulary<S, C, I>::stringIsLiteral(std::string_view s) {
 // _____________________________________________________________________________
 template <class S, class C, class I>
 bool Vocabulary<S, C, I>::shouldBeExternalized(string_view s) const {
-  // TODO<joka921> Completely refactor the Vocabulary on the different
-  // Types, it is a mess.
-
-  // If the string is not compressed, this means that this is a text vocabulary
-  // and thus doesn't support externalization.
-  if constexpr (std::is_same_v<S, CompressedString>) {
-    if (!stringIsLiteral(s)) {
-      return shouldEntityBeExternalized(s);
-    } else {
-      return shouldLiteralBeExternalized(s);
-    }
+  // TODO<joka921> We should have a completely separate layer that handles the
+  // externalization, not the Vocab.
+  if (!stringIsLiteral(s)) {
+    return shouldEntityBeExternalized(s);
   } else {
-    return false;
+    return shouldLiteralBeExternalized(s);
   }
 }
 
@@ -264,17 +255,18 @@ auto Vocabulary<S, C, I>::prefixRanges(std::string_view prefix) const
 }
 
 // _____________________________________________________________________________
-template <typename S, typename C, typename I>
-auto Vocabulary<S, C, I>::operator[](IndexType idx) const
-    -> AccessReturnType_t<S> {
+template <typename UnderlyingVocabulary, typename C, typename I>
+auto Vocabulary<UnderlyingVocabulary, C, I>::operator[](IndexType idx) const
+    -> AccessReturnType {
   AD_CONTRACT_CHECK(idx.get() < size());
   return vocabulary_[idx.get()];
 }
 
 // Explicit template instantiations
-template class Vocabulary<CompressedString, TripleComponentComparator,
-                          VocabIndex>;
-template class Vocabulary<std::string, SimpleStringComparator, WordVocabIndex>;
+template class Vocabulary<detail::UnderlyingVocabRdfsVocabulary,
+                          TripleComponentComparator, VocabIndex>;
+template class Vocabulary<detail::UnderlyingVocabTextVocabulary,
+                          SimpleStringComparator, WordVocabIndex>;
 
 template void RdfsVocabulary::initializeInternalizedLangs<nlohmann::json>(
     const nlohmann::json&);