Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support uncompressed or fully in-memory vocabularies #1740

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,11 @@ if (${USE_CPP_17_BACKPORTS})
add_definitions("-DQLEVER_CPP_17 -DCPP_CXX_CONCEPTS=0")
endif()

set(VOCAB_UNCOMPRESSED_IN_MEMORY OFF CACHE BOOL "Store QLever's vocabulary uncompressed and completely in RAM")
if (${VOCAB_UNCOMPRESSED_IN_MEMORY})
add_definitions("-D_QLEVER_VOCAB_UNCOMPRESSED_IN_MEMORY")
endif ()

# Enable the specification of additional linker flags manually from the commandline
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ADDITIONAL_LINKER_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ADDITIONAL_LINKER_FLAGS}")
Expand Down
11 changes: 9 additions & 2 deletions src/engine/ExportQueryExecutionTrees.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -356,8 +356,15 @@ ExportQueryExecutionTrees::getLiteralOrIriFromVocabIndex(
case Datatype::LocalVocabIndex:
return localVocab.getWord(id.getLocalVocabIndex()).asLiteralOrIri();
case Datatype::VocabIndex: {
auto entity = index.indexToString(id.getVocabIndex());
return LiteralOrIri::fromStringRepresentation(entity);
auto getEntity = [&index, id]() {
return index.indexToString(id.getVocabIndex());
};
// The type of entity might be `string_view` (If the vocabulary is stored
// uncompressed in RAM) or `string` (if it is on-disk, or compressed or
// both). The following code works and is efficient in all cases. In
// particular, the `std::string` constructor is compiled out because of
// RVO if `getEntity()` already returns a `string`.
return LiteralOrIri::fromStringRepresentation(std::string(getEntity()));
}
default:
AD_FAIL();
Expand Down
25 changes: 23 additions & 2 deletions src/global/Pattern.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "util/File.h"
#include "util/Generator.h"
#include "util/Iterators.h"
#include "util/ResetWhenMoved.h"
#include "util/Serializer/FileSerializer.h"
#include "util/Serializer/SerializeVector.h"
#include "util/TypeTraits.h"
Expand Down Expand Up @@ -181,7 +182,10 @@ struct CompactStringVectorWriter {
off_t _startOfFile;
using offset_type = typename CompactVectorOfStrings<data_type>::offset_type;
std::vector<offset_type> _offsets;
bool _finished = false;

// A `CompactStringVectorWriter` that has been moved from may not call
// `finish()` any more in its destructor.
ad_utility::ResetWhenMoved<bool, true> _finished = false;
offset_type _nextOffset = 0;

explicit CompactStringVectorWriter(const std::string& filename)
Expand Down Expand Up @@ -227,16 +231,33 @@ struct CompactStringVectorWriter {
}
}

// The copy operations would be deleted implicitly (because `File` is not
// copyable.
CompactStringVectorWriter(const CompactStringVectorWriter&) = delete;
CompactStringVectorWriter& operator=(const CompactStringVectorWriter&) =
delete;

// The move operations have to be explicitly defaulted, because we have a
// manually defined destructor.
// Note: The defaulted move operations behave correctly because of the usage
// of `ResetWhenMoved` with the `_finished` member.
CompactStringVectorWriter(CompactStringVectorWriter&&) = default;
CompactStringVectorWriter& operator=(CompactStringVectorWriter&&) = default;

private:
// Has to be run by all the constructors
void commonInitialization() {
AD_CONTRACT_CHECK(_file.isOpen());
// We don't known the data size yet.
// We don't know the data size yet.
_startOfFile = _file.tell();
size_t dataSizeDummy = 0;
_file.write(&dataSizeDummy, sizeof(dataSizeDummy));
}
};
static_assert(
std::is_nothrow_move_assignable_v<CompactStringVectorWriter<char>>);
static_assert(
std::is_nothrow_move_constructible_v<CompactStringVectorWriter<char>>);
} // namespace detail

// Forward iterator for a `CompactVectorOfStrings` that reads directly from
Expand Down
2 changes: 1 addition & 1 deletion src/index/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
add_subdirectory(vocabulary)
add_library(index
Index.cpp IndexImpl.cpp IndexImpl.Text.cpp
Vocabulary.cpp VocabularyOnDisk.cpp
Vocabulary.cpp
LocatedTriples.cpp Permutation.cpp TextMetaData.cpp
DocsDB.cpp FTSAlgorithms.cpp
PrefixHeuristic.cpp CompressedRelation.cpp
Expand Down
3 changes: 2 additions & 1 deletion src/index/ConstantsIndexBuilding.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ constinit inline std::atomic<size_t> BUFFER_SIZE_PARTIAL_TO_GLOBAL_ID_MAPPINGS =
// the overhead of the metadata that has to be stored per block becomes
// infeasible. 250K seems to be a reasonable tradeoff here.
constexpr inline ad_utility::MemorySize
UNCOMPRESSED_BLOCKSIZE_COMPRESSED_METADATA_PER_COLUMN = 250_kB;
UNCOMPRESSED_BLOCKSIZE_COMPRESSED_METADATA_PER_COLUMN =
ad_utility::MemorySize::kilobytes(250);

constexpr inline size_t NumColumnsIndexBuilding = 4;

Expand Down
5 changes: 3 additions & 2 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,13 @@ size_t Index::getCardinality(
}

// ____________________________________________________________________________
std::string Index::indexToString(VocabIndex id) const {
auto Index::indexToString(VocabIndex id) const -> Vocab::AccessReturnType {
return pimpl_->indexToString(id);
}

// ____________________________________________________________________________
std::string_view Index::indexToString(WordVocabIndex id) const {
auto Index::indexToString(WordVocabIndex id) const
-> TextVocabulary::AccessReturnType {
return pimpl_->indexToString(id);
}

Expand Down
10 changes: 4 additions & 6 deletions src/index/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,11 @@ class Index {
// Read necessary metadata into memory and open file handles.
void addTextFromOnDiskIndex();

using Vocab =
Vocabulary<CompressedString, TripleComponentComparator, VocabIndex>;
using Vocab = RdfsVocabulary;
[[nodiscard]] const Vocab& getVocab() const;
Vocab& getNonConstVocabForTesting();

using TextVocab =
Vocabulary<std::string, SimpleStringComparator, WordVocabIndex>;
using TextVocab = TextVocabulary;
[[nodiscard]] const TextVocab& getTextVocab() const;

// Get a (non-owning) pointer to the BlankNodeManager of this Index.
Expand All @@ -132,8 +130,8 @@ class Index {

// TODO<joka921> Once we have an overview over the folding this logic should
// probably not be in the index class.
std::string indexToString(VocabIndex id) const;
std::string_view indexToString(WordVocabIndex id) const;
Vocab::AccessReturnType indexToString(VocabIndex id) const;
TextVocab::AccessReturnType indexToString(WordVocabIndex id) const;

[[nodiscard]] Vocab::PrefixRanges prefixRanges(std::string_view prefix) const;

Expand Down
10 changes: 10 additions & 0 deletions src/index/IndexBuilderMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <string>

#include "CompilationInfo.h"
#include "IndexImpl.h"
#include "global/Constants.h"
#include "index/ConstantsIndexBuilding.h"
#include "index/Index.h"
Expand Down Expand Up @@ -166,6 +167,7 @@ int main(int argc, char** argv) {
bool addWordsFromLiterals = false;
std::optional<ad_utility::MemorySize> stxxlMemory;
std::optional<ad_utility::MemorySize> parserBufferSize;
std::optional<ad_utility::VocabularyType> vocabType;
optind = 1;

Index index{ad_utility::makeUnlimitedAllocator<Id>()};
Expand Down Expand Up @@ -224,6 +226,10 @@ int main(int argc, char** argv) {
add("only-pso-and-pos-permutations,o", po::bool_switch(&onlyPsoAndPos),
"Only build the PSO and POS permutations. This is faster, but then "
"queries with predicate variables are not supported");
auto msg = absl::StrCat(
"The vocabulary implementation for strings in qlever, can be any of ",
ad_utility::VocabularyType::getListOfSupportedValues());
add("vocabulary-type", po::value(&vocabType), msg.c_str());

// Options for the index building process.
add("stxxl-memory,m", po::value(&stxxlMemory),
Expand Down Expand Up @@ -257,6 +263,10 @@ int main(int argc, char** argv) {
index.parserBufferSize() = parserBufferSize.value();
}

if (vocabType.has_value()) {
index.getImpl().setVocabularyTypeForIndexBuilding(vocabType.value());
}

// If no text index name was specified, take the part of the wordsfile after
// the last slash.
if (textIndexName.empty() && !wordsfile.empty()) {
Expand Down
2 changes: 1 addition & 1 deletion src/index/IndexImpl.Text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
if (!isLiteral(text)) {
continue;
}
WordsFileLine entityLine{text, true, contextId, 1, true};
WordsFileLine entityLine{std::string{text}, true, contextId, 1, true};
co_yield entityLine;
std::string_view textView = text;
textView = textView.substr(0, textView.rfind('"'));
Expand Down
17 changes: 13 additions & 4 deletions src/index/IndexImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,8 @@ void IndexImpl::createFromFiles(
"The patterns can only be built when all 6 permutations are created"};
}

vocab_.resetToType(vocabularyTypeForIndexBuilding_);

readIndexBuilderSettingsFromFile();

updateInputFileSpecificationsAndLog(files, useParallelParser_);
Expand Down Expand Up @@ -560,7 +562,6 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary(
return (*cmp)(a, b, decltype(vocab_)::SortLevel::TOTAL);
};
auto wordCallback = vocab_.makeWordWriter(onDiskBase_ + VOCAB_SUFFIX);
wordCallback.readableName() = "internal vocabulary";
return ad_utility::vocabulary_merger::mergeVocabulary(
onDiskBase_, numFiles, sortPred, wordCallback,
memoryLimitIndexBuilding());
Expand Down Expand Up @@ -974,7 +975,7 @@ size_t IndexImpl::getNumDistinctSubjectPredicatePairs() const {
}

// _____________________________________________________________________________
bool IndexImpl::isLiteral(const string& object) const {
bool IndexImpl::isLiteral(std::string_view object) const {
return decltype(vocab_)::stringIsLiteral(object);
}

Expand Down Expand Up @@ -1132,6 +1133,11 @@ void IndexImpl::readConfiguration() {
loadDataMember("num-triples", numTriples_, NumNormalAndInternal{});
loadDataMember("num-non-literals-text-index", nofNonLiteralsInTextIndex_, 0);

ad_utility::VocabularyType vocabType(
ad_utility::VocabularyType::Enum::CompressedOnDisk);
loadDataMember("vocabulary-type", vocabType, vocabType);
vocab_.resetToType(vocabType);

// Initialize BlankNodeManager
uint64_t numBlankNodesTotal;
loadDataMember("num-blank-nodes-total", numBlankNodesTotal);
Expand Down Expand Up @@ -1522,10 +1528,13 @@ size_t IndexImpl::getCardinality(
}

// ___________________________________________________________________________
std::string IndexImpl::indexToString(VocabIndex id) const { return vocab_[id]; }
RdfsVocabulary::AccessReturnType IndexImpl::indexToString(VocabIndex id) const {
return vocab_[id];
}

// ___________________________________________________________________________
std::string_view IndexImpl::indexToString(WordVocabIndex id) const {
TextVocabulary::AccessReturnType IndexImpl::indexToString(
WordVocabIndex id) const {
return textVocab_[id];
}

Expand Down
17 changes: 14 additions & 3 deletions src/index/IndexImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,10 @@ class IndexImpl {
std::optional<Id> idOfHasPatternDuringIndexBuilding_;
std::optional<Id> idOfInternalGraphDuringIndexBuilding_;

// The vocabulary type that is used (only relevant during index building).
ad_utility::VocabularyType vocabularyTypeForIndexBuilding_{
ad_utility::VocabularyType::Enum::CompressedOnDisk};

// BlankNodeManager, initialized during `readConfiguration`
std::unique_ptr<ad_utility::BlankNodeManager> blankNodeManager_{nullptr};

Expand Down Expand Up @@ -276,6 +280,13 @@ class IndexImpl {
return deltaTriples_.value();
}

// See the documentation of the `vocabularyTypeForIndexBuilding_` member for
// details.
void setVocabularyTypeForIndexBuilding(ad_utility::VocabularyType type) {
vocabularyTypeForIndexBuilding_ = type;
configurationJson_["vocabulary-type"] = type;
}

// --------------------------------------------------------------------------
// -- RETRIEVAL ---
// --------------------------------------------------------------------------
Expand Down Expand Up @@ -306,10 +317,10 @@ class IndexImpl {
const LocatedTriplesSnapshot& locatedTriplesSnapshot) const;

// ___________________________________________________________________________
std::string indexToString(VocabIndex id) const;
RdfsVocabulary::AccessReturnType indexToString(VocabIndex id) const;

// ___________________________________________________________________________
std::string_view indexToString(WordVocabIndex id) const;
TextVocabulary::AccessReturnType indexToString(WordVocabIndex id) const;

public:
// ___________________________________________________________________________
Expand Down Expand Up @@ -635,7 +646,7 @@ class IndexImpl {
friend class IndexTest_createFromOnDiskIndexTest_Test;
friend class CreatePatternsFixture_createPatterns_Test;

bool isLiteral(const string& object) const;
bool isLiteral(std::string_view object) const;

public:
LangtagAndTriple tripleToInternalRepresentation(TurtleTriple&& triple) const;
Expand Down
6 changes: 6 additions & 0 deletions src/index/StringSortComparator.h
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,12 @@ class TripleComponentComparator {
return compare(spA, spB, level) < 0;
}

bool operator()(const SplitVal& spA, std::string_view b,
const Level level) const {
auto spB = extractAndTransformComparable(b, level, false);
return compare(spA, spB, level) < 0;
}

template <typename A, typename B, typename C>
bool operator()(const SplitValBase<A, B, C>& a,
const SplitValBase<A, B, C>& b, const Level level) const {
Expand Down
54 changes: 23 additions & 31 deletions src/index/Vocabulary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,8 @@ bool Vocabulary<StringType, ComparatorType, IndexT>::PrefixRanges::contain(
// _____________________________________________________________________________
template <class S, class C, typename I>
void Vocabulary<S, C, I>::readFromFile(const string& fileName) {
LOG(INFO) << "Reading vocabulary from file " << fileName << " ..."
<< std::endl;
vocabulary_.close();
vocabulary_.open(fileName);
if constexpr (isCompressed_) {
const auto& internalExternalVocab =
vocabulary_.getUnderlyingVocabulary().getUnderlyingVocabulary();
LOG(INFO) << "Done, number of words: "
<< internalExternalVocab.internalVocab().size() << std::endl;
LOG(INFO) << "Number of words in external vocabulary: "
<< internalExternalVocab.externalVocab().size() << std::endl;
} else {
LOG(INFO) << "Done, number of words: " << vocabulary_.size() << std::endl;
}

// Precomputing ranges for IRIs, blank nodes, and literals, for faster
// processing of the `isIrI` and `isLiteral` functions.
Expand All @@ -75,7 +63,17 @@ void Vocabulary<S, C, I>::createFromSet(
return getCaseComparator()(a, b, SortLevel::TOTAL);
};
std::sort(begin(words), end(words), totalComparison);
vocabulary_.build(words, filename);
auto writer = makeWordWriter(filename);
auto writeWords = [&writer](std::string_view word) {
// All words are stored in the internal vocab (this is consistent with the
// previous behavior). NOTE: This function is currently only used for the
// text index and for few unit tests, where we don't have an external
// vocabulary anyway.
writer(word, false);
};
ql::ranges::for_each(words, writeWords);
writer.finish();
vocabulary_.open(filename);
LOG(DEBUG) << "END Vocabulary::createFromSet" << std::endl;
}

Expand All @@ -88,19 +86,12 @@ bool Vocabulary<S, C, I>::stringIsLiteral(std::string_view s) {
// _____________________________________________________________________________
template <class S, class C, class I>
bool Vocabulary<S, C, I>::shouldBeExternalized(string_view s) const {
// TODO<joka921> Completely refactor the Vocabulary on the different
// Types, it is a mess.

// If the string is not compressed, this means that this is a text vocabulary
// and thus doesn't support externalization.
if constexpr (std::is_same_v<S, CompressedString>) {
if (!stringIsLiteral(s)) {
return shouldEntityBeExternalized(s);
} else {
return shouldLiteralBeExternalized(s);
}
// TODO<joka921> We should have a completely separate layer that handles the
// externalization, not the Vocab.
if (!stringIsLiteral(s)) {
return shouldEntityBeExternalized(s);
} else {
return false;
return shouldLiteralBeExternalized(s);
}
}

Expand Down Expand Up @@ -264,17 +255,18 @@ auto Vocabulary<S, C, I>::prefixRanges(std::string_view prefix) const
}

// _____________________________________________________________________________
template <typename S, typename C, typename I>
auto Vocabulary<S, C, I>::operator[](IndexType idx) const
-> AccessReturnType_t<S> {
template <typename UnderlyingVocabulary, typename C, typename I>
auto Vocabulary<UnderlyingVocabulary, C, I>::operator[](IndexType idx) const
-> AccessReturnType {
AD_CONTRACT_CHECK(idx.get() < size());
return vocabulary_[idx.get()];
}

// Explicit template instantiations
template class Vocabulary<CompressedString, TripleComponentComparator,
VocabIndex>;
template class Vocabulary<std::string, SimpleStringComparator, WordVocabIndex>;
template class Vocabulary<detail::UnderlyingVocabRdfsVocabulary,
TripleComponentComparator, VocabIndex>;
template class Vocabulary<detail::UnderlyingVocabTextVocabulary,
SimpleStringComparator, WordVocabIndex>;

template void RdfsVocabulary::initializeInternalizedLangs<nlohmann::json>(
const nlohmann::json&);
Expand Down
Loading
Loading