From c1ba8f8b1b1f568c8fe084a97bca6016556daa5e Mon Sep 17 00:00:00 2001 From: Nimesh Ghelani Date: Wed, 30 Oct 2019 23:43:37 +0000 Subject: [PATCH] Fix incorrect paragraph ordering --- CALEngine/src/dataset.cc | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/CALEngine/src/dataset.cc b/CALEngine/src/dataset.cc index 40eba9d..136de39 100644 --- a/CALEngine/src/dataset.cc +++ b/CALEngine/src/dataset.cc @@ -1,4 +1,5 @@ #include +#include #include "dataset.h" #include "utils/utils.h" @@ -14,12 +15,14 @@ unordered_map generate_inverted_index(const SparseVectors &spars return inverted_index; } +string get_doc_id(const string ¶_id){ + return para_id.substr(0, para_id.find(".")); +} + vector generate_parent_documents(const Dataset &parent_dataset, const SparseVectors &sparse_vectors){ vector parent_documents(sparse_vectors->size()); for(int i = 0; i < parent_documents.size(); i++){ - auto ¶_id = sparse_vectors->at(i)->doc_id; - string doc_id = para_id.substr(0, para_id.find(".")); - parent_documents[i] = parent_dataset.get_index(doc_id); + parent_documents[i] = parent_dataset.get_index(get_doc_id(sparse_vectors->at(i)->doc_id)); if(i > 0 && parent_documents[i] < parent_documents[i-1]){ fail("Paragraphs must be in increasing order of their parent document ids", -1); @@ -202,5 +205,12 @@ ParagraphDataset::ParagraphDataset(const Dataset &_parent_dataset, std::unordered_map _dictionary): Dataset(move(sparse_vectors), _dictionary), parent_dataset(_parent_dataset){ + sort( + doc_features->begin(), + doc_features->end(), + [&](const unique_ptr &a, const unique_ptr &b) -> bool { + return parent_dataset.get_index(get_doc_id(a->doc_id)) < parent_dataset.get_index(get_doc_id(b->doc_id)); + } + ); parent_documents = generate_parent_documents(_parent_dataset, doc_features); }