Skip to content

Commit

Permalink
Merge pull request #14 from hical/fix-para-ordering
Browse files Browse the repository at this point in the history
Fix incorrect paragraph ordering
  • Loading branch information
nims11 authored Jan 21, 2020
2 parents 9c91557 + c1ba8f8 commit 8d6dfab
Showing 1 changed file with 13 additions and 3 deletions.
16 changes: 13 additions & 3 deletions CALEngine/src/dataset.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <thread>
#include <algorithm>
#include "dataset.h"
#include "utils/utils.h"

Expand All @@ -14,12 +15,14 @@ unordered_map<string, size_t> generate_inverted_index(const SparseVectors &spars
return inverted_index;
}

string get_doc_id(const string &para_id){
return para_id.substr(0, para_id.find("."));
}

vector<int> generate_parent_documents(const Dataset &parent_dataset, const SparseVectors &sparse_vectors){
vector<int> parent_documents(sparse_vectors->size());
for(int i = 0; i < parent_documents.size(); i++){
auto &para_id = sparse_vectors->at(i)->doc_id;
string doc_id = para_id.substr(0, para_id.find("."));
parent_documents[i] = parent_dataset.get_index(doc_id);
parent_documents[i] = parent_dataset.get_index(get_doc_id(sparse_vectors->at(i)->doc_id));

if(i > 0 && parent_documents[i] < parent_documents[i-1]){
fail("Paragraphs must be in increasing order of their parent document ids", -1);
Expand Down Expand Up @@ -202,5 +205,12 @@ ParagraphDataset::ParagraphDataset(const Dataset &_parent_dataset,
std::unordered_map<std::string, TermInfo> _dictionary):
Dataset(move(sparse_vectors), _dictionary),
parent_dataset(_parent_dataset){
sort(
doc_features->begin(),
doc_features->end(),
[&](const unique_ptr<SfSparseVector> &a, const unique_ptr<SfSparseVector> &b) -> bool {
return parent_dataset.get_index(get_doc_id(a->doc_id)) < parent_dataset.get_index(get_doc_id(b->doc_id));
}
);
parent_documents = generate_parent_documents(_parent_dataset, doc_features);
}

0 comments on commit 8d6dfab

Please sign in to comment.