Skip to content

Commit

Permalink
use compressed file ans better default parameters
Browse files Browse the repository at this point in the history
  • Loading branch information
Malfoy committed Jan 18, 2019
1 parent 8accf1a commit b0e17a8
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 83 deletions.
82 changes: 43 additions & 39 deletions blight.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ using namespace chrono;
int main(int argc, char ** argv){
char ch;
string input,query;
uint k(0);
uint m1(7);
uint m2(5);
uint k(31);
uint m1(9);
uint m2(7);
uint m3(3);
uint c(1);
uint bit(6);
uint bit(5);
while ((ch = getopt (argc, argv, "g:q:k:m:n:s:t:b:")) != -1){
switch(ch){
case 'q':
Expand Down Expand Up @@ -77,46 +77,50 @@ int main(int argc, char ** argv){
<<"Mandatory arguments"<<endl
<<"-g graph file"<<endl
<<"-q query file"<<endl
<<"-k k value used for graph "<<endl<<endl
<<"-m minimizer size (7)"<<endl
<<"-n to create 4^n mphf (5). More mean slower construction but better index, must be <=m"<<endl
<<"-k k value used for graph (31) "<<endl<<endl
<<"-m minimizer size (9)"<<endl
<<"-n to create 4^n mphf (7). More mean slower construction but better index, must be <=m"<<endl
<<"-s to use 4^s files (3). More reduce memory usage and use more files, must be <=n"<<endl
<<"-t core used (1)"<<endl
<<"-b bit saved to encode positions (6). Will reduce the memory usage of b bit per kmer but query have to check 2^b kmers"<<endl;
<<"-b bit saved to encode positions (5). Will reduce the memory usage of b bit per kmer but query have to check 2^b kmers"<<endl;
return 0;
}
kmer_Set_Light ksl(k,m1,m2,m3,c,bit);

high_resolution_clock::time_point t1 = high_resolution_clock::now();

ksl.abundance_minimizer_construct(input);
cout<<"Abundance minimizer computed"<<endl;

ksl.create_super_buckets(input);
high_resolution_clock::time_point t12 = high_resolution_clock::now();
duration<double> time_span12 = duration_cast<duration<double>>(t12 - t1);
cout<<"Super bucket created: "<< time_span12.count() << " seconds."<<endl;

ksl.read_super_buckets("_out");
high_resolution_clock::time_point t13 = high_resolution_clock::now();
duration<double> time_span13 = duration_cast<duration<double>>(t13 - t12);

high_resolution_clock::time_point t2 = high_resolution_clock::now();
duration<double> time_span = duration_cast<duration<double>>(t2 - t1);
cout << "The whole indexing took me " << time_span.count() << " seconds."<< endl;
{
kmer_Set_Light ksl(k,m1,m2,m3,c,bit);

high_resolution_clock::time_point t1 = high_resolution_clock::now();

//~ ksl.abundance_minimizer_construct(input);
//~ cout<<"Abundance minimizer computed"<<endl;

ksl.create_super_buckets(input);
high_resolution_clock::time_point t12 = high_resolution_clock::now();
duration<double> time_span12 = duration_cast<duration<double>>(t12 - t1);
cout<<"Super bucket created: "<< time_span12.count() << " seconds."<<endl;

ksl.read_super_buckets("_out");
high_resolution_clock::time_point t13 = high_resolution_clock::now();
duration<double> time_span13 = duration_cast<duration<double>>(t13 - t12);

high_resolution_clock::time_point t2 = high_resolution_clock::now();
duration<double> time_span = duration_cast<duration<double>>(t2 - t1);
cout << "The whole indexing took me " << time_span.count() << " seconds."<< endl;
//~ cin.get();
ksl.file_query(query,false);
high_resolution_clock::time_point t3 = high_resolution_clock::now();
duration<double> time_span2 = duration_cast<duration<double>>(t3 - t2);
cout << "The serial query took me " << time_span2.count() << " seconds."<<endl;

ksl.file_query(query,true);
high_resolution_clock::time_point t4 = high_resolution_clock::now();
duration<double> time_span3 = duration_cast<duration<double>>(t4 - t3);
cout << "The optimized query took me " << time_span3.count() << " seconds."<<endl;

cout<<"I am glad you are here with me. Here at the end of all things."<<endl;
cout<<ksl.bucketSeq.size()<<endl;
cout<<ksl.positions.size()<<endl;
}
//~ cin.get();
ksl.file_query(query,false);
high_resolution_clock::time_point t3 = high_resolution_clock::now();
duration<double> time_span2 = duration_cast<duration<double>>(t3 - t2);
cout << "The serial query took me " << time_span2.count() << " seconds."<<endl;

ksl.file_query(query,true);
high_resolution_clock::time_point t4 = high_resolution_clock::now();
duration<double> time_span3 = duration_cast<duration<double>>(t4 - t3);
cout << "The optimized query took me " << time_span3.count() << " seconds."<<endl;

cout<<"I am glad you are here with me. Here at the end of all things."<<endl;

return 0;
}

56 changes: 25 additions & 31 deletions ksl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ kmer nuc2int(char c){
}



kmer nuc2intrc(char c){
switch(c){
/*
Expand All @@ -66,6 +67,8 @@ kmer nuc2intrc(char c){
return 0;
}



string intToString(uint64_t n){
if(n<1000){
return to_string(n);
Expand Down Expand Up @@ -108,6 +111,7 @@ string getCanonical(const string& str){
}



kmer str2num(const string& str){
kmer res(0);
for(uint i(0);i<str.size();i++){
Expand Down Expand Up @@ -231,16 +235,16 @@ uint32_t kmer_Set_Light::minimizer_according_xs(kmer seq){
seq>>=2;
mmer=seq%minimizer_number;
mmer=min(mmer,(uint32_t)rcb(mmer,m1));
uint value_min(abundance_minimizer[mini]),value_mmer(abundance_minimizer[mini]);
//~ uint value_min(abundance_minimizer[mini]),value_mmer(abundance_minimizer[mini]);

if(value_min>value_mmer){
mini=mmer;
}else{
if(value_min==value_mmer){
mini=((xs(mini)<=xs(mmer))?mini:mmer);
}
}
//~ mini=((xs(mini)<=xs(mmer))?mini:mmer);
//~ if(value_min>value_mmer){
//~ mini=mmer;
//~ }else{
//~ if(value_min==value_mmer){
//~ mini=((xs(mini)<=xs(mmer))?mini:mmer);
//~ }
//~ }
mini=((xs(mini)<=xs(mmer))?mini:mmer);
}
return mini;
}
Expand Down Expand Up @@ -313,9 +317,11 @@ void kmer_Set_Light::create_super_buckets(const string& input_file){
exit(1);
}

vector<ofstream*> out_files;
//~ vector<ofstream*> out_files;
vector<ostream*> out_files;
for(uint i(0);i<number_superbuckets;++i){
auto out =new ofstream("_out"+to_string(i));
//~ auto out =new ofstream("_out"+to_string(i));
auto out =new zstr::ofstream("_out"+to_string(i));
out_files.push_back(out);
}
//~ cout<<"file"<<endl;
Expand Down Expand Up @@ -385,6 +391,7 @@ void kmer_Set_Light::create_super_buckets(const string& input_file){
}
}
}
delete inUnitigs;
for(uint i(0);i<number_superbuckets;++i){
*out_files[i]<<flush;
delete(out_files[i]);
Expand Down Expand Up @@ -461,8 +468,9 @@ void kmer_Set_Light::read_super_buckets(const string& input_file){
//~ total_size+=all_buckets[ii]->bucketSeq.capacity()/2;
//~ }
//~ }
//~ ifstream in(input_file+to_string(SBC));
auto in=new ifstream(input_file+to_string(SBC));

//~ auto in=new ifstream(input_file+to_string(SBC));
auto in=new zstr::ifstream((input_file+to_string(SBC)));

while(not in->eof() and in-> good()){
useless="";
Expand All @@ -481,7 +489,7 @@ void kmer_Set_Light::read_super_buckets(const string& input_file){
}
}
delete in;
//~ remove((input_file+to_string(SBC)).c_str());
remove((input_file+to_string(SBC)).c_str());
create_mphf(BC,BC+bucket_per_superBuckets);
fill_positions(BC,BC+bucket_per_superBuckets);
BC+=bucket_per_superBuckets;
Expand Down Expand Up @@ -622,13 +630,13 @@ void kmer_Set_Light::print_kmer(kmer num){


void kmer_Set_Light::create_mphf(uint begin_BC,uint end_BC){
#pragma omp parallel num_threads(coreNumber)
//~ #pragma omp parallel num_threads(coreNumber)
{
uint64_t anchors_number(0);
vector<kmer> anchors;
uint largest_bucket_anchor(0);
uint largest_bucket_nuc(0);
#pragma omp for schedule(dynamic, number_bucket_per_mphf)
//~ #pragma omp for schedule(dynamic, number_bucket_per_mphf)
for(uint BC=(begin_BC);BC<end_BC;++BC){
if(all_buckets[BC].nuc_minimizer!=0){
largest_bucket_nuc=max(largest_bucket_nuc,all_buckets[BC].nuc_minimizer);
Expand Down Expand Up @@ -762,20 +770,6 @@ int64_t kmer_Set_Light::correct_pos(uint32_t mini, uint64_t p){

















uint32_t kmer_Set_Light::get_anchors(const string& query,uint& minimizer, vector<kmer>& kmerV,uint pos){
//~ #pragma omp atomic
//~ number_query++;
Expand Down Expand Up @@ -902,7 +896,7 @@ void kmer_Set_Light::file_query(const string& query_file,bool optimized){
auto in=new zstr::ifstream(query_file);
uint64_t TP(0),FP(0);
//~ cout<<"go"<<endl;
#pragma omp parallel num_threads(1)
#pragma omp parallel num_threads(coreNumber)
{
vector<kmer> kmerV;
uint32_t minimizer;
Expand Down
21 changes: 8 additions & 13 deletions ksl.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,10 @@ class kmer_Set_Light{
}
//~ abundance_minimizer.resize(minimizer_number);
all_buckets=new bucket_minimizer[minimizer_number];
abundance_minimizer_temp=new uint32_t[minimizer_number];
abundance_minimizer=new uint8_t[minimizer_number];
memset(abundance_minimizer, 0, minimizer_number*sizeof(uint8_t));
memset(abundance_minimizer_temp, 0, minimizer_number*sizeof(uint32_t));
//~ abundance_minimizer_temp=new uint32_t[minimizer_number];
//~ abundance_minimizer=new uint8_t[minimizer_number];
//~ memset(abundance_minimizer, 0, minimizer_number*sizeof(uint8_t));
//~ memset(abundance_minimizer_temp, 0, minimizer_number*sizeof(uint32_t));
for(uint i(0);i<minimizer_number;++i){
all_buckets[i]={0,0,0};
}
Expand All @@ -144,9 +144,12 @@ class kmer_Set_Light{

~kmer_Set_Light () {
delete[] all_buckets;
for(uint i(0);i<minimizer_number/number_bucket_per_mphf;++i){
delete all_mphf[i].kmer_MPHF;
}
delete[] all_mphf;
delete[] Valid_kmer;
delete[] abundance_minimizer;;
//~ delete[] abundance_minimizer;;
}

bool exists(const kmer& query);
Expand Down Expand Up @@ -176,14 +179,6 @@ class kmer_Set_Light{
void int_to_bool(uint n_bits_to_encode,uint32_t X, uint32_t pos,uint64_t start);
kmer update_kmer_local(uint32_t pos,const vector<bool>& V,kmer input);
vector<bool> get_seq(uint32_t mini,uint32_t pos,uint32_t n);








};


Expand Down

0 comments on commit b0e17a8

Please sign in to comment.