Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change createdb.cpp so that it takes in ".txt" file containing paths of different fasta files #879

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ std::vector<Command> baseCommands = {
"Sensitive homology search",
"# Search multiple FASTA against FASTA (like BLASTP, TBLASTN, BLASTX, BLASTN --search-type 3, TBLASTX --search-type 2)\n"
"mmseqs easy-search examples/QUERY.fasta examples/QUERY.fasta examples/DB.fasta result.m8 tmp\n\n"
"# Search multiple query fasta files against target fasta files using a tsv file containing filepaths\n"
"echo -e \"dir1/QUERY1.fasta\\ndir2/QUERY2.fasta\" > examples/queries.tsv\n"
"echo -e \"dir3/TARGET1.fasta\\ndir4/TARGET2.fasta\" > examples/targets.tsv\n"
"mmseqs easy-search examples/queries.tsv examples/targets.tsv result.m8 tmp\n\n"
"# Iterative profile search from stdin (like PSI-BLAST)\n"
"cat examples/QUERY.fasta | mmseqs easy-search stdin examples/DB.fasta result.m8 tmp --num-iterations 2\n\n"
"# Profile search against small databases (e.g. PFAM, eggNOG)\n"
Expand Down Expand Up @@ -125,7 +129,10 @@ std::vector<Command> baseCommands = {
"# Create a seqDB from stdin\n"
"cat seq.fasta | mmseqs createdb stdin sequenceDB\n\n"
"# Create a seqDB by indexing existing FASTA/Q (for single line fasta entries only)\n"
"mmseqs createdb seq.fasta sequenceDB --createdb-mode 1\n",
"mmseqs createdb seq.fasta sequenceDB --createdb-mode 1\n\n"
"# Create a seqDB from a tsv file containing filepaths of multiple FASTA files in each line\n"
"echo -e \"dir1/bacteria.fasta\\ndir2/archea.fasta.gz\" > filepaths.tsv\n"
"mmseqs createdb filepaths.tsv sequenceDB\n",
"Martin Steinegger <[email protected]>",
"<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]>|<i:stdin> <o:sequenceDB>",
CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]|stdin", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfileStdinAndGeneric },
Expand Down
22 changes: 22 additions & 0 deletions src/util/createdb.cpp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,28 @@ int createdb(int argc, const char **argv, const Command& command) {
std::vector<std::string> filenames(par.filenames);
std::string dataFile = filenames.back();
filenames.pop_back();
if (Util::endsWith(".tsv", filenames[0])) {
if (filenames.size() > 1) {
Debug(Debug::ERROR) << "Only one tsv file can be given\n";
EXIT(EXIT_FAILURE);
}
std::string tsv = filenames.back();
filenames.pop_back();

FILE* file = FileUtil::openFileOrDie(tsv.c_str(), "r", true);
char* line = NULL;
size_t len = 0;
ssize_t read;
while ((read = getline(&line, &len, file)) != -1) {
if (line[read - 1] == '\n') {
line[read - 1] = '\0';
read--;
}
filenames.push_back(line);
}
free(line);
fclose(file);
}

for (size_t i = 0; i < filenames.size(); i++) {
if (FileUtil::directoryExists(filenames[i].c_str()) == true) {
Expand Down