Skip to content

Commit

Permalink
added contigs_to_fastg from MEGAHIT,
Browse files Browse the repository at this point in the history
adapted to new gatb-core bugfixes in GraphUnitigs
added a simple 1seq.fa test (not scripted, manually ran yet)
that version doesn't have the previous misassemblies, it starts to look good
  • Loading branch information
rchikhi committed Aug 9, 2016
1 parent 31da63e commit 61d616a
Show file tree
Hide file tree
Showing 11 changed files with 498 additions and 6 deletions.
14 changes: 10 additions & 4 deletions src/Minia.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,8 @@ void Minia::assembleFrom(Node startingNode, TraversalTemplate<Node,Edge,Graph_ty
if (typeid(graph) == typeid(GraphUnitigsTemplate<span>))
{
bool isolatedLeft, isolatedRight;
string sequence = graph.simplePathLongest(startingNode, isolatedLeft, isolatedRight, true);
float coverage = 0; // FIXME
float coverage = 0;
string sequence = graph.simplePathBothDirections(startingNode, isolatedLeft, isolatedRight, true, coverage);

Sequence seq (Data::ASCII);
seq.getData().setRef ((char*)sequence.c_str(), sequence.size());
Expand Down Expand Up @@ -317,8 +317,14 @@ void Minia::assemble (/*const, removed because Simplifications isn't const anymo
{
Node node = itNode.item();

// in this setting it's very simple, we don't even need NodeSelector anymore. Just assemble from any non-deleted unmarked node
if ((!hasUnitigs) && terminator->is_marked (node)) { continue; }
if (hasUnitigs)
{
if (graph.unitigIsMarked(node)) { continue; }
}
else
{
if (terminator->is_marked (node)) { continue; }
}
if (graph.isNodeDeleted(node)) { continue; }

DEBUG ((cout << endl << "-------------------------- " << graph.toString (node) << " -------------------------" << endl));
Expand Down
2 changes: 2 additions & 0 deletions test/1seq.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>1
AAAACCCCTTTTGGGGCCCCCTTTTTTTAAAAA
2 changes: 2 additions & 0 deletions test/1seq_90bp.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>Seq0
CTGTCGCGGGGAATTGTGGGGCGGACCACGCTCTGGCTAACGAGCTACCGTTTCCTTTAACCTGCCAGACGGTGACCAGGGCCGTTCGGC
40 changes: 40 additions & 0 deletions test/1seq_90bp.reads.fq
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
@Seq0_21_90_2:0:0_0:0:0_0/1
GCGGACCACGCTCTGTCTAACGAGCTACCGTTTCCTTTAACCTGCCATACGGTGACCAGGGCCGTTCGGC
+
2222222222222222222222222222222222222222222222222222222222222222222222
@Seq0_18_87_0:0:0_0:0:0_1/1
GAACGGCCCTGGTCACCGTCTGGCAGGTTAAAGGAAACGGTAGCTCGTTAGCCAGAGCGTGGTCCGCCCC
+
2222222222222222222222222222222222222222222222222222222222222222222222
@Seq0_10_79_1:0:0_0:0:0_2/1
GGAATTGTGGGGCGGACCACGCTCTGGCTAACTAGCTACCGTTTCCTTTAACCTGCCAGACGGTGACCAG
+
2222222222222222222222222222222222222222222222222222222222222222222222
@Seq0_16_85_0:0:0_2:0:0_3/1
ACGGCCCTGGTCACCGTCTGGCAGGTTAAAGGAAACGGTAGCTCGTTAGCCAGAGGGTGGTCCTCCCCAC
+
2222222222222222222222222222222222222222222222222222222222222222222222
@Seq0_17_86_1:0:0_0:0:0_4/1
AGGGGCGGACCACGCTCTGGCTAACGAGCTACCGTTTCCTTTAACCTGCCAGACGGTGACCAGGGCCGTT
+
2222222222222222222222222222222222222222222222222222222222222222222222
@Seq0_13_82_2:0:0_1:0:0_5/1
ATTGTGGGGGGGACCACGCTCTGTCTAACGAGCTACCGTTTCCTTTAACCTGCCAGACGGTGACCAGGGC
+
2222222222222222222222222222222222222222222222222222222222222222222222
@Seq0_7_76_0:0:0_1:0:0_6/1
CGGGGAATTGTGGGGCGGACCACGCTCTGGCTAACGAGCTACCGTTTCCTTTAACCTGCCAGACGGTGAC
+
2222222222222222222222222222222222222222222222222222222222222222222222
@Seq0_10_79_1:0:0_1:0:0_7/1
CTGGTCACCGTCTGGCAGGTTAAAGGAAACGGTAGCTGGTTAGCCAGAGCGTGGTCCGCCCCACAATTCC
+
2222222222222222222222222222222222222222222222222222222222222222222222
@Seq0_4_73_0:0:0_3:0:0_8/1
ACCGTCTGGCAGGTAAAAGGACACGGTAGCTCGATAGCCAGAGCGTGGTCCGCCCCACAATTCCCCGCGA
+
2222222222222222222222222222222222222222222222222222222222222222222222
@Seq0_2_71_1:0:0_0:0:0_9/1
TGTCGCGGGGAATTGTGGGGCGGACCACGCTCTGGCTAACGAGCTAGCGTTTCCTTTAACCTGCCAGACG
+
2222222222222222222222222222222222222222222222222222222222222222222222
3 changes: 3 additions & 0 deletions test/1seq_90bp_circ.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
>same as 1seq_90 but made it circular by duplicating it
CTGTCGCGGGGAATTGTGGGGCGGACCACGCTCTGGCTAACGAGCTACCGTTTCCTTTAACCTGCCAGACGGTGACCAGGGCCGTTCGGC
CTGTCGCGGGGAATTGTGGGGCGGACCACGCTCTGGCTAACGAGCTACCGTTTCCTTTAACCTGCCAGACGGTGACCAGGGCCGTTCGGC
3 changes: 3 additions & 0 deletions test/1seq_90bp_simulate_reads.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
wgsim -N 10 -h -r 0 -R 0 -X 0 -1 70 -2 10 -d 0 -s 0 1seq_90bp.fa 1seq_90bp.reads.fq /dev/null
wgsim -N 200 -h -r 0 -R 0 -X 0 -1 70 -2 10 -d 0 -s 0 1seq_90bp_circ.fa 1seq_90bp_circ.reads.fq /dev/null

2 changes: 1 addition & 1 deletion test/buchnera_simulate_reads.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ rm -f buchnera.reads*.fq
N=15000
for i in $(seq 1 15)
do
~/tools/samtools-1.3/misc/wgsim -N $N -r 0.008 buchnera.fasta buchnera.reads$i.fq /dev/null > /dev/null
wgsim -N $N -r 0.008 buchnera.fasta buchnera.reads$i.fq /dev/null > /dev/null
done
ls -1 buchnera.reads*.fq > buchnera
178 changes: 178 additions & 0 deletions thirdparty/contig2fastg/contigs_to_fastg.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
/*
*
* taken from..
*
* MEGAHIT
* Copyright (C) 2014 - 2015 The University of Hong Kong & L3 Bioinformatics Limited
*
* ..cheers!
* it's GPL but we're GPL too so we're good, right? guys.. right?
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

/* contact: Dinghua Li <[email protected]> */

#include <string>
#include <unordered_map>
#include <vector>
#include <cstdio>
#include <cassert>
#include <zlib.h>

#include "kseq.h"

using namespace std;

#ifndef KSEQ_INITED
#define KSEQ_INITED
KSEQ_INIT(gzFile, gzread)
#endif

char Comp(char c) {
switch (c) {
case 'A':
case 'a':
return 'T';

case 'C':
case 'c':
return 'G';

case 'G':
case 'g':
return 'C';

case 'T':
case 't':
return 'A';

default:
assert(false);
}
}

string RevComp(const string &s) {
string ret;

for (unsigned i = 0; i < s.length(); ++i) {
ret.push_back(Comp(s[s.length() - 1 - i]));
}

return ret;
}

string NodeName(int i, int len, double mul, bool is_rc) {
static char buf[10240];
sprintf(buf, "NODE_%d_length_%d_cov_%0.2f_ID_%d", i, len, mul, i * 2 - 1);

if (is_rc) return string(buf) + "'";
else return string(buf);
}

int main(int argc, char **argv) {
if (argc < 3) {
fprintf(stderr, "Usage: %s <contigs.fasta> <kmer_size>\n", argv[0]);
exit(1);
}

unsigned k = atoi(argv[2]) - 1; // megahit uses a different de bruijn definition as us, so it's k-1 for us
gzFile fp = gzopen(argv[1], "r");
if (fp == NULL)
{
fprintf(stderr,"cannot open file: %s ", argv[1]);
exit(1);
}
kseq_t *seq = kseq_init(fp); // kseq to read files

vector<string> ctgs;
vector<double> muls;
vector<string> node_names;
vector<string> rev_node_names;
unordered_map<string, vector<int> > start_kmer_to_id;
bool has_muls = true;

while (kseq_read(seq) >= 0) {
if (seq->seq.l < k) {
continue;
}

float mul;

// Minia has a special output, actually similar to the one this tool outputs. Let's parse it in a convenient way
for (int i = 0; i < seq->name.l; i++)
if (seq->name.s[i] == '_') seq->name.s[i] = ' ';

int osef;
static char osef_s[100];
int ret = sscanf(seq->name.s, "%s %d %s %d %s %f %s %d", &osef_s, &osef, &osef_s, &osef, &osef_s, &mul, &osef_s, &osef);

if (ret != 8 && has_muls)
{
fprintf(stderr, "unexpected format string, disabling recording of coverage. Is this a Minia contig? Header of first seq: '%s %s'\n", seq->name.s, seq->comment.s);
has_muls = false;

}

if (has_muls)
muls.push_back(mul);

ctgs.push_back(string(seq->seq.s));
}

for (int i = 0; i < (int)ctgs.size(); ++i) {
start_kmer_to_id[ctgs[i].substr(0, k)].push_back(i + 1);
start_kmer_to_id[RevComp(ctgs[i].substr(ctgs[i].length() - k))].push_back(-i - 1);
}

for (int i = 0; i < (int)ctgs.size(); ++i) {
float mul = 0;
if (has_muls)
mul = muls[i];
node_names.push_back(NodeName(i + 1, ctgs[i].length(), mul, false));
rev_node_names.push_back(NodeName(i + 1, ctgs[i].length(), mul, true));
}

for (int i = 0; i < (int)ctgs.size(); ++i) {
for (int dir = 0; dir < 2; ++dir) {
string header = dir == 0 ? node_names[i] : rev_node_names[i];
header = ">" + header;
string s = dir == 0 ? ctgs[i] : RevComp(ctgs[i]);
auto mit = start_kmer_to_id.find(s.substr(s.length() - k));

if (mit != start_kmer_to_id.end()) {
for (unsigned j = 0; j < mit->second.size(); ++j) {
if (j == 0) {
header += ":";
}
else {
header += ",";
}

if (mit->second[j] > 0) {
header += node_names[mit->second[j] - 1];
}
else {
header += rev_node_names[-mit->second[j] - 1];
}
}
}

header += ";";
printf("%s\n%s\n", header.c_str(), s.c_str());
}
}

return 0;
}
Loading

0 comments on commit 61d616a

Please sign in to comment.