Skip to content

Commit

Permalink
Add a --drop-n / -N option to drop any sequence containing an N
Browse files Browse the repository at this point in the history
This is a stricter version of --truncate-n which may use sequence
fragments up until the first N provided they pass the length filter.
  • Loading branch information
tsibley committed Aug 15, 2014
1 parent 705ae6c commit c657534
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 12 deletions.
2 changes: 1 addition & 1 deletion src/sickle.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,6 @@ typedef struct __cutsites_ {
/* Function Prototypes */
int single_main (int argc, char *argv[]);
int paired_main (int argc, char *argv[]);
cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int debug);
cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int drop_n, int debug);

#endif /*SICKLE_H*/
14 changes: 9 additions & 5 deletions src/sliding.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ int get_quality_num (char qualchar, int qualtype, kseq_t *fqrec, int pos) {
}


cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int debug) {
cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int drop_n, int debug) {

int window_size = (int) (0.1 * fqrec->seq.l);
int i,j;
Expand Down Expand Up @@ -112,10 +112,14 @@ cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int
}


/* If truncate N option is selected, and sequence has Ns, then */
/* change 3' cut site to be the base before the first N */
if (trunc_n && ((npos = strstr(fqrec->seq.s, "N")) || (npos = strstr(fqrec->seq.s, "n")))) {
three_prime_cut = npos - fqrec->seq.s;
/* If truncate N option is selected, and sequence has Ns, then
* change 3' cut site to be the base before the first N.
* If drop N option is selected, omit the sequence. */
if ((npos = strstr(fqrec->seq.s, "N")) || (npos = strstr(fqrec->seq.s, "n"))) {
if (trunc_n)
three_prime_cut = npos - fqrec->seq.s;
else if (drop_n)
three_prime_cut = five_prime_cut = -1;
}

/* if cutting length is less than threshold then return -1 for both */
Expand Down
20 changes: 16 additions & 4 deletions src/trim_paired.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ __KSEQ_READ
int paired_qual_threshold = 20;
int paired_length_threshold = 20;

static const char *paired_short_options = "df:r:c:t:o:p:m:M:s:q:l:xng";
static const char *paired_short_options = "df:r:c:t:o:p:m:M:s:q:l:xnNg";
static struct option paired_long_options[] = {
{ "qual-type", required_argument, NULL, 't' },
{ "pe-file1", required_argument, NULL, 'f' },
Expand All @@ -30,6 +30,7 @@ static struct option paired_long_options[] = {
{ "length-threshold", required_argument, NULL, 'l' },
{ "no-fiveprime", no_argument, NULL, 'x' },
{ "truncate-n", no_argument, NULL, 'n' },
{ "drop-n", no_argument, NULL, 'N' },
{ "gzip-output", no_argument, NULL, 'g' },
{ "output-combo-all", required_argument, NULL, 'M' },
{ "quiet", no_argument, NULL, 'z' },
Expand Down Expand Up @@ -95,6 +96,7 @@ void paired_usage (int status, char *msg) {
" trimming. Default %4$d.\n"
"-x, --no-fiveprime Don't do five prime trimming.\n"
"-n, --truncate-n Truncate sequences at position of first N.\n"
"-N, --drop-n Discard sequences containing an N.\n"
"-g, --gzip-output Output gzipped files.\n"
"--quiet Do not output trimming info\n"
"--help Display this help and exit\n"
Expand Down Expand Up @@ -154,6 +156,7 @@ int paired_main(int argc, char *argv[]) {
int quiet = 0;
int no_fiveprime = 0;
int trunc_n = 0;
int drop_n = 0;
int gzip_output = 0;
int combo_all=0;
int combo_s=0;
Expand Down Expand Up @@ -246,6 +249,10 @@ int paired_main(int argc, char *argv[]) {
trunc_n = 1;
break;

case 'N':
drop_n = 1;
break;

case 'g':
gzip_output = 1;
break;
Expand Down Expand Up @@ -276,6 +283,11 @@ int paired_main(int argc, char *argv[]) {
paired_usage(EXIT_FAILURE, "****Error: Quality type is required.");
}

if (trunc_n && drop_n) {
fprintf(stderr, "****Error: cannot specify both --truncate-n and --drop-n\n\n");
return EXIT_FAILURE;
}

/* make sure minimum input filenames are specified */
if (!infn1 && !infnc) {
paired_usage(EXIT_FAILURE, "****Error: Must have either -f OR -c argument.");
Expand Down Expand Up @@ -414,8 +426,8 @@ int paired_main(int argc, char *argv[]) {
break;
}

p1cut = sliding_window(fqrec1, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, debug);
p2cut = sliding_window(fqrec2, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, debug);
p1cut = sliding_window(fqrec1, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, drop_n, debug);
p2cut = sliding_window(fqrec2, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, drop_n, debug);
total += 2;

if (debug) printf("p1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut);
Expand Down Expand Up @@ -520,7 +532,7 @@ int paired_main(int argc, char *argv[]) {
}

if (!quiet) {
if (infn1 && infn2) fprintf(stdout, "\nPE forwrd file: %s\nPE reverse file: %s\n", infn1, infn2);
if (infn1 && infn2) fprintf(stdout, "\nPE forward file: %s\nPE reverse file: %s\n", infn1, infn2);
if (infnc) fprintf(stdout, "\nPE interleaved file: %s\n", infnc);
fprintf(stdout, "\nTotal input FastQ records: %d (%d pairs)\n", total, (total / 2));
fprintf(stdout, "\nFastQ paired records kept: %d (%d pairs)\n", kept_p, (kept_p / 2));
Expand Down
15 changes: 13 additions & 2 deletions src/trim_single.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ __KSEQ_READ
int single_qual_threshold = 20;
int single_length_threshold = 20;

static const char *single_short_options = "df:t:o:q:l:zxng";
static const char *single_short_options = "df:t:o:q:l:zxnNg";
static struct option single_long_options[] = {
{ "fastq-file", required_argument, NULL, 'f' },
{ "output-file", required_argument, NULL, 'o' },
Expand Down Expand Up @@ -53,6 +53,7 @@ void single_usage(int status, char *msg) {
" trimming. Default %4$d.\n"
"-x, --no-fiveprime Don't do five prime trimming.\n"
"-n, --truncate-n Truncate sequences at position of first N.\n"
"-N, --drop-n Discard sequences containing an N.\n"
"-g, --gzip-output Output gzipped files.\n"
"--quiet Do not output trimming info\n"
"--help Display this help and exit\n"
Expand Down Expand Up @@ -92,6 +93,7 @@ int single_main(int argc, char *argv[]) {
int quiet = 0;
int no_fiveprime = 0;
int trunc_n = 0;
int drop_n = 0;
int gzip_output = 0;
int total=0;

Expand Down Expand Up @@ -153,6 +155,10 @@ int single_main(int argc, char *argv[]) {
trunc_n = 1;
break;

case 'N':
drop_n = 1;
break;

case 'g':
gzip_output = 1;
break;
Expand Down Expand Up @@ -208,12 +214,17 @@ int single_main(int argc, char *argv[]) {
}
}

if (trunc_n && drop_n) {
fprintf(stderr, "****Error: cannot specify both --truncate-n and --drop-n\n\n");
return EXIT_FAILURE;
}


fqrec = kseq_init(se);

while ((l = kseq_read(fqrec)) >= 0) {

p1cut = sliding_window(fqrec, qualtype, single_length_threshold, single_qual_threshold, no_fiveprime, trunc_n, debug);
p1cut = sliding_window(fqrec, qualtype, single_length_threshold, single_qual_threshold, no_fiveprime, trunc_n, drop_n, debug);
total++;

if (debug) printf("P1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut);
Expand Down

0 comments on commit c657534

Please sign in to comment.