Finish Release-2.3.3

- Fixed a bug which was causing a seg fault when using the comp tool with bin sizes larger than the default value. - Tweaked header of SECT stats output to make it clearer. - SECT now allows includes lowercase bases in analysis. - Added stack trace if a seg fault occurs
TGAC · Apr 7, 2017 · 7181ee6 · 7181ee6
2 parents 7666a08 + 17a8ec1
commit 7181ee6
Show file tree

Hide file tree

Showing 10 changed files with 155 additions and 23 deletions.
diff --git a/configure.ac b/configure.ac
@@ -4,7 +4,7 @@
 
 # Autoconf initialistion.  Sets package name version and contact details
 AC_PREREQ([2.68])
-AC_INIT([kat],[2.3.2],[https://github.com/TGAC/KAT/issues],[kat],[https://github.com/TGAC/KAT])
+AC_INIT([kat],[2.3.3],[https://github.com/TGAC/KAT/issues],[kat],[https://github.com/TGAC/KAT])
 AC_CONFIG_SRCDIR([src/kat.cc])
 AC_CONFIG_AUX_DIR([build-aux])
 AC_CONFIG_MACRO_DIR([m4])

diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -52,9 +52,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '2.3.2'
+version = '2.3.3'
 # The full version, including alpha/beta/rc tags.
-release = '2.3.2'
+release = '2.3.3'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/doc/source/using.rst b/doc/source/using.rst
@@ -27,6 +27,33 @@ Basic usage::
     kat hist [options] (<input>)+
 
 
+Output:
+
+Produces a histogram file and associated `Spectra hist`_ plot.
+The histogram file starts with a header describing settings used to generate the
+histogram.  Each header line starts with a '#' character.  The histogram that follows
+describes each K-mer frequency followed by the number of distinct K-mers found
+at that frequency separated by a space.  Each frequency / count pair is line separated.
+For example::
+
+    # Title:27-mer spectra for: SRR519624_1.1M.fastq
+    # XLabel:27-mer frequency
+    # YLabel:# distinct 27-mers
+    # Kmer value:27
+    # Input 1:SRR519624_1.1M.fastq
+    ###
+    1 47573743
+    2 4737789
+    3 732453
+    4 184505
+    5 100293
+    6 78699
+    7 68553
+    8 59589
+    9 50926
+    ...
+
+
 Applications:
 
  * Assess data quality: estimates of kmers deriving from errors; sequencing bias
@@ -51,6 +78,28 @@ Basic usage::
 
     kat gcp (<input>)+
 
+Output:
+
+Produces a matrix file and associated `Density`_ plot. The matrix file starts with 
+a header describing settings used to generate the matrix.  Each header line starts 
+with a '#' character.  The matrix that follows contains a space separated list of
+distinct k-mer counts for the GC-count indexed by the row.  Each column index represents
+the K-mer Frequency. For example::
+
+    # Title:K-mer coverage vs GC count plot for: ERR409722_1.fastq ERR409722_2.fastq 
+    # XLabel:K-mer multiplicity
+    # YLabel:GC count
+    # ZLabel:Distinct K-mers per bin
+    # Columns:1001
+    # Rows:27
+    # MaxVal:7705834
+    # Transpose:0
+    ###
+    0 65392 10715 6038 4615 3769 3140 2690 2133 1748 1519 1370 1098 840 ...
+    0 189337 30772 20040 16630 15579 13673 12809 11890 10380 9605 8403 7370 6302 ...
+    0 428150 66753 41453 37478 34599 34622 34572 32740 31487 30356 28369 26880 ...
+    ...
+
 Applications:
 
  * Assess data quality: estimates of kmers deriving from errors; sequencing bias
@@ -88,24 +137,50 @@ an assembly::
 
     kat comp -t 8 -o pe_v_asm_test 'PE1.R?.fq' asm.fa
 
+... or if the reads are gzipped::
+    
+    kat comp -t 8 -o pe_v_asm_test <(gunzip -c 'PE1.R?.fq') asm.fa
+
+Output:
+
+Produces a matrix file and by default a `Spectra CN`_ plot, although can also produce
+a `Density` plot if requested. The matrix file is structured in a similar way to the
+GCP tool with a header describing settings used to generate the matrix.  Each header line starts 
+with a '#' character.  The matrix that follows contains a space separated list of
+distinct k-mer counts for the frequency in each input file represented by the row 
+and column index. For example::
+
+    # Title:K-mer comparison plot
+    # XLabel:K-mer multiplicity for: ERR409722_1.fastq
+    # YLabel:K-mer multiplicity for: ERR409722_2.fastq
+    # ZLabel:Distinct K-mers per bin
+    # Columns:1001
+    # Rows:1001
+    # MaxVal:57106148
+    # Transpose:1
+    ###
+    0 57106148 2133673 428934 134189 45267 16399 6603 3374 2066 1371 930 752 490 ...
+    50919938 10364720 1613532 607932 239439 89985 36398 16589 8811 5469 3369 ...
+    1990321 1605550 1061952 561999 271443 125163 61769 34379 22459 15647 11171 ...
+    ...
 
 Applications:
 
  * Determine sequencing bias between left and right read pairs.
- * Compare the kmer spectrum of input reads against an assembly to gauge assembly completeness
+ * Compare the kmer spectrum of input reads against an assembly to gauge assembly completeness.
 
 
 
 SECT
 ----
 
 Estimates coverage levels across sequences in the provided input sequence file.
-This tool will produce a fasta style representation of the input sequence file 
-containing K-mer coverage counts mapped across each sequence.  K-mer coverage is 
+This tool will produce a FastA style representation of the input sequence file 
+containing K-mer coverage counts mapped across each sequence separated by spaces.  
+K-mer coverage is 
 determined from the provided counts input file, which can be either one jellyfish 
-hash, or one or more FastA / FastQ files.  In addition, a space separated table 
-file containing the mean coverage score and GC of each sequence is produced.  The 
-row order is identical to the original sequence file.
+hash, or one or more FastA / FastQ files.  In addition, a file containing statistics
+about each target sequence is produced.
 
 NOTE: K-mers containing any Ns derived from sequences in the sequence file not be 
 included.
@@ -114,12 +189,39 @@ Basic usage::
 
     kat sect [options] <sequence_file> (<input>)+
 
+Output:
+
+Produces a FastA-style representation of the K-mer frequency across each target sequence
+as well as a file describing statistics for each target sequence.  The FastA-style
+output might look like this::
+
+    >Chr4
+    31 31 31 29 29 29 28 27 27 28 28 28 30 30 30 29 29 29 29 31 32 32 33 33 33 31 29 30 ...
+
+With an associated tab separated stats file which looks like this::
+
+    seq_name median       mean      gc%  seq_length  kmers_in_seq  invalid_kmers  %_invalid  non_zero_kmers  %_non_zero  %_non_zero_corrected
+    Chr4         26  362.45141  0.36204    18585056      18585036           3214    0.01729        18549840    99.81065              99.82792
+
+The column headers have the following meaning:
+ * seq_name - The name of the FastA/Q sequence
+ * median - The median K-mer coverage across the sequence
+ * mean - The mean K-mer coverage across the sequence
+ * gc% - The GC% of the sequence
+ * seq_length - The length of the sequence
+ * kmers_in_seq - The number of K-mers in the sequence.  i.e. seq_length - K + 1
+ * invalid_kmers - The number of K-mers in the sequence that cannot be counted, most likely due to being non-canonical.  i.e. non A,T,G,C
+ * %_invalid - The percentage of the sequence which contains invalid K-mers
+ * non_zero_kmers - The number of K-mers that have a coverage of 1 or greater
+ * %_non_zero - The percentage of the sequence which has a K-mer coverage greater than 1
+ * %_non_zero_corrected - The percentage of the sequence which has a K-mer coverage greater than 1 but ignoring any parts of the sequence represented by invalid K-mers.
+
 
 Applications:
 
- * Analyse kmer coverage across assembled sequences
- * Compare assemblies using kmers, helpful to levels of contamination of a specific organism.
- * Contamination detection - Compare kmer spectrum against assembly providing average coverage and GC values for each contig, which can be 2D binned and plot as a heatmap
+ * Analyse K-mer coverage across assembled sequences
+ * Compare assemblies using K-mers, helpful to levels of contamination of a specific organism.
+ * Contamination detection - Compare K-mer spectrum against assembly providing average coverage and GC values for each contig, which can be 2D binned and plot as a heatmap
 
 
 Filtering tools
@@ -173,7 +275,7 @@ modify axis, titles, limits, size, resolution, etc, although they will all try t
 intelligent defaults directly from the data provided.  
 
 
-Spectra_hist
+Spectra hist
 ~~~~~~~~~~~~
 
 Visualises the K-mer spectra from ``kat hist`` or ``jellyfish histo`` output.  
@@ -234,7 +336,7 @@ Applications:
     :scale: 66%
 
 
-Spectra_CN
+Spectra CN
 ~~~~~~~~~~
 
 Shows K-mer duplication levels, which correspond to copy number variation within 
@@ -254,7 +356,7 @@ Applications:
 
 
 
-Spectra_mx
+Spectra MX
 ~~~~~~~~~~
 
 Produces K-mer spectras from rows or columns in a matrix generated by ``kat comp``.  

diff --git a/lib/include/kat/comp_counters.hpp b/lib/include/kat/comp_counters.hpp
@@ -57,6 +57,8 @@ class CompCounters {
     path hash3_path;
 
     CompCounters();
+
+	CompCounters(const size_t _dm_size);
 
     CompCounters(const path& _hash1_path, const path& _hash2_path, const path& _hash3_path, const size_t _dm_size);
 
@@ -92,6 +94,8 @@ class ThreadedCompCounters {
 public:
 
     ThreadedCompCounters();
+
+	ThreadedCompCounters(const size_t _dm_size);
 
     ThreadedCompCounters(const path& _hash1_path, const path& _hash2_path, const path& _hash3_path, const size_t _dm_size);
 

diff --git a/lib/include/kat/sparse_matrix.hpp b/lib/include/kat/sparse_matrix.hpp
@@ -323,7 +323,7 @@ class ThreadedSparseMatrix {
     }
 
     const SM64& mergeThreadedMatricies() {
-        // Merge matrix
+		// Merge matrix
         for (int i = 0; i < width; i++) {
             for (int j = 0; j < height; j++) {
                 for (int k = 0; k < threads; k++) {

diff --git a/lib/include/kat/str_utils.hpp b/lib/include/kat/str_utils.hpp
@@ -184,9 +184,13 @@ namespace kat {
         for (const auto& c : merstr) {
             switch (c) {
                 case 'A':
+				case 'a':
                 case 'T':
+				case 't':
                 case 'G':
+				case 'g':
                 case 'C':
+				case 'c':
                     break;
                 default:
                     return false;

diff --git a/lib/src/comp_counters.cc b/lib/src/comp_counters.cc
@@ -39,6 +39,8 @@ using kat::DistanceMetric;
 
 kat::CompCounters::CompCounters() : CompCounters("", "", "", DEFAULT_NB_BINS) {}
 
+kat::CompCounters::CompCounters(const size_t _dm_size) : CompCounters("", "", "", _dm_size) {}
+
 kat::CompCounters::CompCounters(const path& _hash1_path, const path& _hash2_path, const path& _hash3_path, const size_t _dm_size) :
         hash1_path(_hash1_path), hash2_path(_hash2_path), hash3_path(_hash3_path) {
 
@@ -208,6 +210,8 @@ void kat::CompCounters::printCounts(ostream &out) {
 
 kat::ThreadedCompCounters::ThreadedCompCounters() : ThreadedCompCounters("", "", "", DEFAULT_NB_BINS) {}
 
+kat::ThreadedCompCounters::ThreadedCompCounters(const size_t _dm_size) : ThreadedCompCounters("", "", "", _dm_size) {}
+
 kat::ThreadedCompCounters::ThreadedCompCounters(const path& _hash1_path, const path& _hash2_path, const path& _hash3_path, const size_t _dm_size) {
     final_matrix = CompCounters(_hash1_path, _hash2_path, _hash3_path, _dm_size);            
 }

diff --git a/src/comp.cc b/src/comp.cc
@@ -253,7 +253,6 @@ void kat::Comp::merge() {
 
     cout << "Merging results ...";
     cout.flush();
-
     // Merge results from the threads
     main_matrix.mergeThreadedMatricies();
     if (doThirdHash()) {
@@ -390,7 +389,7 @@ void kat::Comp::compare() {
 
 void kat::Comp::compareSlice(int th_id) {
 
-    shared_ptr<CompCounters> cc = make_shared<CompCounters>();
+    shared_ptr<CompCounters> cc = make_shared<CompCounters>(std::min(this->d1Bins, this->d2Bins));
 
     // Setup iterator for this thread's chunk of hash1
     LargeHashArray::eager_iterator hash1Iterator = input[0].hash->eager_slice(th_id, threads);

diff --git a/src/kat.cc b/src/kat.cc
@@ -19,6 +19,8 @@
 #include <config.h>
 #endif
 
+#include <execinfo.h>
+#include <signal.h>
 #include <sys/ioctl.h>
 #include <string.h>
 #include <exception>
@@ -114,13 +116,29 @@ const string helpMessage() {
 }
 
 
+void handler(int sig) {
+    void *array[10];
+    size_t size;
+
+    // get void*'s for all entries on the stack
+    size = backtrace(array, 10);
+
+    // print out all the frames to stderr
+    fprintf(stderr, "Error: signal %d:\n", sig);
+    fprintf(stderr, "Stack trace:\n");
+    backtrace_symbols_fd(array, size, STDERR_FILENO);
+    exit(1);
+}
+
+
 
 /**
  * Start point for KAT.  Processes the start of the command line and then
  * delegates the rest of the command line to the child tool.
  */
 int main(int argc, char *argv[])
 {
+    signal(SIGSEGV, handler);
     try {
         // KAT args
         string modeStr;
@@ -224,11 +242,11 @@ int main(int argc, char *argv[])
         }
 
     } catch(po::error& e) { 
-        cerr << "Error: Parsing Command Line: " << e.what() << endl; 
-        return 1; 
+        cerr << "Error: Parsing Command Line: " << e.what() << endl;
+                return 1;
     } 
     catch (boost::exception &e) { 
-        cerr << boost::diagnostic_information(e); 
+        cerr << boost::diagnostic_information(e);
         return 4;
     } catch (exception& e) {
         cerr << "Error: " << e.what() << endl;

diff --git a/src/sect.cc b/src/sect.cc
@@ -186,7 +186,7 @@ void kat::Sect::processSeqFile() {
 
     // Average sequence coverage and GC% scores output stream
     ofstream cvg_gc_stream(string(outputPrefix.string() + "-stats.tsv").c_str());
-    cvg_gc_stream << "seq_name\tmedian\tmean\tgc%\tseq_length\tinvalid_bases\t%_invalid\tnon_zero_bases\t%_non_zero\t%_non_zero_corrected" << endl;
+    cvg_gc_stream << "seq_name\tmedian\tmean\tgc%\tseq_length\tkmers_in_seq\tinvalid_kmers\t%_invalid\tnon_zero_kmers\t%_non_zero\t%_non_zero_corrected" << endl;
 
     // Processes sequences in batches of records to reduce memory requirements
     while (!seqan::atEnd(reader)) {
@@ -425,7 +425,8 @@ void kat::Sect::printStatTable(std::ostream &out) {
             << (*medians)[i] << "\t" 
             << (*means)[i] << "\t" 
             << (*gcs)[i] << "\t" 
-            << (*lengths)[i] << "\t" 
+            << (*lengths)[i] << "\t"
+            << (*lengths)[i] - this->input.merLen + 1 << "\t"
             << (*invalid)[i] << "\t"
             << (*percentInvalid)[i] << "\t"
             << (*nonZero)[i] << "\t"