diff --git a/.gitignore b/.gitignore index 61c931eb..c9aca920 100644 --- a/.gitignore +++ b/.gitignore @@ -40,3 +40,4 @@ Makefile.in *.pc *.la *.out +/tags diff --git a/configure.ac b/configure.ac index 0c31b0db..36f4416b 100644 --- a/configure.ac +++ b/configure.ac @@ -4,7 +4,7 @@ # Autoconf initialistion. Sets package name version and contact details AC_PREREQ([2.68]) -AC_INIT([kat],[2.3.1],[https://github.com/TGAC/KAT/issues],[kat],[https://github.com/TGAC/KAT]) +AC_INIT([kat],[2.3.2],[https://github.com/TGAC/KAT/issues],[kat],[https://github.com/TGAC/KAT]) AC_CONFIG_SRCDIR([src/kat.cc]) AC_CONFIG_AUX_DIR([build-aux]) AC_CONFIG_MACRO_DIR([m4]) @@ -176,6 +176,7 @@ else if [[ -z "${BOOST_TIMER_STATIC_LIB}" ]] || [[ -z "${BOOST_CHRONO_STATIC_LIB}" ]] || [[ -z "${BOOST_FILESYSTEM_STATIC_LIB}" ]] || [[ -z "${BOOST_PROGRAM_OPTIONS_STATIC_LIB}" ]] || [[ -z "${BOOST_SYSTEM_STATIC_LIB}" ]]; then AC_MSG_WARN([Not all static boost libraries could be found. Will use dynamic libraries instead.]) BOOST_LIBS="${BOOST_DYN_LIBS}" + dynboost="yes" else BOOST_LIBS="${BOOST_STATIC_LIBS}" fi diff --git a/doc/source/conf.py b/doc/source/conf.py index 0ec42d0f..0816bdf2 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -52,9 +52,9 @@ # built documents. # # The short X.Y version. -version = '2.3.1' +version = '2.3.2' # The full version, including alpha/beta/rc tags. -release = '2.3.1' +release = '2.3.2' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/doc/source/faq.rst b/doc/source/faq.rst index 4f005d10..07ffdceb 100644 --- a/doc/source/faq.rst +++ b/doc/source/faq.rst @@ -4,10 +4,11 @@ Frequently Asked Questions ========================== -Can KAT handle gzipped sequence files? --------------------------------------- +Can KAT handle compressed sequence files? +----------------------------------------- -Yes, but only via named pipes. For example, say we wanted to run ``kat hist`` using +Yes, via named pipes. Anonymous named pipes (process substitution) is also supported. +For example, say we wanted to run ``kat hist`` using gzipped paired end dataset, we can use a named pipe to do this as follows:: mkfifo pe_dataset.fq && gunzip -c pe_dataset_?.fq.gz > pe_dataset.fq & @@ -21,7 +22,17 @@ consuming from the named pipe will take data that has been gunzipped first. To clear this means you do not have to decompress the gzipped files to disk, this happens on the fly as consumed by KAT. -Thanks to John Davey for suggesting this. +Alternatively, using process substitution we could write the previous example more +concisely in a single line like this:: + + kat hist -o oe_dataset.hist <(gunzip -c pe_dataset_?.fq.gz) + +As a more complex example, the KAT comp tool can be driven in spectra-cn mode using +both compressed paired end reads and a compressed assembly as follows:: + + kat comp -o oe_spectra_cn <(gunzip -c pe_dataset_?.fq.gz) <(gunzip -c asm.fa.gz) + +Thanks to John Davey and Torsten Seeman for suggesting this. Why is jellyfish bundled with KAT? diff --git a/doc/source/images/distanalysis_console.png b/doc/source/images/distanalysis_console.png new file mode 100644 index 00000000..2c06df3c Binary files /dev/null and b/doc/source/images/distanalysis_console.png differ diff --git a/doc/source/images/distanalysis_plot.png b/doc/source/images/distanalysis_plot.png new file mode 100644 index 00000000..482ce30c Binary files /dev/null and b/doc/source/images/distanalysis_plot.png differ diff --git a/doc/source/walkthrough.rst b/doc/source/walkthrough.rst index 9b7732f4..1ea93bd5 100644 --- a/doc/source/walkthrough.rst +++ b/doc/source/walkthrough.rst @@ -308,8 +308,9 @@ Genome assembly analysis using k-mer spectra -------------------------------------------- One of the most frequently used tools in KAT are the so called "assembly spectra -copy number plots" or spectra-cn. We use these as a fast first analysis for assembly coherence -to the data in the reads they are representing. Basically we represent how many elements +copy number plots" or spectra-cn. We use these as a fast first analysis to check +assembly coherence against +the content within reads that were used to produce the assembly. Basically we represent how many elements of each frequency on the read’s spectrum ended up not included in the assembly, included once, included twice etc. @@ -374,6 +375,36 @@ duplications, inclusion of extra variation, etc: :scale: 33% +Distribution decomposition analysis +----------------------------------- + +It's useful to be able to fit distributions to each peak in a k-mer histogram or spectra-cn matrix +in order to work out how many distinct k-mers can be associated with those distributions. By counting +k-mers in this way we can make predictions around genome size, heterozygous rates (if diploid) and +assembly completeness. To this end we bundle a script with kat called kat_distanalysis.py. It takes +in either a spectra-cn matrix file, or kat histogram file as input, then proceeds to identify peaks +and fit distributions to each one. In the case of spectra-cn matrix files it also identifies peaks +for each copy number for an assembly. + +The user can help to get correct predictions out of the tool by providing an approximate frequency for +the homozygous part of the distribution. By default, this is assumed to be the last peak. For example, +this command:: + + kat_distanalysis.py --plot spectra-cn.mx + +might produce the following output for this tetraploid genome: + +.. image:: images/distanalysis_console.png + :scale: 100% + +.. image:: images/distanalysis_plot.png + :scale: 100% + + + + + + Finding repetitive regions in assemblies ---------------------------------------- diff --git a/lib/.gitignore b/lib/.gitignore index e1833e65..851e16e8 100644 --- a/lib/.gitignore +++ b/lib/.gitignore @@ -4,3 +4,4 @@ *.lo *.o *.kat-2.1.pc.swp +/tags diff --git a/lib/include/.gitignore b/lib/include/.gitignore new file mode 100644 index 00000000..61ffc7cb --- /dev/null +++ b/lib/include/.gitignore @@ -0,0 +1 @@ +/tags diff --git a/lib/include/kat/.gitignore b/lib/include/kat/.gitignore new file mode 100644 index 00000000..61ffc7cb --- /dev/null +++ b/lib/include/kat/.gitignore @@ -0,0 +1 @@ +/tags diff --git a/lib/include/kat/jellyfish_helper.hpp b/lib/include/kat/jellyfish_helper.hpp index d749f15f..905f28f7 100644 --- a/lib/include/kat/jellyfish_helper.hpp +++ b/lib/include/kat/jellyfish_helper.hpp @@ -169,6 +169,12 @@ namespace kat { */ static void printHeader(const file_header& header, ostream& out); + /** + * Checks if path refers to a pipe rather than a real file + * @param filename Path to input file + * @return Whether or not the path refers to a pipe + */ + static bool isPipe(const path& filename); /** * Returns whether or not the specified file path looks like it belongs to diff --git a/lib/src/.gitignore b/lib/src/.gitignore index a987ceb7..f7e69b07 100644 --- a/lib/src/.gitignore +++ b/lib/src/.gitignore @@ -3,3 +3,4 @@ *.o /.deps/ /.libs/ +/tags diff --git a/lib/src/input_handler.cc b/lib/src/input_handler.cc index ab06bed0..a613729c 100644 --- a/lib/src/input_handler.cc +++ b/lib/src/input_handler.cc @@ -64,13 +64,12 @@ void kat::InputHandler::validateInput() { } } - if (!bfs::exists(p)) { + if (!JellyfishHelper::isPipe(p) && !bfs::exists(p)) { BOOST_THROW_EXCEPTION(InputFileException() << InputFileErrorInfo(string( "Could not find input file at: ") + p.string() + "; please check the path and try again.")); } InputMode m = JellyfishHelper::isSequenceFile(p) ? InputMode::COUNT : InputMode::LOAD; - if (start) { mode = m; } @@ -109,8 +108,10 @@ void kat::InputHandler::validateMerLen(const uint16_t merLen) { string kat::InputHandler::pathString() { string s; + uint16_t index = 1; for(auto& p : input) { - s += p.string() + " "; + string msg = JellyfishHelper::isPipe(p) ? string("") : p.string(); + s += msg + " "; } return boost::trim_right_copy(s); } @@ -131,7 +132,7 @@ void kat::InputHandler::count(const uint16_t threads) { hashCounter = make_shared(hashSize, merLen * 2, 7, threads); hashCounter->do_size_doubling(!disableHashGrow); - cout << "Input is a sequence file. Counting kmers for " << pathString() << "..."; + cout << "Input " << index << " is a sequence file. Counting kmers for input " << index << " (" << pathString() << ") ..."; cout.flush(); hash = JellyfishHelper::countSeqFile(input, *hashCounter, canonical, threads); diff --git a/lib/src/jellyfish_helper.cc b/lib/src/jellyfish_helper.cc index 47cbdb6d..343b46c8 100644 --- a/lib/src/jellyfish_helper.cc +++ b/lib/src/jellyfish_helper.cc @@ -50,6 +50,7 @@ using jellyfish::Offsets; using jellyfish::quadratic_reprobes; #include +#include using kat::JellyfishHelper; /** @@ -255,6 +256,11 @@ void kat::JellyfishHelper::dumpHash(LargeHashArrayPtr ary, file_header& header, dumper.dump(ary); } +bool kat::JellyfishHelper::isPipe(const path& filename) { + return boost::starts_with(filename.string(), "/proc") || boost::starts_with(filename.string(), "/dev"); +} + + /** * Returns whether or not the specified file path looks like it belongs to * a sequence file (either FastA or FastQ). Gzipped sequence files are @@ -264,6 +270,9 @@ void kat::JellyfishHelper::dumpHash(LargeHashArrayPtr ary, file_header& header, */ bool kat::JellyfishHelper::isSequenceFile(const path& filename) { + // If we have a pipe as input then assume we are working with a sequence file + if (JellyfishHelper::isPipe(filename)) return true; + string ext = filename.extension().string(); // Actually we can't handle gz files directly, so turning this off for now diff --git a/m4/ax_boost_chrono.m4 b/m4/ax_boost_chrono.m4 index 18cd5a76..8f2bb46b 100644 --- a/m4/ax_boost_chrono.m4 +++ b/m4/ax_boost_chrono.m4 @@ -82,6 +82,13 @@ AC_DEFUN([AX_BOOST_CHRONO], AC_DEFINE(HAVE_BOOST_CHRONO,,[define if the Boost::Chrono library is available]) BOOSTLIBDIR=`echo $BOOST_LDFLAGS | sed -e 's/@<:@^\/@:>@*//'` + ax_lib="" + ax_static_lib="" + no_find="yes" + no_link="yes" + link_chrono="no" + link_chrono_static="no" + LDFLAGS_SAVE=$LDFLAGS if test "x$ax_boost_user_chrono_lib" = "x"; then for libextension in `ls $BOOSTLIBDIR/libboost_chrono*.so* $BOOSTLIBDIR/libboost_chrono*.dylib* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_chrono.*\)\.so.*$;\1;' -e 's;^lib\(boost_chrono.*\)\.dylib.*$;\1;'` ; do @@ -133,11 +140,11 @@ AC_DEFUN([AX_BOOST_CHRONO], AC_MSG_ERROR(Could not find any version boost_chrono to link to) fi - if [[ "$link_chrono" = "no" ]]; then - AC_MSG_WARN(Could not dynamic link against $ax_lib) + if [[ "$link_chrono" == "no" ]]; then + AC_MSG_WARN(Could not dynamic link against boost_chrono) fi if [[ "$link_chrono_static" == "no" ]]; then - AC_MSG_WARN(Could not static link against $ax_static_lib) + AC_MSG_WARN(Could not static link against boost_chrono) fi if [[ "$no_link" == "yes" ]]; then AC_MSG_ERROR(Could not link against any boost_chrono lib) diff --git a/m4/ax_boost_filesystem.m4 b/m4/ax_boost_filesystem.m4 index d6db5618..19504910 100644 --- a/m4/ax_boost_filesystem.m4 +++ b/m4/ax_boost_filesystem.m4 @@ -84,6 +84,13 @@ AC_DEFUN([AX_BOOST_FILESYSTEM], AC_DEFINE(HAVE_BOOST_FILESYSTEM,,[define if the Boost::Filesystem library is available]) BOOSTLIBDIR=`echo $BOOST_LDFLAGS | sed -e 's/@<:@^\/@:>@*//'` + ax_lib="" + ax_static_lib="" + no_find="yes" + no_link="yes" + link_filesystem="yes" + link_filesystem_static="yes" + LDFLAGS_SAVE=$LDFLAGS if test "x$ax_boost_user_filesystem_lib" = "x"; then for libextension in `ls $BOOSTLIBDIR/libboost_filesystem*.so* $BOOSTLIBDIR/libboost_filesystem*.dylib* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_filesystem.*\)\.so.*$;\1;' -e 's;^lib\(boost_filesystem.*\)\.dylib.*$;\1;'` ; do @@ -135,11 +142,11 @@ AC_DEFUN([AX_BOOST_FILESYSTEM], AC_MSG_ERROR(Could not find any version boost_filesystem to link to) fi - if [[ "$link_filesystem" = "no" ]]; then - AC_MSG_WARN(Could not dynamic link against $ax_lib) + if [[ "$link_filesystem" == "no" ]]; then + AC_MSG_WARN(Could not dynamic link against boost_filesystem) fi if [[ "$link_filesystem_static" == "no" ]]; then - AC_MSG_WARN(Could not static link against $ax_static_lib) + AC_MSG_WARN(Could not static link against boost_filesystem) fi if [[ "$no_link" == "yes" ]]; then AC_MSG_ERROR(Could not link against any boost_filesystem lib) diff --git a/m4/ax_boost_program_options.m4 b/m4/ax_boost_program_options.m4 index 8fa99107..670102e9 100644 --- a/m4/ax_boost_program_options.m4 +++ b/m4/ax_boost_program_options.m4 @@ -76,6 +76,13 @@ AC_DEFUN([AX_BOOST_PROGRAM_OPTIONS], AC_DEFINE(HAVE_BOOST_PROGRAM_OPTIONS,,[define if the Boost::Program_Options library is available]) BOOSTLIBDIR=`echo $BOOST_LDFLAGS | sed -e 's/@<:@^\/@:>@*//'` + ax_lib="" + ax_static_lib="" + no_find="yes" + no_link="yes" + link_program_options="no" + link_program_options_static="no" + LDFLAGS_SAVE=$LDFLAGS if test "x$ax_boost_user_program_options_lib" = "x"; then for libextension in `ls $BOOSTLIBDIR/libboost_program_options*.so* $BOOSTLIBDIR/libboost_program_options*.dylib* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_program_options.*\)\.so.*$;\1;' -e 's;^lib\(boost_program_options.*\)\.dylib.*$;\1;'` ; do @@ -127,11 +134,11 @@ AC_DEFUN([AX_BOOST_PROGRAM_OPTIONS], AC_MSG_ERROR(Could not find any version boost_program_options to link to) fi - if [[ "$link_program_options" = "no" ]]; then - AC_MSG_WARN(Could not dynamic link against $ax_lib) + if [[ "$link_program_options" == "no" ]]; then + AC_MSG_WARN(Could not dynamic link against boost_program_options) fi if [[ "$link_program_options_static" == "no" ]]; then - AC_MSG_WARN(Could not static link against $ax_static_lib) + AC_MSG_WARN(Could not static link against boost_program_options) fi if [[ "$no_link" == "yes" ]]; then AC_MSG_ERROR(Could not link against any boost_program_options lib) diff --git a/m4/ax_boost_system.m4 b/m4/ax_boost_system.m4 index 77f72260..6975434a 100644 --- a/m4/ax_boost_system.m4 +++ b/m4/ax_boost_system.m4 @@ -82,6 +82,13 @@ AC_DEFUN([AX_BOOST_SYSTEM], AC_DEFINE(HAVE_BOOST_SYSTEM,,[define if the Boost::System library is available]) BOOSTLIBDIR=`echo $BOOST_LDFLAGS | sed -e 's/@<:@^\/@:>@*//'` + ax_lib="" + ax_static_lib="" + no_find="yes" + no_link="yes" + link_system="no" + link_system_static="no" + LDFLAGS_SAVE=$LDFLAGS if test "x$ax_boost_user_system_lib" = "x"; then for libextension in `ls $BOOSTLIBDIR/libboost_system*.so* $BOOSTLIBDIR/libboost_system*.dylib* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_system.*\)\.so.*$;\1;' -e 's;^lib\(boost_system.*\)\.dylib.*$;\1;'` ; do @@ -133,11 +140,11 @@ AC_DEFUN([AX_BOOST_SYSTEM], AC_MSG_ERROR(Could not find any version boost_system to link to) fi - if [[ "$link_system" = "no" ]]; then - AC_MSG_WARN(Could not dynamic link against $ax_lib) + if [[ "$link_system" == "no" ]]; then + AC_MSG_WARN(Could not dynamic link against boost_system) fi if [[ "$link_system_static" == "no" ]]; then - AC_MSG_WARN(Could not static link against $ax_static_lib) + AC_MSG_WARN(Could not static link against boost_system) fi if [[ "$no_link" == "yes" ]]; then AC_MSG_ERROR(Could not link against any boost_system lib) @@ -145,7 +152,7 @@ AC_DEFUN([AX_BOOST_SYSTEM], fi - CPPFLAGS="$CPPFLAGS_SAVED" + CPPFLAGS="$CPPFLAGS_SAVED" LDFLAGS="$LDFLAGS_SAVED" - fi + fi ]) diff --git a/m4/ax_boost_timer.m4 b/m4/ax_boost_timer.m4 index 28477e7a..fab1515e 100644 --- a/m4/ax_boost_timer.m4 +++ b/m4/ax_boost_timer.m4 @@ -83,6 +83,13 @@ AC_DEFUN([AX_BOOST_TIMER], AC_DEFINE(HAVE_BOOST_TIMER,,[define if the Boost::Timer library is available]) BOOSTLIBDIR=`echo $BOOST_LDFLAGS | sed -e 's/@<:@^\/@:>@*//'` + ax_lib="" + ax_static_lib="" + no_find="yes" + no_link="yes" + link_timer="no" + link_timer_static="no" + LDFLAGS_SAVE=$LDFLAGS if test "x$ax_boost_user_timer_lib" = "x"; then for libextension in `ls $BOOSTLIBDIR/libboost_timer*.so* $BOOSTLIBDIR/libboost_timer*.dylib* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_timer.*\)\.so.*$;\1;' -e 's;^lib\(boost_timer.*\)\.dylib.*$;\1;'` ; do @@ -134,11 +141,11 @@ AC_DEFUN([AX_BOOST_TIMER], AC_MSG_ERROR(Could not find any version boost_timer to link to) fi - if [[ "$link_timer" = "no" ]]; then - AC_MSG_WARN(Could not dynamic link against $ax_lib) + if [[ "$link_timer" == "no" ]]; then + AC_MSG_WARN(Could not dynamic link against boost_timer) fi if [[ "$link_timer_static" == "no" ]]; then - AC_MSG_WARN(Could not static link against $ax_static_lib) + AC_MSG_WARN(Could not static link against boost_timer) fi if [[ "$no_link" == "yes" ]]; then AC_MSG_ERROR(Could not link against any boost_timer lib) diff --git a/scripts/kat_distanalysis.py b/scripts/kat_distanalysis.py index 6a7dacfe..99266c86 100755 --- a/scripts/kat_distanalysis.py +++ b/scripts/kat_distanalysis.py @@ -1,415 +1,685 @@ #!/usr/bin/env python3 +import argparse +import abc import sys -from math import * -from scipy.stats import gamma,norm -from scipy import mean,optimize +import traceback +import time + import numpy as np -import argparse +from scipy import mean, optimize +import matplotlib.pyplot as plt + -def plot_hist(h,points,cap,label=""): - plot([min(cap,x) for x in h[:points]],label=label) +R2PI = np.sqrt(2.0 * np.pi) class KmerSpectra(object): - """A kmer spectra, comprised of different peaks. - Contains the general fitting method""" - ###----------------- INIT, LOAD, DUMP, ETC -------------------- - def __init__(self,histo_file=None,points=10000,column=1,cumulative=False): - """Init with the objective histogram""" - self.histogram=[] - self.peaks=[] - if histo_file: - self.read_hist(histo_file,points,column,cumulative=cumulative) - - def read_hist(self,name,points=10000,column=1,cumulative=False): - f=open(name) - if cumulative: - self.histogram=[sum([int(y) for y in x.split()[column:]]) for x in f.readlines() if x and x[0]!='#'][:points][1:] - else: - self.histogram=[int(x.split()[column]) for x in f.readlines() if x and x[0]!='#'][:points][1:] - f.close() - - - def total_values(self,start=1,end=10000): - return list(map(sum,list(zip(*[x.points(start,end) for x in self.peaks])))) - - ###----------------- FIND AND CREATE PEAKS -------------------- - - def deriv(self,histogram): - return [histogram[i+2]-histogram[i] for i in range(len(histogram)-2)] - - def progsmoothderiv(self, histogram): - #always return mean - return [mean(histogram[i+1:int(i+1+i/10)])-mean(histogram[int(i-i/10):i+1]) for i in range(len(histogram)-2)] - - def find_maxima(self, center, radius, min_perc, min_elem, histogram=None): - if histogram==None: - hs=self.histogram[center-radius:center+radius] - #Smoothed df/dx (dx=2 units) - deriv=self.progsmoothderiv(self.histogram)[center-radius:center+radius] - else: - hs=histogram[center-radius:center+radius] - #Smoothed df/dx (dx=2 units) - deriv=self.progsmoothderiv(histogram)[center-radius:center+radius] - - fmax=hs.index(max(hs)) - if fmax==0 or fmax==len(hs)-1: return 0 - - #find a single inflection point-> that is the maxima - #Reject by voting to avoid oscillations. - failpoints=0 - for i in range(fmax): - if deriv[i]<0: failpoints+=1 - for i in range(fmax+1,len(deriv)): - if deriv[i]>0: failpoints+=1 - if float(failpoints)/(2*radius+1)>.1: - return 0 - - #TODO: discard maxima if not 1% of the already contained elements on peaks are there - - if sum(hs) < min( (float(min_perc)/100)*sum([x.elements for x in self.peaks]),min_elem): - print("Distribution on %d too small to be considered (%d elements)" % (center-radius+fmax, sum(hs))) - return 0 - #TODO: do further validation - #print "maxima found on %d, for %s" % (fmax,hs) - - return center-radius+fmax - - - def add_peak_and_update_cuts(self,lm,reset_opt=False): - #Receives a local maxima, adds a peak there if there was none, updates the cuts. - #If reset_opt is set, it will reset all the peak counts/sd and only keep the means - fdists=[x.mean for x in self.peaks] - for f in fdists: - if lm>=f-f/5 and lm<=f+f/5: - print("WARNING!!! Dist on %d is not to be added, because existing dist on %d is too close to it." % (lm,f)) - return False - fdists.append(lm) - fdists.sort() - self.cuts=[self.fmin] - for i in range(len(fdists)-1): - self.cuts.append(int(fdists[i]*(1+float(fdists[i])/(fdists[i]+fdists[i+1])))) - self.cuts.append(int(min(len(self.histogram)-1,fdists[-1]*1.5))) - #print "cuts on %s" % str(self.cuts) - if reset_opt: - self.peaks=[] - for i in range(len(fdists)): - if reset_opt or i==fdists.index(lm): - self.peaks.append(KmerPeak(lm,fdists[i], fdists[i]/6, sum([self.histogram[j] for j in range(self.cuts[i],self.cuts[i+1])]))) - self.peaks.sort(key=lambda x: x.mean) - return True - - def create_peaks(self,min_perc,min_elem): - fmin=0 - #walk till first local minimum (d(f)>0) - for i in range(len(self.histogram)): - if self.histogram[i] .01 * sum([p.elements for p in self.peaks]): - lm=self.find_maxima(int(f), int(f/5), min_perc, min_elem,base) - if lm: - if self.add_peak_and_update_cuts(lm,reset_opt=False): - updated=True - #else: - # print "No new maxima found on %d +/- %d" % (f,f/5) - if updated: self.optimize_peaks(min_perc,min_elem) - - def residues(self,p): - """p has a triplet for each distribution""" - #print p - for i in range(len(self.peaks)): - self.peaks[i].mean=p[i*3] - self.peaks[i].shape=p[i*3+1] - self.peaks[i].elements=p[i*3+2] - tv=self.total_values() - r=[tv[i]-self.histogram[i] for i in range(self.cuts[0],self.cuts[-1])] - #a lot more penalty for being UP: - for i in range(len(r)): - if r[i]>0: - r[i]=2*r[i] - #for j in xrange(len(self.peaks)): - # if p[0]>self.peaks[j].target_max*1.1 or p[0] 0 else 1 + + r1_start = i + 1 + r1_end = i + 1 + delta + m1 = mean(histo[r1_start:r1_end]) + + r2_start = max(i - delta, 0) + r2_end = i+1 + m2 = mean(histo[r2_start:r2_end]) + + smoothed.append(m1 - m2) + return smoothed + + def find_maxima(self, center, radius, min_perc, min_elem, histo=None): + + start=center-radius + end=center+radius + + hs = histo[start:end] if histo else self.histogram[start:end] + + if len(hs) == 0: + raise ValueError("Histogram has length of 0: Center:", center, "; Radius:", radius) + + # Smoothed df/dx (dx=2 units) + deriv = self.progsmoothderiv(hs) + fmax = hs.index(max(hs)) + if fmax == 0 or fmax == len(hs) - 1: return 0 + + # find a single inflection point-> that is the maxima + # Reject by voting to avoid oscillations. + failpoints = 0 + for i in range(fmax): + if deriv[i] < 0: failpoints += 1 + for i in range(fmax + 1, len(deriv)): + if deriv[i] > 0: failpoints += 1 + if float(failpoints) / (2 * radius + 1) > .1: + return 0 + + # TODO: discard maxima if not 1% of the already contained elements on peaks are there + if sum(hs) < min((float(min_perc) / 100) * sum([x.elements for x in self.peaks]), min_elem): + print("Distribution at %d too small to be considered (%d elements)" % (start + fmax, sum(hs))) + return 0 + # TODO: do further validation + # print "maxima found on %d, for %s" % (fmax,hs) + return start + fmax + + def add_peak_and_update_cuts(self, lm, reset_opt=False): + # Receives a local maxima, adds a peak there if there was none, updates the cuts. + # If reset_opt is set, it will reset all the peak counts/sd and only keep the means + + # Check if peak is below fmin. If so skip it. + if lm < self.fmin: + return False + + # Validation. Extract means for all peaks and ensure they are not too close to one another + fdists = [x.mean for x in self.peaks] + for f in fdists: + if lm >= f - f / 4 and lm <= f + f / 4: + #print("WARNING!!! Dist at %d is not to be added, because existing dist at %d is too close to it." % (lm, f)) + return False + + # Add the new peak and sort + fdists.append(lm) + fdists.sort() + + # Use what is hopefully the cutoff frequency as the start of the first peak + # Then try to segment the spectra into different sections based on the peak frequencies + self.cuts = [self.fmin] + for i in range(len(fdists) - 1): + self.cuts.append(int(fdists[i] * (1 + float(fdists[i]) / (fdists[i] + fdists[i + 1])))) + self.cuts.append(int(min(len(self.histogram) - 1, fdists[-1] * 1.5))) + self.cuts.sort() + #print("Cuts: %s" % str(self.cuts)) + if reset_opt: + self.peaks = [] + + for i in range(len(fdists)): + if reset_opt or i == fdists.index(lm): + self.peaks.append(KmerPeak( + fdists[i], # Mean + fdists[i] / 6, # Shape + sum([self.histogram[j] for j in range(self.cuts[i], self.cuts[i+1])]), # Elements + self.histogram[fdists[i]], #Peak + self.cuts[i], # Left + self.cuts[i+1])) # Right + self.peaks.sort(key=lambda x: x.mean) + return True + + def create_peaks(self, min_perc, min_elem, verbose=False): + + # walk till first local minimum (d(f)>0) + # Double check the following two steps, rather than just the next one. + # Sometimes we can get a strange laddering affect in alternate frequencies which prevent + # us from correct detecting the minima + fmin = 0 + for i in range(len(self.histogram) - 2): + if self.histogram[i] < self.histogram[i+1] and self.histogram[i] < self.histogram[i+2]: + fmin = i + break + + # Sometimes we might not find a local minima, it depends on what sort of content is in the spectra. + # In this case just reset all spectra measures + if not fmin: + self.fmax = 0 + self.maxval = 0 + + # Find the global fm|p(fm)=max(p) + fmax = self.histogram.index(max(self.histogram[fmin:])) + + if fmax < 10: + #print("Kmer count at max frequency is < 10. Not enough data to perform analysis.") + return + + # Set measures to class variables + self.fmax = fmax + self.fmin = fmin + self.maxval = self.histogram[fmax] + + # Explore fm/x and fm*x for x in [1,4] + for f in [fmax / 4.0, fmax / 3.0, fmax / 2.0, fmax, fmax * 2, fmax * 3, fmax * 4]: + if int(f / 5.0) > 0 and f + f / 5.0 < len(self.histogram): + lm = self.find_maxima(int(f), int(f / 5.0), min_perc, min_elem) + if lm: + self.add_peak_and_update_cuts(lm, reset_opt=True) + + # ----------------- OPTIMIZATION -------------------- + + def optimize_peaks(self, min_perc, min_elem, verbose=False): + sortedpeaks = [x for x in self.peaks] + sortedpeaks.sort(key=lambda x: -x.elements) + + # create a base as the histogram and start from there + base = [x for x in self.histogram] + for p_i, p in enumerate(sortedpeaks): + # locally optimize the preak + p.constrained_opt(p.left, base[p.left:p.right]) + # substract the peak effect from the baseline + for i in range(len(self.histogram)): + base[i] = base[i] - p.point(i) + + updated = False + for f in [self.fmax / 4, self.fmax / 3, self.fmax / 2, self.fmax, self.fmax * 2, self.fmax * 3, self.fmax * 4]: + if int(f + f / 5) < len(self.histogram) and sum( + [base[x] for x in range(int(f - f / 5), int(f + f / 5))]) > .01 * sum( + [p.elements for p in self.peaks]): + lm = self.find_maxima(int(f), int(f / 5), min_perc, min_elem, base) + if lm: + if self.add_peak_and_update_cuts(lm, reset_opt=False): + updated = True + if updated: self.optimize_peaks(min_perc, min_elem) + + def residues(self, p): + """p has a triplet for each distribution""" + # print p + for i in range(len(self.peaks)): + self.peaks[i].mean = p[i * 5] + self.peaks[i].shape = p[i * 5 + 1] + self.peaks[i].elements = p[i * 5 + 2] + self.peaks[i].left = p[i * 5 + 3] + self.peaks[i].right = p[i * 5 + 4] + + self.total_values() + r = [self.fitted_histogram[i] - self.histogram[i] for i in range(self.cuts[0], self.cuts[-1])] + return r + + def optimize_overall(self): + p = [] + for pk in self.peaks: + p.append(pk.mean) + p.append(pk.shape) + p.append(pk.elements) + p.append(pk.left) + p.append(pk.right) + + # Optimise + res = optimize.leastsq(self.residues, p, full_output=True) + if res[-1] < 1 or res[-1] > 4: + raise RuntimeError("It is likely that the spectra is too complex to analyse properly. Stopping analysis.\nOptimisation results:\n" + str(res[-2])) + + # once the better fit is found, check if by taking the unfitted elements new distributions arise. + return + + + def getHomozygousPeakIndex(self, approx_freq=0): + # Work out which peak is the homozygous peak + peak_index = len(self.peaks) + if approx_freq > 0: + min_peak_index = peak_index + delta_peak_freq = 1000000 + for p_i, p in enumerate(self.peaks, start=1): + delta = abs(p.mean - approx_freq) + if delta_peak_freq > delta: + delta_peak_freq = delta + min_peak_index = p_i + peak_index = min_peak_index + + return peak_index + + + def calcGenomeSize(self, hom_peak=0): + + hom_peak_index = len(self.peaks) if hom_peak == 0 else hom_peak + + # Just use first spectra for this calculation + sum = 0 + for p_i, p in enumerate(self.peaks, start=1): + sum += p_i * p.elements + + return int(sum / hom_peak_index) + + + def calcHetRate(self, genome_size=0, hom_peak=0): + + genomesize = genome_size if genome_size > 0 else self.calcGenomeSize() + hom_peak_index = len(self.peaks) if hom_peak == 0 else hom_peak + sum = 0 + if hom_peak_index < 2: + return 0.0 + + for p_i, p in enumerate(self.peaks, start=1): + #Skip the last peak + if p_i >= hom_peak_index: + break + sum += p.elements / self.k + + return (sum / genomesize) * 100.0 + + + def calcKmerCoverage(self): + tot_vol = sum([x.elements for x in self.peaks]) + weighted = sum([x.mean * x.elements for x in self.peaks]) + return int(weighted / tot_vol) + + def printPeaks(self): + if len(self.peaks) > 0: + print("Index\t" + KmerPeak.getTabHeader()) + for p_i, p in enumerate(self.peaks, start=1): + print(str(p_i) + "\t" + p.toTabString()) + else: + print("No peaks detected") + + def printGenomeStats(self, hom_peak_freq): + # Step 4, genome stats + print("K-value used:", self.k) + print("Peaks in analysis:", len(self.peaks)) + self.printPeaks() + + print("Mean k-mer frequency:", str(self.calcKmerCoverage()) + "x") + + hp = self.getHomozygousPeakIndex(hom_peak_freq) + print("Homozygous peak index:", hp, ("" if hom_peak_freq > 0 else "(assumed)")) + gs = self.calcGenomeSize(hom_peak=hp) + print("Estimated genome size:", '{0:.2f}'.format(float(gs) / 1000000.0), "Mbp") + if (hp > 1): + hr = self.calcHetRate(gs) + print("Estimated heterozygous rate:", "{0:.2f}".format(hr) + "%") + + class KmerPeak(object): - """A distribution representing kmers covered a certain number of times. - Contains methods for fitting to an interval""" - - def __init__(self,target_max,mean,shape,elements): - self.target_max=target_max - self.mean=mean - self.shape=shape - - - self.elements=elements - - def __str__(self): - return "" % (self.mean,self.elements) - - def point(self,x): - """Normalized Gaussian""" - return float(self.elements) / np.sqrt(2 * np.pi) / self.shape * np.exp(-(x - self.mean) ** 2 / 2. / self.shape ** 2) - - def points(self,start,end): - return [self.point(x) for x in range(start,end)] - - def residues(self,p,offset,histogram): - """residues of fitting self with parameters p to offset obj[0] with values obj[1]""" - #TODO: enforce a constrain? - self.mean=p[0] - self.shape=p[1] - self.elements=p[2] - r=[self.point(offset+i)-histogram[i] for i in range(len(histogram))] - #a lot more penalty for being UP: - for i in range(len(r)): - if r[i]>0: - r[i]=2*r[i] - #if p[0]>self.target_max*1.1 or p[0]%d, x->%d" % (limy,limx)) - self.plot_all(limx,limy) - show() - self.spectra.optimize_peaks() - figure() - self.plot_all(limx,limy) - show() - self.spectra.optimize_overall() - figure() - self.plot_all(limx,limy) - figure() - self.plot_peaks(limx,limy) - show() - for p in self.spectra.peaks: print(p) - -class MXKmerSpectraAnalysis(object): - def __init__(self,filename,columns=3,points=10000): - self.spectras=[KmerSpectra(filename,points,column=0,cumulative=True)] - for i in range(columns): - self.spectras.append(KmerSpectra(filename,points,column=i,cumulative=(i==columns-1))) - - - def plot_hist(self,h,points,cap,label=""): - plot([min(cap,x) for x in h[:points]],label=label) - xlabel('Kmer Frecuency') - ylabel('Kmer Count') - legend() - - def plot_hist_df(self,h,points,cap): - plot([max(-cap,min(cap,x)) for x in [h[i+1]-h[i] for i in range(points)]]) - - def plot_all(self,points=0,cap=0,spectra=True,fit=True,dists=True): - if 0==points: points=self.limx - if 0==cap: cap=self.limy - for s in self.spectras: - if self.spectras[0]==s: - slabel="General Spectra" - else: - slabel="%d x present" % (self.spectras.index(s)-1) - figure() - if spectra: self.plot_hist(s.histogram,points,cap,label=slabel) - if fit: self.plot_hist(s.total_values(1,points+1),points,cap,label=slabel+" fit") - if dists: - for p in s.peaks: - self.plot_hist(p.points(1,points+1),points,cap,label="fit dist %d" % s.peaks.index(p)) - show() - for p in s.peaks: - print(p) - - def analyse(self,min_perc=1,min_elem=100000,verbose=False): - self.limx=0 - self.limy=0 - for s in self.spectras: - print("analysing spectra... ", end=' ') - sys.stdout.flush() - s.create_peaks(min_perc=min_perc,min_elem=min_elem) - - sys.stdout.flush() - if s.peaks: - self.limy=max(int(s.maxval*1.1/1000)*1000, self.limy ) - self.limx=max(min(s.peaks[-1].mean*2,len(s.histogram)) , self.limx) - print("peaks created ... ", end=' ') - sys.stdout.flush() - s.optimize_peaks(min_perc=min_perc,min_elem=min_elem) - print("locally optimised ... ", end=' ') - for p in s.peaks: print(p) - sys.stdout.flush() - s.optimize_overall() - print("overall optimised ... DONE") - sys.stdout.flush() - print("Plot limits: y->%d, x->%d" % (self.limy,self.limx)) - - def peak_stats(self): - """TODO: Runs analyse (TODO:include general spectra) - Takes enough peaks as to cover a given % of the elements: - - Find the peak across all distributions - - Reports peak stats - If multiple peaks have been analyzed, tries to find the "main unique" and explains the results based on that freq. - """ - #step 1, try to find a reasonable mean for kmer frequency. - #weighted means by number of elements? - general_dists=[(x.mean,x.elements) for x in self.spectras[0].peaks] - general_dists.sort(key=lambda x: x[0]) - pkc=general_dists[0][0] - kcov=sum([ (x[0]/round(x[0]/pkc) )*x[1] for x in general_dists])/sum([x[1] for x in general_dists]) - print(pkc, kcov) - #step 2, selects frequencies for peaks from bigger to smaller till X% of the elements are covered or no more peaks - goal=0.99*sum([x[1] for x in general_dists]) - maxpeaks=10 - general_dists.sort(key=lambda x: -x[1]) - af=[] - peaks=0 - covered=0 - for x in general_dists: - af.append(kcov*round(x[0]/kcov)) - peaks+=1 - covered+=x[1] - if peaks==maxpeaks or covered>goal: - break - #step 3, report for each peak - #get the candidate peak on each spectra - for f in af: - total=0 - pd={} - for i in range(len(self.spectras)-1): - m=[(x.mean,x.elements) for x in self.spectras[1+i].peaks if x.mean>0.8*f and x.mean<1.2*f] - if len(m)==1: - pd[i]=m[0] - total+=m[0][1] - if len(m)>1: - print("WARNING, MORE THAT 1 PEAK FOR f=%.3f FOUND ON THE %dx SPECTRA!!!" % (f,i)) - print("\n---- Report for f=%.3f (total elements %d)----" % (f,total)) - for i in range(len(self.spectras)-1): - if i in list(pd.keys()): - print(" %dx: %.2f%% (%d elements on f=%.2f)" % (i,float(pd[i][1])*100/total,pd[i][1],pd[i][0])) - else: - print(" %dx: No significant content" % i) - - #step 4, general report - return + """A distribution representing kmers covered a certain number of times. + Contains methods for fitting to an interval""" + + def __init__(self, mean, shape, elements, peak, left, right): + self.mean = mean + self.shape = shape + self.elements = elements + self.peak = peak + self.left = left + self.right = right + + def __str__(self): + return "Peak of " + str(self.peak) + " at frequency " + str(int(self.mean)) + ", with volume of " + \ + str(int(self.elements)) + " elements between frequencies of " + str(self.left) + " and " + str(self.right) + ". Shape: " + str(self.shape) + + def toTabString(self): + return "\t".join([str(int(self.left)), str(int(self.mean)), str(int(self.right)), str(int(self.peak)), str(int(self.elements))]) + + @staticmethod + def getTabHeader(): + return "Left\tMean\tRight\tMax\tVolume" + + def point(self, x): + """Normalized Gaussian""" + #return float(self.elements) / R2PI / self.shape * np.exp(-(x - self.mean) ** 2 / 2. / self.shape ** 2) + return self.__p1 * np.exp(-(x - self.mean) ** 2 / 2. / self.shape ** 2) + + def points(self, start, end): + self.__p1 = float(self.elements) / R2PI / self.shape + return [self.point(x) for x in range(start, end)] + + def residues(self, p, offset, histogram): + """residues of fitting self to offset obj[0] with values obj[1]""" + self.__p1 = float(self.elements) / R2PI / self.shape + r = [self.point(offset + i) - histogram[i] for i in range(len(histogram))] + return r + + def constrained_opt(self, offset, histogram): + """Does a constrained optimization of fitting to match the histogram""" + if len(histogram) == 0: + raise RuntimeError("Encountered peak with no range") + optimize.leastsq(self.residues, (self.mean, self.shape, self.elements), (offset, histogram)) + pass + +def plot_hist(h, points, cap, label="", to_screen=False, to_file=None): + plt.plot([min(cap, x) for x in h[:points]], label=label) + plt.xlabel('Kmer Frequency') + plt.ylabel('# Distinct Kmers') + plt.legend() + + + +def plot_hist_df(self, h, points, cap): + plt.plot([max(-cap, min(cap, x)) for x in [h[i + 1] - h[i] for i in range(points)]]) + plt.xlabel('Kmer Frequency') + plt.ylabel('# Distinct Kmers') + plt.legend() + +class SpectraAnalysis(object): + __metaclass__ = abc.ABCMeta + + def __init__(self, freq_cutoff=10000, hom_peak_freq=0, k=27): + self.k = k + self.freq_cutoff = freq_cutoff + self.hom_peak = hom_peak_freq + self.limx = 0 + self.limy = 0 + + @abc.abstractmethod + def analyse(self, min_perc=1, min_elem=100000, verbose=False): + pass + + @abc.abstractmethod + def plot(self, points=0, cap=0, to_screen=False, to_files=None): + pass + + @abc.abstractmethod + def peak_stats(self): + pass + +class HistKmerSpectraAnalysis(SpectraAnalysis): + def __init__(self, filename, freq_cutoff=10000, hom_peak_freq=0, k=27): + SpectraAnalysis.__init__(self, freq_cutoff=freq_cutoff, hom_peak_freq=hom_peak_freq, k=k) + self.spectra = KmerSpectra(self.read_hist(filename, freq_cutoff), k=k) + + + def read_hist(self, name, freq_cutoff=10000): + f = open(name) + histogram = [int(x.split()[1]) for x in f.readlines() if x and x[0] != '#'][:freq_cutoff] + f.close() + return histogram + + def plot(self, points=0, cap=0, to_screen=False, to_files=None): + if 0 == points: points = self.limx + if 0 == cap: cap = self.limy + print() + print("Creating plot") + print("-------------") + print() + self.spectra.printPeaks() + + fig = plt.figure() + plot_hist(self.spectra.histogram, points, cap, label="Histogram") + plot_hist(self.spectra.total_values(1, points + 1), points, cap, label="Fitted distribution") + + for p_i, p in enumerate(self.spectra.peaks, start=1): + plot_hist(p.points(1, points + 1), points, cap, label="fit dist %d" % p_i) + + if to_screen: + plt.show() + + if to_files: + filename = to_files + ".dists.png" + fig.savefig(filename) + print("Saved plot to:", filename) + + print() + + def analyse(self, min_perc=1, min_elem=100000, verbose=False): + + # Create the peaks + print("\nAnalysing spectra") + self.spectra.analyse(min_perc=min_perc, min_elem=min_elem, verbose=verbose) + if self.spectra.peaks: + self.limy = int(max(int(self.spectra.maxval * 1.1 / 1000) * 1000, self.limy)) + self.limx = int(max(min(self.spectra.peaks[-1].mean * 2, len(self.spectra.histogram)), self.limx)) + + if verbose: + print("\nPlot limits: y->%d, x->%d" % (self.limy, self.limx)) + + def peak_stats(self): + print() + print("Spectra statistics") + print("------------------") + self.spectra.printGenomeStats(self.hom_peak) + +class MXKmerSpectraAnalysis(SpectraAnalysis): + def __init__(self, filename, cns_cutoff=3, freq_cutoff=10000, hom_peak_freq=0, k=27): + SpectraAnalysis.__init__(self, freq_cutoff=freq_cutoff, hom_peak_freq=hom_peak_freq, k=k) + self.spectras = [KmerSpectra(self.read_mx(filename, freq_cutoff=freq_cutoff, column=0, cumulative=True), k=k)] + for i in range(cns_cutoff): + self.spectras.append(KmerSpectra(self.read_mx(filename, freq_cutoff=freq_cutoff, column=i, cumulative=False), k=k)) + + def read_mx(self, name, freq_cutoff=10000, column=1, cumulative=False): + f = open(name) + + histogram = [] + if cumulative: + histogram = [sum([int(y) for y in x.split()[column:]]) for x in f.readlines() if x and x[0] != '#'][ + :freq_cutoff][1:] + else: + histogram = [int(x.split()[column]) for x in f.readlines() if x and x[0] != '#'][:freq_cutoff][1:] + f.close() + return histogram + + def plot(self, points=0, cap=0, to_screen=False, to_files=None): + if 0 == points: points = self.limx + if 0 == cap: cap = self.limy + print() + print("Creating plots") + print("--------------") + print() + for s_i, s in enumerate(self.spectras): + if self.spectras[0] == s: + slabel = "General Spectra" + else: + slabel = "%dx present" % (s_i - 1) + fig = plt.figure() + + print("Plotting:",slabel) + s.printPeaks() + + plot_hist(s.histogram, points, cap, label=slabel) + plot_hist(s.total_values(1, points + 1), points, cap, label=slabel + " fit") + for p_i, p in enumerate(s.peaks, start=1): + plot_hist(p.points(1, points + 1), points, cap, label="fit dist %d" % p_i) + + if to_screen: + plt.show() + + if to_files: + suffix = ".spectra-cn" + str(s_i-1) + ".png" + if self.spectras[0] == s: + suffix = ".general.png" + filename = to_files + suffix + fig.savefig(filename) + print("Saved plot to:", filename) + + print() + + + def analyse(self, min_perc=1, min_elem=100000, verbose=False): + + for s_i, s in enumerate(self.spectras): + if s_i == 0: + print("\nAnalysing full spectra") + else: + print("\nAnalysing spectra with copy number", s_i-1) + s.analyse(min_perc=min_perc, min_elem=min_elem, verbose=verbose) + if s.peaks: + self.limy = int(max(int(s.maxval * 1.1 / 1000) * 1000, self.limy)) + self.limx = int(max(min(s.peaks[-1].mean * 2, len(s.histogram)), self.limx)) + elif s_i == 0: + raise RuntimeError("No peaks detected for full spectra. Can't continue.") + + print("\nAnalysed spectra for all requested copy numbers.") + + if verbose: + print("\nPlot limits: y->%d, x->%d" % (self.limy, self.limx)) + + def peak_stats(self): + """TODO: Runs analyse (TODO:include general spectra) + Takes enough peaks as to cover a given % of the elements: + - Find the peak across all distributions + - Reports peak stats + If multiple peaks have been analyzed, tries to find the "main unique" and explains the results based on that freq. + """ + # First check to see if we have anything to work with + if len(self.spectras[0].peaks) == 0: + raise ValueError("Main spectra distribution does not contain any peaks.") + + # step 1, try to find a reasonable mean for kmer frequency. + # weighted means by number of elements? + print("\nMain spectra statistics") + print("-----------------------") + self.spectras[0].printGenomeStats(self.hom_peak) + + # step 2, try to estimate the assembly completeness + completeness = self.calcAssemblyCompleteness() + print("Estimated assembly completeness:", ("{0:.2f}".format(completeness) + "%") if completeness > 0.0 else "Unknown") + + # step 3, selects frequencies for peaks from bigger to smaller till X% of the elements are covered or no more peaks + print("\nBreakdown of copy number composition for each peak") + print("--------------------------------------------------") + + general_dists = self.spectras[0].peaks + goal = 0.99 * sum([x.elements for x in general_dists]) + maxpeaks = 10 + general_dists.sort(key=lambda x: -x.elements) + af = [] + peaks = 0 + covered = 0 + for x in general_dists: + af.append(x.mean) + peaks += 1 + covered += x.elements + if peaks == maxpeaks or covered > goal: + break + + # step 3, report for each peak + # get the candidate peak on each spectra + for f in af: + total = 0 + pd = {} + for i in range(len(self.spectras) - 1): + m = [(x.mean, x.elements) for x in self.spectras[1 + i].peaks if x.mean > 0.8 * f and x.mean < 1.2 * f] + if len(m) == 1: + pd[i] = m[0] + total += m[0][1] + if len(m) > 1: + print("WARNING, MORE THAT 1 PEAK FOR f=%.3f FOUND ON THE %dx SPECTRA!!!" % (f, i)) + print("\n---- Report for f=%.3f (total elements %d)----" % (f, total)) + for i in range(len(self.spectras) - 1): + if i in list(pd.keys()): + print( + " %dx: %.2f%% (%d elements at f=%.2f)" % (i, float(pd[i][1]) * 100 / total, pd[i][1], pd[i][0])) + else: + print(" %dx: No significant content" % i) + + + def calcAssemblyCompleteness(self): + + if self.spectras[0].peaks: + hpi = self.spectras[0].getHomozygousPeakIndex(self.hom_peak) + opt_freq = int(self.spectras[0].peaks[hpi-1].mean) + absent_count = self.spectras[1].histogram[opt_freq] + present_count = self.spectras[2].histogram[opt_freq] + return (present_count / (absent_count + present_count)) * 100.0 + else: + return 0.0 + +def get_properties_from_file(input_file): + k = 27 + mx = False + + f = open(input_file) + i = 0 + for l in f.readlines(): + if i > 10: + break + line = l.strip() + if line.startswith("#"): + if line.startswith("# Kmer value:"): + k = int(line.split(":")[1]) + elif line.startswith("# Rows:"): + mx = True + i+=1 + f.close() + + return k, mx + +def main(): + # ----- command line parsing ----- + parser = argparse.ArgumentParser( + description="""Analyse a comp matrix file with respect to the distributions and copy numbers seen within.""") + + parser.add_argument("input", type=str, + help="The input should be either a KAT spectra-cn matrix file or a KAT histogram file.") + + parser.add_argument("-o", "--output_prefix", + help="If present then plots are sent to files starting with this prefix.") + parser.add_argument("-c", "--cns", type=int, default=4, + help="The number of copy numbers to consider in the analysis. Only applicable if input is a matrix file.") + parser.add_argument("-f", "--freq_cutoff", type=int, default=500, + help="The maximum frequency cutoff point to consider. Analysis will be done up to this frequency.") + parser.add_argument("-p", "--min_perc", type=float, default=1.0, + help="Any new distribution that adds less to min perc kmers on the iterative analysis will not be added.") + parser.add_argument("-e", "--min_elem", type=int, default=100000, + help="Any new distribution that adds less to min elem kmers on the iterative analysis will not be added.") + parser.add_argument("--plot", action='store_true', + help="Plot fitted distributions to each peak to the screen.") + parser.add_argument("-z", "--homozygous_peak", type=int, default=0, + help="The approximate kmer frequency for the homozygous peak. Allows us to calculate a more accurate genome size estimate.") + parser.add_argument("-v", "--verbose", action='store_true', + help="Print additional information.") + + args = parser.parse_args() + + k, mx = get_properties_from_file(args.input) + print("Input file generated using K", k) + print("Input is a", "matrix" if mx else "histogram", "file") + + a = None + if mx: + print("Processing", args.cns, "spectra") + a = MXKmerSpectraAnalysis(args.input, cns_cutoff=args.cns, freq_cutoff=args.freq_cutoff, hom_peak_freq=args.homozygous_peak, k=k) + else: + a = HistKmerSpectraAnalysis(args.input, freq_cutoff=args.freq_cutoff, hom_peak_freq=args.homozygous_peak, k=k) + + if not a: + raise RuntimeError("Couldn't generate a valid spectra analysis") + + try: + start = time.time() + a.analyse(min_perc=args.min_perc, min_elem=args.min_elem, verbose=args.verbose) + end = time.time() + print("\nTime taken to analyse data: ", end - start, "seconds") + a.peak_stats() + if args.plot or args.output_prefix: + a.plot(to_screen=args.plot, to_files=args.output_prefix) + except Exception: + print("\nERROR\n-----", file=sys.stderr) + traceback.print_exc(file=sys.stderr) if __name__ == '__main__': - # ----- command line parsing ----- - parser = argparse.ArgumentParser( - description="""Analyse a comp matrix file with respect to the distributions and copy numbers seen within.""") - - parser.add_argument("matrix_file", type=str, - help="The input matrix file from KAT comp") - - parser.add_argument("-c", "--cns", type=int, default=4, - help="The number of copy numbers to consider in the analysis") - parser.add_argument("-f", "--freq_cutoff", type=int, default=200, - help="The maximum frequency cutoff point to consider. Analysis will be done up to this frequency.") - parser.add_argument("-p", "--min_perc", type=int, default=1, - help="Any new distribution that adds less to min perc kmers on the iterative analysis will not be added.") - parser.add_argument("-e", "--min_elem", type=int, default=100000, - help="Any new distribution that adds less to min elem kmers on the iterative analysis will not be added.") - - args = parser.parse_args() - - a=MXKmerSpectraAnalysis(args.matrix_file, args.cns, args.freq_cutoff) - a.analyse(min_perc=args.min_perc, min_elem=args.min_elem) - a.peak_stats() + main() \ No newline at end of file diff --git a/scripts/kat_plot_spectra-hist.py b/scripts/kat_plot_spectra-hist.py index 722de2da..786398f7 100755 --- a/scripts/kat_plot_spectra-hist.py +++ b/scripts/kat_plot_spectra-hist.py @@ -85,7 +85,7 @@ elif "Title" in header: title = headers[0]["Title"] else: - title = "Spectra Copy Number Plot" + title = "Spectra Histogram Plot" if args.x_label is not None: x_label = args.x_label diff --git a/src/.gitignore b/src/.gitignore index 54d19abc..df4f7234 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -12,3 +12,4 @@ /plot/ /sect/ /inc/ +/tags diff --git a/src/comp.cc b/src/comp.cc index 9d900b65..8664fe6f 100644 --- a/src/comp.cc +++ b/src/comp.cc @@ -667,7 +667,7 @@ int kat::Comp::main(int argc, char *argv[]) { ("disable_hash_grow,g", po::bool_switch(&disable_hash_grow)->default_value(false), "By default jellyfish will double the size of the hash if it gets filled, and then attempt to recount. Setting this option to true, disables automatic hash growing. If the hash gets filled an error is thrown. This option is useful if you are working with large genomes, or have strict memory limits on your system.") ("density_plot,n", po::bool_switch(&density_plot)->default_value(false), - "Makes a spectra_mx plot. By default we create a spectra_cn plot.") + "Makes a density plot. By default we create a spectra_cn plot.") ("output_type,p", po::value(&plot_output_type)->default_value(DEFAULT_COMP_PLOT_OUTPUT_TYPE), "The plot file type to create: png, ps, pdf. Warning... if pdf is selected please ensure your gnuplot installation can export pdf files.") ("output_hists,h", po::bool_switch(&output_hists)->default_value(false),