Minor changes

snayfach · Nov 1, 2016 · 51e31ef · 51e31ef
1 parent 09aaa4d
commit 51e31ef
Show file tree

Hide file tree

Showing 6 changed files with 26 additions and 24 deletions.
diff --git a/midas/merge/annotate.py b/midas/merge/annotate.py
@@ -98,7 +98,6 @@ def annotate_site(site, genes, gene_index, contigs):
 	# gene_index: current position in list of genes; global variable
 	site.snp_types = {}
 	site.amino_acids = {}
-
 	while True:
 		# 1. fetch next gene
 		#    if there are no more genes, snp must be non-coding so break
@@ -120,7 +119,7 @@ def annotate_site(site, genes, gene_index, contigs):
 		# 4. otherwise, snp must be in gene
 		#    annotate site (1D-4D) and snp (SYN, NS)
 		else:
-			site.gene_id = gene['gene_id'].split('|')[-1]
+			site.gene_id = gene['gene_id']
 			site.ref_codon, site.codon_pos = fetch_ref_codon(site, gene)
 			if not all([_ in ['A','T','C','G'] for _ in site.ref_codon]): # check for invalid bases in codon
 				site.site_type = 'NA'; site.gene_id = ''

diff --git a/midas/merge/merge_snps.py b/midas/merge/merge_snps.py
@@ -160,7 +160,7 @@ def filter_snp_matrix(species_id, samples, args):
 			write_matrices(site, matrices)
 
 def write_readme(args, sp):
-	outfile = open('%s/%s/README' % (args['outdir'], sp.id), 'w')
+	outfile = open('%s/%s/readme.txt' % (args['outdir'], sp.id), 'w')
 	outfile.write("""
 Description of output files and file formats from 'merge_midas.py snps'
 

diff --git a/midas/merge/merge_species.py b/midas/merge/merge_species.py
@@ -87,7 +87,7 @@ def identify_samples(args):
 	return samples
 
 def write_readme(args):
-	outfile = open('%s/README' % args['outdir'], 'w')
+	outfile = open('%s/readme.txt' % args['outdir'], 'w')
 	outfile.write("""
 Description of output files and file formats from 'merge_midas.py species'
 

diff --git a/midas/merge/snp_matrix.py b/midas/merge/snp_matrix.py
@@ -15,18 +15,13 @@ def __init__(self, files, samples, info=None):
 			self.id, self.depth = next(files['depth'])
 			self.id, self.alt_allele = next(files['alt_allele'])
 			self.info = next(info) if info else None
-			self.ref_id, self.ref_pos, self.ref_allele = self.parse_id()
+			self.ref_id, self.ref_pos, self.ref_allele = self.id.rsplit('|', 2)
+			self.ref_pos = int(self.ref_pos)
 			self.samples = samples
 
 		except StopIteration:
 			self.id = None
 
-	def parse_id(self):
-		ref_id = self.id.rsplit('|')[0]
-		ref_pos = int(self.id.split('|')[1])
-		ref_allele = self.id.split('|')[2]
-		return ref_id, ref_pos, ref_allele
-
 	def sample_values(self): # return a dic mapping sample id to site values
 		d = {}
 		for index, sample in enumerate(self.samples):

diff --git a/scripts/merge_midas.py b/scripts/merge_midas.py
@@ -134,13 +134,18 @@ def genes_arguments():
 	species.add_argument('--species_id', dest='species_id', type=str, metavar='CHAR',
 		help="""Comma-separated list of species ids""")
 	species.add_argument('--max_species', type=int, metavar='INT',
-		help="""Maximum number of species to merge. useful for testing (use all)""")
+		help="""Maximum number of species to merge. Useful for testing (use all)""")
 	sample = parser.add_argument_group('Sample filters (select subset of samples from INPUT)')
 	sample.add_argument('--sample_depth', type=float, default=1.0, metavar='FLOAT',
 		help="""Minimum read-depth across all genes with non-zero coverage (1.0)""")
 	sample.add_argument('--max_samples', type=int, metavar='INT',
-		help="""Maximum number of samples to process. useful for testing (use all)""")
-	gene = parser.add_argument_group('Presence/Absence')
+		help="""Maximum number of samples to process. Useful for testing (use all)""")
+	gene = parser.add_argument_group('Quantification')
+	gene.add_argument('--cluster_pid', type=str, dest='cluster_pid', default='95', choices=['75', '80', '85', '90', '95', '99'],
+		help="""In the database, pan-genomes are defined at 6 different %% identity clustering cutoffs
+CLUSTER_PID allows you to quantify gene content for any of these sets of gene clusters
+By default, gene content is reported for genes clustered at 95%% identity (95)
+""")
 	gene.add_argument('--min_copy', type=float, default=0.35, metavar='FLOAT',
 		help="""Genes >= MIN_COPY are classified as present
 Genes < MIN_COPY are classified as absent (0.35)""")
@@ -295,6 +300,7 @@ def print_genes_arguments(args):
 	print ("  keep samples with >=%s mean coverage across genes with non-zero coverage" % args['sample_depth'])
 	if args['max_samples']: print ("  keep <= %s samples" % args['max_samples'])
 	print ("Gene quantification criterea:")
+	print ("  quantify genes clustered at %s%% identity" % args['cluster_pid'])
 	print ("  present (1): genes with copy number >= %s" % args['min_copy'])
 	print ("  absent (0): genes with copy number < %s" % args['min_copy'])
 	print ("===============================")

diff --git a/scripts/run_midas.py b/scripts/run_midas.py
@@ -148,10 +148,12 @@ def print_species_arguments(args):
 	lines.append("Input reads (2nd mate): %s" % args['m2'])
 	lines.append("Remove temporary files: %s" % args['remove_temp'])
 	lines.append("Word size for database search: %s" % args['word_size'])
-	if args['mapid']: lines.append("Minimum mapping identity: %s" % args['mapid'])
+	if args['mapid']:
+		lines.append("Minimum mapping identity: %s" % args['mapid'])
 	lines.append("Minimum mapping alignment coverage: %s" % args['aln_cov'])
 	lines.append("Number of reads to use from input: %s" % (args['max_reads'] if args['max_reads'] else 'use all'))
-	if args['read_length']: lines.append("Trim reads to %s-bp and discard reads with length < %s-bp" % (args['read_length'], args['read_length']))
+	if args['read_length']:
+		lines.append("Trim reads from 3'/right end to %s-bp and discard reads with length < %s-bp" % (args['read_length'], args['read_length']))
 	lines.append("Number of threads for database search: %s" % args['threads'])
 	lines.append("================================")
 	args['log'].write('\n'.join(lines)+'\n')
@@ -258,11 +260,11 @@ def gene_arguments():
 	map.add_argument('--mapid', type=float, metavar='FLOAT',
 		default=94.0, help='Discard reads with alignment identity < MAPID (94.0)')
 	map.add_argument('--mapq', type=int, metavar='INT',
-		default=20, help='Discard reads with mapping quality < MAPQ (10)')
+		default=0, help=argparse.SUPPRESS) # help='Discard reads with mapping quality < MAPQ (0)'
 	map.add_argument('--aln_cov', type=float, metavar='FLOAT',
 		default=0.75, help='Discard reads with alignment coverage < ALN_COV (0.75)')
 	map.add_argument('--trim', type=int, default=0, metavar='INT',
-		help='Trim N base-pairs from read-tails (0)')
+		help='Trim N base-pairs from 3\'/right end of read (0)')
 	args = vars(parser.parse_args())
 	if args['species_id']: args['species_id'] = args['species_id'].split(',')
 	return args
@@ -303,7 +305,7 @@ def print_gene_arguments(args):
 		lines.append("  minimum alignment coverage of reads: %s" % args['aln_cov'])
 		lines.append("  minimum read quality score: %s" % args['readq'])
 		lines.append("  minimum mapping quality score: %s" % args['mapq'])
-		lines.append("  trim %s base-pairs from read-tails" % args['trim'])
+		lines.append("  trim %s base-pairs from 3'/right end of read" % args['trim'])
 	lines.append("================================")
 	args['log'].write('\n'.join(lines)+'\n')
 	sys.stdout.write('\n'.join(lines)+'\n')
@@ -377,7 +379,7 @@ def snp_arguments():
 	snps.add_argument('--readq', type=int, metavar='INT',
 		default=20, help='Discard reads with mean quality < READQ (20)')
 	snps.add_argument('--trim', metavar='INT', type=int, default=0,
-		help='Trim N base-pairs from read-tails (0)')
+		help='Trim N base-pairs from 3\'/right end of read')
 	snps.add_argument('--discard', default=False, action='store_true',
 		help='Discard discordant read-pairs')
 	snps.add_argument('--baq', default=False, action='store_true',
@@ -424,7 +426,7 @@ def print_snp_arguments(args):
 		lines.append("  minimum mapping quality score: %s" % args['mapq'])
 		lines.append("  minimum base quality score: %s" % args['baseq'])
 		lines.append("  minimum read quality score: %s" % args['readq'])
-		lines.append("  trim %s base-pairs from read-tails" % args['trim'])
+		lines.append("  trim %s base-pairs from 3'/right end of read" % args['trim'])
 		if args['discard']: lines.append("  discard discordant read-pairs")
 		if args['baq']: lines.append("  enable BAQ (per-base alignment quality)")
 		if args['adjust_mq']: lines.append("  adjust MAPQ")
@@ -574,7 +576,7 @@ def check_snps(args):
 		sys.exit("\nError: BASEQ must be between 0 and 100\n")
 
 def write_readme(program, args):
-	outfile = open('%s/%s/README' % (args['outdir'], program), 'w')
+	outfile = open('%s/%s/readme.txt' % (args['outdir'], program), 'w')
 	if program == 'species':
 		outfile.write("""
 Description of output files and file formats from 'run_midas.py species'
@@ -599,7 +601,7 @@ def write_readme(program, args):
   coverage: estimated genome-coverage (i.e. read-depth) of species in metagenome
   relative_abundance: estimated relative abundance of species in metagenome
 
-**Additional information for each species can be found in the reference database:
+Additional information for each species can be found in the reference database:
  %s/marker_genes
 """ % args['db'])
 	elif program == 'genes':
@@ -638,7 +640,7 @@ def write_readme(program, args):
   mean_coverage: average read-depth across genes with at least 1 mapped read
   marker_coverage: median read-depth across 15 universal single copy genes
 
-**Additional information for each species can be found in the reference database:
+Additional information for each species can be found in the reference database:
  %s/pan_genomes
 """ % args['db'])
 	elif program == 'snps':