Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added validation code for narrowPeak, broadPeak files #23

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion idr/idr.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def mean(items):

import idr.optimization
from idr.optimization import estimate_model_params, old_estimator
from idr.utility import calc_post_membership_prbs, compute_pseudo_values
from idr.utility import calc_post_membership_prbs, compute_pseudo_values, is_valid_narrowPeak, is_valid_broadPeak

Peak = namedtuple(
'Peak', ['chrm', 'strand', 'start', 'stop', 'signal', 'summit', 'signalValue', 'pValue', 'qValue'])
Expand Down Expand Up @@ -696,9 +696,15 @@ def load_samples(args):
else:
peak_merge_fn = min
if args.input_file_type == 'narrowPeak':
if not all([is_valid_narrowPeak(fp) for fp in args.samples]):
raise ValueError(
"Input file(s) are not narrowPeak files")
summit_index = 9
else:
summit_index = None
if not all([is_valid_broadPeak(fp) for fp in args.samples]):
raise ValueError(
"Input file(s) are not broadPeak files")
f1, f2 = [load_bed(fp, signal_index, summit_index)
for fp in args.samples]
oracle_pks = (
Expand Down
46 changes: 46 additions & 0 deletions idr/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,53 @@

import math


DEFAULT_PV_COVERGE_EPS = 1e-8
def validate_peak_line(line, include_summit=False):
""" validates the types of each element of line
use include_summit=True for narrowPeak files, """
try:
parts = line.split()
if include_summit:
chrom, start, end, name, score, strand, signal, pvalue, qvalue, summit = parts
else:
chrom, start, end, name, score, strand, signal, pvalue, qvalue = parts
# check chr, start, end
chrom_check = isinstance(chrom, str)
start_check = isinstance(int(start), int)
end_check = isinstance(int(end), int)
# check name, score, strand
name_check = isinstance(name, str)
score_check = int(score) >= 0 and int(score) <= 1000
strand_check = strand in ['.','-','+']
# check signal, pvalue, qvalue, summit
signal_check = isinstance(float(signal), float)
pval_check = pvalue == -1 or float(pvalue) > 0
qval_check = qvalue == -1 or float(qvalue) > 0
if include_summit:
summit_check = int(summit) > 0
return all([chrom_check, start_check, end_check, name_check, score_check, \
strand_check, signal_check, pval_check, qval_check, summit_check])
else:
return all([chrom_check, start_check, end_check, name_check, score_check, \
strand_check, signal_check, pval_check, qval_check])

except ValueError:
print("Validation error: one or more of the elements could not be interpreted as the proper type, or there were insufficient elements to unpack: ", line)
return False


def is_valid_narrowPeak(fp):
""" validates this file has all the attributes of a narrowPeak file. """
validation_results = [validate_peak_line(l, True) for l in fp]
fp.seek(0)
return all(validation_results)

def is_valid_broadPeak(fp):
""" validates this file has all the attributes of a broadPeak file. """
validation_results = [validate_peak_line(l, False) for l in fp]
fp.seek(0)
return all(validation_results)

def simulate_values(N, params):
"""Simulate ranks and values from a mixture of gaussians
Expand Down