Skip to content

Commit

Permalink
Merge pull request #4 from sp00nman/annotation
Browse files Browse the repository at this point in the history
Genome browser tracks
  • Loading branch information
sp00nman committed Jun 22, 2015
2 parents a529a4d + 700935a commit 9a22b64
Showing 1 changed file with 138 additions and 24 deletions.
162 changes: 138 additions & 24 deletions src/gt_screen_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,20 @@ def fix_end_position(project_name,
return msg_fix


def parse_intersectfile(annotation_bed):

try:
file_handle = open(annotation_bed)
bed_files = [line.rstrip('\r\n').split('\t') \
for line in file_handle if line.rstrip('\r\n')]
file_handle.close()

except IOError:
print('Annotation file is missing.')

return bed_files


def intersectbed(project_name,
project_dir,
sample_file,
Expand Down Expand Up @@ -440,6 +454,48 @@ def plot_results(project_name,
output_file)
return msg_plot, cmd_plot


def browser_track(project_name,
project_dir,
annotation_name,
file_ext):

input_file = project_dir + "/" + project_name + "." + "insertions" + "." \
+ annotation_name + ".bed"
output_file = project_dir + "/" + project_name + "." \
+ annotation_name + "." + file_ext

msg_track = "Create browser tracks. "

if annotation_name=="exon" or annotation_name=="intron":
df = pd.read_csv(input_file, sep="\t",
names=["chr", "start", "end",
"id", "mapq", "strand",
"cigar", "ens_chr", "ens_start",
"ens_end", "ens_annotation",
"ens_strand", "ens_ensid",
"ens_gsymbol", "ens_transid",
"ens_num"])
track_name = df['id'] + "~" + df['ens_gsymbol'] + "~" + annotation_name \
+ df['strand'] + "/" + df['ens_strand']
reshape = df.loc[:,['chr', 'start', 'end']]
reshape['track_name'] = track_name
reshape['score'] = 0
reshape['strand'] = df['strand']

else:
df = pd.read_csv(input_file, sep="\t",
names=["chr", "start", "end",
"id", "mapq", "strand", "cigar",
"e_chr", "e_start", "e_end",
"e_peak_id", "e_score", "e_strand", "e_num"])
reshape = df.loc[:,['chr', 'start', 'end','e_peak_id','e_score','strand']]

reshape.to_csv(output_file, mode='w', sep="\t", index=0, header=0)

return msg_track


#def report_statistics():
# number of mapped reads...duplicates,..insertion count..

Expand All @@ -455,7 +511,7 @@ def plot_results(project_name,
help='Limit job submission to a particular '
'analysis stage. '
'[all,(bam2fastq),alignment,filter,sort,duplicates,index,'
'insertions,annotate,count,fisher,plot]')
'insertions,annotate,count,fisher,plot,browser]')
parser.add_argument('--project_name', dest='project_name', required=False,
help='Name of project directory.')
parser.add_argument('--output_dir', dest='output_dir', required=False,
Expand All @@ -470,10 +526,8 @@ def plot_results(project_name,
help='Genome version')
parser.add_argument('--bowtie2', dest='bowtie2', required=False,
help='Path to bowtie2 indices')
parser.add_argument('--annotation_exon', dest='annotation_exon', required=False,
help='Exon annotation file.')
parser.add_argument('--annotation_intron', dest='annotation_intron', required=False,
help='Intron annotation file')
parser.add_argument('--annotation', dest='annotation', required=False,
help='annotation file. Format (tab-separated: exon /path/to/annotation_1.txt \n intron /path/to/annotation_2.txt')
parser.add_argument('--control_file', dest='control_file', required=False,
help='Control file with insertions for fisher-test.')
parser.add_argument('--refseq_file', dest='refseq_file', required=False,
Expand All @@ -499,10 +553,8 @@ def plot_results(project_name,
args.genome_version = "hg19"
if not args.bowtie2:
args.bowtie2 = "forBowtie2"
if not args.annotation_exon:
args.annotation_exon = home_dir + "hg19_exons.gtf"
if not args.annotation_intron:
args.annotation_intron = home_dir + "hg19_introns.gtf"
if not args.annotation:
args.annotation = home_dir + "annotation_file.txt"
if not args.control_file:
args.control_file = home_dir + "control_file.txt"
if not args.refseq_file:
Expand All @@ -514,8 +566,9 @@ def plot_results(project_name,
# set project directory
project_dir = args.output_dir + "/" + args.project_name

# creat project directory
# create directory structure
create_output_dir(args.output_dir, args.project_name)
#TODO: create /img and /statistics

# create log file
logfile_name = args.output_dir + "/" + args.project_name + "/" + args.project_name + ".log"
Expand Down Expand Up @@ -553,20 +606,30 @@ def plot_results(project_name,
'fix_pos': 'rm2bp_insertions_header_fix.bed',
'count': 'count_table.txt',
'fisher': 'fisher_test.txt',
'bubble': 'bubble_plot.pdf'}
'bubble': 'bubble_plot.pdf',
'browser': 'browser_track'}

# only necessary if --stage is not [all]
sample_file = args.sequences_dir + "/" + args.sample_file

# start workflow
if re.search(r"bam2fastq", args.stage):

####################################
print "Convert bam to fastq file." #
####################################

(msg, cmd) = bam2fastq(project_dir, sample_file,
args.project_name, file_ext="fastq")
status = run_cmd(msg, cmd)
sample_file = project_dir + "/" + args.project_name + "." \
+ file_ext['bam2fastq']

if re.search(r"all|alignment", args.stage):

#########################
print "Read alignment." #
#########################

(msg, cmd) = alignment(args.genome_version, args.genomes,
args.project_name, sample_file,
Expand All @@ -577,6 +640,11 @@ def plot_results(project_name,
+ file_ext['alignment']

if re.search(r"all|filter", args.stage):

#######################
print "Filter reads." #
#######################

(msg, cmd) = sam2bam(args.project_name, project_dir,
sample_file, file_ext=file_ext['sam2bam'])
status = run_cmd(msg,cmd)
Expand All @@ -588,6 +656,10 @@ def plot_results(project_name,
sample_file = project_dir + "/" + args.project_name + "." + file_ext['filter']

if re.search(r"all|duplicates", args.stage):

############################
print "Remove duplicates." #
############################

(msg, cmd) = sort_bam(args.project_name, project_dir,
sample_file, file_ext=file_ext['sort_bam'])
Expand All @@ -607,24 +679,31 @@ def plot_results(project_name,
+ file_ext['duplicates']

if re.search(r"all|index", args.stage):

(msg, cmd) = bam2bai(args.project_name, project_dir,
sample_file, file_ext=file_ext['index'])
status = run_cmd(msg, cmd)

if re.search(r"all|insertions", args.stage):

(msg, cmd) = bam2sam(args.project_name, project_dir,
sample_file, file_ext=file_ext['bam2sam'])
status = run_cmd(msg, cmd)
sample_file = project_dir + "/" + args.project_name + "." + file_ext['bam2sam']

print "Remove insertions 1 or 2 bp away."

###########################################
print "Remove insertions 1 or 2 bp away." #
###########################################
remove2bpinsertions(args.project_name, project_dir,
sample_file, file_ext=file_ext['insertions'])
sample_file = project_dir + "/" + args.project_name + "." \
+ file_ext['insertions']

if re.search(r"all|annotate", args.stage):
msg_rm2bpins = "Annotate insertions."

##############################
print "Annotate insertions." #
##############################
(msg, cmd) = getheader(args.project_name, project_dir,
file_ext=file_ext['get_header'])
status = run_cmd(msg, cmd)
Expand All @@ -644,34 +723,69 @@ def plot_results(project_name,
sample_file = project_dir + "/" + args.project_name + "." \
+ file_ext['sam2bed']

print "Fix end position to be start+1 "
##############################################################
print "Add +1 to end position in bedfile. chr start star+1 " #
##############################################################
fix_end_position(args.project_name, project_dir,
sample_file, file_ext=file_ext['fix_pos'])

sample_file = project_dir + "/" + args.project_name + "." + file_ext['fix_pos']

(msg, cmd) = intersectbed(args.project_name, project_dir,
sample_file, args.annotation_exon,
annotation_name="exon", file_ext="insertions" )
status = run_cmd(msg, cmd)
(msg, cmd) = intersectbed(args.project_name, project_dir,
sample_file, args.annotation_intron,
annotation_name="intron", file_ext="insertions")
status = run_cmd(msg, cmd)

######################################
print "Intersect annotation files. " #
######################################

bed_files = parse_intersectfile(args.annotation)

for bed in bed_files:
annotation_name = bed[0]
annotation_file = bed[1]

(msg, cmd) = intersectbed(args.project_name, project_dir,
sample_file, annotation_file,
annotation_name=annotation_name, file_ext="insertions" )
status = run_cmd(msg, cmd)

if re.search(r"all|count", args.stage):

#####################################
print "Count number of insertions." #
#####################################
(msg, cmd) = count_insertions(args.project_name, project_dir,
file_ext=file_ext['count'])
status = run_cmd(msg, cmd)
sample_file = project_dir + "/" + args.project_name + "." + file_ext['count']

if re.search(r"all|fisher", args.stage):

######################
print "Fisher-test." #
######################
(msg, cmd) = fisher_test(args.project_name, project_dir, sample_file,
args.control_file, file_ext=file_ext['fisher'])
status = run_cmd(msg, cmd)
sample_file = project_dir + "/" + args.project_name + "." + file_ext['fisher']

if re.search(r"all|plot", args.stage):

#########################################
print "Plot insertions in bubble plot." #
#########################################
(msg, cmd) = plot_results(args.project_name, project_dir, args.refseq_file,
sample_file, file_ext=file_ext['bubble'])
status = run_cmd(msg, cmd)

if re.search(r"all|browser", args.stage):

##########################################################
print "Create browser tracks to view insertions sites. " #
##########################################################
bed_files = parse_intersectfile(args.annotation)

for bed in bed_files:
annotation_name = bed[0]
browser_track(args.project_name, project_dir, annotation_name=annotation_name, file_ext=file_ext['browser'])




0 comments on commit 9a22b64

Please sign in to comment.