-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path1_liftover_hg38_to_hg19.py
239 lines (187 loc) · 12.7 KB
/
1_liftover_hg38_to_hg19.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
'''
Converts bed files with UniProtKB genome annotation tracks from hg-build38 to build19 using UCSC's 'liftOver' tool.
See comments in script for complications produced in liftOver's output.
Author: 'Mana'valan Gajapathy
Version changes in UniProt's format:
1. In Feb 2017 release, amino acid details next to uniprot id in bedfiles were removed (Eg. In column 4, 'O95671:p.E44' is now just 'O95671').
'''
import os
from shutil import copyfile, copyfileobj
from glob import glob
from subprocess import call
import csv
from settings_files.fn_replace_acronyms import replace_acronyms_in_natural_variant
from pybedtools import BedTool
import shutil
# function to parse settings file
def parse_settings(settings_filename):
settings_dict = {}
annotations_of_interest = []
with open(settings_filename, 'Ur') as settings_handle:
for line in settings_handle:
line = line.replace('\n', '')
line_list = line.split(',')
if line[0:2] != '##' and line.replace(',', ''): # excldues commented and empty lines
settings_dict[line_list[0]] = [ line_list[1], line_list[2] ]
if line_list[2] == '1':
annotations_of_interest.append(line_list[0])
return settings_dict, annotations_of_interest
# Function to fix bed file format during conversion from hg-build38 to build19.
# Also, converts rows with multiple non-continuous amino acid annotations to separate lines.
def edit_bedfile(input_bed, settings_dict, annotation_type, disulfide_bedfile, output_bed_handle):
csv_input_bed = csv.reader(input_bed, delimiter='\t')
for row in csv_input_bed:
# replace score column in bed files with annotation type; score was set to zero by default by UniProt.
if row[4] == '0':
row[4] = settings_dict[annotation_type][0]
else:
print 'Error. Column# 5 (score column) must have value 0. Instead, this file has some other value. ' \
'Figure out why. Script terminates now.'
raise
# merges column 14 back together
if len(row) > 13:
merged_col14 = ' '.join(row[13:])
else:
merged_col14 = ''
# following part is to use details of block columns in bed file to split multiple annotations into separate rows
name = row[3].split(':')
uniprot_id = name[0]
# if disulfide_bedfile: # no more needed due to format change in Feb 2017 release
# amino_acids = ( name[1].replace('p.', '') ).split('_')
# else:
# amino_acids = ( name[1].replace('p.', '') ).split(',')
block_size_list = [ int(x) for x in row[10].split(',') ]
block_start_list = [int(x) for x in row[11].split(',')]
if disulfide_bedfile and merged_col14.startswith('disulfide bond_'):
block_size_list = [3, 3] # to account for uniprot mistakenly treating disulfide bond as continuous region
start_site, end_site, strand = int(row[1]), int(row[2]), row[5]
# note that in bed format, start sites are 0-based and end sites are 1-based.
for block_no in range(0, len(block_size_list)):
if disulfide_bedfile and merged_col14.startswith('disulfide bond_'): # disulfide bonds forming with other polypeptide chain are ignored here and treated properly in the 'else' section
if block_no == 0:
new_start = start_site
new_end = new_start + 3
elif block_no == 1:
new_start = end_site - 3
new_end = new_start + 3
else:
new_start = start_site + block_start_list[block_no]
new_end = new_start + block_size_list[block_no]
# # properly connects amino acids to coordinates based on strand type
# if strand == '+': # no more needed due to format change in Feb 2017 release
# new_name = "%s:p.%s" %(uniprot_id, amino_acids[block_no])
# else:
# new_name = "%s:p.%s" %(uniprot_id, amino_acids[-(block_no+1)])
new_name = "%s" % uniprot_id
revised_row = "%s\t%i\t%i\t%s\t%s\t%i\t%i\t%s\t1\t%i\t0\t%s\t%s\n" \
%(row[0], new_start, new_end, new_name, '\t'.join(row[4:6]), new_start, new_end, row[8],
block_size_list[block_no], row[12], merged_col14)
output_bed_handle.write(revised_row)
output_bed_handle.close()
# function to create directory if doesnt already exist
def mkdir_if_not_exists(dir_path):
if not os.path.exists(dir_path):
os.makedirs(dir_path)
# function to make a file and delete and make a newfile if it already exists
def create_newfile(directory, filename):
# creates a bed file that will have merged data of bedfiles of interest
merged_outbedfile = os.path.join(directory, filename)
if os.path.exists(merged_outbedfile): # deletes file if it exists. This is done to create and append to a new file.
os.remove(merged_outbedfile)
# Commented out since use of bedtools' sorting feature removes the title line while sorting.
# with open(merged_outbedfile, 'w') as out_handle: # write title
# out_handle.write('#chrom\tg_start\tg_end\tuniprot_id\tannotation_type\tstrand\tthick_start\tthick_end\trgb\tno_of_blocks\tsize_of_blocks\tstart_of_blocks\tannotation_identifier\tdescription\n')
return merged_outbedfile
# function to append data to existing file based on annotation type
def append_to_mergefile(merged_bedfilename, fixed_bed19_outfile, settings_dict, annotation_type, annotation_value):
with open(merged_bedfilename, 'ab') as combined_handle:
if settings_dict[annotation_type][1] == annotation_value:
with open(fixed_bed19_outfile, 'rb') as edited_bed:
copyfileobj(edited_bed, combined_handle, 1024 * 1024 * 10) # 10MB per writing chunk to avoid reading big file into memory.
# release version info
release_ver = 'Mar2017' # change this variable to include directory name of recent release version
dir_new_release = 'hg38_uniprot_bedfiles/%s' % release_ver
# mkdir_if_not_exists(dir_new_release)
# read settings file
settings_filename = 'settings_files/Settings_UniProt_compare.csv'
settings_dict, annotations_of_interest = parse_settings(settings_filename)
# get filepaths of UniProt's genome annotation track containing bed files
input_bedfiles_dir = '%s/UP000005640_9606_beds' % dir_new_release
input_file_list = glob(input_bedfiles_dir + '/*.bed')
build19_output_dir = os.path.join(input_bedfiles_dir, 'convert2hg19') # dir to store liftOver output data in hg-build19 format
mkdir_if_not_exists(build19_output_dir)
fixed_bed19_outdir = os.path.join(build19_output_dir, 'hg19_format_fixed') # dir to have format-corrected build19 bed files
mkdir_if_not_exists(fixed_bed19_outdir)
# creates a bed file that will have merged data of bedfiles with annotation types labelled as type 1 in settings file
download_dir = 'Download_data_%s' % release_ver
mkdir_if_not_exists(download_dir)
merged_type1_filename = '%s/merged_select_UniProt%s_hg19_restructured_type1.bed' % (download_dir, release_ver)
merged_bedfile_type1 = create_newfile(directory='.', filename=merged_type1_filename)
# creates a bed file that will have merged data of bedfiles with annotation types labelled as type 0 in settings file
merged_type0_filename = '%s/merged_select_UniProt%s_hg19_restructured_type0.bed' % (download_dir, release_ver)
merged_bedfile_type0 = create_newfile(directory='.', filename=merged_type0_filename)
# liftOver tools requires chain file for propper mapping between genome assemblies
liftover_chainfile = 'settings_files/hg38ToHg19.over.chain.gz'
print "Chainfile used for liftOver: '%s\n'" % liftover_chainfile
# merged_file = 'merged_UniProt_genome_annotations_of_interest.bed'
count = 1
for bed_file in input_file_list:
base_filename = os.path.basename(bed_file)
build19_outfile = os.path.join(build19_output_dir, base_filename.replace('.bed', '_build19.bed'))
error_filename = os.path.join(build19_output_dir, base_filename.replace('.bed', '_unmapped.log')) # if coordinates could not be mapped to target assembly, this file logs them
# uses 'liftOver' command line tool to convert coordinates to target assembly coordinates
try:
# if bedPlus parameter not provided, liftOver seems to convert even the columns that are not coordinates.
# For example, when not used, it converts RGB column to new coordinates. Using this parameter converts only the columns that are within the limit provided. RGB column is column #9, btw.
print "%i/%i - Executing liftOver for file: '%s'" % (count, len(input_file_list), bed_file)
call(['liftOver', bed_file, liftover_chainfile, build19_outfile, error_filename, "-bedPlus=8", "-minMatch=1.0"])
except Exception as e:
print 'liftOver execution resulted in error:\n%s' %e
raise
count += 1
# liftOver doesn't recognize bed detailed format. Though -bedplus parameter circumvents this, liftOver still has trouble dealing with space characters in column 14 of input bed file,
# and the output bed file ends up replacing space with tab characters in that column. I haven't found any solution for this from web-search.
# These output files, as it is, would have varying number of columns within a file, and bedtools results in error in this case.
# Following part fixes this issue and converts them back into single column.
# Tried using pandas and csv modules, but neither could properly handle multi-column data without complicating scripting. Hence, straight-forward scripting was done and this is lot simpler and is not taxing on time/memory either.
fixed_bed19_outfile = os.path.join(fixed_bed19_outdir, '_'.join( ['hg19_final', base_filename]) )
if not base_filename.endswith('_proteome.bed'): # proteome bed file doesn't have 14-column structure as other bed files. They can be copied into a new file without any modification.
annotation_type = base_filename.replace('UP000005640_9606_', '')
annotation_type = annotation_type.replace('.bed', '')
if base_filename.endswith('_disulfid.bed'):
disulfide_bed = True
else:
disulfide_bed = False
with open(fixed_bed19_outfile, 'w') as edited_bed_handle:
# edited_bed19_data = []
with open(build19_outfile, 'Ur') as bed19_handle:
edit_bedfile(bed19_handle, settings_dict, annotation_type, disulfide_bed, edited_bed_handle)
# for 'natural variant' bed file, disease acronym in column 14 need to replaced with full name
if base_filename.endswith('_variant.bed'):
natural_variant_bed = True
fixed_bed19_outfile = replace_acronyms_in_natural_variant(fixed_bed19_outfile, diseases_acronym_file='settings_files/diseases-all.tab') # replaces acronym with full name and writes them in a temporary file
else:
natural_variant_bed = False
# for annotations that are labelled as type 1 in settings file
# append into a single bed file for annotations of interest
append_to_mergefile(merged_bedfile_type1, fixed_bed19_outfile, settings_dict, annotation_type, annotation_value='1')
# for annotations that are labelled as type 0 in settings file
# append into a single bed file for annotations of interest
append_to_mergefile(merged_bedfile_type0, fixed_bed19_outfile, settings_dict, annotation_type, annotation_value='0')
# for 'natural variant' bed file, delete the temporary bed file which had disease acronym replaced with full name
if natural_variant_bed:
os.remove(fixed_bed19_outfile)
else: # proteome bed file doesn't have 14-column structure as other bed files. They can be copied into a new file without any modification.
copyfile(build19_outfile, fixed_bed19_outfile)
# sorts the contents of merged bed file
title_line = '#chrom\tg_start\tg_end\tuniprot_id\tannotation_type\tstrand\tthick_start\tthick_end\trgb\tno_of_blocks\tsize_of_blocks\tstart_of_blocks\tannotation_identifier\tdescription\n'
for filename in [merged_type0_filename, merged_type1_filename]:
bed_data = BedTool(filename)
bed_data.sort().saveas(filename)
# Bedtools' sorting removes title line. Following part writes title as first line of the file
with open(filename, 'r') as original: data = original.read()
with open(filename, 'w') as modified: modified.write(title_line + data)
# Zips hg19 converted bed files and puts it into download directory
dir_name = os.path.join(input_bedfiles_dir, 'convert2hg19/hg19_format_fixed')
zipped_filename = '%s/hg19_UniProt_genome_annotations_%s' % (download_dir, release_ver)
shutil.make_archive(zipped_filename, 'zip', dir_name)