-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrename_orthogroup_genes.py
executable file
·111 lines (95 loc) · 3.59 KB
/
rename_orthogroup_genes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/python
'''
Orthology analysis was performed on Vd gene models prior to being submitted to ncbi
and subsequently renamed. This script renames the genes present in each
orthogroup based upon a conversion table generated as part of the genome submission
script.
'''
#-----------------------------------------------------
# Step 1
# Import variables & load input files
#-----------------------------------------------------
import sys
import argparse
import re
from sets import Set
from collections import defaultdict
from operator import itemgetter
ap = argparse.ArgumentParser()
ap.add_argument('--orthogroups',required=True,type=str,help='A fasta file of the assembled contigs')
ap.add_argument('--tsv_12008',required=True,type=str,help='A table of gene conversions')
ap.add_argument('--tsv_51',required=True,type=str,help='A table of gene conversions')
ap.add_argument('--tsv_53',required=True,type=str,help='A table of gene conversions')
ap.add_argument('--tsv_58',required=True,type=str,help='A table of gene conversions')
ap.add_argument('--tsv_61',required=True,type=str,help='A table of gene conversions')
conf = ap.parse_args()
with open(conf.orthogroups) as f:
orthogroup_lines = f.readlines()
with open(conf.tsv_12008) as f:
tsv_12008 = f.readlines()
with open(conf.tsv_51) as f:
tsv_51 = f.readlines()
with open(conf.tsv_53) as f:
tsv_53 = f.readlines()
with open(conf.tsv_58) as f:
tsv_58 = f.readlines()
with open(conf.tsv_61) as f:
tsv_61 = f.readlines()
#-----------------------------------------------------
# Step 2
# build dictionaries for renaming genes
#-----------------------------------------------------
conversion_dict = defaultdict(list)
for line in tsv_12008:
line = line.rstrip()
split_line = line.split()
old_name = split_line[0]
old_name = re.sub('^.+?_', '', old_name, 1) # replaces first occurence
new_name = split_line[1]
new_name = re.sub('^.+?_', '', new_name, 1)
new_name = re.sub('^vAg', 'g', new_name)
conversion_dict["12008|" + old_name] = "12008|" + new_name
for line in tsv_51:
line = line.rstrip()
split_line = line.split()
old_name = split_line[0]
old_name = re.sub('^.+?_', '', old_name, 1)
new_name = split_line[1]
new_name = re.sub('^.+?_', '', new_name, 1)
conversion_dict["51|" + old_name] = "12251|" + new_name
for line in tsv_53:
line = line.rstrip()
split_line = line.split()
old_name = split_line[0]
old_name = re.sub('^.+?_', '', old_name, 1)
new_name = split_line[1]
new_name = re.sub('^.+?_', '', new_name, 1)
conversion_dict["53|" + old_name] = "12253|" + new_name
for line in tsv_58:
line = line.rstrip()
split_line = line.split()
old_name = split_line[0]
old_name = re.sub('^.+?_', '', old_name, 1)
new_name = split_line[1]
new_name = re.sub('^.+?_', '', new_name, 1)
conversion_dict["58|" + old_name] = "12158|" + new_name
for line in tsv_61:
line = line.rstrip()
split_line = line.split()
old_name = split_line[0]
old_name = re.sub('^.+?_', '', old_name, 1)
new_name = split_line[1]
new_name = re.sub('^.+?_', '', new_name, 1)
conversion_dict["61|" + old_name] = "12161|" + new_name
for line in orthogroup_lines:
line = line.rstrip()
split_line = line.split()
parsed_line = []
for gene in split_line:
split_gene = gene.split('.') # transcript id not in dictionary g1.t1
if conversion_dict[split_gene[0]]:
split_gene[0] = conversion_dict[split_gene[0]]
gene = ".".join(split_gene)
# print gene
parsed_line.append(gene)
print "\t".join(parsed_line)