-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwhich_genome_2.py
executable file
·114 lines (103 loc) · 4.05 KB
/
which_genome_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/bin/env python
import os
import sys
import gzip
ref_dir = os.path.realpath(os.path.dirname(__file__))
def read_genes(f, idx=0, attribute="gene_id"):
result = []
sep = "\t"
if f.endswith(".tsv"):
with open(f, "rt") as csv_file:
for line in csv_file:
value = line.strip().split(sep)[idx]
if value and value[0] == '"' and value[-1] == '"':
value = value[1:-1].replace('""', '"')
if value:
result.append(value)
elif f.endswith(".gz"):
with gzip.open(f, "rt") as gz_file:
for line in gz_file:
if line.startswith("#"):
continue # Skip comment lines
fields = line.strip().split(sep)
if fields[2] == "gene": # Check if 'Feature' is 'gene'
gene_attribute = None
for attr in fields[8].split(';'):
key, val = attr.strip().split(" ")
if key == attribute:
gene_attribute = val.strip('"')
break
if gene_attribute:
result.append(gene_attribute)
else:
with open(f, "rt") as txt_file:
for line in txt_file:
if line.startswith("#"):
continue # Skip comment lines
fields = line.strip().split(sep)
if fields[2] == "gene": # Check if 'Feature' is 'gene'
gene_attribute = None
for attr in fields[8].split(';'):
key, val = attr.strip().split(" ")
if key == attribute:
gene_attribute = val.strip('"')
break
if gene_attribute:
result.append(gene_attribute)
return result
def which_build(features, verbose=False):
result = {}
min_genes = None
min_answer = None
given_genes = read_genes(features)
is_ensembl_ids = given_genes[0].startswith("ENS")
ref_idx = 0
if not is_ensembl_ids:
ref_idx = 1
attribute = 'gene_name'
for d in os.listdir(ref_dir):
dir_path = os.path.join(ref_dir, d)
if not os.path.isdir(dir_path):
continue
ref_genes_path = os.path.join(dir_path, "genes.gtf.gz")
if not os.path.exists(ref_genes_path):
continue
ref_genes = read_genes(ref_genes_path, idx=ref_idx,attribute=attribute)
result[d] = set(given_genes) - set(ref_genes)
if min_genes is None:
min_genes = len(result[d])
min_answer = [d]
elif len(result[d]) < min_genes:
min_genes = len(result[d])
min_answer = [d]
elif len(result[d]) == min_genes:
min_answer.append(d)
if min_genes == 0:
if len(min_answer) == 1:
print(f"All genes in {features} come from build {min_answer[0]}")
else:
print(f"All genes in {features} come from builds {', '.join(min_answer)}")
else:
print(f"No build provides full set of genes for {features}. Here's the number of missing genes per build:")
for build, genes in sorted(result.items(), key=lambda x: len(x[1])):
print(f" {build}: {len(genes)}")
if verbose:
for build, genes in sorted(result.items(), key=lambda x: len(x[1])):
print(f"Genes absent in build {build}:")
genes = list(genes)
for i in range(-(len(genes) // -10)): # this is math.ceil equivalent
print(f" {' '.join(genes[i * 10:(i + 1) * 10])}")
break
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: which_genome.py [-v] path/to/cellbrowser/features.tsv.gz")
sys.exit(1)
args = []
verbose = False
for i in sys.argv[1:]:
if i == "-v":
verbose = True
else:
args.append(i)
print(ref_dir)
which_build(*args, verbose=verbose)