-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path01_get_locus_ortholog_part1.py
executable file
·140 lines (100 loc) · 3.63 KB
/
01_get_locus_ortholog_part1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/python
#MINIMUM_LENGTH = 100
############################
##### DEF1 : Get Pairs #####
############################
def get_pairs(fasta_file_path):
F2 = open(fasta_file_path, "r")
list_pairwises = []
while 1:
next2 = F2.readline()
if not next2:
break
if next2[0] == ">":
fasta_name_query = next2[:-1]
Sn = string.split(fasta_name_query, "||")
fasta_name_query = Sn[0]
next3 = F2.readline()
fasta_seq_query = next3[:-1]
next3 = F2.readline() ## jump one empty line (if any after the sequence)
#next3 = F2.readline()
fasta_name_match = next3[:-1]
Sn = string.split(fasta_name_match, "||")
fasta_name_match = Sn[0]
next3 = F2.readline()
fasta_seq_match = next3[:-1]
pairwise = [fasta_name_query,fasta_seq_query,fasta_name_match,fasta_seq_match]
## ADD pairwise with condition
list_pairwises.append(pairwise)
F2.close()
#print list_pairwises
return(list_pairwises)
##############################################
#################################
##### DEF2 : Extract length #####
#################################
def extract_length(length_string): # format length string = 57...902
l3 = string.split(length_string, "...")
n1 = string.atoi(l3[0])
n2 = string.atoi(l3[1])
length = n2-n1
return(length)
##############################################
#######################
##### RUN RUN RUN #####
#######################
import string, os, time, re, sys, pickle
PATH_IN = "../../tmp/02_ParwiseRBH_tblastx/"
L1 = os.listdir(PATH_IN)
print L1
PATH_OUT = "04_LOCUS"
## 1 ## Get list locus
list_LOCUS=[]
for subfolder in L1:
print subfolder
fileIN = "%s/%s/19_ReciprocalBestHits.fasta" %(PATH_IN, subfolder)
list_pairwises = get_pairs(fileIN) ### DEF1 ###
print "\t%s" %(list_pairwises[0][0])
print "\t%s" %(list_pairwises[0][2])
jj=0
for pair in list_pairwises:
name1 = pair[0]
name2 = pair[2]
m=0
## If one of the 2 names yet present ==> complete the locus
for locus in list_LOCUS:
if name1 in locus or name2 in locus:
locus.append(name1)
locus.append(name2)
m=1
## If no names was yet present in locus ==> create the locus
if m==0:
new_locus = [name1, name2]
list_LOCUS.append(new_locus)
## Compteur
jj = jj+1
if jj%1000 == 0:
print "\t%d" %jj
## 2 ## Remove redondancy in list_LOCUS : Several locus in the list_locus, should be the same (but not in the same order), depending in which order they were created, we should remove them
list_short_LOCUS = []
for locus in list_LOCUS:
index = list_LOCUS.index(locus)
short_locus = []
## Remove redondant sequence name
for seq in locus:
if seq not in short_locus:
short_locus.append(seq)
## Sort the list of sequence name (by alphabetical order)
short_locus.sort()
## Add the locus to the new list (list_short_LOCUS) if not yet present (avoid redondant locus)
if short_locus not in list_short_LOCUS:
list_short_LOCUS.append(short_locus)
list_LOCUS = list_short_LOCUS
## 3 ## Control list locus length
ln = len(list_LOCUS)
print "\nNumber of locus = %d\n" %ln
## 4 ## Backup list locus with PICKLE
backup_list_LOCUS = open("02_backup_list_LOCUS", "w")
pickle.dump(list_LOCUS, backup_list_LOCUS)
backup_list_LOCUS.close()
print "\n\nLIST LOCI = %s" %list_LOCUS