-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path06_1_unInterleave_BigFastaFile_and_ShorteningFastaName.py
executable file
·90 lines (64 loc) · 2.44 KB
/
06_1_unInterleave_BigFastaFile_and_ShorteningFastaName.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/python
## AUTHOR: Eric Fontanillas
## LAST VERSION: 18.05.2011
## DESCRIPTION: Uninterleaved transcript file ("transcripts.fa", i.e. the oases output) => create the file "transcripts.fasta" and format the fasta name (to make it shorter)
###############################
### DEF : FORMAT FASTA NAME ###
###############################
def format_fastaName(fastaName):
# INPUT: EXAMPLE for fastaNAME = Locus_25599_Transcript_1/1_Confidence_1.000_Length_215
# OUTPUT: We want: Ac25599_1/1_1.000_215
l = string.split(fastaName, "_")
fastaName = SPECIES + l[1] + "_" + l[3]+ "_" + l[5]+ "_" + l[7]
return fastaName
###########
## DEF 1 ##
###########
## Generates bash, with key = fasta name; value = sequence (WITH GAP, IF ANY, REMOVED IN THIS FUNCTION)
def uninterleave_BigFastaFile(path_fileIN, path_fileOUT):
F1 = open(path_fileIN, "r")
F2 = open(path_fileOUT, "w")
bash1 = {}
j = 0
seq = ""
k = 0
while 1:
k = k+1
if k%10000 ==0:
print "\t%d" %k
nextline = F1.readline()
if not nextline :
seq = string.replace(seq, "\n", "")
## record the last entrie
F2.write(">%s\n" %fasta_name)
F2.write("%s\n" %seq)
break
if nextline[0] != ">":
seq = seq+nextline
j = 1
elif nextline[0] == ">":
## 1 ## record previous sequence
if j ==1:
seq = string.replace(seq, "\n", "")
F2.write(">%s\n" %fasta_name)
F2.write("%s\n" %seq)
## 2 ## new fasta name
seq = ""
fasta_name = nextline[1:-1]
fasta_name = format_fastaName(fasta_name)
print fasta_name
F1.close()
F2.close()
return()
#####################################
import string, os, sys
#SPECIES = "Ac_"
SPECIES = sys.argv[1] ## format "Ac_"
pathIN = "../tmp"
List_onlydirectories = [name for name in os.listdir(pathIN) if os.path.isdir(os.path.join(pathIN, name))]
List_onlydirectories.sort()
for kmerRun in List_onlydirectories:
print kmerRun
path_fileIN = "%s/%s/transcripts.fa" %(pathIN, kmerRun)
path_fileOUT = "%s/%s/transcripts.fasta" %(pathIN, kmerRun)
uninterleave_BigFastaFile(path_fileIN, path_fileOUT) ### DEF1 ###