-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFASTQ_Pairkey_Matcher.py
123 lines (106 loc) · 4.54 KB
/
FASTQ_Pairkey_Matcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/python3
from os import listdir
from datetime import datetime
import getpass
import re
import textwrap
import itertools
import argparse
# we allow you to specify the filename for the shell script
# and provides -h and --help messages for free!
parser = argparse.ArgumentParser(description='build shell script')
parser.add_argument('--shellFilename', help='filename for the shell script',
default="junk.sh"
)
parser.add_argument('--debug', help='print debug output',
action="store_true"
)
args = parser.parse_args()
# after the above line args.filename will equal either the specified filename
# or junk.sh if the user didn't specify a filename
# in production we will find the filenames in the filesystem
# files = listdir(".")
# example San_Diego_Honeybee_015_S30_L008_R1_001.fastq
# we are currently working with just a text file of the filenames
files = open("./names.txt").read().split()
# regex to parse the parts of the filename
filePat = re.compile('san_diego_honeybee_(.*)_(.*)_(.*)_(.*)_(.*)', re.IGNORECASE)
# apply the parsing regex and return a dict with info from the parsed filename
def getEntry(f) :
m = filePat.match(f)
# the numbered groups from a regex match are 0 (the whole matched string)
# and then 1 (the 1st paren match), 2 (the 2nd paren match), etc
return {
'filename': m.group(0),
'beeNumber': m.group(1),
'run': m.group(2),
'lane': m.group(3),
'read': m.group(4),
# our pairkey is beenNumber:run:lane, eg "015:S30:L008"
'pairKey': m.group(1) + ":" + m.group(2) + ":" + m.group(3)
}
# get a list of the filenames converted to dicts from getEntry()
elist = list(map(getEntry, files))
# get a list of the keys from the elist
pairKeys = list(map(lambda e: e.get("pairKey"), elist))
print("Creating Pairkey Dictionary")
# this will have one value for each pairkey which will be another dict
# which has an R1 and and R2 key
# eg pkDictionary.get("015:S30:L008") will return a dict with R1 and R2
# values representing the R1 and R2 files for that pairkey
pkDictionary = {}
#get the unique pairKeys
pairKeySet = set(pairKeys)
# we are going to create pair dicts for each pair of R1 and R2 filenames
# with the keys R1 and R2. We will then loop over the getEntry dicts
# putting each file into the appropriate pair dict in the overall pkDictionary
for pk in pairKeySet:
# each entry will be an empty dictionary to start. Later we will
# add R1 and R2 to these dictionaries
# for now pkDictionary.get("015:S30:L008") will return an empty dict
pkDictionary[pk] = {}
# place the entries in the Pairkey dictionary.
for fileInfo in elist:
pk = fileInfo["pairKey"] # get the pairkey for this file
keyDict = pkDictionary[pk] # find the matching pair dict
fileRead = fileInfo["read"] # find R1 or R2 from filename
keyDict[fileRead] = fileInfo # add to pair dict as R1 or R2
# print out the items for debugging/logging if debug flag is used
if ( args.debug ):
for k, v in pkDictionary.items():
print(textwrap.dedent(f"""
-----
R1 is {v.get("R1")}
R2 is {v.get("R2")} """))
# lets validate!
# look for dicts in pkDictionary missing one of the filenames
# and print a warning message
for k, v in pkDictionary.items():
if ( (v.get("R1") == None) or (v.get("R2") == None) ):
print("\n++++++\n")
print("Invalid file pair found:")
print(v)
# output a shell script
# FIXME -- what if junk.sh already exists?
print(f"outputting shell script to {args.shellFilename}")
with open(args.shellFilename, "w") as output:
header = textwrap.dedent(f"""
#/bin/bash
# this script generated by newGen.py
# creation time {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
# created by user {getpass.getuser()}
#
""")
# deleted print header section to facilitate downstream analysis
print(file=output)
for k, v in pkDictionary.items():
r1 = v.get("R1")
r2 = v.get("R2")
if ( (r1 is not None) and (r2 is not None) ):
# the "combine" below is the example command for the shell script
cmd = textwrap.dedent(f"""
# command generated by newGen.py
combine {r1["filename"]} {r2["filename"]}
""")
# deleted the print cmd option in order to facilitate downstream analysis
print( file=output)