-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathBCL2FASTQCleanup.py
executable file
·197 lines (160 loc) · 7.61 KB
/
BCL2FASTQCleanup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#!/usr/bin/env python3
import os, sys, re
from glob import glob
from shutil import rmtree
import time
# Here we have a baked-in idea of what a project number
# should look like, so if this changes the code will have to change.
# We also dynamically add all the projects in 'projects_ready.txt' but note that we may
# well need to clean up a project that's not in there, so we can't rely on that file.
PPATTERNS = ['[0-9]+', 'ControlLane']
def main(output_dir, *lanes):
"""Usage: BCL2FASTQCleanup.py <output_dir> <lanes_list>
Given an output folder, clean up old FASTQ files ready for re-demultiplexing.
Also clean [other stuff]. See design criteria at:
https://www.wiki.ed.ac.uk/pages/viewpage.action?pageId=319660973
Also see the unit tests (as always)
"""
output_dir = os.path.abspath(output_dir)
# Open the log. This will barf on non-existent output_dir.
with open(os.path.join(output_dir, 'cleanup.log'), 'a') as log_fh:
def log(m): print(m, file=log_fh)
def die(m): print("# ERROR: %s" % m, file=log_fh) or exit(m)
log("# %s" % sys.argv[0])
log("# cleaning lanes %s in %s on %s" % (
lanes,
output_dir,
time.strftime('%Y-%m-%d %H:%M', time.localtime()) ))
# Sanity checking...
if not lanes:
die("No lanes specified to process.")
for l in lanes:
if l not in list("12345678"):
die("%s is not a valid lane." % l)
# See if we have projects_ready.txt
try:
with open(os.path.join(output_dir, 'projects_ready.txt')) as fh:
old_pr = [ l.rstrip('\n') for l in fh ]
log("# Lines in projects_ready.txt: %s" % repr(old_pr))
PPATTERNS.extend([re.escape(x) for x in old_pr])
except Exception:
log("# Failed to read projects_ready.txt")
# Collector for projects removed
projects = set()
lanes = set(lanes)
try:
# Deleting the FASTQ that hasn't been renamed
projects.update(delete_d_dirs(os.path.join(output_dir, 'demultiplexing'), lanes, log=log))
# Deleting the FASTQ that is already post-processed
projects.update(delete_p_fastq(output_dir, lanes, log=log))
# Deleting the [other stuff]
# md5sums and counts are removed by the 'otherdirs' option passed to delete_fastq.
# We should also scrub the QC? Stale info in the MultiQC reports will be bad!!
del_qc = "rm -rf {od}/QC/lane[{l}] {od}/QC/multiqc_report_lane[{l}]*".format(l=''.join(lanes), od=output_dir)
os.system(del_qc)
log(del_qc)
# Put anything I deleted into projects_pending.txt
with open(os.path.join(output_dir, 'projects_pending.txt'), 'a') as pp_fh:
for p in projects:
print(p, file=pp_fh)
log("# DONE: %s projects added to projects_pending.txt" % len(projects))
except BaseException as e:
# Trap BaseException so we log death-by-SIGINT
log("# EXCEPTION: %s" % e)
raise
def delete_p_fastq(path, lanes, **kwargs):
r"""Delete FASTQ from the top-level dir and return a list of the projects
impacted.
Files in here match [0-9]{6}_[^_]+_[0-9]+_[^_]+_(.)_[^_]+_(?:[0-9]|UMI)\.fastq\.gz
where $1 is the lane number.
"""
return delete_fastq( path, lanes,
re.compile(r'^[0-9]{6}_[^_]+_[0-9]+_[^_]+_(.)_[^_]+_(?:[0-9]|UMI)\.fastq\.gz'),
otherdirs=('md5sums', 'counts'),
**kwargs )
def delete_d_fastq(path, lanes, **kwargs):
r"""Delete FASTQ from the demultiplexing area and return a list of the projects
impacted.
Files in here match .*_L00(.)_.\d_\d\d\d\.fastq\.gz where $1 is the lane.
"""
return delete_fastq( path, lanes,
re.compile(r'_L00(.)_.._\d\d\d\.fastq\.gz$'),
**kwargs )
def delete_d_dirs(path, lanes, log=lambda x: None):
"""I was using delete_d_fastq to prune out individual FASTQ files, but now I just
want to delete entire directories: path/lane?
"""
projects = set()
deletions = 0
for lane in lanes:
lane_dir = os.path.join(path, "lane%s" % lane)
# There may not be a directory to delete.
if not os.path.exists(lane_dir):
continue
proj_in_lane = [ os.path.basename(d) for d in
glob(os.path.join(lane_dir, '[0-9]*')) ]
projects.update(proj_in_lane)
# Delete whole directory. If bcl2fastq completed this is just the logs.
log("rm -r '%s'" % lane_dir)
rmtree(lane_dir)
deletions += 1
msg = "Deleted %i directories complete with files relating to %i projects." % (
deletions, len(projects) )
log('# ' + msg)
return projects
def delete_fastq(path, lanes, match_pattern, log=lambda x: None, otherdirs=()):
"""Generic file deleter given a path and a pattern.
"""
ppatterns = PPATTERNS
projects = set()
deletions = list()
od_deletions = 0
emptydirs = 0
for root, dirs, files in os.walk(path):
# At the top level, only descent into directories that are numbers (ie. projects),
# or 'ControlLane' as a special case.
# We expect to see the unassigned reads at this level
if root == path:
dirs[:] = [ d for d in dirs if any(re.search('^{}$'.format(p), d) for p in ppatterns) ]
for f in files:
mo = re.search(match_pattern, f)
if mo and mo.group(1) in lanes:
# Pull out the project from the path (unassigned files have no project, of course!)
proj = root[len(path):].strip(os.path.sep).split(os.path.sep)[0]
if proj: projects.add( proj )
os.remove(os.path.join(root, f))
log( "rm '{}'".format(os.path.join(root, f)) )
deletions.append(os.path.join(root[len(path):], f))
# Useful for debugging
# else:
# if mo:
# log("# lane %s is not in %s" % (mo.group(1), lanes))
# else:
# log("# %s does not match %s" % (f, match_pattern))
# Deal with otherdirs - ie places where supplementary files lurk.
# We're looking for files with a matching name, but a different extension.
for od in otherdirs:
for f in deletions:
for odf in glob( "{}/{}/{}.*".format(path, od, f.split('.')[0].lstrip('/')) ):
os.remove(odf)
log( "rm '{}'".format(odf) )
od_deletions += 1
# Now remove empty directories. We only want to look at those in projects, or in project
# dirs within otherdirs.
for od in ['.'] + list(otherdirs):
for proj in projects:
for root, dirs, files in os.walk(os.path.join(path, od, proj), topdown=False):
try:
os.rmdir(root)
log("rmdir '%s'" % root)
emptydirs += 1
except Exception:
pass # Assume it was non-empty.
msg = "Deleted {} fastq files and {} ancillary files and {} directories from {} relating to {} projects.".format(
len(deletions), od_deletions, emptydirs, os.path.basename(path), len(projects) )
log('# ' + msg)
#print(msg)
return projects
if __name__ == '__main__':
print("Running " + ' '.join(sys.argv))
main(*sys.argv[1:])