-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathunassigned_to_table.py
executable file
·137 lines (110 loc) · 5.22 KB
/
unassigned_to_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python3
import os, sys, re
import json
import logging as L
# Take the Stats.json and produce a basic text table of the unassigned
# barcodes. This code was originally in multiqc_edgen/modules/edgen_unassigned/edgen_unassigned.py
# but it got a little complex so I'm breaking it out into the pipeline proper.
# Note that if there are no barcodes in the run or the Stats.json contains multiple lanes the
# output will be empty and a warning will be logged on STDERR.
def format_lines(json_data, maxlines=None, commentor=lambda *ub: []):
"""Given the content of a Stats.json file as a Python object,
return a list of lines that we can embed into the MultiQC report.
"""
ub = json_data.get("UnknownBarcodes")
if not ub:
L.warning("No unknown barcodes in this JSON data.")
return []
if len(ub) != 1:
# TODO - check I got this right
L.warning("JSON data contains info for more than one lane.")
return []
# There is one lane. Good.
ub_codes = ub[0]["Barcodes"]
# Now we have a dict. In the original files the list is sorted by count but this will
# be lost, so re-sort. Also we can lose the overspill.
ub_sorted = sorted(ub_codes.items(), key=lambda i: int(i[1]), reverse=True)[:maxlines]
# Add comment to the end of each line. This incolves converting the tuples to lists.
ub_sorted = [ list(ub) + commentor(*ub) for ub in ub_sorted ]
# I had this calculate the right widths and format into columns, but Matt says he
# copy-pastes the list into Excel and needs literal tab characters. OK.
return [ "\t".join(map(str,i)) for i in ub_sorted ]
def make_revcomp_commentor(sdict):
"""Return a commentor function that looks for likely reverse-complement
issues. Note we don't look for reversed or swapped codes or anything
other than revcomp.
"""
# Sanitize all the names in sdict, while also ensuring that changes to
# the original dict cannot alter the function output.
newdict = { k: v.split('__')[-1] for k, v in sdict.items() }
# We only care about the barcode not the count, so define the fuction like so:
def comm_func(bc, *_):
# Simple case
if bc in newdict:
return ["is " + newdict[bc]]
# The bc may or may not have a '+'
bc_split = bc.split('+')
if len(bc_split) > 2:
# I could make this work for any number of indices but it would just
# make the code ugly and would never do anything useful.
return []
elif len(bc_split) == 1:
if revcomp(bc) in newdict:
return ["revcomp of " + newdict[revcomp(bc)]]
else:
# No match, no message
return []
else:
# Here we need a list of possible matches - see the unit tests
poss_matches = [ ('idx1',
newdict.get(revcomp(bc_split[0]) + "+" + bc_split[1])),
('idx2',
newdict.get(bc_split[0] + "+" + revcomp(bc_split[1]))),
('idx1+2',
newdict.get(revcomp(bc_split[0]) + "+" + revcomp(bc_split[1]))) ]
# String-ify. For the no-match case we need to return an empty list, not an empty
# string.
res = "; ".join([ "revcomp {} of {}".format(*pm) for pm in poss_matches
if pm[1] ])
return [res] if res else []
return comm_func
def revcomp(seq, rep_table=str.maketrans('ATCGatcg', 'TAGCtagc')):
"""The classic!
"""
return seq.translate(rep_table)[::-1]
def get_samples_list(json_data):
"""This is implemented elsewhere, but I'll re-implement it here. List
the samples from the JSON, in the form of as dict of {barcode: sample}
"""
con_res = json_data.get('ConversionResults')
# We expect one of these, and then there should be a DemuxResults list that is one
# per sample.
if not con_res or len(con_res) != 1:
L.warning("Cannot get table of sample barcodes from JSON data")
return {}
dem_res = con_res[0]["DemuxResults"]
if not dem_res:
L.warning("Table of sample barcodes from JSON data is empty")
res = {}
for sample in dem_res:
sample_name = sample.get("SampleId") or sample.get("SampleName") or 'unnamed'
# In Stats.json a sample may have multiple index sequences. Not sure if bcl2fastq actually supports
# this but we have a list in any case.
for sample_index in sample.get("IndexMetrics", []):
res[sample_index["IndexSequence"]] = sample_name
return res
def main(args):
if not len(args) == 1:
exit("Usage: unassigned_to_table.py <Stats.json>")
with open(args[0]) as sfh:
json_data = json.load(sfh)
# Commentor needs to know the expected samples
expected_samples = get_samples_list(json_data)
commentor = make_revcomp_commentor(expected_samples)
out_lines = format_lines(json_data, maxlines=200, commentor=commentor)
for l in out_lines:
print(l)
if __name__ == '__main__':
# Logging in use for the benefit of unit tests
L.basicConfig(format='{message:s}', level=L.INFO, style='{')
main(sys.argv[1:])