-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Introducing sample rate from minknow and catching sub_read issue.
- Loading branch information
Showing
4 changed files
with
34 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,7 @@ | |
import numpy as np | ||
import numpy.typing as npt | ||
from minknow_api.protocol_pb2 import ProtocolRunInfo | ||
from minknow_api.device_pb2 import GetSampleRateResponse | ||
|
||
try: | ||
from pybasecall_client_lib.helper_functions import package_read | ||
|
@@ -55,12 +56,20 @@ def __getitem__(self, _): | |
|
||
class Caller(CallerABC): | ||
def __init__( | ||
self, run_information: ProtocolRunInfo = None, debug_log=None, **kwargs | ||
self, | ||
run_information: ProtocolRunInfo = None, | ||
sample_rate: GetSampleRateResponse = None, | ||
debug_log=None, | ||
**kwargs, | ||
): | ||
self.logger = setup_logger("readfish_dorado_logger", log_file=debug_log) | ||
self.supported_barcode_kits = None | ||
self.supported_basecall_models = None | ||
self.run_information = run_information | ||
if sample_rate: | ||
self.sample_rate = float(sample_rate) | ||
else: | ||
self.sample_rate = float(5000) | ||
|
||
# Set our own priority | ||
self.dorado_params = kwargs | ||
|
@@ -198,21 +207,18 @@ def basecall( | |
cache, skipped = {}, {} | ||
reads_received, reads_sent = 0, 0 | ||
daq_values = _DefaultDAQValues if daq_values is None else daq_values | ||
|
||
for channel, read in reads: | ||
# Attach the "RF-" prefix | ||
read_id = f"RF-{read.id}" | ||
t0 = time.time() | ||
cache[read_id] = (channel, read.number, t0) | ||
|
||
success = self.caller.pass_read( | ||
package_read( | ||
read_id=read_id, | ||
raw_data=np.frombuffer(read.raw_data, signal_dtype), | ||
daq_offset=daq_values[channel].offset, | ||
daq_scaling=daq_values[channel].scaling, | ||
# ToDo: Sample rate needs to be derived from the MinKNOW API | ||
sampling_rate=float(5000), | ||
sampling_rate=self.sample_rate, | ||
start_time=int(read.start_sample), | ||
) | ||
) | ||
|
@@ -237,13 +243,28 @@ def basecall( | |
time.sleep(self.caller.throttle) | ||
continue | ||
|
||
for res_batch in results: | ||
for res_batch in results: | ||
for res in res_batch: | ||
read_id = res["metadata"]["read_id"] | ||
''' | ||
Dorado sometimes returns two sub_tags for a read. | ||
An example result: | ||
RF-17774e44-8d3c-4bdd-8730-aaf8addd3e6c | ||
{'read_tag': 3605, 'sub_tag': 0, 'priority': <read_priority.high_priority: 0>, 'metadata': {'adapter_front_begin_index': -1, 'adapter_front_foundseq': 'NA', 'adapter_front_foundseq_length': 2, 'adapter_front_id': 'NA', 'adapter_front_refseq': 'NA', 'adapter_front_score': 0.0, 'adapter_mid_end_index': -1, 'adapter_mid_id': 'NA', 'adapter_mid_score': 0.0, 'adapter_rear_end_index': -1, 'adapter_rear_foundseq': 'NA', 'adapter_rear_foundseq_length': 2, 'adapter_rear_id': 'NA', 'adapter_rear_refseq': 'NA', 'adapter_rear_score': 0.0, 'barcode_arrangement': '', 'barcode_front_begin_index': 0, 'barcode_front_foundseq': 'NA', 'barcode_front_foundseq_length': 2, 'barcode_front_id': 'NA', 'barcode_front_id_inner': 'NA', 'barcode_front_refseq': 'NA', 'barcode_front_score': 0.0, 'barcode_front_score_inner': 0.0, 'barcode_full_arrangement': '', 'barcode_kit': '', 'barcode_mid_front_end_index': 0, 'barcode_mid_front_id': 'NA', 'barcode_mid_front_score': 0.0, 'barcode_mid_rear_end_index': 0, 'barcode_mid_rear_id': 'NA', 'barcode_mid_rear_score': 0.0, 'barcode_rear_end_index': 0, 'barcode_rear_foundseq': 'NA', 'barcode_rear_foundseq_length': 2, 'barcode_rear_id': 'NA', 'barcode_rear_id_inner': 'NA', 'barcode_rear_refseq': 'NA', 'barcode_rear_score': 0.0, 'barcode_rear_score_inner': 0.0, 'barcode_score': 0.0, 'barcode_variant': '', 'basecall_type': 'beam search', 'daq_offset': 0.0, 'daq_scaling': 0.25, 'duration': 8592, 'lamp_barcode_id': 'NA', 'lamp_barcode_score': 0.0, 'lamp_target_id': 'NA', 'lamp_target_score': 0.0, 'mean_qscore': 10.297822952270508, 'med_abs_dev': 0.01067463681101799, 'median': -373.5199890136719, 'model_stride': 6, 'model_version_id': '[email protected]', 'num_events': 1432, 'num_minknow_events': 0, 'primer_front_begin_index': -1, 'primer_front_foundseq': 'NA', 'primer_front_foundseq_length': 2, 'primer_front_id': 'NA', 'primer_front_refseq': 'NA', 'primer_front_score': 0.0, 'primer_rear_end_index': -1, 'primer_rear_foundseq': 'NA', 'primer_rear_foundseq_length': 2, 'primer_rear_id': 'NA', 'primer_rear_refseq': 'NA', 'primer_rear_score': 0.0, 'read_id': 'RF-17774e44-8d3c-4bdd-8730-aaf8addd3e6c', 'sampling_rate': 5000.0, 'scaling_med_abs_dev': 0.01067463681101799, 'scaling_median': -373.5199890136719, 'scaling_version': '', 'sequence_length': 340, 'start_time': 12462912, 'state_size': 0, 'strand_id': '441174ae-e18f-4612-bc46-0c86d301cb4d', 'trim_front': 0, 'trim_rear': 0, 'trimmed_duration': 8592, 'trimmed_events': 1432, 'trimmed_samples': 0}, 'datasets': {'raw_data': array([306, 289, 300, ..., 526, 537, 540], dtype=int16), 'sequence': 'CTACAGTGGTCGTCAATACTATTGCAACTCCAGCCTGGGCAACAGAGTGAGACCCTGTCTCAAAAAAAAACAGAAGGAAGCCAAAGCCCACAAAGACGCTGAGTATTTTACATTCCTTTTTTGTACGAATGCTTCAAAATCTGGTGTGTATCTTACAGTTATAAAATCTCTCAATTTAGCCACCAAATTTTCACCAGAAATGCATGAACTGTGTTCAGATTTCATAAACTTTCAGTTGATAAAGTAGATTCACATACTGAAGGTGTTCCAAACGTACTGGAAGACTTTCTAAAATGGATTTGAACACCAGTTTCCAGGTTTACATTTAAGTTAATTCAAA', 'qstring': '#(&&%(\'\'%&(#"""$\'%%$%$%*,)+()\'(\'\'*+AEHHH.-0410)200BGC4HKJJGGB77<DC3-\'.,,+\'&&%%+*(*1-778:C76+,=A.,-./*)+##9:JSGGGHD6432,5>H(\'(+,1-(,//:3640==6504C><357>4448FAFCIGI64-/9660=12HISOBGKLA@?1/59EGM>:::DEJSOM1D?>=GI11775:930,;;>H9/7122)*0:**-===>B55878&#+"#&\',,?@EIJBCLSM964\'(376528:5/-*)21\'##+-&$"#%+&#&%\'-./49?B=A960@7;ABB@6CIEFDD>H7)&45:DC5158B'}} | ||
RF-17774e44-8d3c-4bdd-8730-aaf8addd3e6c | ||
{'read_tag': 3605, 'sub_tag': 1, 'priority': <read_priority.high_priority: 0>, 'metadata': {'adapter_front_begin_index': -1, 'adapter_front_foundseq': 'NA', 'adapter_front_foundseq_length': 2, 'adapter_front_id': 'NA', 'adapter_front_refseq': 'NA', 'adapter_front_score': 0.0, 'adapter_mid_end_index': -1, 'adapter_mid_id': 'NA', 'adapter_mid_score': 0.0, 'adapter_rear_end_index': -1, 'adapter_rear_foundseq': 'NA', 'adapter_rear_foundseq_length': 2, 'adapter_rear_id': 'NA', 'adapter_rear_refseq': 'NA', 'adapter_rear_score': 0.0, 'barcode_arrangement': '', 'barcode_front_begin_index': 0, 'barcode_front_foundseq': 'NA', 'barcode_front_foundseq_length': 2, 'barcode_front_id': 'NA', 'barcode_front_id_inner': 'NA', 'barcode_front_refseq': 'NA', 'barcode_front_score': 0.0, 'barcode_front_score_inner': 0.0, 'barcode_full_arrangement': '', 'barcode_kit': '', 'barcode_mid_front_end_index': 0, 'barcode_mid_front_id': 'NA', 'barcode_mid_front_score': 0.0, 'barcode_mid_rear_end_index': 0, 'barcode_mid_rear_id': 'NA', 'barcode_mid_rear_score': 0.0, 'barcode_rear_end_index': 0, 'barcode_rear_foundseq': 'NA', 'barcode_rear_foundseq_length': 2, 'barcode_rear_id': 'NA', 'barcode_rear_id_inner': 'NA', 'barcode_rear_refseq': 'NA', 'barcode_rear_score': 0.0, 'barcode_rear_score_inner': 0.0, 'barcode_score': 0.0, 'barcode_variant': '', 'basecall_type': 'beam search', 'daq_offset': 0.0, 'daq_scaling': 0.25, 'duration': 321, 'lamp_barcode_id': 'NA', 'lamp_barcode_score': 0.0, 'lamp_target_id': 'NA', 'lamp_target_score': 0.0, 'mean_qscore': 12.145367622375488, 'med_abs_dev': 0.01067463681101799, 'median': -373.5199890136719, 'model_stride': 6, 'model_version_id': '[email protected]', 'num_events': 53, 'num_minknow_events': 0, 'primer_front_begin_index': -1, 'primer_front_foundseq': 'NA', 'primer_front_foundseq_length': 2, 'primer_front_id': 'NA', 'primer_front_refseq': 'NA', 'primer_front_score': 0.0, 'primer_rear_end_index': -1, 'primer_rear_foundseq': 'NA', 'primer_rear_foundseq_length': 2, 'primer_rear_id': 'NA', 'primer_rear_refseq': 'NA', 'primer_rear_score': 0.0, 'read_id': 'RF-17774e44-8d3c-4bdd-8730-aaf8addd3e6c', 'sampling_rate': 5000.0, 'scaling_med_abs_dev': 0.01067463681101799, 'scaling_median': -373.5199890136719, 'scaling_version': '', 'sequence_length': 16, 'start_time': 12477610, 'state_size': 0, 'strand_id': '3b536c09-3f73-4c59-a332-2a0f6cad4ed9', 'trim_front': 0, 'trim_rear': 0, 'trimmed_duration': 321, 'trimmed_events': 53, 'trimmed_samples': 0}, 'datasets': {'raw_data': array([306, 289, 300, ..., 526, 537, 540], dtype=int16), 'sequence': 'TTTTTGTGGGCTTTTA', 'qstring': "21;;B98'-213,*2&"}} | ||
The second sub_tag will cause the code to gail as it is not expected. | ||
Therefore we need to check for the sub_tag and if it is not 0, we need to skip the read. | ||
''' | ||
if res["sub_tag"] > 0: | ||
continue | ||
|
||
try: | ||
channel, read_number, time_sent = cache.pop(read_id) | ||
except KeyError: | ||
# FIXME: This is resolved in later versions of dorado. | ||
channel, read_number, time_sent = skipped.pop(read_id) | ||
reads_sent += 1 | ||
res["metadata"]["read_id"] = read_id[3:] | ||
|