Introducing sample rate from minknow and catching sub_read issue.

LooseLab · Apr 20, 2024 · a4d8b7e · a4d8b7e
1 parent 6bb34e4
commit a4d8b7e
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 8 deletions.
diff --git a/src/readfish/entry_points/stats.py b/src/readfish/entry_points/stats.py
@@ -73,6 +73,7 @@
     readfish stats --toml tests/static/stats_test/yeast_summary_test.toml --fastq-directory tests/static/stats_test/ --html summary_adaptive
 
 """
+
 from __future__ import annotations
 import argparse
 from pathlib import Path

diff --git a/src/readfish/entry_points/targets.py b/src/readfish/entry_points/targets.py
@@ -66,6 +66,7 @@
 
 
 """
+
 # Core imports
 from __future__ import annotations
 import argparse
@@ -171,16 +172,18 @@ def __init__(
         self.break_reads_after_seconds = (
             self.client.connection.analysis_configuration.get_analysis_configuration().read_detection.break_reads_after_seconds.value
         )
+        self.sample_rate = self.client.connection.device.get_sample_rate().sample_rate
         self.logger.info("Run Configuration Received")
         self.logger.info(f"run_id={self.run_information.run_id}")
         self.logger.info(f"break_reads_after_seconds={self.break_reads_after_seconds}")
+        self.logger.info(f"sample_rate={self.sample_rate}")
         # Create our statistics tracker
         self.loop_statistics = ReadfishStatistics(
             read_log_name, self.break_reads_after_seconds
         )
         logger.info("Initialising Caller")
         self.caller: CallerABC = self.conf.caller_settings.load_object(
-            "Caller", run_information=self.run_information
+            "Caller", run_information=self.run_information, sample_rate=self.sample_rate
         )
         logger.info("Caller initialised")
         caller_description = self.caller.describe()

diff --git a/src/readfish/entry_points/unblock_all.py b/src/readfish/entry_points/unblock_all.py
@@ -12,6 +12,7 @@
 
    readfish unblock-all --device X3 --experiment-name "test unblock all"
 """
+
 from tempfile import NamedTemporaryFile
 
 from readfish._cli_args import DEVICE_BASE_ARGS

diff --git a/src/readfish/plugins/dorado.py b/src/readfish/plugins/dorado.py
@@ -14,6 +14,7 @@
 import numpy as np
 import numpy.typing as npt
 from minknow_api.protocol_pb2 import ProtocolRunInfo
+from minknow_api.device_pb2 import GetSampleRateResponse
 
 try:
     from pybasecall_client_lib.helper_functions import package_read
@@ -55,12 +56,20 @@ def __getitem__(self, _):
 
 class Caller(CallerABC):
     def __init__(
-        self, run_information: ProtocolRunInfo = None, debug_log=None, **kwargs
+        self,
+        run_information: ProtocolRunInfo = None,
+        sample_rate: GetSampleRateResponse = None,
+        debug_log=None,
+        **kwargs,
     ):
         self.logger = setup_logger("readfish_dorado_logger", log_file=debug_log)
         self.supported_barcode_kits = None
         self.supported_basecall_models = None
         self.run_information = run_information
+        if sample_rate:
+            self.sample_rate = float(sample_rate)
+        else:
+            self.sample_rate = float(5000)
 
         # Set our own priority
         self.dorado_params = kwargs
@@ -198,21 +207,18 @@ def basecall(
         cache, skipped = {}, {}
         reads_received, reads_sent = 0, 0
         daq_values = _DefaultDAQValues if daq_values is None else daq_values
-
         for channel, read in reads:
             # Attach the "RF-" prefix
             read_id = f"RF-{read.id}"
             t0 = time.time()
             cache[read_id] = (channel, read.number, t0)
-
             success = self.caller.pass_read(
                 package_read(
                     read_id=read_id,
                     raw_data=np.frombuffer(read.raw_data, signal_dtype),
                     daq_offset=daq_values[channel].offset,
                     daq_scaling=daq_values[channel].scaling,
-                    # ToDo: Sample rate needs to be derived from the MinKNOW API
-                    sampling_rate=float(5000),
+                    sampling_rate=self.sample_rate,
                     start_time=int(read.start_sample),
                 )
             )
@@ -237,13 +243,28 @@ def basecall(
                 time.sleep(self.caller.throttle)
                 continue
 
-            for res_batch in results:
+            for res_batch in results: 
                 for res in res_batch:
                     read_id = res["metadata"]["read_id"]
+                    '''
+                    Dorado sometimes returns two sub_tags for a read.
+                    
+                    An example result:
+                    RF-17774e44-8d3c-4bdd-8730-aaf8addd3e6c
+                    {'read_tag': 3605, 'sub_tag': 0, 'priority': <read_priority.high_priority: 0>, 'metadata': {'adapter_front_begin_index': -1, 'adapter_front_foundseq': 'NA', 'adapter_front_foundseq_length': 2, 'adapter_front_id': 'NA', 'adapter_front_refseq': 'NA', 'adapter_front_score': 0.0, 'adapter_mid_end_index': -1, 'adapter_mid_id': 'NA', 'adapter_mid_score': 0.0, 'adapter_rear_end_index': -1, 'adapter_rear_foundseq': 'NA', 'adapter_rear_foundseq_length': 2, 'adapter_rear_id': 'NA', 'adapter_rear_refseq': 'NA', 'adapter_rear_score': 0.0, 'barcode_arrangement': '', 'barcode_front_begin_index': 0, 'barcode_front_foundseq': 'NA', 'barcode_front_foundseq_length': 2, 'barcode_front_id': 'NA', 'barcode_front_id_inner': 'NA', 'barcode_front_refseq': 'NA', 'barcode_front_score': 0.0, 'barcode_front_score_inner': 0.0, 'barcode_full_arrangement': '', 'barcode_kit': '', 'barcode_mid_front_end_index': 0, 'barcode_mid_front_id': 'NA', 'barcode_mid_front_score': 0.0, 'barcode_mid_rear_end_index': 0, 'barcode_mid_rear_id': 'NA', 'barcode_mid_rear_score': 0.0, 'barcode_rear_end_index': 0, 'barcode_rear_foundseq': 'NA', 'barcode_rear_foundseq_length': 2, 'barcode_rear_id': 'NA', 'barcode_rear_id_inner': 'NA', 'barcode_rear_refseq': 'NA', 'barcode_rear_score': 0.0, 'barcode_rear_score_inner': 0.0, 'barcode_score': 0.0, 'barcode_variant': '', 'basecall_type': 'beam search', 'daq_offset': 0.0, 'daq_scaling': 0.25, 'duration': 8592, 'lamp_barcode_id': 'NA', 'lamp_barcode_score': 0.0, 'lamp_target_id': 'NA', 'lamp_target_score': 0.0, 'mean_qscore': 10.297822952270508, 'med_abs_dev': 0.01067463681101799, 'median': -373.5199890136719, 'model_stride': 6, 'model_version_id': '[email protected]', 'num_events': 1432, 'num_minknow_events': 0, 'primer_front_begin_index': -1, 'primer_front_foundseq': 'NA', 'primer_front_foundseq_length': 2, 'primer_front_id': 'NA', 'primer_front_refseq': 'NA', 'primer_front_score': 0.0, 'primer_rear_end_index': -1, 'primer_rear_foundseq': 'NA', 'primer_rear_foundseq_length': 2, 'primer_rear_id': 'NA', 'primer_rear_refseq': 'NA', 'primer_rear_score': 0.0, 'read_id': 'RF-17774e44-8d3c-4bdd-8730-aaf8addd3e6c', 'sampling_rate': 5000.0, 'scaling_med_abs_dev': 0.01067463681101799, 'scaling_median': -373.5199890136719, 'scaling_version': '', 'sequence_length': 340, 'start_time': 12462912, 'state_size': 0, 'strand_id': '441174ae-e18f-4612-bc46-0c86d301cb4d', 'trim_front': 0, 'trim_rear': 0, 'trimmed_duration': 8592, 'trimmed_events': 1432, 'trimmed_samples': 0}, 'datasets': {'raw_data': array([306, 289, 300, ..., 526, 537, 540], dtype=int16), 'sequence': 'CTACAGTGGTCGTCAATACTATTGCAACTCCAGCCTGGGCAACAGAGTGAGACCCTGTCTCAAAAAAAAACAGAAGGAAGCCAAAGCCCACAAAGACGCTGAGTATTTTACATTCCTTTTTTGTACGAATGCTTCAAAATCTGGTGTGTATCTTACAGTTATAAAATCTCTCAATTTAGCCACCAAATTTTCACCAGAAATGCATGAACTGTGTTCAGATTTCATAAACTTTCAGTTGATAAAGTAGATTCACATACTGAAGGTGTTCCAAACGTACTGGAAGACTTTCTAAAATGGATTTGAACACCAGTTTCCAGGTTTACATTTAAGTTAATTCAAA', 'qstring': '#(&&%(\'\'%&(#"""$\'%%$%$%*,)+()\'(\'\'*+AEHHH.-0410)200BGC4HKJJGGB77<DC3-\'.,,+\'&&%%+*(*1-778:C76+,=A.,-./*)+##9:JSGGGHD6432,5>H(\'(+,1-(,//:3640==6504C><357>4448FAFCIGI64-/9660=12HISOBGKLA@?1/59EGM>:::DEJSOM1D?>=GI11775:930,;;>H9/7122)*0:**-===>B55878&#+"#&\',,?@EIJBCLSM964\'(376528:5/-*)21\'##+-&$"#%+&#&%\'-./49?B=A960@7;ABB@6CIEFDD>H7)&45:DC5158B'}}
+                    RF-17774e44-8d3c-4bdd-8730-aaf8addd3e6c
+                    {'read_tag': 3605, 'sub_tag': 1, 'priority': <read_priority.high_priority: 0>, 'metadata': {'adapter_front_begin_index': -1, 'adapter_front_foundseq': 'NA', 'adapter_front_foundseq_length': 2, 'adapter_front_id': 'NA', 'adapter_front_refseq': 'NA', 'adapter_front_score': 0.0, 'adapter_mid_end_index': -1, 'adapter_mid_id': 'NA', 'adapter_mid_score': 0.0, 'adapter_rear_end_index': -1, 'adapter_rear_foundseq': 'NA', 'adapter_rear_foundseq_length': 2, 'adapter_rear_id': 'NA', 'adapter_rear_refseq': 'NA', 'adapter_rear_score': 0.0, 'barcode_arrangement': '', 'barcode_front_begin_index': 0, 'barcode_front_foundseq': 'NA', 'barcode_front_foundseq_length': 2, 'barcode_front_id': 'NA', 'barcode_front_id_inner': 'NA', 'barcode_front_refseq': 'NA', 'barcode_front_score': 0.0, 'barcode_front_score_inner': 0.0, 'barcode_full_arrangement': '', 'barcode_kit': '', 'barcode_mid_front_end_index': 0, 'barcode_mid_front_id': 'NA', 'barcode_mid_front_score': 0.0, 'barcode_mid_rear_end_index': 0, 'barcode_mid_rear_id': 'NA', 'barcode_mid_rear_score': 0.0, 'barcode_rear_end_index': 0, 'barcode_rear_foundseq': 'NA', 'barcode_rear_foundseq_length': 2, 'barcode_rear_id': 'NA', 'barcode_rear_id_inner': 'NA', 'barcode_rear_refseq': 'NA', 'barcode_rear_score': 0.0, 'barcode_rear_score_inner': 0.0, 'barcode_score': 0.0, 'barcode_variant': '', 'basecall_type': 'beam search', 'daq_offset': 0.0, 'daq_scaling': 0.25, 'duration': 321, 'lamp_barcode_id': 'NA', 'lamp_barcode_score': 0.0, 'lamp_target_id': 'NA', 'lamp_target_score': 0.0, 'mean_qscore': 12.145367622375488, 'med_abs_dev': 0.01067463681101799, 'median': -373.5199890136719, 'model_stride': 6, 'model_version_id': '[email protected]', 'num_events': 53, 'num_minknow_events': 0, 'primer_front_begin_index': -1, 'primer_front_foundseq': 'NA', 'primer_front_foundseq_length': 2, 'primer_front_id': 'NA', 'primer_front_refseq': 'NA', 'primer_front_score': 0.0, 'primer_rear_end_index': -1, 'primer_rear_foundseq': 'NA', 'primer_rear_foundseq_length': 2, 'primer_rear_id': 'NA', 'primer_rear_refseq': 'NA', 'primer_rear_score': 0.0, 'read_id': 'RF-17774e44-8d3c-4bdd-8730-aaf8addd3e6c', 'sampling_rate': 5000.0, 'scaling_med_abs_dev': 0.01067463681101799, 'scaling_median': -373.5199890136719, 'scaling_version': '', 'sequence_length': 16, 'start_time': 12477610, 'state_size': 0, 'strand_id': '3b536c09-3f73-4c59-a332-2a0f6cad4ed9', 'trim_front': 0, 'trim_rear': 0, 'trimmed_duration': 321, 'trimmed_events': 53, 'trimmed_samples': 0}, 'datasets': {'raw_data': array([306, 289, 300, ..., 526, 537, 540], dtype=int16), 'sequence': 'TTTTTGTGGGCTTTTA', 'qstring': "21;;B98'-213,*2&"}}
+                    
+                    The second sub_tag will cause the code to gail as it is not expected.
+                    
+                    Therefore we need to check for the sub_tag and if it is not 0, we need to skip the read.
+                    '''
+                    if res["sub_tag"] > 0:
+                        continue
+
                     try:
                         channel, read_number, time_sent = cache.pop(read_id)
                     except KeyError:
-                        # FIXME: This is resolved in later versions of dorado.
                         channel, read_number, time_sent = skipped.pop(read_id)
                         reads_sent += 1
                     res["metadata"]["read_id"] = read_id[3:]
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,6 +12,7 @@ @@
        readfish unblock-all --device X3 --experiment-name "test unblock all"
     """
     from tempfile import NamedTemporaryFile
     from readfish._cli_args import DEVICE_BASE_ARGS
@@ Expand Down @@