Skip to content

Commit

Permalink
Introducing sample rate from minknow and catching sub_read issue.
Browse files Browse the repository at this point in the history
  • Loading branch information
mattloose committed Apr 20, 2024
1 parent 6bb34e4 commit a4d8b7e
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 8 deletions.
1 change: 1 addition & 0 deletions src/readfish/entry_points/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
readfish stats --toml tests/static/stats_test/yeast_summary_test.toml --fastq-directory tests/static/stats_test/ --html summary_adaptive
"""

from __future__ import annotations
import argparse
from pathlib import Path
Expand Down
5 changes: 4 additions & 1 deletion src/readfish/entry_points/targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
"""

# Core imports
from __future__ import annotations
import argparse
Expand Down Expand Up @@ -171,16 +172,18 @@ def __init__(
self.break_reads_after_seconds = (
self.client.connection.analysis_configuration.get_analysis_configuration().read_detection.break_reads_after_seconds.value
)
self.sample_rate = self.client.connection.device.get_sample_rate().sample_rate
self.logger.info("Run Configuration Received")
self.logger.info(f"run_id={self.run_information.run_id}")
self.logger.info(f"break_reads_after_seconds={self.break_reads_after_seconds}")
self.logger.info(f"sample_rate={self.sample_rate}")
# Create our statistics tracker
self.loop_statistics = ReadfishStatistics(
read_log_name, self.break_reads_after_seconds
)
logger.info("Initialising Caller")
self.caller: CallerABC = self.conf.caller_settings.load_object(
"Caller", run_information=self.run_information
"Caller", run_information=self.run_information, sample_rate=self.sample_rate
)
logger.info("Caller initialised")
caller_description = self.caller.describe()
Expand Down
1 change: 1 addition & 0 deletions src/readfish/entry_points/unblock_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
readfish unblock-all --device X3 --experiment-name "test unblock all"
"""

from tempfile import NamedTemporaryFile

from readfish._cli_args import DEVICE_BASE_ARGS
Expand Down
35 changes: 28 additions & 7 deletions src/readfish/plugins/dorado.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import numpy as np
import numpy.typing as npt
from minknow_api.protocol_pb2 import ProtocolRunInfo
from minknow_api.device_pb2 import GetSampleRateResponse

try:
from pybasecall_client_lib.helper_functions import package_read
Expand Down Expand Up @@ -55,12 +56,20 @@ def __getitem__(self, _):

class Caller(CallerABC):
def __init__(
self, run_information: ProtocolRunInfo = None, debug_log=None, **kwargs
self,
run_information: ProtocolRunInfo = None,
sample_rate: GetSampleRateResponse = None,
debug_log=None,
**kwargs,
):
self.logger = setup_logger("readfish_dorado_logger", log_file=debug_log)
self.supported_barcode_kits = None
self.supported_basecall_models = None
self.run_information = run_information
if sample_rate:
self.sample_rate = float(sample_rate)
else:
self.sample_rate = float(5000)

# Set our own priority
self.dorado_params = kwargs
Expand Down Expand Up @@ -198,21 +207,18 @@ def basecall(
cache, skipped = {}, {}
reads_received, reads_sent = 0, 0
daq_values = _DefaultDAQValues if daq_values is None else daq_values

for channel, read in reads:
# Attach the "RF-" prefix
read_id = f"RF-{read.id}"
t0 = time.time()
cache[read_id] = (channel, read.number, t0)

success = self.caller.pass_read(
package_read(
read_id=read_id,
raw_data=np.frombuffer(read.raw_data, signal_dtype),
daq_offset=daq_values[channel].offset,
daq_scaling=daq_values[channel].scaling,
# ToDo: Sample rate needs to be derived from the MinKNOW API
sampling_rate=float(5000),
sampling_rate=self.sample_rate,
start_time=int(read.start_sample),
)
)
Expand All @@ -237,13 +243,28 @@ def basecall(
time.sleep(self.caller.throttle)
continue

for res_batch in results:
for res_batch in results:
for res in res_batch:
read_id = res["metadata"]["read_id"]
'''
Dorado sometimes returns two sub_tags for a read.
An example result:
RF-17774e44-8d3c-4bdd-8730-aaf8addd3e6c
{'read_tag': 3605, 'sub_tag': 0, 'priority': <read_priority.high_priority: 0>, 'metadata': {'adapter_front_begin_index': -1, 'adapter_front_foundseq': 'NA', 'adapter_front_foundseq_length': 2, 'adapter_front_id': 'NA', 'adapter_front_refseq': 'NA', 'adapter_front_score': 0.0, 'adapter_mid_end_index': -1, 'adapter_mid_id': 'NA', 'adapter_mid_score': 0.0, 'adapter_rear_end_index': -1, 'adapter_rear_foundseq': 'NA', 'adapter_rear_foundseq_length': 2, 'adapter_rear_id': 'NA', 'adapter_rear_refseq': 'NA', 'adapter_rear_score': 0.0, 'barcode_arrangement': '', 'barcode_front_begin_index': 0, 'barcode_front_foundseq': 'NA', 'barcode_front_foundseq_length': 2, 'barcode_front_id': 'NA', 'barcode_front_id_inner': 'NA', 'barcode_front_refseq': 'NA', 'barcode_front_score': 0.0, 'barcode_front_score_inner': 0.0, 'barcode_full_arrangement': '', 'barcode_kit': '', 'barcode_mid_front_end_index': 0, 'barcode_mid_front_id': 'NA', 'barcode_mid_front_score': 0.0, 'barcode_mid_rear_end_index': 0, 'barcode_mid_rear_id': 'NA', 'barcode_mid_rear_score': 0.0, 'barcode_rear_end_index': 0, 'barcode_rear_foundseq': 'NA', 'barcode_rear_foundseq_length': 2, 'barcode_rear_id': 'NA', 'barcode_rear_id_inner': 'NA', 'barcode_rear_refseq': 'NA', 'barcode_rear_score': 0.0, 'barcode_rear_score_inner': 0.0, 'barcode_score': 0.0, 'barcode_variant': '', 'basecall_type': 'beam search', 'daq_offset': 0.0, 'daq_scaling': 0.25, 'duration': 8592, 'lamp_barcode_id': 'NA', 'lamp_barcode_score': 0.0, 'lamp_target_id': 'NA', 'lamp_target_score': 0.0, 'mean_qscore': 10.297822952270508, 'med_abs_dev': 0.01067463681101799, 'median': -373.5199890136719, 'model_stride': 6, 'model_version_id': '[email protected]', 'num_events': 1432, 'num_minknow_events': 0, 'primer_front_begin_index': -1, 'primer_front_foundseq': 'NA', 'primer_front_foundseq_length': 2, 'primer_front_id': 'NA', 'primer_front_refseq': 'NA', 'primer_front_score': 0.0, 'primer_rear_end_index': -1, 'primer_rear_foundseq': 'NA', 'primer_rear_foundseq_length': 2, 'primer_rear_id': 'NA', 'primer_rear_refseq': 'NA', 'primer_rear_score': 0.0, 'read_id': 'RF-17774e44-8d3c-4bdd-8730-aaf8addd3e6c', 'sampling_rate': 5000.0, 'scaling_med_abs_dev': 0.01067463681101799, 'scaling_median': -373.5199890136719, 'scaling_version': '', 'sequence_length': 340, 'start_time': 12462912, 'state_size': 0, 'strand_id': '441174ae-e18f-4612-bc46-0c86d301cb4d', 'trim_front': 0, 'trim_rear': 0, 'trimmed_duration': 8592, 'trimmed_events': 1432, 'trimmed_samples': 0}, 'datasets': {'raw_data': array([306, 289, 300, ..., 526, 537, 540], dtype=int16), 'sequence': 'CTACAGTGGTCGTCAATACTATTGCAACTCCAGCCTGGGCAACAGAGTGAGACCCTGTCTCAAAAAAAAACAGAAGGAAGCCAAAGCCCACAAAGACGCTGAGTATTTTACATTCCTTTTTTGTACGAATGCTTCAAAATCTGGTGTGTATCTTACAGTTATAAAATCTCTCAATTTAGCCACCAAATTTTCACCAGAAATGCATGAACTGTGTTCAGATTTCATAAACTTTCAGTTGATAAAGTAGATTCACATACTGAAGGTGTTCCAAACGTACTGGAAGACTTTCTAAAATGGATTTGAACACCAGTTTCCAGGTTTACATTTAAGTTAATTCAAA', 'qstring': '#(&&%(\'\'%&(#"""$\'%%$%$%*,)+()\'(\'\'*+AEHHH.-0410)200BGC4HKJJGGB77<DC3-\'.,,+\'&&%%+*(*1-778:C76+,=A.,-./*)+##9:JSGGGHD6432,5>H(\'(+,1-(,//:3640==6504C><357>4448FAFCIGI64-/9660=12HISOBGKLA@?1/59EGM>:::DEJSOM1D?>=GI11775:930,;;>H9/7122)*0:**-===>B55878&#+"#&\',,?@EIJBCLSM964\'(376528:5/-*)21\'##+-&$"#%+&#&%\'-./49?B=A960@7;ABB@6CIEFDD>H7)&45:DC5158B'}}
RF-17774e44-8d3c-4bdd-8730-aaf8addd3e6c
{'read_tag': 3605, 'sub_tag': 1, 'priority': <read_priority.high_priority: 0>, 'metadata': {'adapter_front_begin_index': -1, 'adapter_front_foundseq': 'NA', 'adapter_front_foundseq_length': 2, 'adapter_front_id': 'NA', 'adapter_front_refseq': 'NA', 'adapter_front_score': 0.0, 'adapter_mid_end_index': -1, 'adapter_mid_id': 'NA', 'adapter_mid_score': 0.0, 'adapter_rear_end_index': -1, 'adapter_rear_foundseq': 'NA', 'adapter_rear_foundseq_length': 2, 'adapter_rear_id': 'NA', 'adapter_rear_refseq': 'NA', 'adapter_rear_score': 0.0, 'barcode_arrangement': '', 'barcode_front_begin_index': 0, 'barcode_front_foundseq': 'NA', 'barcode_front_foundseq_length': 2, 'barcode_front_id': 'NA', 'barcode_front_id_inner': 'NA', 'barcode_front_refseq': 'NA', 'barcode_front_score': 0.0, 'barcode_front_score_inner': 0.0, 'barcode_full_arrangement': '', 'barcode_kit': '', 'barcode_mid_front_end_index': 0, 'barcode_mid_front_id': 'NA', 'barcode_mid_front_score': 0.0, 'barcode_mid_rear_end_index': 0, 'barcode_mid_rear_id': 'NA', 'barcode_mid_rear_score': 0.0, 'barcode_rear_end_index': 0, 'barcode_rear_foundseq': 'NA', 'barcode_rear_foundseq_length': 2, 'barcode_rear_id': 'NA', 'barcode_rear_id_inner': 'NA', 'barcode_rear_refseq': 'NA', 'barcode_rear_score': 0.0, 'barcode_rear_score_inner': 0.0, 'barcode_score': 0.0, 'barcode_variant': '', 'basecall_type': 'beam search', 'daq_offset': 0.0, 'daq_scaling': 0.25, 'duration': 321, 'lamp_barcode_id': 'NA', 'lamp_barcode_score': 0.0, 'lamp_target_id': 'NA', 'lamp_target_score': 0.0, 'mean_qscore': 12.145367622375488, 'med_abs_dev': 0.01067463681101799, 'median': -373.5199890136719, 'model_stride': 6, 'model_version_id': '[email protected]', 'num_events': 53, 'num_minknow_events': 0, 'primer_front_begin_index': -1, 'primer_front_foundseq': 'NA', 'primer_front_foundseq_length': 2, 'primer_front_id': 'NA', 'primer_front_refseq': 'NA', 'primer_front_score': 0.0, 'primer_rear_end_index': -1, 'primer_rear_foundseq': 'NA', 'primer_rear_foundseq_length': 2, 'primer_rear_id': 'NA', 'primer_rear_refseq': 'NA', 'primer_rear_score': 0.0, 'read_id': 'RF-17774e44-8d3c-4bdd-8730-aaf8addd3e6c', 'sampling_rate': 5000.0, 'scaling_med_abs_dev': 0.01067463681101799, 'scaling_median': -373.5199890136719, 'scaling_version': '', 'sequence_length': 16, 'start_time': 12477610, 'state_size': 0, 'strand_id': '3b536c09-3f73-4c59-a332-2a0f6cad4ed9', 'trim_front': 0, 'trim_rear': 0, 'trimmed_duration': 321, 'trimmed_events': 53, 'trimmed_samples': 0}, 'datasets': {'raw_data': array([306, 289, 300, ..., 526, 537, 540], dtype=int16), 'sequence': 'TTTTTGTGGGCTTTTA', 'qstring': "21;;B98'-213,*2&"}}
The second sub_tag will cause the code to gail as it is not expected.
Therefore we need to check for the sub_tag and if it is not 0, we need to skip the read.
'''
if res["sub_tag"] > 0:
continue

try:
channel, read_number, time_sent = cache.pop(read_id)
except KeyError:
# FIXME: This is resolved in later versions of dorado.
channel, read_number, time_sent = skipped.pop(read_id)
reads_sent += 1
res["metadata"]["read_id"] = read_id[3:]
Expand Down

0 comments on commit a4d8b7e

Please sign in to comment.