diff --git a/.github/workflows/build_docker.yml b/.github/workflows/build_docker.yml index bbe909d..ea3ee60 100644 --- a/.github/workflows/build_docker.yml +++ b/.github/workflows/build_docker.yml @@ -33,7 +33,7 @@ jobs: packages: write contents: read runs-on: ubuntu-latest - timeout-minutes: 45 + timeout-minutes: 60 steps: - name: Checkout @@ -83,6 +83,7 @@ jobs: - name: Test docker image in python3.10 uses: addnab/docker-run-action@v3 + timeout-minutes: 20 with: image: ${{ env.TEST_TAG }} shell: /bin/bash @@ -97,6 +98,7 @@ jobs: - name: Push docker image uses: docker/build-push-action@v4 + timeout-minutes: 20 id: Push with: context: . @@ -108,6 +110,7 @@ jobs: - name: Run full usecase (in Docker container) uses: addnab/docker-run-action@v3 + timeout-minutes: 20 with: image: ${{ env.TEST_TAG }} shell: /bin/bash @@ -133,6 +136,7 @@ jobs: - name: Archive figures uses: actions/upload-artifact@v3 + timeout-minutes: 10 with: name: usecase-figures path: figures.tar.gz diff --git a/.vscode/launch.json b/.vscode/launch.json index d3b30d6..3992111 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -15,23 +15,61 @@ { "name": "Python: Current File", - "type": "python", + "type": "debugpy", "request": "launch", - "program": "${file}", + + // "program": "${file}", + + "program": "/home/mmordig/ont_project_all/ont_project/usecases/enrich_usecase.py", + "cwd": "/home/mmordig/ont_project_all/ont_project/runs/enrich_usecase/full_genome_run_sampler_per_window", + "console": "integratedTerminal", // "justMyCode": true "justMyCode": false // to debug external library code }, { "name": "Python: Attach to python process", - "type": "python", + "type": "debugpy", "request": "attach", - "processId": "${command:pickProcess}", + "processId": "${command:pickProcess}", // ctrl+Z, fg to get pid; requires "echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope" on Linux, see https://code.visualstudio.com/docs/python/debugging, may need to launch program from within vscode + // "logToFile": true, // in case it fails "justMyCode": false - } + }, // { // // "host": "compute-biomed-01", // "" // } + + { + "name": "Python: enrich usecase", + "type": "debugpy", + "request": "launch", + "python": "/home/mmordig/miniforge3/envs/nanosim/bin/python", + + // (cd /home/mmordig/ont_project_all/ont_project/runs/enrich_usecase/chr202122_run && python ~/ont_project_all/ont_project/usecases/enrich_usecase.py) + + "program": "/home/mmordig/ont_project_all/ont_project/usecases/enrich_usecase.py", + // "cwd": "/home/mmordig/ont_project_all/ont_project/runs/enrich_usecase/chr202122_run", + "cwd": "/home/mmordig/ont_project_all/ont_project/runs/enrich_usecase/readfish_exp/results_readfishexp_realreads", + + "console": "integratedTerminal", + // "justMyCode": true + "justMyCode": false // to debug external library code + }, + + { + "name": "Python: debug nanosim", + "type": "debugpy", + "request": "launch", + + "python": "/home/mmordig/miniforge3/envs/nanosim/bin/python", + "program": "external/ont_nanosim/src/simulator.py", + "args": ["genome", "--model_prefix", "runs/nanosim_models/human_NA12878_DNA_FAB49712_guppy/training", "--ref_g", "runs/data/random_genome.fasta", "-dna_type", "linear", "-med", "15000", "-max", "20000", "-min", "400", "-sd", "6.9", "--output", "runs/data/nanosim_reads/human_genome_med15000/reads_seed3", "--number", "100000", "--seed", "3", "--strandness", "0.5", "--basecaller", "guppy", "--aligned_rate", "100%", "--num_threads", "1", "--no_flanking", "--no_error_profile"], + "cwd": "/home/mmordig/ont_project_all/ont_project/", + + "console": "integratedTerminal", + // "justMyCode": true + "justMyCode": false // to debug external library code + }, ] } \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..79ea931 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,28 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "shell", + "label": "rsync to mpi", + "command": [ + // "rsync -avzh --exclude /ont_project/runs --exclude /ont_project/usecase_data.tar.gz --exclude /ont_project/.tox/ --exclude /ont_project/.git/ --exclude /ont_project/external/ont_nanosim --progress --delete ~/ont_project_all/ont_project mpi:/home/mmordig/ont_project_all &&", + // also syncing nanosim + "rsync -avzh --exclude /ont_project/runs --exclude /ont_project/usecase_data.tar.gz --exclude /ont_project/.tox/ --exclude /ont_project/.git/ --progress --delete ~/ont_project_all/ont_project mpi:/home/mmordig/ont_project_all &&", + + // sync to biomed + // "rsync -avzh --exclude /ont_project/runs --exclude /ont_project/usecase_data.tar.gz --exclude /ont_project/.tox/ --exclude /ont_project/.git/ --exclude /ont_project/external/ont_nanosim --progress --delete ~/ont_project_all/ont_project biomed:/cluster/home/mmordig/ont_project_all &&", + // // sync figures back from biomed + // "rsync -avzh --progress --include='*/' --include '**/figures/*.png' --include '**/configs/*' --include '**/pickled_figures/*.dill' --exclude '*' --delete biomed:/cluster/work/grlab/projects/mmordig/selseq_runs/ ~/ont_project_all/figures_biomed_cluster &&", + + "echo Current time: $(date)" + ], + "problemMatcher": [], + // in keybindings.json + // { + // "key": "cmd+m cmd+p", + // "command": "workbench.action.tasks.runTask", + // "args": "rsync to mpi" + // } + } + ] +} \ No newline at end of file diff --git a/DeveloperNotes.md b/DeveloperNotes.md index 0f05f21..95ef8e4 100644 --- a/DeveloperNotes.md +++ b/DeveloperNotes.md @@ -8,7 +8,13 @@ This is only applicable if you want to develop the package. After changing the package entrypoints, you have to reinstall the package with ```{bash} +# test it with `python -c "import ru"`. pip uninstall -y simreaduntil; pip install -e './[test,readfish,dev]' + +# need to reinstall readfish to use our modified version +# Hatch does not support installing dependencies like readfish in editable mode, so we install it manually with "-e". +# ReadFish imports its own files with `ru.*`, so it assumes that the ReadFish directory is in the `PYTHONPATH`. +pip uninstall -y readfish; pip install -e ./external/ont_readfish ``` This is also necessary when modifying the readfish dependency because it cannot easily be installed in editable mode with hatch: https://(github.com/pypa/hatch/issues/588). @@ -38,11 +44,6 @@ python -m pytest --cov=. tests/simulator/gap_sampling/test_gap_sampling.py::test pydoctor "./src/simreaduntil" # can also put one file to just compile it -# manually install ReadFish -# ReadFish imports its own files with `ru.*`, so it assumes that the ReadFish directory is in the `PYTHONPATH`. -pip install -e ./external/ont_readfish -# test it with `python -c "import ru"`. - git submodule add [] git config --add oh-my-zsh.hide-dirty 1 # otherwise cd into NanoSim directory is slow diff --git a/pyproject.toml b/pyproject.toml index 724d1a7..b8c0276 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,6 +70,9 @@ readfish = [ "read-until @ git+https://github.com/nanoporetech/read_until_api@v3.4.1", "readfish @ {root:uri}/external/ont_readfish", ] +rawsignal = [ + "pyslow5", +] [project.scripts] plot_seqsum = "simreaduntil.seqsum_tools.seqsum_plotting:main" @@ -91,7 +94,7 @@ usecase_make_html_report = "simreaduntil.usecase_helpers.cli_usecase.make_html_r [tool.hatch.version] source = "vcs" -fallback-version = "unknown_version" +fallback-version = "0.0.0.7999" # dummy version to recognize it [tool.hatch.metadata] # to install from git diff --git a/simulator_example.png b/simulator_example.png index 7478122..254ae22 100644 Binary files a/simulator_example.png and b/simulator_example.png differ diff --git a/src/simreaduntil/seqsum_tools/coverage_tracker.py b/src/simreaduntil/seqsum_tools/coverage_tracker.py index 48dccbd..4421611 100644 --- a/src/simreaduntil/seqsum_tools/coverage_tracker.py +++ b/src/simreaduntil/seqsum_tools/coverage_tracker.py @@ -189,7 +189,7 @@ def get_chrom_start_len(self, read_id) -> Optional[Tuple[Any, int, int]]: read_id: a NanoSim read id Returns: - Tuple (chrom, ref_start, ref_len) with respect to forward strand; None + Tuple (chrom, ref_start, ref_len) with respect to forward strand; None if could not be mapped """ raise NotImplementedError() @@ -255,6 +255,9 @@ def get_fraction_cov_atleast(self, threshold, chroms: Optional[List]=None) -> fl """ if chroms is None: chroms = list(self.coverage_per_chrom.keys()) + if len(chroms) == 0: + logger.warning("get_fraction_cov_atleast called with empty chroms, returning 1.0") + return 1.0 return sum((self.coverage_per_chrom[chrom] >= threshold).sum(dtype=np.uint64) for chrom in chroms) / sum(len(self.coverage_per_chrom[chrom]) for chrom in chroms) def get_chrom_lens(self) -> Dict[str, int]: @@ -503,6 +506,9 @@ def get_fraction_cov_atleast(self, threshold, chroms: Optional[List]=None) -> fl """ if chroms is None: chroms = list(self.coverage_per_chrom.keys()) + if len(chroms) == 0: + logger.warning("get_fraction_cov_atleast called with empty chroms, returning 1.0") + return 1.0 # last block can be shorter, so we also have to weight it differently return sum(((self._avg_cov_per_block(chrom) >= threshold) * self._block_sizes(chrom)).sum(dtype=np.uint64) for chrom in chroms) / sum(self.chrom_lens[chrom] for chrom in chroms) @@ -548,9 +554,13 @@ def plot_state(self, plot_type, target_coverage=None, **kwargs): class NanoSimCoverageTracker(CovTrackerClass): """ Track coverage by parsing the location from the NanoSim read ids + + NanoSim unaligned reads do not map. """ def get_chrom_start_len(self, read_id): nanosim_id = NanoSimId.from_str(read_id) + if nanosim_id.read_type == "unaligned": + return None return (nanosim_id.chrom, nanosim_id.ref_pos, nanosim_id.ref_len) class PafCoverageTracker(CovTrackerClass): diff --git a/src/simreaduntil/seqsum_tools/seqsum_plotting.py b/src/simreaduntil/seqsum_tools/seqsum_plotting.py index 8d61b4d..d4ac77b 100644 --- a/src/simreaduntil/seqsum_tools/seqsum_plotting.py +++ b/src/simreaduntil/seqsum_tools/seqsum_plotting.py @@ -139,7 +139,7 @@ def seqsum_add_cols_for_plotting_selseq_performance(seqsum_df, group_column=None seqsum_df["end_reason"] = "unknown" seqsum_df["end_reason"] = seqsum_df["end_reason"].astype("category") - seqsum_df["is_user_rejection"] = seqsum_df["end_reason"] == "data_service_unblock_mux_change" # user rejections only, rejections due to mux scan may also happen + seqsum_df["is_user_rejection"] = seqsum_df["end_reason"] == "data_service_unblock_mux_change" # user rejections only, rejections due to mux scan or simulation end may also happen seqsum_df["is_full_read"] = seqsum_df["end_reason"] == "signal_positive" # number of full reads seqsum_df["end_time"] = seqsum_df["start_time"] + seqsum_df["duration"] @@ -167,7 +167,9 @@ def seqsum_add_cols_for_plotting_selseq_performance(seqsum_df, group_column=None seqsum_df[f"cum_nb_never_requested_per_{group_column}"] = seqsum_df.groupby(group_column, observed=True)["never_requested"].cumsum() if "nb_ref_bps_full" in seqsum_df.columns: - assert all(~seqsum_df["is_user_rejection"] | seqsum_df["nb_rejectedbps"] > 0) # user rejected => nb_rejectedbps > 0 + pass + # not always true when using real data, e.g. if read was already over when rejected but still logging as rejected + # assert all(~seqsum_df["is_user_rejection"] | seqsum_df["nb_rejectedbps"] > 0) # user rejected => nb_rejectedbps > 0 return seqsum_df @@ -446,7 +448,7 @@ def create_plot(ax, normalize): return fig -def plot_channel_occupation_fraction_over_time(seqsum_df, timepoints=None, mux_scan_interval=None, save_dir=None): +def plot_channel_occupation_over_time(seqsum_df, timepoints=None, mux_scan_interval=None, save_dir=None): """ Plot channel occupation (active percentage) over time @@ -498,7 +500,7 @@ def plot_channel_occupation_fraction_over_time(seqsum_df, timepoints=None, mux_s make_tight_layout(fig) if save_dir is not None: - save_fig_and_pickle(fig, save_dir / f"channel_occupation_fraction_over_time.{FIGURE_EXT}") + save_fig_and_pickle(fig, save_dir / f"channel_occupation_over_time.{FIGURE_EXT}") return ax @@ -787,6 +789,29 @@ def plot_read_end_reason_hist(seqsum_df, save_dir=None): return ax +def plot_read_length_by_end_reason(seqsum_df, save_dir=None, end_reasons=None): + """ + Plot histogram of read length for different end reasons (fully read, stop_receiving, rejected, never_requested) + """ + fig, ax = plt.subplots() + if end_reasons is None: + df = seqsum_df + else: + df = seqsum_df[seqsum_df["end_reason"].isin(end_reasons)] + df["end_reason"] = df["end_reason"].cat.remove_unused_categories() # raises a SettingWithCopyWarning warning, not really clear why + if len(df) > 0: + # seaborn: error when no data + sns.histplot(df, x="sequence_length_template", hue="end_reason", multiple="dodge", ax=ax) + make_tight_layout(fig) + + if save_dir is not None: + base_filename = "read_length_by_end_reason" + if end_reasons is not None: + base_filename += "_" + "_".join(end_reasons) + save_fig_and_pickle(fig, save_dir / f"{base_filename}.{FIGURE_EXT}") + + return ax + def plot_processed_seqsum(seqsum_df, save_dir: Optional[Path]=None, group_column=None, close_figures: Optional[bool]=None): """ Plot a bunch of stuff from the processed seqsum_df, subsampling when sensible @@ -806,7 +831,7 @@ def plot_processed_seqsum(seqsum_df, save_dir: Optional[Path]=None, group_column def close_fig(fig): if close_figures: plt.close(fig) - + # # compute instantaneous rates # # take difference (x[i+step] - x[i-step]) while keeping array size the same by extending the array on both sides by the step # take_diff = lambda x, step=3: np.concatenate((x[step:] - x[:-step], [np.NaN]*step)) @@ -846,14 +871,18 @@ def sample_group(group): # require full seqsum_df fig = plot_number_channels_per_group_over_time(seqsum_df, save_dir=save_dir, group_column=group_column); logger.debug("Created 1 plot"); close_fig(fig) - ax = plot_channel_occupation_fraction_over_time(seqsum_df, save_dir=save_dir); logger.debug("Created 1 plot"); close_fig(ax.figure) + ax = plot_channel_occupation_over_time(seqsum_df, save_dir=save_dir); logger.debug("Created 1 plot"); close_fig(ax.figure) ax = plot_channels_over_time(seqsum_df, save_dir=save_dir); logger.debug("Created 1 plot"); close_fig(ax.figure) if "mux" in seqsum_df.columns and (seqsum_df["mux"].nunique() > 1): ax = plot_mux_over_time(seqsum_df, save_dir=save_dir); logger.debug("Created 1 plot"); close_fig(ax.figure) fig, _ = plot_read_stats_by_channel_hists(seqsum_df, save_dir=save_dir); logger.debug("Created 1 plot"); close_fig(fig) ax = plot_fraction_states_per_channel(seqsum_df, save_dir=save_dir); logger.debug("Created 1 plot"); close_fig(ax.figure) ax = plot_read_end_reason_hist(seqsum_df, save_dir=save_dir); logger.debug("Created 1 plot"); close_fig(ax.figure) - + + # ax = plot_read_length_by_end_reason(seqsum_df, save_dir=save_dir); logger.debug("Created 1 plot"); close_fig(ax.figure) + ax = plot_read_length_by_end_reason(seqsum_df, save_dir=save_dir, end_reasons=["signal_positive"]); logger.debug("Created 1 plot"); close_fig(ax.figure) + ax = plot_read_length_by_end_reason(seqsum_df, save_dir=save_dir, end_reasons=["data_service_unblock_mux_change"]); logger.debug("Created 1 plot"); close_fig(ax.figure) + def plot_coverage_per_group(cov_df, cov_thresholds=[1, 2, 3, 4, 5, 6], save_dir: Optional[Path]=None, group_column="group", close_figures=None): """Plot fraction covered per group for each coverage, then per coverage for each group""" if close_figures is None: @@ -888,43 +917,24 @@ def close_fig(fig): save_fig_and_pickle(fig, save_dir / f"fraction_covered_{group}.{FIGURE_EXT}") logger.debug("Created 1 plot"); close_fig(ax.figure) -def create_plots_for_seqsum(seqsum_df, nrows=None, group_to_units: Dict[str, List[Any]]=None, group_column=None, - ref_genome_path=None, paf_file=None, cov_thresholds=[1, 2, 3, 4, 5, 6], cov_every=1, - save_dir=None, close_figures=None): - """ - Create plots for a sequencing summary file - - Args: - seqsum_df: path to sequencing summary file, or dataframe - nrows: only read the first nrows reads - group_to_units: dictionary {group_name: units} where units form a subset of the unique values in group_column; if None, groups have size 1 - group_column: column in sequencing summary file to group by; if "all", use one group called "all"; if None, use GROUP_COLUMN - - ref_genome_path: path to reference genome; if None, don't plot coverage - paf_file: path to PAF file to map reads to unit; if None, unit is the chromosome extracted from NanoSim read id - cov_thresholds: coverage thresholds to plot - cov_every: coverage is calculated every cov_every reads - - save_dir: directory to save plots to, if None, plots are not saved - close_figures: close figures after saving, if None, close figures if save_dir is not None - - Returns: - seqsum_df, cov_df - """ +# for doc, see create_plots_for_seqsum +def preprocess_seqsum_df_for_plotting(seqsum_df, nrows=None, group_to_units=None, group_column=None, paf_file=None): group_column = group_column or GROUP_COLUMN + chrom_column = group_column # column to use for coverage or to compute groups - if save_dir is not None: - save_dir.mkdir(exist_ok=True) - if not isinstance(seqsum_df, pd.DataFrame): logger.debug(f"Reading {nrows if nrows is not None else 'all'} reads from sequencing summary file '{seqsum_df}'") seqsum_df_filename = seqsum_df - seqsum_df = pd.read_csv(seqsum_df_filename, sep="\t", nrows=nrows) + try: + seqsum_df = pd.read_csv(seqsum_df_filename, sep="\t", nrows=nrows) + except pd.errors.EmptyDataError: + logger.warning(f"Empty sequencing summary file '{seqsum_df}'") + seqsum_df = pd.DataFrame() # empty, will exit below logger.debug(f"Done reading sequencing summary file '{seqsum_df_filename}'") if len(seqsum_df) == 0: - logger.warning(f"Empty sequencing summary file '{seqsum_df}'") - return seqsum_df, None + logger.warning(f"Empty sequencing summary") + return seqsum_df, None, chrom_column logger.info(f"Sorting and cleaning seqsummary file of shape {seqsum_df.shape}") seqsum_df = sort_and_clean_seqsum_df(seqsum_df) @@ -941,18 +951,17 @@ def create_plots_for_seqsum(seqsum_df, nrows=None, group_to_units: Dict[str, Lis logger.info(f"Adding group column from NanoSim read id") add_group_and_reflen_from_nanosim_id(seqsum_df, group_column=group_column) - chrom_column = group_column if group_to_units is not None: # create column to group by, e.g. several chromosomes in one group - units_in_group = set.union(*[set(units) for units in group_to_units.values()]) + all_units = set.union(*[set(units) for units in group_to_units.values()]) observed_units = set(seqsum_df[group_column].unique()) - if not units_in_group.issubset(observed_units): - logger.warning(f"No reads were observed from the following groups: {units_in_group - observed_units}") - other_group = observed_units - units_in_group + if not all_units.issubset(observed_units): + logger.warning(f"No reads were observed from the following groups: {all_units - observed_units}") + other_group = observed_units - all_units if len(other_group) > 0: assert "other" not in group_to_units group_to_units["other"] = other_group - logger.info(f"Plotting according to groups {group_to_units}") + logger.info(f"Splitting according to groups {group_to_units}") group_column = "group" assert group_column not in seqsum_df.columns, f"New column '{group_column}' already in sequencing summary df with columns {seqsum_df.columns}" @@ -962,6 +971,38 @@ def create_plots_for_seqsum(seqsum_df, nrows=None, group_to_units: Dict[str, Lis logger.info("Adding extra columns for plotting") seqsum_df = seqsum_add_cols_for_plotting_selseq_performance(seqsum_df, group_column=group_column) + return seqsum_df, group_column, chrom_column + +def create_plots_for_seqsum(seqsum_df, nrows=None, group_to_units: Dict[str, List[Any]]=None, group_column=None, + ref_genome_path=None, paf_file=None, cov_thresholds=[1, 2, 3, 4, 5, 6], cov_every=1, + save_dir=None, close_figures=None): + """ + Create plots for a sequencing summary file + + Args: + seqsum_df: path to sequencing summary file, or dataframe + nrows: only read the first nrows reads + group_to_units: dictionary {group_name: units} where units form a subset of the unique values in group_column; + if None, groups have size 1; each read should belong to exactly one group + group_column: column in sequencing summary file to group by; if "all", use one group called "all"; if None, use GROUP_COLUMN + + ref_genome_path: path to reference genome; if None, don't plot coverage + paf_file: path to PAF file to map reads to unit; if None, unit is the chromosome extracted from NanoSim read id + cov_thresholds: coverage thresholds to plot + cov_every: coverage is calculated every cov_every reads + + save_dir: directory to save plots to, if None, plots are not saved + close_figures: close figures after saving, if None, close figures if save_dir is not None + + Returns: + seqsum_df, cov_df + """ + + if save_dir is not None: + save_dir.mkdir(exist_ok=True) + + seqsum_df, group_column, chrom_column = preprocess_seqsum_df_for_plotting(seqsum_df, nrows=nrows, group_to_units=group_to_units, group_column=group_column, paf_file=paf_file) + logger.debug("Creating plots for seqsum...") plot_processed_seqsum(seqsum_df, group_column=group_column, save_dir=save_dir, close_figures=close_figures) logger.debug("Done creating plots for seqsum...") @@ -996,7 +1037,7 @@ def main(): """ CLI entrypoint to create plots from a sequencing summary file """ - add_comprehensive_stream_handler_to_logger(None, logging.DEBUG) + add_comprehensive_stream_handler_to_logger(logger, logging.DEBUG) if is_test_mode(): args = argparse.Namespace() @@ -1027,6 +1068,8 @@ def main(): else: group_units = {"targets": args.targets.split(",")} create_plots_for_seqsum(seqsum_df=args.seqsummary_filename, nrows=args.nrows, group_to_units=group_units, ref_genome_path=args.ref_genome_path, paf_file=args.paf_file, cov_thresholds=cov_thresholds, cov_every=args.cov_every, save_dir=args.save_dir) + # # todo: enable above + # create_plots_for_seqsum(seqsum_df=args.seqsummary_filename, nrows=args.nrows, group_column="all", ref_genome_path=args.ref_genome_path, paf_file=args.paf_file, cov_thresholds=cov_thresholds, cov_every=args.cov_every, save_dir=args.save_dir) logger.debug("Done with plotting script") diff --git a/src/simreaduntil/shared_utils/debugging_helpers.py b/src/simreaduntil/shared_utils/debugging_helpers.py index f4773ec..9db4b80 100644 --- a/src/simreaduntil/shared_utils/debugging_helpers.py +++ b/src/simreaduntil/shared_utils/debugging_helpers.py @@ -40,14 +40,14 @@ def warn_debugging(): def helper(): # also print, in case logging is disabled [print("#"*80) for _ in range(5)] - print("Running in test mode") + print("Running in debug mode") [print("#"*80) for _ in range(5)] [logger.info("#"*80) for _ in range(5)] - logger.info("Running in test mode", stacklevel=2) + logger.info("Running in debug mode", stacklevel=2) [logger.info("#"*80) for _ in range(5)] helper() - # also print when program terminates + # also print when program terminates, but only once global __WARN_DEBUGGING_REGISTERED if not __WARN_DEBUGGING_REGISTERED: import atexit diff --git a/src/simreaduntil/shared_utils/logging_utils.py b/src/simreaduntil/shared_utils/logging_utils.py index ad55785..f19de31 100644 --- a/src/simreaduntil/shared_utils/logging_utils.py +++ b/src/simreaduntil/shared_utils/logging_utils.py @@ -66,6 +66,13 @@ def custom_emit(record): END_WITH_CARRIAGE_RETURN = {"extra": {"end": "\r"}} """use together with make_handler_support_end to print to logger with a carriage return (move to beginning of line, overwriting content)""" +def logging_output_formatter(handler): + """configures handler to use a specific formatter""" + formatter = logging.Formatter("%(asctime)s - %(message)s --- %(filename)s:%(lineno)d (%(funcName)s) %(levelname)s ##") + # "--- vscode://%(pathname)s:%(lineno)d - %(levelname)s" + make_handler_support_end(handler) + handler.setFormatter(formatter) + _STREAM_HANDLER_ATTR_NAME = "COMPREHENSIVE_STREAM_HANDLER" def add_comprehensive_stream_handler_to_logger(logger: Union[str, logging.Logger, None]=None, level=logging.NOTSET): """ @@ -91,14 +98,11 @@ def add_comprehensive_stream_handler_to_logger(logger: Union[str, logging.Logger return False handler = logging.StreamHandler() # outputs to sys.stderr - + logging_output_formatter(handler) handler.setLevel(level) setattr(handler, _STREAM_HANDLER_ATTR_NAME, True) - formatter = logging.Formatter("%(asctime)s - %(message)s --- %(filename)s:%(lineno)d (%(funcName)s) %(levelname)s ##") - # "--- vscode://%(pathname)s:%(lineno)d - %(levelname)s" - make_handler_support_end(handler) - handler.setFormatter(formatter) + logger.addHandler(handler) logging.captureWarnings(True) diff --git a/src/simreaduntil/shared_utils/nanosim_parsing.py b/src/simreaduntil/shared_utils/nanosim_parsing.py index dcb1265..8630b44 100644 --- a/src/simreaduntil/shared_utils/nanosim_parsing.py +++ b/src/simreaduntil/shared_utils/nanosim_parsing.py @@ -12,7 +12,7 @@ class NanoSimId: Integer-like strings will be converted. """ def __init__(self, chrom, ref_pos, read_nb, direction, ref_len, head_len=0, tail_len=0, read_type="aligned"): - assert read_type in ["aligned", "perfect"] + assert read_type in ["aligned", "unaligned", "perfect"] assert direction in ["R", "F"] ref_pos = int(ref_pos) ref_len = int(ref_len) @@ -20,8 +20,8 @@ def __init__(self, chrom, ref_pos, read_nb, direction, ref_len, head_len=0, tail head_len = int(head_len) tail_len = int(tail_len) # if read_type == "perfect": - assert head_len == 0 - assert tail_len == 0 + # assert head_len == 0 + # assert tail_len == 0 self.chrom = chrom self.ref_pos = ref_pos @@ -42,11 +42,14 @@ def from_str(read_id: str): E.g. chr2_920875_perfect_proc0:1_F_0_8346_0 chr2_649870_aligned_proc0:2_F_0_10399_0 + chr-18_12681_aligned_proc3:68_F_5_13231_10 + genome1-chr-6_236227_unaligned_proc5:16_R_0_16119_0 This is of the form {genome}-{chromosome}_{ref_pos}_{read_type}_{read_nb}_{strand}_{head_len}_{segment_lengths}_{tail_len} The ref_pos is 0-based and the read spans [ref_pos:ref_pos+ref_len] on the forward strand, independent of the the direction which is F or R (forward, reverse). We assume here that the head and tail flanking lengths are 0. read_nb: proc{process_nb}:{read_nb_for_process} + head and tail len are with respect to read on forward strand, so when the read is reversed, the read starts with the tail length Note (not relevant here): for chimeric reads, '{genome-chromosome}_{position}' and 'segment_lengths' are joined by ";" etc. """ diff --git a/src/simreaduntil/shared_utils/tee_stdouterr.py b/src/simreaduntil/shared_utils/tee_stdouterr.py index 164750e..90542c5 100644 --- a/src/simreaduntil/shared_utils/tee_stdouterr.py +++ b/src/simreaduntil/shared_utils/tee_stdouterr.py @@ -13,7 +13,7 @@ class TeeStdouterr: We use the logging module to ensure thread-safety when writing to it. Do not use this logger or its children for anything else. - Unlike common implementations seen on the Internet, this implementation correctly logs stdout and stderr to the respective + Unlike common implementations seen on the Internet, this implementation logs stdout and stderr to the respective streams (instead of merging them into one stream). Args: diff --git a/src/simreaduntil/shared_utils/thread_helpers.py b/src/simreaduntil/shared_utils/thread_helpers.py index 5350e00..7d3c30f 100644 --- a/src/simreaduntil/shared_utils/thread_helpers.py +++ b/src/simreaduntil/shared_utils/thread_helpers.py @@ -54,6 +54,8 @@ def inner_wrapper(self, *args, **kwargs): return inner_wrapper return wrapper +# todo: remove + class MakeThreadSafe: """ Inheriting from this class makes instance method and instance attribute access thread-safe. diff --git a/src/simreaduntil/shared_utils/timing.py b/src/simreaduntil/shared_utils/timing.py index 57865ba..6a38430 100644 --- a/src/simreaduntil/shared_utils/timing.py +++ b/src/simreaduntil/shared_utils/timing.py @@ -62,7 +62,7 @@ def elapsed_time_last_reset(self): _time_offset = time.time_ns() - time.perf_counter_ns() # time when this module is loaded, minus offset def cur_ns_time(): """ - Get monotonic current time with nanosecond precision + Get monotonic current time with nanosecond precision, in seconds, includes time during sleep Notes: time.time_ns() is not monotonic, so when waking up the machine again, the time might decrease by 1 second or so, so it is not monotonic diff --git a/src/simreaduntil/shared_utils/utils.py b/src/simreaduntil/shared_utils/utils.py index dc8b3cc..16a5f4b 100644 --- a/src/simreaduntil/shared_utils/utils.py +++ b/src/simreaduntil/shared_utils/utils.py @@ -2,18 +2,25 @@ General utility functions """ +from contextlib import contextmanager +import contextlib import copy import functools import gzip import os from pathlib import Path +import queue import shutil import subprocess +import sys +import threading +import time from typing import Any, Dict, Iterable, List import dill import tqdm from simreaduntil.shared_utils.logging_utils import setup_logger_simple +from simreaduntil.shared_utils.timing import cur_ns_time logger = setup_logger_simple(__name__) @@ -220,4 +227,200 @@ def _cycle_list_deep(lst): import copy while True: for x in lst: - yield copy.deepcopy(x) \ No newline at end of file + yield copy.deepcopy(x) + + +@contextmanager +def set_signal_handler(signal_type, handler): + """ + Set a signal handler temporarily + + This function can only be called from the main thread of the main interpreter. + + This is better than using KeyboardInterrupt because this immediately breaks out of the code, possibly leaving the code in an inconsistent + state, e.g. for the simulator updating channels when Ctrl-C is pressed causes it to immediately stop. It is better to set a flag to stop + it. Since the signal handler acquires the Pytho GIL, make sure the code in the signal handler can run to completion because all other threads are + stopped as well due to the Python GIL, so no locked mutexes should be required. + + See the tests for an example. + + Args: + signal_type: e.g. signal.SIGINT for KeyboardInterrupt + handler: function to handle the signal taking two arguments, should not mess with any state + """ + import signal + old_handler = signal.getsignal(signal_type) + signal.signal(signal_type, handler) + yield + signal.signal(signal_type, old_handler) + + +@contextmanager +def tee_stdouterr_to_file(filename_base, mode="a"): + """ + Try to use a file handler with the logger rather than this function! + + Tee the entire output of a Python program to a file and the console. + + # assign to an object since otherwise it will get destroyed and the file handler will become invalid! + obj = tee_stdouterr_to_file("test11", mode="w") + obj.__enter__() + # or use with a "with" statement + + Args: + filename_base: base of the filename to write to, ".out" and ".err" will be appended, directory containing the file must exist + mode: mode to open file in, "a" for append, "w" for overwrite + """ + + """ + File object that writes to multiple file objects at once, e.g. for teeing + """ + class MultipleFilesWriter: + def __init__(self, *files): + self.files = files + + def write(self, text): + for file in self.files: + file.write(text) + + def flush(self): + for file in self.files: + file.flush() + + # to debug this function in case of exception, remove the redirection of stdout + with open(str(filename_base) + ".out", mode=mode) as out_file: + with open(str(filename_base) + ".err", mode=mode) as err_file: + old_stdout = sys.stdout + old_stderr = sys.stderr + with contextlib.redirect_stdout(MultipleFilesWriter(old_stdout, out_file)): + with contextlib.redirect_stderr(MultipleFilesWriter(old_stderr, err_file)): + yield + + +""" +Class storing a value + +Useful for passing values by reference to a function, since int, float etc are immutable, so modifications to them are not visible outside the function +""" +class MutableValue: + def __init__(self, value=None): + self.value = value +# def __repr__(self): +# return f"MutableValue({self.value})" +# def __str__(self): +# return f"MutableValue({self.value})" +# def __eq__(self, other): +# return self.value == other.value +# def __hash__(self): +# return hash(self.value) + +def record_gen_waiting_time(gen, waiting_time: MutableValue): + """ + Record how much time is spent waiting for new elements in the generator, only counting the times when requesting an element and until getting it + + The function also works if the generator is stopped early. + + Args: + gen: generator to wrap + waiting_time: MutableValue to store the total waiting time in seconds, only includes the time until the generator is destroyed + + Yields: values from gen + """ + elapsed_time = 0 + try: + t_before = cur_ns_time() + for x in gen: + elapsed_time += cur_ns_time() - t_before + yield x + t_before = cur_ns_time() + finally: + # to make sure this is executed even if the generator is stopped early + waiting_time.value = elapsed_time + +def record_gen_fcn_waiting_time(gen_fcn, gen, waiting_time: MutableValue): + """ + Record how much time the function gen_fcn(gen) takes processing elements coming from generator gen, excluding the time waiting for elements from gen + + Example: + See tests + + Args: + gen_fcn: function taking a generator and returning a generator + gen: generator to wrap + waiting_time: MutableValue to store the total waiting time in seconds of gen_fcn itself only, only includes the time until the generator is destroyed + """ + waiting_time.value = 0 + time_inner_gen = MutableValue() + try: + yield from record_gen_waiting_time(gen_fcn(record_gen_waiting_time(gen, time_inner_gen)), waiting_time) + finally: + waiting_time.value -= time_inner_gen.value + + +""" +Queue with interruptible get and put methods when it is stopped, returned a QueueStoppedException +""" +class StoppableQueue(queue.Queue): + class QueueStoppedException(Exception): + pass + # workaround since otherwise Empty not found + Empty = queue.Empty + Full = queue.Full + + def __init__(self, maxsize=0): + super().__init__(maxsize) + # Create an event that can be used to signal the queue to stop its operations. + self.stop_event = threading.Event() + self.POLL_INTERVAL = 0.1 + + """ + Put an item on the queue + + raises an QueueStoppedException if the stop event is set + + Args: + item: the item to put on the queue + block: if True, repeatedly try to put item (repeating if queue is full); if False, immediately raise queue.Full if full + timeout: only applies when block=True + """ + def put(self, item, block=True, timeout=None): + start_time = time.time() + while not self.stop_event.is_set(): + try: + super().put(item, block, timeout=self.POLL_INTERVAL) + return + except queue.Full: + if (not block) or (timeout is not None and time.time() - start_time > timeout): + raise # queue full exception + if self.stop_event.is_set(): + raise self.QueueStoppedException() + + """ + Get an item from the queue + + raises an QueueStoppedException if the stop event is set + + Args: + block: if True, repeatedly try to get item (repeating if queue is empty); if False, immediately raise queue.Empty if empty + timeout: only applies when block=True + + Returns: + item from the queue, or exception if None (queue empty or stop event set or timeout expired) + """ + def get(self, block=True, timeout=None): + start_time = time.time() + while not self.stop_event.is_set(): + try: + item = super().get(block, timeout=self.POLL_INTERVAL) + return item + except queue.Empty: + if (not block) or (timeout is not None and time.time() - start_time > timeout): + raise # queue empty exception + if self.stop_event.is_set(): + raise self.QueueStoppedException() + + """ + Convenience method to set the stop event. + """ + def stop(self): + self.stop_event.set() \ No newline at end of file diff --git a/src/simreaduntil/simulator/README.md b/src/simreaduntil/simulator/README.md index d51f4bc..6c5dd93 100644 --- a/src/simreaduntil/simulator/README.md +++ b/src/simreaduntil/simulator/README.md @@ -41,8 +41,8 @@ mkdir protos_generated python -m grpc_tools.protoc -Iprotos/ --python_out=protos_generated/ --pyi_out=protos_generated/ --grpc_python_out=protos_generated/ protos/ont_device.proto && \ sed -i -E "s%import (.*)_pb2 as%import simreaduntil.simulator.protos_generated.\1_pb2 as%g" protos_generated/ont_device_pb2_grpc.py -# todo: check -cd src && python -m grpc_tools.protoc -Isimreaduntil/simulator/protos/ --python_out=simreaduntil/simulator/protos_generated/ --pyi_out=simreaduntil/simulator/protos_generated/ --grpc_python_out=simreaduntil/simulator/protos_generated/ simreaduntil/simulator/protos/ont_device.proto +# not working: +# cd src && python -m grpc_tools.protoc -Isimreaduntil/simulator/protos/ --python_out=simreaduntil/simulator/protos_generated/ --pyi_out=simreaduntil/simulator/protos_generated/ --grpc_python_out=simreaduntil/simulator/protos_generated/ simreaduntil/simulator/protos/ont_device.proto ``` diff --git a/src/simreaduntil/simulator/channel.py b/src/simreaduntil/simulator/channel.py index 3e5bece..1b11c34 100644 --- a/src/simreaduntil/simulator/channel.py +++ b/src/simreaduntil/simulator/channel.py @@ -4,19 +4,20 @@ #%% from __future__ import annotations -import enum # for referring in type hints of a class's method to class itself, Python 3.7, otherwise use strings; see https://stackoverflow.com/questions/55320236/does-python-evaluate-type-hinting-of-a-forward-reference +import contextlib +import enum +from threading import Lock # for referring in type hints of a class's method to class itself, Python 3.7, otherwise use strings; see https://stackoverflow.com/questions/55320236/does-python-evaluate-type-hinting-of-a-forward-reference from typing import Iterable, List, Union, Tuple, Any from matplotlib.layout_engine import TightLayoutEngine import numpy as np from simreaduntil.shared_utils.logging_utils import setup_logger_simple -from simreaduntil.shared_utils.thread_helpers import MakeThreadSafe from simreaduntil.simulator.channel_stats import ChannelStats from simreaduntil.simulator.gap_sampling.gap_sampling import GapSampler from simreaduntil.simulator.simulator_params import SimParams from simreaduntil.simulator.utils import in_interval -from simreaduntil.simulator.readpool import NoReadLeft, ReadPool +from simreaduntil.simulator.readpool import NoReadLeftException, ReadPool from simreaduntil.simulator.readswriter import ReadsWriter from simreaduntil.simulator.channel_element import ChannelBroken, ChannelElement, ShortGap, MuxScan, NoReadLeftGap, UnblockDelay, ChunkedRead, LongGap, ReadEndReason @@ -67,7 +68,7 @@ class UnblockResponse(int, enum.Enum): def to_str(self): return {UnblockResponse.MISSED: "missed", UnblockResponse.UNBLOCKED: "unblocked"}[self] -class Channel(MakeThreadSafe): +class Channel: """ Simulate the reads from a flow cell pore (channel) @@ -79,14 +80,16 @@ class Channel(MakeThreadSafe): The channel can be reused by calling .start() again after a .stop(). This will however not reset the states of ReadPool and ReadsWriter. - The class is thread-safe, at most one thread at a time can call its methods simultaneously. - Methods: - chan.start(t) # Start the channel at time t, channel now active - chan.forward(t) # Forward the channel to time t - - chan.get_new_chunks() # get new chunks of read-in-progress, concatenation of all chunks + - chan.get_new_samples() # get new chunks of read-in-progress, concatenation of all chunks - chan.stop() # stop the channel, write current read until current time (last call of chan.forward(t)) After this, the channel is clean and .start(t) can be called again + + A mutex protects start, stop, forward, run_mux_scan, stop_receiving, unblock. + get_new_samples() can be called in parallel without a mutex, but modifies the stats, so they should not + be accessed/written at the same time. Arguments: name: channel name @@ -112,6 +115,8 @@ def __init__(self, name: str, read_pool: ReadPool, reads_writer: ReadsWriter, si self.save_elems = False self.stats = None self.cur_elem : Union[ChannelElement, None] = None + + self._cur_elem_mutex = Lock() def __repr__(self): return f"Channel({self.name}, cur_elem={self.cur_elem}, stats={self.stats})" @@ -126,11 +131,12 @@ def start(self, t_start): if self.is_running: raise ChannelAlreadyRunningException() - self.t_start = t_start - self.t = t_start - self.finished_elems = [] - self.stats = ChannelStats(n_channels=1) - self.run_mux_scan(t_duration=0, _starting_up=True) + with self._cur_elem_mutex: + self.t_start = t_start + self.t = t_start + self.finished_elems = [] + self.stats = ChannelStats(n_channels=1) + self.run_mux_scan(t_duration=0, _starting_up=True) # sets self.cur_elem def stop(self): """ @@ -147,14 +153,15 @@ def stop(self): if not self.is_running: raise ChannelNotRunningException() - if isinstance(self.cur_elem, ChunkedRead): - # reject current read - self._write_read(end_reason=ReadEndReason.SIM_STOPPED) - else: - self.cur_elem.t_end = self.t - - self._finish_element_in_stats() - self.cur_elem = None + with self._cur_elem_mutex: + if isinstance(self.cur_elem, ChunkedRead): + # reject current read + self._write_read(self.cur_elem, end_reason=ReadEndReason.SIM_STOPPED) + else: + self.cur_elem.t_end = self.t + + self._finish_elem_in_stats(self.cur_elem) + self.cur_elem = None @property def is_running(self): @@ -169,24 +176,24 @@ def is_idle(self): """ return isinstance(self.cur_elem, (NoReadLeftGap, ChannelBroken)) - def _move_to_next_element(self): + def _move_to_next_elem(self, last_elem): """ Helper function for .forward() to choose the next element in the channel + Writes the current read if last_elem is a read - Also starts the element + Not thread-safe """ - last_elem = self.cur_elem t_start = last_elem.t_end - # get a new read, otherwise NoReadLeft + # get a new read, otherwise NoReadLeftGap def get_new_read(): try: # new read new_read_id, new_read_seq = self.read_pool.get_new_read(channel=self.name) return ChunkedRead(new_read_id, new_read_seq, t_start, t_delay=self.sim_params.gap_samplers[self.name].sample_read_start_delay(channel_stats=self.stats, random_state=self.sim_params.random_state), - read_speed=self.sim_params.bp_per_second, chunk_size=self.sim_params.chunk_size) - except NoReadLeft: + read_speed=self.sim_params.bp_per_second, min_chunk_size=self.sim_params.min_chunk_size) + except NoReadLeftException: # insert infinite gap return NoReadLeftGap(t_start) @@ -202,27 +209,26 @@ def get_new_gap(): if isinstance(last_elem, MuxScan): if last_elem.elem_to_restore is None: - self.cur_elem = get_new_gap() + new_elem = get_new_gap() else: # restore old element which is a long gap assert isinstance(last_elem.elem_to_restore, LongGap) - next_elem = last_elem.elem_to_restore - next_elem.t_start = t_start - self.cur_elem = next_elem + new_elem = last_elem.elem_to_restore + new_elem.t_start = t_start elif isinstance(last_elem, ChunkedRead): - self._write_read(end_reason=None) - self.cur_elem = get_new_gap() + self._write_read(last_elem, end_reason=None) + new_elem = get_new_gap() elif isinstance(last_elem, UnblockDelay): - self.cur_elem = get_new_gap() + new_elem = get_new_gap() elif isinstance(last_elem, ShortGap): - self.cur_elem = get_new_read() + new_elem = get_new_read() elif isinstance(last_elem, LongGap): self.sim_params.gap_samplers[self.name].mark_long_gap_end(channel_stats=self.stats) - self.cur_elem = get_new_read() + new_elem = get_new_read() else: assert not isinstance(last_elem, (NoReadLeftGap, ChannelBroken)), "NoReadLeftGap has infinite length (sink state), so no next state" raise ValueError(f"unknown channel element type: {type(last_elem).__name__}") - self._start_cur_element_in_stats() + return new_elem def forward(self, t, delta=False): """ @@ -238,25 +244,27 @@ def forward(self, t, delta=False): Raises: ChannelNotRunningException: if channel is not running """ - if not self.is_running: - raise ChannelNotRunningException() - assert self.is_running, "need to call .start(t) first" - if delta: - t += self.t - assert t >= self.t, "can only forward time, not go backwards" - - while self.cur_elem.has_finished_by(t): - self._update_cur_elem_in_stats(self.t, self.cur_elem.t_end) - self.t = self.cur_elem.t_end - self._finish_element_in_stats() # takes current self.t into account + with self._cur_elem_mutex: + if not self.is_running: + raise ChannelNotRunningException() + assert self.is_running, "need to call .start(t) first" + if delta: + t += self.t + assert t >= self.t, "can only forward time, not go backwards" - # important to update stats before so the gap sampling takes the updated values into account - self._move_to_next_element() - - self._update_cur_elem_in_stats(self.t, t) - self.t = t - - self.stats.check_consistent() + while self.cur_elem.has_finished_by(t): + self._update_elem_in_stats(self.cur_elem, self.t, self.cur_elem.t_end) + self.t = self.cur_elem.t_end + self._finish_elem_in_stats(self.cur_elem) # takes current self.t into account + + # important to update stats before so the gap sampling takes the updated values into account + self.cur_elem = self._move_to_next_elem(self.cur_elem) + self._start_elem_in_stats(self.cur_elem) + + self._update_elem_in_stats(self.cur_elem, self.t, t) + self.t = t + + self.stats.check_consistent() ###################### functions that terminate the current element in the channel and replace it by another one ##################### @@ -274,7 +282,7 @@ def run_mux_scan(self, t_duration: float, _starting_up: bool=False) -> bool: Args: t_duration: duration of mux scan starting from current time - _starting_up: for internal use, used when the channel is started + _starting_up: for internal use, used when the channel is started, mutex should already be held Returns: whether a read was rejected @@ -285,40 +293,41 @@ def run_mux_scan(self, t_duration: float, _starting_up: bool=False) -> bool: if not self.is_running and not _starting_up: raise ChannelNotRunningException() - assert t_duration >= 0 - elem_to_restore = None - read_was_rejected = False - - if isinstance(self.cur_elem, ChunkedRead): - # stop active read immediately - self._write_read(end_reason=ReadEndReason.MUX_SCAN_STARTED) - read_was_rejected = True - elif isinstance(self.cur_elem, (UnblockDelay, ShortGap)): - # end immediately - self.cur_elem.t_end = self.t - elif isinstance(self.cur_elem, LongGap): - # split gap into two at t_split, i.e. set self to [t_start, t_split] and return a new [t_split, t_end] - # have mux scan refer to it - elem_to_restore = self.cur_elem.split(self.t) - elif isinstance(self.cur_elem, MuxScan): - # modify t_end of current mux scan, same element to restore - self.cur_elem.t_end = self.t + t_duration - return False # don't add MuxScan again - elif isinstance(self.cur_elem, (NoReadLeftGap, ChannelBroken)): - # don't do anything - return False - else: - # beginning of channel, no element yet - assert self.cur_elem is None and _starting_up, f"unknown element type {type(self.cur_elem).__name__}" + with self._cur_elem_mutex if not _starting_up else contextlib.nullcontext(): # starting up -> mutex already held + assert t_duration >= 0 + elem_to_restore = None + read_was_rejected = False - # cur_elem is None when called right after start - if self.cur_elem is not None: - self._finish_element_in_stats() - - self.cur_elem = MuxScan(self.t, t_duration=t_duration, elem_to_restore=elem_to_restore) - self._start_cur_element_in_stats() - - return read_was_rejected + if isinstance(self.cur_elem, ChunkedRead): + # stop active read immediately + self._write_read(self.cur_elem, end_reason=ReadEndReason.MUX_SCAN_STARTED) + read_was_rejected = True + elif isinstance(self.cur_elem, (UnblockDelay, ShortGap)): + # end immediately + self.cur_elem.t_end = self.t + elif isinstance(self.cur_elem, LongGap): + # split gap into two at t_split, i.e. set self to [t_start, t_split] and return a new [t_split, t_end] + # have mux scan refer to it + elem_to_restore = self.cur_elem.split(self.t) + elif isinstance(self.cur_elem, MuxScan): + # modify t_end of current mux scan, same element to restore + self.cur_elem.t_end = self.t + t_duration + return False # don't add MuxScan again + elif isinstance(self.cur_elem, (NoReadLeftGap, ChannelBroken)): + # don't do anything + return False + else: + # beginning of channel, no element yet + assert self.cur_elem is None and _starting_up, f"unknown element type {type(self.cur_elem).__name__}" + + # cur_elem is None when called right after start + if self.cur_elem is not None: + self._finish_elem_in_stats(self.cur_elem) + + self.cur_elem = MuxScan(self.t, t_duration=t_duration, elem_to_restore=elem_to_restore) + self._start_elem_in_stats(self.cur_elem) + + return read_was_rejected def has_active_mux_scan(self) -> bool: return isinstance(self.cur_elem, MuxScan) @@ -337,22 +346,24 @@ def unblock(self, unblock_duration=None, end_reason=ReadEndReason.UNBLOCKED, rea Returns: UnblockResponse """ - # add unblock delay - if not self._write_read(end_reason=end_reason, read_id=read_id): - # read was not finished - return UnblockResponse.MISSED - - self._finish_element_in_stats() - - if unblock_duration is None: - unblock_duration = self.sim_params.default_unblock_duration - assert isinstance(unblock_duration, (int, float)) - - self.cur_elem = UnblockDelay(self.t, unblock_duration, self.cur_elem) - self._start_cur_element_in_stats() - return UnblockResponse.UNBLOCKED + with self._cur_elem_mutex: + cur_elem = self.cur_elem # for thread-safety + # add unblock delay + if not self._write_read(self.cur_elem, end_reason=end_reason, read_id=read_id): + # read was missed + return UnblockResponse.MISSED + + self._finish_elem_in_stats(cur_elem) + + if unblock_duration is None: + unblock_duration = self.sim_params.default_unblock_duration + assert isinstance(unblock_duration, (int, float)) + + self.cur_elem = UnblockDelay(self.t, unblock_duration, cur_elem) + self._start_elem_in_stats(self.cur_elem) # pass in new element! + return UnblockResponse.UNBLOCKED - def _write_read(self, end_reason, read_id=None) -> bool: + def _write_read(self, elem, end_reason, read_id=None) -> bool: """ Finish the current read right now (possibly early) by writing it (without changing stats!) @@ -363,13 +374,13 @@ def _write_read(self, end_reason, read_id=None) -> bool: Returns: whether read was missed or not """ - if not isinstance(self.cur_elem, ChunkedRead) or (read_id is not None and self.cur_elem.full_read_id != read_id): + if not isinstance(elem, ChunkedRead) or (read_id is not None and elem.full_read_id != read_id): # read no longer the current read self.stats.reads.number_rejected_missed += 1 return False # write read up to current time t only (not necessarily full read) - seq_record = self.cur_elem.finish(self.t, end_reason=end_reason) + seq_record = elem.finish(self.t, end_reason=end_reason) seq_record.description += f" ch={self.name}" if DONT_WRITE_ZERO_LENGTH_READS and len(seq_record.seq) == 0: logger.debug(f"Read with id '{seq_record.id}' had length 0, not writing it") @@ -386,6 +397,8 @@ def stop_receiving(self, read_id=None) -> StoppedReceivingResponse: Stop receiving chunks from current read with read_id. If the channel is not running, it is logged as a missed action. + + This method should not be called in parallel from several threads (but can be called along with other methods). Args: read_id: read id of read to unblock; if None, current read @@ -393,94 +406,105 @@ def stop_receiving(self, read_id=None) -> StoppedReceivingResponse: Returns: True if read was stopped, False if read was not found (no longer current read) """ - if not isinstance(self.cur_elem, ChunkedRead) or (read_id is not None and self.cur_elem.full_read_id != read_id): - # read no longer the current read - self.stats.reads.number_stop_receiving_missed += 1 - return StoppedReceivingResponse.MISSED - - if self.cur_elem.stop_receiving(): - # only count if read was not already stopped - assert self.stats.reads.cur_number == 1 - self.stats.reads.cur_number_stop_receiving += 1 - return StoppedReceivingResponse.STOPPED_RECEIVING - else: - return StoppedReceivingResponse.ALREADY_STOPPED_RECEIVING + with self._cur_elem_mutex: # for updating stats + cur_elem = self.cur_elem # for thread-safety + if not isinstance(cur_elem, ChunkedRead) or (read_id is not None and cur_elem.full_read_id != read_id): + # read no longer the current read + self.stats.reads.number_stop_receiving_missed += 1 # only method writing to this field + return StoppedReceivingResponse.MISSED + + if cur_elem.stop_receiving(): + # only count if read was not already stopped + assert self.stats.reads.cur_number == 1 + self.stats.reads.cur_number_stop_receiving += 1 # only method writing to this field + return StoppedReceivingResponse.STOPPED_RECEIVING + else: + return StoppedReceivingResponse.ALREADY_STOPPED_RECEIVING - def get_new_chunks(self): + def get_new_samples(self): """ - Get concatenation of new chunks of the current read. + Get new samples of the current read. - If the read was not set to stop receiving, no new chunks are returned. + If the read was set to stop receiving, no new samples are returned. Returns: - Tuple of (concatenated_new_chunks, read_id, estimated_ref_len_so_far) - If the read was set to stop_receiving, concatenated_new_chunks is "" + Tuple of (samples, read_id, estimated_ref_len_so_far) + If the read was set to stop_receiving, samples is "" If no read is active (e.g. read gap, not running), it returns ("", None, None) """ - if not isinstance(self.cur_elem, ChunkedRead): + + # we are not acquiring a lock because this method will be called in parallel to "forward" + cur_elem = self.cur_elem # for thread-safety + if not isinstance(cur_elem, ChunkedRead): # also works if channel is not running return ("", None, None) - chunks, read_id, estimated_ref_len_so_far = self.cur_elem.get_new_chunks(self.t) - self.stats.reads.number_bps_requested += len(chunks) + chunks, read_id, estimated_ref_len_so_far = cur_elem.get_new_samples(self.t) + self.stats.reads.number_bps_requested += len(chunks) # only method writing to this field, todo: writing racing condition return (chunks, read_id, estimated_ref_len_so_far) ##################### Channel statistics ##################### + # They all take elem as an argument of elem to make clear what they are modifying. + # They also modify the stats, so a lock should be acquired. - def _get_cur_elem_in_stats(self): + def _get_stats_for_elem(self, elem): """ Returns object to modify in stats given current element """ - if isinstance(self.cur_elem, ShortGap): + if isinstance(elem, ShortGap): return self.stats.short_gaps - elif isinstance(self.cur_elem, LongGap): + elif isinstance(elem, LongGap): return self.stats.long_gaps - elif isinstance(self.cur_elem, ChannelBroken): + elif isinstance(elem, ChannelBroken): return self.stats.channel_broken - elif isinstance(self.cur_elem, MuxScan): + elif isinstance(elem, MuxScan): return self.stats.mux_scans - elif isinstance(self.cur_elem, UnblockDelay): + elif isinstance(elem, UnblockDelay): return self.stats.unblock_delays - elif isinstance(self.cur_elem, ChunkedRead): + elif isinstance(elem, ChunkedRead): return self.stats.reads else: - assert isinstance(self.cur_elem, NoReadLeftGap), f"Encountered unknown element of class {self.cur_elem.__class__}" + assert isinstance(elem, NoReadLeftGap), f"Encountered unknown element of class {elem.__class__}" return self.stats.no_reads_left - def _start_cur_element_in_stats(self): + def _start_elem_in_stats(self, elem): """ Start current element in stats """ - self._get_cur_elem_in_stats().start() + self._get_stats_for_elem(elem).start() - def _update_cur_elem_in_stats(self, t_from, t_to): + def _update_elem_in_stats(self, elem, t_from, t_to): """ Add time to current element in stats - + Args: t_from, t_to: time interval [t_from, t_to] to add for current element """ kwargs = {} - if isinstance(self.cur_elem, ChunkedRead): - kwargs["nb_new_bps"] = self.cur_elem.nb_basepairs(t_to) - self.cur_elem.nb_basepairs(t_from) + if isinstance(elem, ChunkedRead): + kwargs["nb_new_bps"] = elem.actual_seq_length(t_to) - elem.actual_seq_length(t_from) # not thread-safe - self._get_cur_elem_in_stats().add_time(t_to - t_from, **kwargs) + self._get_stats_for_elem(elem).add_time(t_to - t_from, **kwargs) - def _finish_element_in_stats(self): + def _finish_elem_in_stats(self, elem): """ Finish current element (possibly prematurely) in stats """ kwargs = {} - if isinstance(self.cur_elem, ChunkedRead): - kwargs["nb_bps_rejected"] = self.cur_elem.nb_basepairs_full() - self.cur_elem.nb_basepairs(self.t) - kwargs["stopped_receiving"] = self.cur_elem.stopped_receiving - self._get_cur_elem_in_stats().finish(**kwargs) + if isinstance(elem, ChunkedRead): + kwargs["nb_bps_rejected"] = elem.full_seq_length() - elem.actual_seq_length(self.t) + kwargs["stopped_receiving"] = elem.stopped_receiving + self._get_stats_for_elem(elem).finish(**kwargs) if self.save_elems: - self.finished_elems.append(self.cur_elem) + self.finished_elems.append(elem) def plot(self, *args, **kwargs): - """Plot channels, only plots elements recorded while save_elems was set to True""" + """ + Plot channels, only plots elements recorded while save_elems was set to True + + Not thread-safe + """ return plot_channels([self], *args, **kwargs) #%% @@ -535,12 +559,12 @@ def plot_channels(channels: List[Channel], time_interval=None, ax=None, **plot_a elif isinstance(elem, MuxScan): color = "purple" offset = -0.05 - elif isinstance(elem, NoReadLeft): + elif isinstance(elem, NoReadLeftGap): color = "grey" offset = 0.02 elif isinstance(elem, ChannelBroken): - color = "darkgrey" - offset = 0.02 + color = "blue" + offset = 0.01 else: raise TypeError(elem) t_end = elem.t_end @@ -575,7 +599,7 @@ def plot_channels(channels: List[Channel], time_interval=None, ax=None, **plot_a Line2D(existing_point, existing_point, color='red', lw=2, label='long gap'), Line2D(existing_point, existing_point, color='purple', lw=2, label='mux scan'), Line2D(existing_point, existing_point, color='grey', lw=2, label='no read left'), - Line2D(existing_point, existing_point, color='darkgrey', lw=2, label='broken channel'), + Line2D(existing_point, existing_point, color='blue', lw=2, label='broken channel'), Line2D(existing_point, existing_point, color='black', lw=2, label='current time', linestyle="dotted"), ] ax.legend(handles=legend_elements, loc='center right') diff --git a/src/simreaduntil/simulator/channel_element.py b/src/simreaduntil/simulator/channel_element.py index 62cb96f..39a3451 100644 --- a/src/simreaduntil/simulator/channel_element.py +++ b/src/simreaduntil/simulator/channel_element.py @@ -6,6 +6,7 @@ import enum from numbers import Number from functools import cached_property +from threading import Lock from typing import Any, List, Optional, Tuple, Union from Bio import SeqIO from Bio.Seq import Seq @@ -170,7 +171,7 @@ def estimate_ref_len(orig_ref_len, orig_seq_len, new_seq_len): # StrEnum does not exist yet in Python3.8, see PythonDoc for IntEnum for this recipe # allows printing as "field" instead of "class.field", where class is a class derived from enum class ReadEndReason(str, enum.Enum): - """Reason why a read ended""" + """Reason why a read ended, stop receiving is not part of it""" # read ended prematurely SIM_STOPPED = "sim_stopped_unblocked" # simulation was stopped while read was still in-progress @@ -191,7 +192,7 @@ class ReadTags(str, enum.Enum): """ Tags to attach to a read, multiple are possible! """ - RU_NEVER_REQUESTED = "never_requested" # never requested by ReadUntil + RU_NEVER_REQUESTED = "never_requested" # never returned data (may have been requested, but data length below chunk size) RU_STOPPED_RECEIVING = "stopped_receiving" # read was set to stop_receiving class ChunkedRead(ChannelElement): @@ -201,8 +202,8 @@ class ChunkedRead(ChannelElement): Chunks can be received from the read, it can be ended prematurely with .finish(), the sequence record or sequence summary record can be retrieved. If the read has a NanoSim read id, its id is modified to reflect the estimated reference length if it is ended prematurely. - A read with n basepairs starts at t_start + t_delay, goes to time n*dt and the ith basepair (i>=1) is read after time i*dt, where dt=1/bp_per_second. - Basepairs are returned in chunks of size chunk_size. + A read with n basepairs/signals starts at t_start + t_delay, goes to time n*dt and the ith basepair (i>=1) is read after time i*dt, where dt=1/bp_per_second. + Basepairs/signals are returned in chunks of size at least min_chunk_size. t_start, t_delay, t_end, t_duration should not be modified once this class was instantiated. A read can be terminated early by calling .finish_now(). @@ -210,16 +211,16 @@ class ChunkedRead(ChannelElement): read_id: id of read seq: read sequence t_start: time at which the read starts - read_speed: speed at which the read is read, in basepairs per second, defaults to SIM_PARAMS.bp_per_second - chunk_size: size of chunks that .get_new_chunks() returns, defaults to SIM_PARAMS.chunk_size - t_delay: delay before read starts (template_start - read_start), 0 basepairs are read during this delay, end time is shifted accordingly + read_speed: speed at which the read is read, in basepairs/signals per second, defaults to SIM_PARAMS.bp_per_second + min_chunk_size: minimum size of chunks that .get_new_chunks() returns, defaults to SIM_PARAMS.min_chunk_size + t_delay: delay before read starts (template_start - read_start), 0 basepairs/signals are read during this delay, end time is shifted accordingly """ - def __init__(self, read_id: str, seq: str, t_start: Number, read_speed: Number=None, chunk_size: Number=None, t_delay:float = 0): + def __init__(self, read_id: str, seq: str, t_start: Number, read_speed: Number=None, min_chunk_size: Number=None, t_delay: float = 0): # copy params in case they change while the read is in-progress assert read_speed > 0 - assert chunk_size > 0 + assert min_chunk_size > 0 self._read_speed = read_speed - self._chunk_size = chunk_size + self._min_chunk_size = min_chunk_size super().__init__(t_start, len(seq) / self._read_speed + t_delay) self.full_read_id = read_id @@ -228,6 +229,7 @@ def __init__(self, read_id: str, seq: str, t_start: Number, read_speed: Number=N assert t_delay >= 0 self._t_delay = t_delay + # self._ref_len: length of sequence on reference sequence, seq can be shorter/longer due to indels, constant if NanoSimId.is_valid(read_id): # whether the read id is from NanoSim -> read id will be changed when read is terminated early self._nanosim_read_id = NanoSimId.from_str(read_id) @@ -238,65 +240,77 @@ def __init__(self, read_id: str, seq: str, t_start: Number, read_speed: Number=N self._ref_len = len(self._full_seq) self.stopped_receiving = False # whether to receive chunks from the read - self._next_chunk_idx = 0 # start idx of next chunks to return - self.end_reason = None # action used to finish read + self._num_samples_returned = 0 # number of samples returned so far + self.end_reason : Optional[ReadEndReason] = None # action used to finish read (excluding stop receiving!) + + # lock to ensure get_new_samples is not called in parallel, other methods reading from self._num_samples_returned + # can only assume the value is valid, but it may have changed when accessing it again, see + # https://docs.python.org/3/faq/library.html#what-kinds-of-global-value-mutation-are-thread-safe + # this applies to self.end_reason, self._num_samples_returned, self.stopped_receiving + self._get_new_samples_lock = Lock() def __repr__(self): - return f"ChunkedRead '{self.full_read_id}': '{self._full_seq}' between [{self.t_start}, {self.t_end}], num chunks returned: {self._next_chunk_idx}, end_reason: {self.end_reason}" + return f"ChunkedRead '{self.full_read_id}': '{self._full_seq}' between [{self.t_start}, {self.t_end}], num samples returned: {self._num_samples_returned}, end_reason: {self.end_reason}" def __eq__(self, other) -> bool: assert isinstance(other, ChunkedRead) return self.t_start == other.t_start and self.t_duration == other.t_duration \ and self._t_delay == other._t_delay \ and self.full_read_id == other.full_read_id and self._full_seq == other._full_seq and self._read_speed == other._read_speed \ - and self._chunk_size == other._chunk_size and self.stopped_receiving == other.stopped_receiving \ - and self._next_chunk_idx == other._next_chunk_idx and self.end_reason == other.end_reason - - @property - def _nb_chunks(self): - """Number of chunks the read is divided into""" - return (len(self._full_seq) + self._chunk_size - 1) // self._chunk_size # round up + and self._min_chunk_size == other._min_chunk_size and self.stopped_receiving == other.stopped_receiving \ + and self._num_samples_returned == other._num_samples_returned and self.end_reason == other.end_reason def full_duration(self) -> Number: return self._t_delay + len(self._full_seq) / self._read_speed + def full_seq_length(self): + """number of basepairs/signals of full read""" + return len(self._full_seq) + + def num_samples_returned(self): + """Number of basepairs/signals that were returned so far (if stopped receiving or if not getting chunks)""" + return self._num_samples_returned + @property - def _has_received_chunks(self): - """Whether at least one non-empty chunk was returned (via get_new_chunks)""" - return self._next_chunk_idx > 0 + def _has_received_data(self): + """Whether at least one new sample was returned (via get_new_samples)""" + return self._num_samples_returned > 0 - def all_chunks_consumed(self) -> bool: - """Whether all chunks have been consumed, i.e. read has finished""" - return self._next_chunk_idx >= self._nb_chunks + def all_samples_consumed(self) -> bool: + """Whether read has been fully read (so also has finished)""" + return self._num_samples_returned >= self.full_seq_length() - @cached_property # lazy - def _chunk_end_positions(self): + def _estimate_ref_len(self, read_seq_length): """ - End positions of the chunks (cumulative chunk lengths), exclusive + Estimate reference length given number of basepairs/signals read, not exact due to indels - cached because they may not be needed if the simulation passes over the read (because time forwarded a lot). - """ - cum_lens = [(i+1)*self._chunk_size for i in range(self._nb_chunks)] - cum_lens[-1] = len(self._full_seq) - return cum_lens - - def _get_chunks(self, fr: int, to: int): - """Get concatenated chunks [from, to)""" - return self._full_seq[fr * self._chunk_size:to * self._chunk_size] - - def _estimate_ref_len(self, nb_bps_read): - """ - Estimate reference length given number of basepairs read, not exact due to indels + If it is a NanoSim read, it also removes the head or tail length Requires correct estimation of ref length of full read (which is available for NanoSim eads) """ # round rather than round down (with int()) - assert nb_bps_read <= len(self._full_seq) - return estimate_ref_len(orig_ref_len=self._ref_len, orig_seq_len=len(self._full_seq), new_seq_len=nb_bps_read) + full_len = self.full_seq_length() + assert read_seq_length <= full_len + if self._nanosim_read_id: + # remove head and tail length + head_len, tail_len = self._nanosim_read_id.head_len, self._nanosim_read_id.tail_len + assert head_len + tail_len <= full_len + if self._nanosim_read_id.direction == "R": + # swap + head_len, tail_len = tail_len, head_len + full_len_no_flanking = full_len - (head_len + tail_len) + # constrain (read_seq_length - head_len) to interval [0, full_len_no_flanking] + read_seq_length = min( + max(0, read_seq_length - head_len), + full_len_no_flanking + ) + else: + full_len_no_flanking = full_len + return estimate_ref_len(orig_ref_len=self._ref_len, orig_seq_len=full_len_no_flanking, new_seq_len=read_seq_length) - def nb_basepairs(self, t: Number): + def actual_seq_length(self, t: Number): """ - Number of basepairs of read up to time t, first basepair emitted at time t_start + Number of basepairs/signals of read up to time t, first basepair emitted at time t_start If .has_finished_by(t) returns True and the full read was read or .finish() not yet called, it is guaranteed to return the full length of the @@ -305,22 +319,14 @@ def nb_basepairs(self, t: Number): real_start = self.t_start + self._t_delay if t < real_start: return 0 - if (self.has_finished_by(t) and self.end_reason in [None, ReadEndReason.READ_ENDED_NORMALLY]): - # special case due to floating point problem with addition + if self.has_finished_by(t) and self.end_reason in [None, ReadEndReason.READ_ENDED_NORMALLY]: + # special case due to floating point precision loss with addition, when read ended normally, make sure to return full length when t>=t_end return len(self._full_seq) return min( len(self._full_seq), int((min(t, self.t_end) - real_start) * self._read_speed) # round down ) - def nb_basepairs_full(self): - """number of basepairs of full read""" - return len(self._full_seq) - - def nb_basepairs_returned(self): - """Number of basepairs not yet returned (if stopped receiving or if not getting chunks)""" - return min(len(self._full_seq), self._next_chunk_idx * self._chunk_size) - def finish(self, t=None, end_reason: Optional[ReadEndReason]=None) -> SeqIO.SeqRecord: """ Finish read by time t, updating t_end @@ -329,18 +335,17 @@ def finish(self, t=None, end_reason: Optional[ReadEndReason]=None) -> SeqIO.SeqR Arguments: t: time when read ends, read is ended prematurely if less than full read end; full read if None or t exceeds full read end - end_reason: action that caused read to be written to file + end_reason: action that caused read to finish, must be 'valid' if t is not None and before end of full read Returns: SeqRecord of read that can be written to fasta file """ - assert self.end_reason is None, f"already ended read with action {self.end_reason}" + assert self.end_reason is None, f"trying to end with {end_reason}, but already ended read with action {self.end_reason}" if t is not None: if not self.has_finished_by(t): assert end_reason in [ReadEndReason.UNBLOCKED, ReadEndReason.MUX_SCAN_STARTED, ReadEndReason.SIM_STOPPED], "end reason must be set" - nb_bps_returned = self.nb_basepairs_returned() - assert t >= self.t_start + self._t_delay * (nb_bps_returned > 0) + nb_bps_returned / self._read_speed, "cannot finish earlier than last returned chunk" + assert t >= self.t_start + self._t_delay * (self._num_samples_returned > 0) + self._num_samples_returned / self._read_speed, "cannot finish earlier than last returned chunk" self.t_end = min(self.t_end, t) # t_end contains end time of read now @@ -366,10 +371,10 @@ def get_seq_record(self): if self.end_reason == ReadEndReason.READ_ENDED_NORMALLY: seq = self._full_seq else: - seq = self._full_seq[:self.nb_basepairs(self.t_end)] + seq = self._full_seq[:self.actual_seq_length(self.t_end)] if self._nanosim_read_id is not None and (len(seq) < len(self._full_seq)): # adapt reference length, as read was stopped early - actual_ref_len = self._estimate_ref_len(self.nb_basepairs(self.t_end)) + actual_ref_len = self._estimate_ref_len(self.actual_seq_length(self.t_end)) self._nanosim_read_id.change_ref_len(actual_ref_len) # if this method is called again, the read id will not change again because the ref len is the same adapted_read_id = str(self._nanosim_read_id) @@ -377,13 +382,17 @@ def get_seq_record(self): read_tags = [] if self.stopped_receiving: read_tags.append(ReadTags.RU_STOPPED_RECEIVING) - if not self._has_received_chunks: + if not self._has_received_data: read_tags.append(ReadTags.RU_NEVER_REQUESTED) # append full sequence length (in case read was unblocked) - description = f"full_seqlen={self.nb_basepairs_full()} t_start={self.t_start} t_end={self.t_end} t_delay={self._t_delay} ended={self.end_reason} tags={','.join(read_tags)} full_read_id={self.full_read_id}" - return SeqIO.SeqRecord(Seq(seq), id=adapted_read_id, description=description) + description = f"full_seqlen={self.full_seq_length()} t_start={self.t_start} t_end={self.t_end} t_delay={self._t_delay} ended={self.end_reason} tags={','.join(read_tags)} full_read_id={self.full_read_id}" + return SeqIO.SeqRecord(Seq(seq if self.is_nucleotide_seq(seq) else ""), id=adapted_read_id, description=description) + @staticmethod + def is_nucleotide_seq(seq): + """Whether the read is a nucleotide sequence""" + return isinstance(seq, str) SEQ_SUMMARY_HEADER = ["read_id", "channel", "mux", "start_time", "duration", "passes_filtering", "template_start", "template_duration", "sequence_length_template", "end_reason"] def get_seq_summary_record(self) -> Optional[List[str]]: @@ -391,19 +400,19 @@ def get_seq_summary_record(self) -> Optional[List[str]]: Get list of entries for sequence summary file, see SEQ_SUMMARY_HEADER for field names Returns: - list of string entries, or None if read has no basepairs (e.g. due to delay) + list of string entries, or None if read has no basepairs/signals (e.g. due to delay) """ # read_id, channel, mux, start_time, duration, passes_filtering, template_start, template_duration, sequence_length_template, end_reason mux = 1 passes_filtering = "True" template_duration = self.t_duration - self._t_delay - nb_bps_read = self.nb_basepairs(self.t_end) - if nb_bps_read <= 0: + read_seq_length = self.actual_seq_length(self.t_end) + if read_seq_length <= 0: return None return [ self.full_read_id, self.channel, mux, self.t_start, self.t_duration, passes_filtering, self.t_start + self._t_delay, template_duration, - nb_bps_read, self.end_reason + read_seq_length, self.end_reason ] def stop_receiving(self, value=True) -> bool: @@ -422,29 +431,33 @@ def stop_receiving(self, value=True) -> bool: self.stopped_receiving = value return True - def get_new_chunks(self, t: Number) -> Tuple[str, str, Optional[int]]: + def get_new_samples(self, t: Number) -> Tuple[str, str, Optional[int]]: """ - Get new read chunks up to time <= t, only new data since last call to this function + Get new read samples up to time <= t, only new data since last call to this function + + Implicitly forwards to time t - Implicitly forwards to time t (choosing chunk index of chunk containing t) + Still works when other methods are called in parallel (e.g. finish, stop_receiving), + though be careful about interpreting the results if it is finished before the time t provided here Returns: - (all chunks concatenated, read_id, estimated_ref_len_so_far). + (samples, read_id, estimated_ref_len_so_far). - Empty chunks "" if stop_receiving is True - estimated_ref_len_so_far is the estimated number of basepairs covered by chunks returned so far - """ - assert self.end_reason is None, f"already ended read with action {self.end_reason}" - - if self.stopped_receiving: - return "", self.full_read_id, self._estimate_ref_len(self.nb_basepairs_returned()) - - # choose chunk index of chunk containing t - next_chunk_idx = np.searchsorted(self._chunk_end_positions, v=self.nb_basepairs(t), side='right') # index such that a[i-1] <= v < a[i] - old_next_chunk_idx = self._next_chunk_idx - self._next_chunk_idx = next_chunk_idx - estimated_ref_len_so_far = self._estimate_ref_len(self.nb_basepairs_returned()) # takes into account _next_chunk_idx - return self._get_chunks(old_next_chunk_idx, self._next_chunk_idx), self.full_read_id, estimated_ref_len_so_far + Empty samples "" if stop_receiving is True + estimated_ref_len_so_far is the estimated number of basepairs/signals covered by samples returned so far + """ + if self.stopped_receiving or self.end_reason is not None: # may be set in parallel + return "", self.full_read_id, self._estimate_ref_len(self._num_samples_returned) + + with self._get_new_samples_lock: + old_num_samples_returned = self._num_samples_returned + new_num_samples_returned = self.actual_seq_length(t) # depends on self.end_reason which may be set in parallel, this is okay + if new_num_samples_returned < old_num_samples_returned + self._min_chunk_size: + return "", self.full_read_id, self._estimate_ref_len(self._num_samples_returned) + + self._num_samples_returned = new_num_samples_returned + estimated_ref_len_so_far = self._estimate_ref_len(new_num_samples_returned) # takes into account _next_chunk_idx + return self._full_seq[old_num_samples_returned:new_num_samples_returned], self.full_read_id, estimated_ref_len_so_far class ReadDescriptionParser: diff --git a/src/simreaduntil/simulator/channel_stats.py b/src/simreaduntil/simulator/channel_stats.py index 3410590..a9e8fa2 100644 --- a/src/simreaduntil/simulator/channel_stats.py +++ b/src/simreaduntil/simulator/channel_stats.py @@ -367,6 +367,7 @@ def channel_stats_to_df(channel_stats: List[ChannelStats]): df = pd.DataFrame( [( + channel, channel.short_gaps.finished_number, channel.short_gaps.time_spent, channel.long_gaps.finished_number, channel.long_gaps.time_spent, channel.unblock_delays.finished_number, channel.unblock_delays.time_spent, @@ -377,6 +378,7 @@ def channel_stats_to_df(channel_stats: List[ChannelStats]): channel.reads.fin_number_rejected, channel.reads.number_rejected_missed, channel.reads.number_stop_receiving_missed, channel.no_reads_left.finished_number, channel.no_reads_left.time_spent ) for channel in channel_stats], columns=[ + 'channel', 'short_gaps_finished_number', 'short_gaps_time_spent', 'long_gaps_finished_number', 'long_gaps_time_spent', 'unblock_delays_finished_number', 'unblock_delays_time_spent', 'mux_scans_finished_number', 'mux_scans_time_spent', 'channel_broken_finished_number', 'channel_broken_time_spent', 'reads_finished_number', 'reads_time_spent', 'reads_number_bps_requested', 'reads_number_bps_read', 'reads_number_bps_rejected', @@ -389,7 +391,7 @@ def channel_stats_to_df(channel_stats: List[ChannelStats]): return df def plot_read_stats_per_channel(df, save_dir=None): - """Plot stats about reads per channel""" + """Plot stats about reads per channel, df coming from channel_stats_to_df""" df_reads = df[[col for col in df.columns if col.startswith("reads_")]].copy() df_reads["reads_number_bps_notrequested"] = df_reads["reads_number_bps_read"] - df_reads["reads_number_bps_requested"] # silently read, but never requested by ReadUntil diff --git a/src/simreaduntil/simulator/gap_sampling/constant_gaps_until_blocked.py b/src/simreaduntil/simulator/gap_sampling/constant_gaps_until_blocked.py index 567f5bb..da02153 100644 --- a/src/simreaduntil/simulator/gap_sampling/constant_gaps_until_blocked.py +++ b/src/simreaduntil/simulator/gap_sampling/constant_gaps_until_blocked.py @@ -18,6 +18,9 @@ class ConstantGapsUntilBlocked(GapSampler): """ Chooses short and long gaps of constant length, where a long gap is chosen with some probability, until the channel is blocked. + + Called constant_gaps in the paper + Args: short_gap_length: length of short gaps long_gap_length: length of long gaps diff --git a/src/simreaduntil/simulator/gap_sampling/gap_sampler_per_window_until_blocked.py b/src/simreaduntil/simulator/gap_sampling/gap_sampler_per_window_until_blocked.py index 21c1a79..b5db4e5 100644 --- a/src/simreaduntil/simulator/gap_sampling/gap_sampler_per_window_until_blocked.py +++ b/src/simreaduntil/simulator/gap_sampling/gap_sampler_per_window_until_blocked.py @@ -44,6 +44,8 @@ class GapSamplerPerWindowUntilBlocked(GapSampler): """ Gap sampler that has separate sample distributions per time window and eventually blocks. + Called window_all_channels in the paper + Args: short_gaps_per_window: array of short gaps for each time window long_gaps_per_window: array of long gaps for each time window @@ -101,7 +103,7 @@ def from_seqsum_df(cls, seqsum_df, read_delay=None, time_and_aggregation_windows read_delay: delay between read starting and first bp being read; if None, compute median read delay time_and_aggregation_windows: tuple of array of time windows (window_start, window_end) and array of data aggregation windows (window_start, window_end); - if None, use windows with 50% overlap, i.e. [t, t+4] window with data from [t-2, t+6] + if None, use 4h windows with 50% overlap, i.e. [t, t+4] window with data from [t-2, t+6] Returns: function to create a gap sampler, so it is flexible with respect to the number of channels diff --git a/src/simreaduntil/simulator/gap_sampling/gap_sampling.py b/src/simreaduntil/simulator/gap_sampling/gap_sampling.py index 502fbf4..3b343e1 100644 --- a/src/simreaduntil/simulator/gap_sampling/gap_sampling.py +++ b/src/simreaduntil/simulator/gap_sampling/gap_sampling.py @@ -107,6 +107,8 @@ class RandomGapSampler(GapSampler): Gap sampler with random gap lengths, for testing mostly Channel breaks with some probability + + Called random_gaps in the paper """ def __init__(self, prob_long_gap=0.5) -> None: super().__init__() diff --git a/src/simreaduntil/simulator/gap_sampling/inactive_active_gaps_replication.py b/src/simreaduntil/simulator/gap_sampling/inactive_active_gaps_replication.py index f35986d..985d728 100644 --- a/src/simreaduntil/simulator/gap_sampling/inactive_active_gaps_replication.py +++ b/src/simreaduntil/simulator/gap_sampling/inactive_active_gaps_replication.py @@ -213,6 +213,8 @@ class SingleChannelInactiveActiveReplicator(GapSampler): Whenever an inactive period finishes, you must call mark_long_gap_end. This moves to the next (active) period. Within an active period, the short gaps are recycled from the observed short gaps within the active period. It ignores the time spent in mux scans. + + Called gap_replication in the paper """ def __init__(self, inactive_active_periods_tracker: ChannelInactiveActivePeriodsTracker, read_delay) -> None: super().__init__() diff --git a/src/simreaduntil/simulator/gap_sampling/rolling_window_gap_sampler.py b/src/simreaduntil/simulator/gap_sampling/rolling_window_gap_sampler.py index f1bb184..59a6690 100644 --- a/src/simreaduntil/simulator/gap_sampling/rolling_window_gap_sampler.py +++ b/src/simreaduntil/simulator/gap_sampling/rolling_window_gap_sampler.py @@ -16,7 +16,6 @@ class RollingWindowGapSampler(GapSampler): """ - Rolling window gap sampler. Takes into account the whole gaps, so it mixes channels. Args: gaps_df: pd.DataFrame with columns "gap_start", "gap_end", "gap_duration", "gap_type" (long or short) @@ -56,6 +55,7 @@ def modify_args(*, gaps_df, **kwargs): @classmethod def from_seqsum_df(cls, seqsum_df, read_delay=None, long_gap_threshold=None, window_width=None): + """mixes gaps from all channels""" if read_delay is None: read_delay = compute_median_read_delay(seqsum_df) @@ -145,6 +145,8 @@ class RollingWindowGapSamplerPerChannel: For each channel, it samples the corresponding channel in the originak dataset + Called rolling_window_per_channel in the paper + """ def __init__(self): pass diff --git a/src/simreaduntil/simulator/protos/ont_device.proto b/src/simreaduntil/simulator/protos/ont_device.proto index 85d2cef..5ea795a 100644 --- a/src/simreaduntil/simulator/protos/ont_device.proto +++ b/src/simreaduntil/simulator/protos/ont_device.proto @@ -15,7 +15,7 @@ service ONTDevice { rpc GetMKRunDir(EmptyRequest) returns (MKRunDirResponse) {} // request actions to perform on channels - rpc PerformActions(ReadActionsRequest) returns (ActionResultImmediateResponse) {} + rpc PerformActions(ReadActionsRequest) returns (EmptyResponse) {} // get new chunks rpc GetBasecalledChunks(BasecalledChunksRequest) returns (stream BasecalledReadChunkResponse) {} // get action results (only those that were not yet received) @@ -25,7 +25,7 @@ service ONTDevice { rpc StartSim(StartRequest) returns (BoolResponse) {} // stop simulation, returns whether it succeeded (i.e. if simulation was running) rpc StopSim(EmptyRequest) returns (BoolResponse) {} - rpc RunMuxScan(RunMuxScanRequest) returns (MuxScanStartedInfo) {} + rpc RunMuxScan(RunMuxScanRequest) returns (RunMuxScanResponse) {} // whether simulation is running rpc IsRunning(EmptyRequest) returns (BoolResponse) {} @@ -69,10 +69,6 @@ message ReadActionsRequest { repeated Action actions = 1; } -message ActionResultImmediateResponse { - repeated bool succeeded = 1; -} - message ActionResultsRequest { bool clear = 1; // whether to clear the action results after getting them } @@ -100,7 +96,7 @@ message RunMuxScanRequest { double t_duration = 1; } -message MuxScanStartedInfo { +message RunMuxScanResponse { uint32 nb_reads_rejected = 1; } diff --git a/src/simreaduntil/simulator/protos_generated/ont_device_pb2.py b/src/simreaduntil/simulator/protos_generated/ont_device_pb2.py index 46bc1dc..b064db8 100644 --- a/src/simreaduntil/simulator/protos_generated/ont_device_pb2.py +++ b/src/simreaduntil/simulator/protos_generated/ont_device_pb2.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: ont_device.proto +# Protobuf Python Version: 4.25.0 """Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool @@ -13,13 +14,12 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x10ont_device.proto\x12\tontdevice\"\x0e\n\x0c\x45mptyRequest\"\x0f\n\rEmptyResponse\"\'\n\x12ServerInfoResponse\x12\x11\n\tunique_id\x18\x01 \x01(\t\"&\n\x10MKRunDirResponse\x12\x12\n\nmk_run_dir\x18\x01 \x01(\t\"\xe2\x02\n\x12ReadActionsRequest\x12\x35\n\x07\x61\x63tions\x18\x01 \x03(\x0b\x32$.ontdevice.ReadActionsRequest.Action\x1a\x94\x02\n\x06\x41\x63tion\x12\x0f\n\x07\x63hannel\x18\x01 \x01(\r\x12\x0f\n\x07read_id\x18\x02 \x01(\t\x12\x45\n\x07unblock\x18\x03 \x01(\x0b\x32\x32.ontdevice.ReadActionsRequest.Action.UnblockActionH\x00\x12U\n\x11stop_further_data\x18\x04 \x01(\x0b\x32\x38.ontdevice.ReadActionsRequest.Action.StopReceivingActionH\x00\x1a\x15\n\x13StopReceivingAction\x1a)\n\rUnblockAction\x12\x18\n\x10unblock_duration\x18\x01 \x01(\x01\x42\x08\n\x06\x61\x63tion\"2\n\x1d\x41\x63tionResultImmediateResponse\x12\x11\n\tsucceeded\x18\x01 \x03(\x08\"%\n\x14\x41\x63tionResultsRequest\x12\r\n\x05\x63lear\x18\x01 \x01(\x08\"k\n\x14\x41\x63tionResultResponse\x12\x0f\n\x07read_id\x18\x01 \x01(\t\x12\x0c\n\x04time\x18\x02 \x01(\x01\x12\x0f\n\x07\x63hannel\x18\x03 \x01(\r\x12\x13\n\x0b\x61\x63tion_type\x18\x04 \x01(\r\x12\x0e\n\x06result\x18\x05 \x01(\r\"r\n\x0cStartRequest\x12\x1b\n\x13\x61\x63\x63\x65leration_factor\x18\x01 \x01(\x01\x12\x15\n\rupdate_method\x18\x02 \x01(\t\x12\x14\n\x0clog_interval\x18\x03 \x01(\r\x12\x18\n\x10stop_if_no_reads\x18\x04 \x01(\x08\"\'\n\x11RunMuxScanRequest\x12\x12\n\nt_duration\x18\x01 \x01(\x01\"/\n\x12MuxScanStartedInfo\x12\x19\n\x11nb_reads_rejected\x18\x01 \x01(\r\"\xad\x01\n\x17\x42\x61secalledChunksRequest\x12\x17\n\nbatch_size\x18\x01 \x01(\rH\x00\x88\x01\x01\x12\x42\n\x08\x63hannels\x18\x03 \x01(\x0b\x32+.ontdevice.BasecalledChunksRequest.ChannelsH\x01\x88\x01\x01\x1a\x19\n\x08\x43hannels\x12\r\n\x05value\x18\x02 \x03(\rB\r\n\x0b_batch_sizeB\x0b\n\t_channels\"\x83\x01\n\x1b\x42\x61secalledReadChunkResponse\x12\x0f\n\x07\x63hannel\x18\x01 \x01(\r\x12\x0f\n\x07read_id\x18\x02 \x01(\t\x12\x0b\n\x03seq\x18\x03 \x01(\t\x12\x13\n\x0bquality_seq\x18\x04 \x01(\t\x12 \n\x18\x65stimated_ref_len_so_far\x18\x05 \x01(\r\"\x1d\n\x0c\x42oolResponse\x12\r\n\x05value\x18\x01 \x01(\x08\"6\n\x12\x44\x65viceInfoResponse\x12\x0c\n\x04info\x18\x01 \x01(\t\x12\x12\n\nn_channels\x18\x02 \x01(\r2\x93\x06\n\tONTDevice\x12I\n\rGetServerInfo\x12\x17.ontdevice.EmptyRequest\x1a\x1d.ontdevice.ServerInfoResponse\"\x00\x12\x45\n\x0bGetMKRunDir\x12\x17.ontdevice.EmptyRequest\x1a\x1b.ontdevice.MKRunDirResponse\"\x00\x12[\n\x0ePerformActions\x12\x1d.ontdevice.ReadActionsRequest\x1a(.ontdevice.ActionResultImmediateResponse\"\x00\x12\x65\n\x13GetBasecalledChunks\x12\".ontdevice.BasecalledChunksRequest\x1a&.ontdevice.BasecalledReadChunkResponse\"\x00\x30\x01\x12X\n\x10GetActionResults\x12\x1f.ontdevice.ActionResultsRequest\x1a\x1f.ontdevice.ActionResultResponse\"\x00\x30\x01\x12>\n\x08StartSim\x12\x17.ontdevice.StartRequest\x1a\x17.ontdevice.BoolResponse\"\x00\x12=\n\x07StopSim\x12\x17.ontdevice.EmptyRequest\x1a\x17.ontdevice.BoolResponse\"\x00\x12K\n\nRunMuxScan\x12\x1c.ontdevice.RunMuxScanRequest\x1a\x1d.ontdevice.MuxScanStartedInfo\"\x00\x12?\n\tIsRunning\x12\x17.ontdevice.EmptyRequest\x1a\x17.ontdevice.BoolResponse\"\x00\x12I\n\rGetDeviceInfo\x12\x17.ontdevice.EmptyRequest\x1a\x1d.ontdevice.DeviceInfoResponse\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x10ont_device.proto\x12\tontdevice\"\x0e\n\x0c\x45mptyRequest\"\x0f\n\rEmptyResponse\"\'\n\x12ServerInfoResponse\x12\x11\n\tunique_id\x18\x01 \x01(\t\"&\n\x10MKRunDirResponse\x12\x12\n\nmk_run_dir\x18\x01 \x01(\t\"\xe2\x02\n\x12ReadActionsRequest\x12\x35\n\x07\x61\x63tions\x18\x01 \x03(\x0b\x32$.ontdevice.ReadActionsRequest.Action\x1a\x94\x02\n\x06\x41\x63tion\x12\x0f\n\x07\x63hannel\x18\x01 \x01(\r\x12\x0f\n\x07read_id\x18\x02 \x01(\t\x12\x45\n\x07unblock\x18\x03 \x01(\x0b\x32\x32.ontdevice.ReadActionsRequest.Action.UnblockActionH\x00\x12U\n\x11stop_further_data\x18\x04 \x01(\x0b\x32\x38.ontdevice.ReadActionsRequest.Action.StopReceivingActionH\x00\x1a\x15\n\x13StopReceivingAction\x1a)\n\rUnblockAction\x12\x18\n\x10unblock_duration\x18\x01 \x01(\x01\x42\x08\n\x06\x61\x63tion\"%\n\x14\x41\x63tionResultsRequest\x12\r\n\x05\x63lear\x18\x01 \x01(\x08\"k\n\x14\x41\x63tionResultResponse\x12\x0f\n\x07read_id\x18\x01 \x01(\t\x12\x0c\n\x04time\x18\x02 \x01(\x01\x12\x0f\n\x07\x63hannel\x18\x03 \x01(\r\x12\x13\n\x0b\x61\x63tion_type\x18\x04 \x01(\r\x12\x0e\n\x06result\x18\x05 \x01(\r\"r\n\x0cStartRequest\x12\x1b\n\x13\x61\x63\x63\x65leration_factor\x18\x01 \x01(\x01\x12\x15\n\rupdate_method\x18\x02 \x01(\t\x12\x14\n\x0clog_interval\x18\x03 \x01(\r\x12\x18\n\x10stop_if_no_reads\x18\x04 \x01(\x08\"\'\n\x11RunMuxScanRequest\x12\x12\n\nt_duration\x18\x01 \x01(\x01\"/\n\x12RunMuxScanResponse\x12\x19\n\x11nb_reads_rejected\x18\x01 \x01(\r\"\xad\x01\n\x17\x42\x61secalledChunksRequest\x12\x17\n\nbatch_size\x18\x01 \x01(\rH\x00\x88\x01\x01\x12\x42\n\x08\x63hannels\x18\x03 \x01(\x0b\x32+.ontdevice.BasecalledChunksRequest.ChannelsH\x01\x88\x01\x01\x1a\x19\n\x08\x43hannels\x12\r\n\x05value\x18\x02 \x03(\rB\r\n\x0b_batch_sizeB\x0b\n\t_channels\"\x83\x01\n\x1b\x42\x61secalledReadChunkResponse\x12\x0f\n\x07\x63hannel\x18\x01 \x01(\r\x12\x0f\n\x07read_id\x18\x02 \x01(\t\x12\x0b\n\x03seq\x18\x03 \x01(\t\x12\x13\n\x0bquality_seq\x18\x04 \x01(\t\x12 \n\x18\x65stimated_ref_len_so_far\x18\x05 \x01(\r\"\x1d\n\x0c\x42oolResponse\x12\r\n\x05value\x18\x01 \x01(\x08\"6\n\x12\x44\x65viceInfoResponse\x12\x0c\n\x04info\x18\x01 \x01(\t\x12\x12\n\nn_channels\x18\x02 \x01(\r2\x83\x06\n\tONTDevice\x12I\n\rGetServerInfo\x12\x17.ontdevice.EmptyRequest\x1a\x1d.ontdevice.ServerInfoResponse\"\x00\x12\x45\n\x0bGetMKRunDir\x12\x17.ontdevice.EmptyRequest\x1a\x1b.ontdevice.MKRunDirResponse\"\x00\x12K\n\x0ePerformActions\x12\x1d.ontdevice.ReadActionsRequest\x1a\x18.ontdevice.EmptyResponse\"\x00\x12\x65\n\x13GetBasecalledChunks\x12\".ontdevice.BasecalledChunksRequest\x1a&.ontdevice.BasecalledReadChunkResponse\"\x00\x30\x01\x12X\n\x10GetActionResults\x12\x1f.ontdevice.ActionResultsRequest\x1a\x1f.ontdevice.ActionResultResponse\"\x00\x30\x01\x12>\n\x08StartSim\x12\x17.ontdevice.StartRequest\x1a\x17.ontdevice.BoolResponse\"\x00\x12=\n\x07StopSim\x12\x17.ontdevice.EmptyRequest\x1a\x17.ontdevice.BoolResponse\"\x00\x12K\n\nRunMuxScan\x12\x1c.ontdevice.RunMuxScanRequest\x1a\x1d.ontdevice.RunMuxScanResponse\"\x00\x12?\n\tIsRunning\x12\x17.ontdevice.EmptyRequest\x1a\x17.ontdevice.BoolResponse\"\x00\x12I\n\rGetDeviceInfo\x12\x17.ontdevice.EmptyRequest\x1a\x1d.ontdevice.DeviceInfoResponse\"\x00\x62\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'ont_device_pb2', _globals) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None _globals['_EMPTYREQUEST']._serialized_start=31 _globals['_EMPTYREQUEST']._serialized_end=45 @@ -37,28 +37,26 @@ _globals['_READACTIONSREQUEST_ACTION_STOPRECEIVINGACTION']._serialized_end=447 _globals['_READACTIONSREQUEST_ACTION_UNBLOCKACTION']._serialized_start=449 _globals['_READACTIONSREQUEST_ACTION_UNBLOCKACTION']._serialized_end=490 - _globals['_ACTIONRESULTIMMEDIATERESPONSE']._serialized_start=502 - _globals['_ACTIONRESULTIMMEDIATERESPONSE']._serialized_end=552 - _globals['_ACTIONRESULTSREQUEST']._serialized_start=554 - _globals['_ACTIONRESULTSREQUEST']._serialized_end=591 - _globals['_ACTIONRESULTRESPONSE']._serialized_start=593 - _globals['_ACTIONRESULTRESPONSE']._serialized_end=700 - _globals['_STARTREQUEST']._serialized_start=702 - _globals['_STARTREQUEST']._serialized_end=816 - _globals['_RUNMUXSCANREQUEST']._serialized_start=818 - _globals['_RUNMUXSCANREQUEST']._serialized_end=857 - _globals['_MUXSCANSTARTEDINFO']._serialized_start=859 - _globals['_MUXSCANSTARTEDINFO']._serialized_end=906 - _globals['_BASECALLEDCHUNKSREQUEST']._serialized_start=909 - _globals['_BASECALLEDCHUNKSREQUEST']._serialized_end=1082 - _globals['_BASECALLEDCHUNKSREQUEST_CHANNELS']._serialized_start=1029 - _globals['_BASECALLEDCHUNKSREQUEST_CHANNELS']._serialized_end=1054 - _globals['_BASECALLEDREADCHUNKRESPONSE']._serialized_start=1085 - _globals['_BASECALLEDREADCHUNKRESPONSE']._serialized_end=1216 - _globals['_BOOLRESPONSE']._serialized_start=1218 - _globals['_BOOLRESPONSE']._serialized_end=1247 - _globals['_DEVICEINFORESPONSE']._serialized_start=1249 - _globals['_DEVICEINFORESPONSE']._serialized_end=1303 - _globals['_ONTDEVICE']._serialized_start=1306 - _globals['_ONTDEVICE']._serialized_end=2093 + _globals['_ACTIONRESULTSREQUEST']._serialized_start=502 + _globals['_ACTIONRESULTSREQUEST']._serialized_end=539 + _globals['_ACTIONRESULTRESPONSE']._serialized_start=541 + _globals['_ACTIONRESULTRESPONSE']._serialized_end=648 + _globals['_STARTREQUEST']._serialized_start=650 + _globals['_STARTREQUEST']._serialized_end=764 + _globals['_RUNMUXSCANREQUEST']._serialized_start=766 + _globals['_RUNMUXSCANREQUEST']._serialized_end=805 + _globals['_RUNMUXSCANRESPONSE']._serialized_start=807 + _globals['_RUNMUXSCANRESPONSE']._serialized_end=854 + _globals['_BASECALLEDCHUNKSREQUEST']._serialized_start=857 + _globals['_BASECALLEDCHUNKSREQUEST']._serialized_end=1030 + _globals['_BASECALLEDCHUNKSREQUEST_CHANNELS']._serialized_start=977 + _globals['_BASECALLEDCHUNKSREQUEST_CHANNELS']._serialized_end=1002 + _globals['_BASECALLEDREADCHUNKRESPONSE']._serialized_start=1033 + _globals['_BASECALLEDREADCHUNKRESPONSE']._serialized_end=1164 + _globals['_BOOLRESPONSE']._serialized_start=1166 + _globals['_BOOLRESPONSE']._serialized_end=1195 + _globals['_DEVICEINFORESPONSE']._serialized_start=1197 + _globals['_DEVICEINFORESPONSE']._serialized_end=1251 + _globals['_ONTDEVICE']._serialized_start=1254 + _globals['_ONTDEVICE']._serialized_end=2025 # @@protoc_insertion_point(module_scope) diff --git a/src/simreaduntil/simulator/protos_generated/ont_device_pb2.pyi b/src/simreaduntil/simulator/protos_generated/ont_device_pb2.pyi index 9cb0b91..d424ba1 100644 --- a/src/simreaduntil/simulator/protos_generated/ont_device_pb2.pyi +++ b/src/simreaduntil/simulator/protos_generated/ont_device_pb2.pyi @@ -6,34 +6,34 @@ from typing import ClassVar as _ClassVar, Iterable as _Iterable, Mapping as _Map DESCRIPTOR: _descriptor.FileDescriptor class EmptyRequest(_message.Message): - __slots__ = [] + __slots__ = () def __init__(self) -> None: ... class EmptyResponse(_message.Message): - __slots__ = [] + __slots__ = () def __init__(self) -> None: ... class ServerInfoResponse(_message.Message): - __slots__ = ["unique_id"] + __slots__ = ("unique_id",) UNIQUE_ID_FIELD_NUMBER: _ClassVar[int] unique_id: str def __init__(self, unique_id: _Optional[str] = ...) -> None: ... class MKRunDirResponse(_message.Message): - __slots__ = ["mk_run_dir"] + __slots__ = ("mk_run_dir",) MK_RUN_DIR_FIELD_NUMBER: _ClassVar[int] mk_run_dir: str def __init__(self, mk_run_dir: _Optional[str] = ...) -> None: ... class ReadActionsRequest(_message.Message): - __slots__ = ["actions"] + __slots__ = ("actions",) class Action(_message.Message): - __slots__ = ["channel", "read_id", "unblock", "stop_further_data"] + __slots__ = ("channel", "read_id", "unblock", "stop_further_data") class StopReceivingAction(_message.Message): - __slots__ = [] + __slots__ = () def __init__(self) -> None: ... class UnblockAction(_message.Message): - __slots__ = ["unblock_duration"] + __slots__ = ("unblock_duration",) UNBLOCK_DURATION_FIELD_NUMBER: _ClassVar[int] unblock_duration: float def __init__(self, unblock_duration: _Optional[float] = ...) -> None: ... @@ -50,20 +50,14 @@ class ReadActionsRequest(_message.Message): actions: _containers.RepeatedCompositeFieldContainer[ReadActionsRequest.Action] def __init__(self, actions: _Optional[_Iterable[_Union[ReadActionsRequest.Action, _Mapping]]] = ...) -> None: ... -class ActionResultImmediateResponse(_message.Message): - __slots__ = ["succeeded"] - SUCCEEDED_FIELD_NUMBER: _ClassVar[int] - succeeded: _containers.RepeatedScalarFieldContainer[bool] - def __init__(self, succeeded: _Optional[_Iterable[bool]] = ...) -> None: ... - class ActionResultsRequest(_message.Message): - __slots__ = ["clear"] + __slots__ = ("clear",) CLEAR_FIELD_NUMBER: _ClassVar[int] clear: bool def __init__(self, clear: bool = ...) -> None: ... class ActionResultResponse(_message.Message): - __slots__ = ["read_id", "time", "channel", "action_type", "result"] + __slots__ = ("read_id", "time", "channel", "action_type", "result") READ_ID_FIELD_NUMBER: _ClassVar[int] TIME_FIELD_NUMBER: _ClassVar[int] CHANNEL_FIELD_NUMBER: _ClassVar[int] @@ -77,7 +71,7 @@ class ActionResultResponse(_message.Message): def __init__(self, read_id: _Optional[str] = ..., time: _Optional[float] = ..., channel: _Optional[int] = ..., action_type: _Optional[int] = ..., result: _Optional[int] = ...) -> None: ... class StartRequest(_message.Message): - __slots__ = ["acceleration_factor", "update_method", "log_interval", "stop_if_no_reads"] + __slots__ = ("acceleration_factor", "update_method", "log_interval", "stop_if_no_reads") ACCELERATION_FACTOR_FIELD_NUMBER: _ClassVar[int] UPDATE_METHOD_FIELD_NUMBER: _ClassVar[int] LOG_INTERVAL_FIELD_NUMBER: _ClassVar[int] @@ -89,21 +83,21 @@ class StartRequest(_message.Message): def __init__(self, acceleration_factor: _Optional[float] = ..., update_method: _Optional[str] = ..., log_interval: _Optional[int] = ..., stop_if_no_reads: bool = ...) -> None: ... class RunMuxScanRequest(_message.Message): - __slots__ = ["t_duration"] + __slots__ = ("t_duration",) T_DURATION_FIELD_NUMBER: _ClassVar[int] t_duration: float def __init__(self, t_duration: _Optional[float] = ...) -> None: ... -class MuxScanStartedInfo(_message.Message): - __slots__ = ["nb_reads_rejected"] +class RunMuxScanResponse(_message.Message): + __slots__ = ("nb_reads_rejected",) NB_READS_REJECTED_FIELD_NUMBER: _ClassVar[int] nb_reads_rejected: int def __init__(self, nb_reads_rejected: _Optional[int] = ...) -> None: ... class BasecalledChunksRequest(_message.Message): - __slots__ = ["batch_size", "channels"] + __slots__ = ("batch_size", "channels") class Channels(_message.Message): - __slots__ = ["value"] + __slots__ = ("value",) VALUE_FIELD_NUMBER: _ClassVar[int] value: _containers.RepeatedScalarFieldContainer[int] def __init__(self, value: _Optional[_Iterable[int]] = ...) -> None: ... @@ -114,7 +108,7 @@ class BasecalledChunksRequest(_message.Message): def __init__(self, batch_size: _Optional[int] = ..., channels: _Optional[_Union[BasecalledChunksRequest.Channels, _Mapping]] = ...) -> None: ... class BasecalledReadChunkResponse(_message.Message): - __slots__ = ["channel", "read_id", "seq", "quality_seq", "estimated_ref_len_so_far"] + __slots__ = ("channel", "read_id", "seq", "quality_seq", "estimated_ref_len_so_far") CHANNEL_FIELD_NUMBER: _ClassVar[int] READ_ID_FIELD_NUMBER: _ClassVar[int] SEQ_FIELD_NUMBER: _ClassVar[int] @@ -128,13 +122,13 @@ class BasecalledReadChunkResponse(_message.Message): def __init__(self, channel: _Optional[int] = ..., read_id: _Optional[str] = ..., seq: _Optional[str] = ..., quality_seq: _Optional[str] = ..., estimated_ref_len_so_far: _Optional[int] = ...) -> None: ... class BoolResponse(_message.Message): - __slots__ = ["value"] + __slots__ = ("value",) VALUE_FIELD_NUMBER: _ClassVar[int] value: bool def __init__(self, value: bool = ...) -> None: ... class DeviceInfoResponse(_message.Message): - __slots__ = ["info", "n_channels"] + __slots__ = ("info", "n_channels") INFO_FIELD_NUMBER: _ClassVar[int] N_CHANNELS_FIELD_NUMBER: _ClassVar[int] info: str diff --git a/src/simreaduntil/simulator/protos_generated/ont_device_pb2_grpc.py b/src/simreaduntil/simulator/protos_generated/ont_device_pb2_grpc.py index 063cff1..8933100 100644 --- a/src/simreaduntil/simulator/protos_generated/ont_device_pb2_grpc.py +++ b/src/simreaduntil/simulator/protos_generated/ont_device_pb2_grpc.py @@ -29,7 +29,7 @@ def __init__(self, channel): self.PerformActions = channel.unary_unary( '/ontdevice.ONTDevice/PerformActions', request_serializer=ont__device__pb2.ReadActionsRequest.SerializeToString, - response_deserializer=ont__device__pb2.ActionResultImmediateResponse.FromString, + response_deserializer=ont__device__pb2.EmptyResponse.FromString, ) self.GetBasecalledChunks = channel.unary_stream( '/ontdevice.ONTDevice/GetBasecalledChunks', @@ -54,7 +54,7 @@ def __init__(self, channel): self.RunMuxScan = channel.unary_unary( '/ontdevice.ONTDevice/RunMuxScan', request_serializer=ont__device__pb2.RunMuxScanRequest.SerializeToString, - response_deserializer=ont__device__pb2.MuxScanStartedInfo.FromString, + response_deserializer=ont__device__pb2.RunMuxScanResponse.FromString, ) self.IsRunning = channel.unary_unary( '/ontdevice.ONTDevice/IsRunning', @@ -157,7 +157,7 @@ def add_ONTDeviceServicer_to_server(servicer, server): 'PerformActions': grpc.unary_unary_rpc_method_handler( servicer.PerformActions, request_deserializer=ont__device__pb2.ReadActionsRequest.FromString, - response_serializer=ont__device__pb2.ActionResultImmediateResponse.SerializeToString, + response_serializer=ont__device__pb2.EmptyResponse.SerializeToString, ), 'GetBasecalledChunks': grpc.unary_stream_rpc_method_handler( servicer.GetBasecalledChunks, @@ -182,7 +182,7 @@ def add_ONTDeviceServicer_to_server(servicer, server): 'RunMuxScan': grpc.unary_unary_rpc_method_handler( servicer.RunMuxScan, request_deserializer=ont__device__pb2.RunMuxScanRequest.FromString, - response_serializer=ont__device__pb2.MuxScanStartedInfo.SerializeToString, + response_serializer=ont__device__pb2.RunMuxScanResponse.SerializeToString, ), 'IsRunning': grpc.unary_unary_rpc_method_handler( servicer.IsRunning, @@ -253,7 +253,7 @@ def PerformActions(request, metadata=None): return grpc.experimental.unary_unary(request, target, '/ontdevice.ONTDevice/PerformActions', ont__device__pb2.ReadActionsRequest.SerializeToString, - ont__device__pb2.ActionResultImmediateResponse.FromString, + ont__device__pb2.EmptyResponse.FromString, options, channel_credentials, insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @@ -338,7 +338,7 @@ def RunMuxScan(request, metadata=None): return grpc.experimental.unary_unary(request, target, '/ontdevice.ONTDevice/RunMuxScan', ont__device__pb2.RunMuxScanRequest.SerializeToString, - ont__device__pb2.MuxScanStartedInfo.FromString, + ont__device__pb2.RunMuxScanResponse.FromString, options, channel_credentials, insecure, call_credentials, compression, wait_for_ready, timeout, metadata) diff --git a/src/simreaduntil/simulator/readpool.py b/src/simreaduntil/simulator/readpool.py index 9fbbb27..6292786 100644 --- a/src/simreaduntil/simulator/readpool.py +++ b/src/simreaduntil/simulator/readpool.py @@ -2,12 +2,21 @@ Read pool that returns reads when requested, e.g., from a generator or a file """ +import contextlib +from pathlib import Path +from queue import Empty +import queue import threading -from typing import Optional, Dict, Any, Generator +from typing import Optional, Dict, Any, Generator, Tuple import numpy as np import pysam +from simreaduntil.shared_utils.logging_utils import setup_logger_simple +from simreaduntil.shared_utils.thread_helpers import ThreadWithResultsAndExceptions -from simreaduntil.shared_utils.utils import force_eval_generator_function, get_some_value_from_dict, is_empty_file +from simreaduntil.shared_utils.utils import StoppableQueue, force_eval_generator_function, get_some_value_from_dict, is_empty_file + +logger = setup_logger_simple(__name__) +"""module logger""" @force_eval_generator_function def reads_from_file_gen(fasta_file, shuffle_rand_state: Optional[np.random.Generator]=None): @@ -29,7 +38,8 @@ def reads_from_file_gen(fasta_file, shuffle_rand_state: Optional[np.random.Gener for id in ref_names: yield (id, fasta.fetch(id)) -class NoReadLeft(Exception): +# todo9: put into readpool +class NoReadLeftException(Exception): """ When no read is left in the read pool """ @@ -38,51 +48,72 @@ class NoReadLeft(Exception): class ReadPool: """ Read pool from which reads can be obtained + + Do not forget to call finish() when done. + + Args: + reads_per_channel: whether reads are channel-specific """ - def __init__(self): + def __init__(self, reads_per_channel): self.lock = threading.Lock() self.definitely_empty = False # whether the readpool is definitely empty (if False, we don't know) self.nb_reads_returned = 0 + self.reads_per_channel = reads_per_channel - def get_new_read(self, channel=None) -> str: + def get_new_read(self, channel=None) -> Tuple[str, Any]: """ Get new read (thread-safe) - Once the function returns NoReadLeft for a channel, it will always return NoReadLeft for this channel (also for channel=None). + Once the function returns NoReadLeftException for a channel, it will always return NoReadLeftException for this channel (also for channel=None). Args: channel: channel for which to get read """ with self.lock: - res = self._get_new_read(channel=channel) + res = self._get_new_read(channel=channel) if self.reads_per_channel else self._get_new_read() self.nb_reads_returned += 1 return res - def _get_new_read(self, channel=None) -> str: + def _get_new_read(self, channel=None) -> Tuple[str, Any]: """ Get new read (not thread-safe), overwrite in subclasses Args: - channel: channel for which to get read + channel: channel for which to get read, only provided if self.reads_per_channel is True + + Returns: + tuple (read, read signal) """ raise NotImplementedError() + + """ + Stop the read pool + + For example, if it is threaded, stop the thread + """ + def finish(self): + pass + def __enter__(self): + return self + def __exit__(self, exc_type, exc_value, traceback): + self.finish() class ReadPoolFromIterable(ReadPool): """ Read pool that requests reads from generator """ def __init__(self, reads_iterable): - super().__init__() + super().__init__(reads_per_channel=False) self.reads_iterable = reads_iterable - def _get_new_read(self, channel=None) -> str: + def _get_new_read(self) -> Tuple[str, Any]: # note: generators are not thread-safe !! try: return next(self.reads_iterable) except StopIteration as e: self.definitely_empty = True - raise NoReadLeft from e # exception chaining + raise NoReadLeftException from e # exception chaining # support pickling def __getstate__(self): @@ -98,15 +129,15 @@ class ReadPoolFromIterablePerChannel(ReadPool): Read pool that requests reads from channel-specific generator """ def __init__(self, reads_iterable_per_channel: Dict[Any, Generator[str, None, None]]): - super().__init__() + super().__init__(reads_per_channel=True) self.reads_iterable_per_channel = reads_iterable_per_channel - def _get_new_read(self, channel=None) -> str: + def _get_new_read(self, channel) -> Tuple[str, Any]: try: return next(self.reads_iterable_per_channel[channel]) except StopIteration as e: - raise NoReadLeft from e # exception chaining + raise NoReadLeftException from e # exception chaining def __repr__(self): return f"ReadPoolFromIterable({list(self.reads_iterable_per_channel.keys())})" @@ -122,15 +153,212 @@ def __getstate__(self): class ReadPoolFromFile(ReadPoolFromIterable): """ - Keep track of reads_file + Read pool that reads from a file or directory """ - def __init__(self, reads_file, shuffle_rand_state: Optional[np.random.Generator]=None): - super().__init__(reads_from_file_gen(reads_file, shuffle_rand_state=shuffle_rand_state)) + def __init__(self, reads_file_or_dir, shuffle_rand_state: Optional[np.random.Generator]=None): + reads_file_or_dir = Path(reads_file_or_dir) + def read_gen(): + for filename in (reads_file_or_dir.glob("**/*.fasta") if reads_file_or_dir.is_dir() else [reads_file_or_dir]): + logger.info(f"Starting to read file '{filename}'") + yield from reads_from_file_gen(filename, shuffle_rand_state=shuffle_rand_state) + super().__init__(read_gen()) self.shuffled = shuffle_rand_state is not None - self.reads_file = reads_file + self.reads_file_or_dir = reads_file_or_dir + + """ + Check if the read pool can open the file/directory + """ + @staticmethod + def can_handle(file: Path) -> bool: + file = Path(file) + return ( + (file.is_dir() and any(file.glob("**/*.fasta"))) or + (file.is_file() and (file.suffix == ".fasta" or file.suffix == ".fasta.gz")) + ) def __repr__(self): - return f"ReadPool(file = {self.reads_file}, shuffled = {self.shuffled})" + return f"ReadPool(file = {self.reads_file_or_dir}, shuffled = {self.shuffled})" + +""" +Threaded ReadPool that wraps another ReadPool and reads from it in another thread using a queue + +Note: Using a rng with ThreadedPoolWrapper is not thread-safe if rng is accessed from multiple threads +""" +class ThreadedReadPoolWrapper(ReadPool): + def __init__(self, read_pool: ReadPool, queue_size: int): + super().__init__(reads_per_channel=read_pool.reads_per_channel) + self._read_pool = read_pool + assert queue_size > 0 # otherwise will read all reads at once + self._reads_queue = StoppableQueue(queue_size) + self._reader_thread = ThreadWithResultsAndExceptions(target=self._fill_queue, name="ThreadedReadPoolWrapper") + self._reader_thread.start() + self.definitely_empty = False + + """ + Check if the read pool can open the file/directory + """ + def can_handle(self, *args, **kwargs) -> bool: + return self._read_pool.can_handle(*args, **kwargs) - \ No newline at end of file + def _fill_queue(self): + try: + while True: + read = self._read_pool.get_new_read() + self._reads_queue.put(read) + except (StoppableQueue.QueueStoppedException, NoReadLeftException): + pass + + try: + self._reads_queue.put(None) # allows get_new_read() to detect it will never return a read again + except StoppableQueue.QueueStoppedException: + # was terminated in between + pass + + logger.info("Finished read queue filler thread") + + def __repr__(self) -> str: + return f"ThreadedReadPool({self._read_pool}, queue_size: {self._reads_queue.maxsize})" + + # protected by lock/mutex + def _get_new_read(self) -> Tuple[str, Any]: + if self.definitely_empty: + raise NoReadLeftException + + try: + read = self._reads_queue.get() + except StoppableQueue.QueueStoppedException: + read = None + + if read is None: + self.definitely_empty = True + raise NoReadLeftException + return read + + def finish(self): + self._reads_queue.stop() + self._reader_thread.join() + self._reader_thread.raise_if_error() + self._read_pool.finish() + +def get_slow5_reads_gen(filename): + """generator returning reads in a slow5 file""" + import pyslow5 + with contextlib.closing(pyslow5.Open(str(filename), "r")) as fh: + for read in fh.seq_reads(): + yield (read["read_id"], read["signal"]) + +# """ +# Read Slow5 files in another thread and put the read data into a queue. + +# # todo: remove read_buffer, rather use ThreadedReadPoolWrapper +# # todo: subclass ReadPool +# # todo: overwrite _get_new_read +# # todo: number of threads for reading slow5 + +# Args: +# s5_dir: directory containing slow5 files +# read_buffer: number of reads to buffer in queue +# """ +# class Slow5ReadPool(ReadPool): +# def __init__(self, s5_dir, read_buffer) -> None: +# super().__init__(reads_per_channel=False) + +# raise ("NotYetImplementedError") + +# import pyslow5 # todo: add as dependency +# from pathlib import Path +# from queue import Queue +# import threading + +# s5_dir = Path(s5_dir) +# assert s5_dir.is_dir() +# self.s5_files = list(s5_dir.glob("*.[sb]low5")) +# self.queue = Queue(read_buffer) # threadsafe +# self.cur_file_idx = -1 +# self._queue_filler_thread = threading.Thread(target=self._fill_queue) +# self._queue_filler_thread.start() +# # self._queue_filler_thread = multiprocessing.Process(target=self._fill_queue) + +# """ +# Check if the read pool can open the file/directory +# """ +# @staticmethod +# def can_handle(dir: Path) -> bool: +# dir = Path(dir) +# return dir.is_dir() and any(dir.glob("**/*.[sb]low5")) + +# def _open_next_file(self): +# self.cur_file_idx += 1 +# if self.cur_file_idx >= len(self.s5_files): +# return False +# logger.info(f"Switching to file {self.cur_file_idx} of {len(self.s5_files)} with name {self.s5_files[self.cur_file_idx]}") +# self.cur_read_gen = get_slow5_reads_gen(str(self.s5_files[self.cur_file_idx])) +# return True + +# def _fill_queue(self): +# logger.info("Started queue filler thread") +# while self._open_next_file(): +# for read in self.cur_read_gen: +# # logger.debug(f"Putting read {read[0]} into queue") +# self.queue.put(read) +# self.queue.put(None) # sentinel + +# def get_new_read(self) -> Tuple[str, np.ndarray]: +# # overwriting get_new_read directly rather than _get_new_read because queue already threadsafe +# # res = self.queue.get() +# # try getting an element immediately, otherwise print a warning +# try: +# res = self.queue.get_nowait() +# except Empty: +# logger.warning("Slow5 read queue empty, waiting until available") +# res = self.queue.get() +# if res is None: +# self.definitely_empty = True +# raise NoReadLeftException() +# return res + +# # def stop_reading(self): +# # self.queue.put(None) +# # self._queue_filler_thread.join(), raise_if_error + +# # def reads_gen(self): +# # while True: +# # res = self.queue.get() +# # if res is None: +# # break +# # yield res + +# s5_dir = "/home/mmordig/rawhash_project/rawhash2/test/data/d2_ecoli_r94/slow5_files" +# reader = Slow5ReadPool(s5_dir, 2) + +# import time +# import tqdm + +# num_signals = 0 +# num_reads = 0 +# # read_id, read_signal = reader.get_new_read() +# start_time = time.time() +# for (read_id, read_signal) in tqdm.tqdm(reader.reads_gen()): +# num_reads += 1 +# num_signals += len(read_signal) +# if num_reads > 10000: +# break + +# elapsed_time = time.time() - start_time +# num_signals / elapsed_time +# n_channels = 512 +# min_throughput = n_channels * 4000 +# num_signals / elapsed_time / min_throughput + + +# end_time_per_channel = np.array([time.time()] * n_channels) +# for (read_id, read_signal) in tqdm.tqdm(reader.reads_gen()): +# min_i = np.argmin(end_time_per_channel) +# time_sleep = end_time_per_channel[min_i] - time.time() +# if time_sleep >= 0: +# time.sleep(time_sleep) +# # else: +# # print(f"Warning: missed deadline {time_sleep}") +# end_time_per_channel[min_i] = time.time() + len(read_signal) / 4000 +# # xxx = read_signal.sum() \ No newline at end of file diff --git a/src/simreaduntil/simulator/readswriter.py b/src/simreaduntil/simulator/readswriter.py index abbe17a..32ae0b5 100644 --- a/src/simreaduntil/simulator/readswriter.py +++ b/src/simreaduntil/simulator/readswriter.py @@ -6,6 +6,7 @@ import logging import os from pathlib import Path +import queue import shutil import sys import tempfile @@ -13,6 +14,7 @@ import threading from Bio import SeqIO from Bio.Seq import Seq +from simreaduntil.shared_utils.thread_helpers import ThreadWithResultsAndExceptions from simreaduntil.shared_utils.utils import is_empty_dir, setup_logger_simple logger = setup_logger_simple(__name__) @@ -45,12 +47,74 @@ def flush(self): def _flush(self): """thread-safe helper method for flush""" raise NotImplementedError + + def finish(self): + self.flush() + + def __enter__(self): + return self + def __exit__(self, exc_type, exc_value, traceback): + self.finish() +""" +Wrapper arounds ReadsWriter that writes reads in a separate thread +""" +class ThreadedReadsWriterWrapper(ReadsWriter): + def __init__(self, reads_writer: ReadsWriter): + super().__init__() + self._reads_writer = reads_writer + self._reads_queue = queue.Queue() + self._writer_thread = ThreadWithResultsAndExceptions(target=self._write_received_reads, name="ThreadedReadsWriterWrapper") + self._writer_thread.start() + + def __repr__(self): + return f"ThreadedReadsWriterWrapper(reads_writer={self._reads_writer})" + + def _write_received_reads(self): + while True: + read = self._reads_queue.get() + if read is None: + break + self._reads_writer.write_read(read) + self.flush() + + def write_read(self, read: SeqIO.SeqRecord): + self._reads_queue.put(read) + + def _flush(self): + self._reads_writer.flush() + + def finish(self): + self._reads_queue.put(None) + self._writer_thread.join() + self._writer_thread.raise_if_error() + self._reads_writer.finish() + +""" +Combines several ReadsWriters into one, calling them sequentially +""" +class CompoundReadsWriter(ReadsWriter): + def __init__(self, reads_writers): + super().__init__() + self.reads_writers = reads_writers + + def __repr__(self): + return f"CompoundReadsWriter(reads_writers={self.reads_writers})" + + def write_read(self, read: SeqIO.SeqRecord): + [rw.write_read(read) for rw in self.reads_writers] + + def flush(self): + [rw.flush() for rw in self.reads_writers] + + def finish(self): + [rw.finish() for rw in self.reads_writers] + class SingleFileReadsWriter(ReadsWriter): """ Write reads to one file (by default stdout), appending reads to the file as they are written (possibly with buffering) - When pickling the file, the file is flushed. When reloading it, the filehandler is not restored and must be set directly. + When pickling this class, the file is flushed. When reloading it, the filehandler is not restored and must be set directly. This class is useful for debugging by writing to sys.stdout. Args: @@ -85,6 +149,12 @@ def __getstate__(self): state["fh"] = None return state + def finish(self): + if self.fh not in [sys.stdout, sys.stderr]: + self.fh.close() + else: + self.fh.flush() + class RotatingFileReadsWriter(ReadsWriter): """ Write reads to a file, creating a new file whenever a maximum of reads is reached. @@ -196,7 +266,6 @@ def __init__(self): super().__init__() self.reads = [] - self.output_dir = None # for compatibility with the ONTSimulator def __repr__(self) -> str: return f"ArrayReadsWriter(nb_reads={len(self.reads)})" diff --git a/src/simreaduntil/simulator/simfasta_to_seqsum.py b/src/simreaduntil/simulator/simfasta_to_seqsum.py index b95d894..c25fd68 100644 --- a/src/simreaduntil/simulator/simfasta_to_seqsum.py +++ b/src/simreaduntil/simulator/simfasta_to_seqsum.py @@ -7,6 +7,8 @@ import argparse import os from pathlib import Path +import sys +import typing import warnings from Bio import SeqIO import numpy as np @@ -19,6 +21,7 @@ from simreaduntil.shared_utils.utils import print_args from simreaduntil.simulator.channel_element import ReadDescriptionParser, ReadTags, end_reason_to_ont_map +from simreaduntil.simulator.readswriter import ReadsWriter logger = setup_logger_simple(__name__) """module logger""" @@ -27,6 +30,78 @@ SEQ_SUMMARY_HEADER = ["read_id", "channel", "mux", "start_time", "duration", "passes_filtering", "template_start", "template_duration", "sequence_length_template", "end_reason"] + _extra_fields """Fields in the sequencing summary file""" +"""Write the sequencing summary header""" +def write_seqsum_header(seqsummary_file): + seqsummary_file.write("\t".join(SEQ_SUMMARY_HEADER) + os.linesep) + +""" +Writes a single sequence record to a sequencing summary file + +Args: + record: sequence record + seqsummary_file: file to write to + read_id: if None, will be parsed from description (by splitting on the first whitespace) + Typically, when SeqIO.SeqRecord is constructed in the code, read_id must be set because the description does not contain the read id + If it is read from a file with SeqIO.parse, the read_id is also in the description +""" +def write_seqsum_record_line(record: SeqIO.SeqRecord, seqsummary_file: typing.IO, read_id: str = None): + if (read_id is None) or record.description.startswith(record.id): + # id is in description + read_id, description = record.description.split(" ", maxsplit=1) + else: + description = record.description + + parsed_desc = ReadDescriptionParser(description) + if NanoSimId.is_valid(parsed_desc.full_read_id): + full_len = NanoSimId.from_str(parsed_desc.full_read_id).ref_len # length if read had not been rejected + else: + full_len = np.NaN + + t_duration = parsed_desc.t_end - parsed_desc.t_start + template_duration = t_duration - parsed_desc.t_delay + if len(record.seq) == 0: + logger.info(f"Found read '{read_id}' that stopped after time {t_duration} before its actual content would have started (at {parsed_desc.t_delay}), skipping") # due to adapters, barcodes + return + channel = parsed_desc.ch + + mux = 1 + passes_filtering = True + print("\t".join(map(str, + [read_id, channel, mux, parsed_desc.t_start, t_duration, passes_filtering, parsed_desc.t_start + parsed_desc.t_delay, + template_duration, len(record.seq), end_reason_to_ont_map[parsed_desc.ended], full_len, ReadTags.RU_STOPPED_RECEIVING in parsed_desc.tags, ReadTags.RU_NEVER_REQUESTED in parsed_desc.tags] + )), file=seqsummary_file) + +class SequencingSummaryWriter(ReadsWriter): + """ + Write the sequencing summary to a single file on-the-fly. + + Args: + reads_out_fh: filehandler to write to + """ + def __init__(self, reads_out_fh=sys.stdout): + super().__init__() + + self.fh = reads_out_fh + + write_seqsum_header(self.fh) + + # Flush reads, e.g. write outstanding reads to file by flushing the file handler + def _flush(self): + self.fh.flush() + + def __repr__(self): + return f"SequencingSummaryWriter(filename='{self.fh.name}')" + + # write read to file + def _write_read(self, read: SeqIO.SeqRecord): + write_seqsum_record_line(read, self.fh, read_id=read.id) + + def finish(self): + if self.fh not in [sys.stdout, sys.stderr]: + self.fh.close() + else: + self.fh.flush() + def convert_simfasta_to_seqsum(reads_fasta, seqsummary_filename, mode="w", tqdm_outer=False): """ Convert FASTA generated by simulator to a sequencing summary file @@ -44,7 +119,7 @@ def convert_simfasta_to_seqsum(reads_fasta, seqsummary_filename, mode="w", tqdm_ assert mode in ["a", "w"] with open(seqsummary_filename, mode=mode) as seqsummary_file: if mode == "w": - seqsummary_file.write("\t".join(SEQ_SUMMARY_HEADER) + os.linesep) + write_seqsum_header(seqsummary_file) nb_seqs = get_nb_fasta_seqs(reads_fasta) if nb_seqs == 0: @@ -52,27 +127,7 @@ def convert_simfasta_to_seqsum(reads_fasta, seqsummary_filename, mode="w", tqdm_ # set leave=False since progress bar is otherwise not properly erased for record in tqdm.tqdm(SeqIO.parse(reads_fasta, "fasta"), desc="Reading fasta line", leave=not tqdm_outer, total=nb_seqs): - read_id, description = record.description.split(" ", maxsplit=1) - parsed_desc = ReadDescriptionParser(description) - if NanoSimId.is_valid(parsed_desc.full_read_id): - full_len = NanoSimId.from_str(parsed_desc.full_read_id).ref_len # length if read had not been rejected - else: - full_len = np.NaN - - t_duration = parsed_desc.t_end - parsed_desc.t_start - template_duration = t_duration - parsed_desc.t_delay - if len(record.seq) == 0: - logger.info(f"Found read '{read_id}' that stopped after time {t_duration} before its actual content would have started (at {parsed_desc.t_delay}), skipping") # due to adapters, barcodes - continue - channel = parsed_desc.ch - - mux = 1 - passes_filtering = True - print("\t".join(map(str, - [read_id, channel, mux, parsed_desc.t_start, t_duration, passes_filtering, parsed_desc.t_start + parsed_desc.t_delay, - template_duration, len(record.seq), end_reason_to_ont_map[parsed_desc.ended], full_len, ReadTags.RU_STOPPED_RECEIVING in parsed_desc.tags, ReadTags.RU_NEVER_REQUESTED in parsed_desc.tags] - )), file=seqsummary_file) - + write_seqsum_record_line(record, seqsummary_file) # print(record) # break diff --git a/src/simreaduntil/simulator/simulator.py b/src/simreaduntil/simulator/simulator.py index 82d0294..627331e 100644 --- a/src/simreaduntil/simulator/simulator.py +++ b/src/simreaduntil/simulator/simulator.py @@ -8,6 +8,7 @@ import enum import itertools from pathlib import Path +import queue from textwrap import dedent, indent import threading import time @@ -133,7 +134,7 @@ def is_running(self) -> bool: """ raise NotImplementedError() - def get_basecalled_read_chunks(self, batch_size=None, channel_subset=None) -> List[Any]: + def get_basecalled_read_chunks(self, batch_size=None, channel_subset=None) -> Generator[Any, None, None]: """ Get available read chunks from the selected channels, from at most 'batch_size' channels @@ -141,7 +142,7 @@ def get_basecalled_read_chunks(self, batch_size=None, channel_subset=None) -> Li """ raise NotImplementedError() - def get_action_results(self, **kwargs) -> List[Tuple[Any, float, int, str, Any]]: + def get_action_results(self, **kwargs) -> Generator[Tuple[Any, float, int, str, Any], Any, Any]: """ Get new results of actions that were performed with unblock and stop_receiving (mux scans etc not included) @@ -268,8 +269,9 @@ class ONTSimulator(ReadUntilDevice): reads_writer: reads writer, ideally of type RotatingFileReadsWriter with attribute .output_dir (used in .mk_run_dir attribute) sim_params: simulation parameters, can be modified during the simulation channel_status_filename: where to write combined channel status at regular intervals + output_dir: output dir returned by self.mk_run_dir, this is where files will be put """ - def __init__(self, read_pool: ReadPool, reads_writer: RotatingFileReadsWriter, sim_params: SimParams, channel_status_filename: Optional[Union[str, Path]]=None): + def __init__(self, read_pool: ReadPool, reads_writer: RotatingFileReadsWriter, sim_params: SimParams, channel_status_filename: Optional[Union[str, Path]]=None, output_dir=""): logger.debug(f"Creating ONT device simulator") self._read_pool = read_pool @@ -278,6 +280,7 @@ def __init__(self, read_pool: ReadPool, reads_writer: RotatingFileReadsWriter, s self._channels: List[Channel] = [Channel(channel_name, read_pool, reads_writer, sim_params=sim_params) for channel_name in sim_params.gap_samplers.keys()] self._channel_status_filename = channel_status_filename + self._output_dir = output_dir # thread that forwards simulation at regular time intervals, may not be alive, so call .is_running() to check if simulation is currently running self._forward_sim_thread: Optional[ThreadWithResultsAndExceptions] = None @@ -287,14 +290,12 @@ def __init__(self, read_pool: ReadPool, reads_writer: RotatingFileReadsWriter, s self.lock_sim_state = threading.RLock() # lock to hold running state of simulation fixed self.sim_state = SimulationRunningState.Stopped + self.action_queue = queue.Queue() # queue for actions to perform on the simulator to avoid lock contention, do it at the end of each forward self._action_results = [] @property def mk_run_dir(self) -> Union[Path, str]: - try: - return self._reads_writer.output_dir - except AttributeError: - return "" + return self._output_dir def device_info(self, sim_params=True, channel_states=False) -> str: """ @@ -336,7 +337,7 @@ def get_channel_stats(self, combined=False) -> Union[ChannelStats, List[ChannelS if combined: return combine_stats((channel.stats for channel in self._channels)) else: - return [channel.stats for channel in self._channels] + return [channel.stats for channel in self._channels] def _forward_channels(self, t, delta=False, show_progress=False): """ @@ -367,7 +368,7 @@ def _stop_channels(self): for channel in self._channels: assert channel.is_running channel.stop() - self._reads_writer.flush() + self._reads_writer.finish() def _all_channels_finished(self) -> bool: """Whether no reads are left and all channels have finished""" @@ -400,7 +401,7 @@ def start(self, *args, **kwargs): # don't set as daemon because reads in-progress need to be written to a file self._forward_sim_thread = ThreadWithResultsAndExceptions( - target=self._forward_sim_loop, name=new_thread_name(), args=args, kwargs=kwargs + target=self._forward_sim_loop, name=new_thread_name("simforw-{}"), args=args, kwargs=kwargs ) logger.info("Starting the simulation") @@ -434,6 +435,7 @@ def _forward_sim_loop(self, acceleration_factor=1.0, update_method="realtime", l self.sim_state = SimulationRunningState.Running logger.debug("Simulator forward thread started...") + logger.info(f"Device info: {self.device_info()}") assert acceleration_factor > 0, f"invalid acceleration_factor {acceleration_factor}" if self._channel_status_filename is not None: @@ -479,6 +481,7 @@ def _log(): logger.debug(f"Forwarding to time {t_sim}") t_real_last_forward = cur_ns_time() self._forward_channels(t_sim) + self._process_actions() if (log_interval != -1) and (i % log_interval == 0): _log() @@ -502,7 +505,7 @@ def _compute_delta_t_sim(self): Returns: Length of one chunk in seconds (without acceleration) """ - return self.sim_params.chunk_size / self.sim_params.bp_per_second + return self.sim_params.min_chunk_size / self.sim_params.bp_per_second def stop(self, _join_thread=True): """ @@ -515,6 +518,7 @@ def stop(self, _join_thread=True): Returns: Whether the simulation was stopped (i.e. it was running and not in the process of being stopped) + The simulation has not necessarily stopped when this method returns False, it only started the stopping process. """ logger.info("Stop request received, stopping simulation...") @@ -527,6 +531,7 @@ def stop(self, _join_thread=True): if _join_thread: self._forward_sim_thread.join() # block, try hard for .cancel() on stream + self._forward_sim_thread.raise_if_error() assert not self._forward_sim_thread.is_alive() self._forward_sim_thread = None @@ -552,7 +557,7 @@ def is_running(self): ############## chunk related methods ############## - def get_basecalled_read_chunks(self, batch_size=None, channel_subset=None) -> List[Tuple[Any]]: + def get_basecalled_read_chunks(self, batch_size=None, channel_subset=None) -> Generator[Tuple[Any], None, None]: """ It permutes the channels and gets at most 'batch_size' from them. Channels with no new chunks are filtered out. @@ -577,9 +582,9 @@ def get_basecalled_read_chunks(self, batch_size=None, channel_subset=None) -> Li for channel in self.sim_params.random_state.permutation(channel_subset): if nb_chunks >= batch_size: break - chunks, read_id, estimated_ref_len_so_far = self._channels[channel-1].get_new_chunks() # if simulation was already stopped (in between), just returns "" + chunks, read_id, estimated_ref_len_so_far = self._channels[channel-1].get_new_samples() # if simulation was already stopped (in between), just returns "" # ignore if no new chunks (e.g. if channel does not have a read currently) - if chunks != "": + if len(chunks) > 0: # chunks is either str or array of raw signals nb_chunks += 1 yield (channel, read_id, chunks, "noquality", estimated_ref_len_so_far) @@ -593,7 +598,7 @@ def get_raw_chunks(self, *args, **kwargs): for (channel, read_id, chunks, quality, estimated_ref_len_so_far) in self.get_basecalled_read_chunks(*args, **kwargs): yield (channel, read_id, self.sim_params.pore_model.to_raw(chunks), quality, estimated_ref_len_so_far) - def get_action_results(self, clear=True) -> List[Tuple[Any, float, int, str, Any]]: + def get_action_results(self, clear=True) -> Generator[Tuple[Any, float, int, str, Any], Any, Any]: """ Get action results of actions performed on simulator @@ -613,27 +618,47 @@ def get_action_results(self, clear=True) -> List[Tuple[Any, float, int, str, Any else: return self._action_results.copy() - def unblock_read(self, read_channel, read_id, unblock_duration=None) -> Optional[bool]: + def unblock_read(self, read_channel, read_id, unblock_duration=None): + self.action_queue.put((ActionType.Unblock, (read_channel, read_id, unblock_duration))) + + def stop_receiving_read(self, read_channel, read_id): + self.action_queue.put((ActionType.StopReceiving, (read_channel, read_id))) + + # process actions asynchronously after calling forward since otherwise, there is a lot of lock contention which + # means we cannot run at acceleration factor 10 + def _process_actions(self): + while True: + try: + action_type, args = self.action_queue.get_nowait() + except queue.Empty: + return + if action_type == ActionType.Unblock: + self._unblock_read(*args) + else: + assert action_type == ActionType.StopReceiving + self._stop_receiving_read(*args) + + def _unblock_read(self, read_channel, read_id, unblock_duration=None) -> Optional[bool]: """Unblock read""" self._check_channels_available([read_channel]) action_res = self._channels[read_channel-1].unblock(unblock_duration=unblock_duration, read_id=read_id) self._action_results.append((read_id, self._channels[read_channel-1].t, read_channel, ActionType.Unblock, action_res)) - logger.info(f"Unblocking read {read_id} on channel {read_channel}, result: {action_res.to_str()}") + # logger.info(f"Unblocking read {read_id} on channel {read_channel}, result: {action_res.to_str()}") return action_res - def stop_receiving_read(self, read_channel, read_id) -> Optional[StoppedReceivingResponse]: + def _stop_receiving_read(self, read_channel, read_id) -> Optional[StoppedReceivingResponse]: """Stop receiving from read""" self._check_channels_available([read_channel]) action_res = self._channels[read_channel-1].stop_receiving(read_id=read_id) self._action_results.append((read_id, self._channels[read_channel-1].t, read_channel, ActionType.StopReceiving, action_res)) - logger.info(f"Stopping receiving from read {read_id} on channel {read_channel}, result: {action_res.to_str()}") + # logger.info(f"Stopping receiving from read {read_id} on channel {read_channel}, result: {action_res.to_str()}") return action_res - def run_mux_scan(self, t_duration: float) -> int: + def run_mux_scan(self, t_duration: float, is_sync=False) -> int: """Pass in duration on each channel rather than end time because the channel may already have been forwarded in-between""" with self.lock_sim_state: # the lock ensures that channels are not forwarded - if self.sim_state != SimulationRunningState.Running: + if (not is_sync) and self.sim_state != SimulationRunningState.Running: logger.warning("Simulation not (or no longer) running, mux scan ignored") return 0 if self._channels[0].has_active_mux_scan(): @@ -653,9 +678,12 @@ def _check_not_async_mode(self): def sync_forward(self, t, delta=False, show_progress=False): """ - Forward all channels to time t + Process actions and forward all channels to time t + + Using (t=0, delta=True) means actions are processed """ self._check_not_async_mode() + self._process_actions() return self._forward_channels(t, delta=delta, show_progress=show_progress) def sync_start(self, t=None): @@ -728,7 +756,7 @@ def convert_action_results_to_df(action_results): action_results_df = pd.DataFrame.from_records(action_results, columns=["read_id", "time", "channel", "action_type", "success"]) return action_results_df -def simulator_stats_to_disk(simulators, output_dir=None): +def write_simulator_stats(simulators, output_dir=None): """ Dump action results (success or missed) and channel statistics to the run dir @@ -788,9 +816,9 @@ def plot_nb_actions_per_read(action_results_df, save_dir=None): reads_with_multiple_actions = nb_actions_per_read[nb_actions_per_read > 1].index.values reads_with_contradicting_actions = nb_unique_actions_per_read[nb_unique_actions_per_read > 1].index.values if len(reads_with_multiple_actions) > 0: - logger.warning(f"There are {len(reads_with_multiple_actions)} reads with multiple actions: {reads_with_multiple_actions}") + logger.warning(f"There are {len(reads_with_multiple_actions)} reads with multiple actions (possible same actions): {reads_with_multiple_actions}") if len(reads_with_contradicting_actions) > 0: - logger.warning(f"There are {len(reads_with_contradicting_actions)} reads with contradicting actions: {reads_with_contradicting_actions}") + logger.warning(f"There are {len(reads_with_contradicting_actions)} reads with contradicting actions (e.g. stop_receiving and unblock): {reads_with_contradicting_actions}") fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(15, 4)) @@ -854,58 +882,58 @@ def plot_action_success_rate(action_results_df, save_dir=None): return fig -class ReadUntilClientFromDevice(ReadUntilDevice): - """ - ReadUntilClient with ReadUntil actions operating on a batch of reads - - Named ReadUntilClientFromDevice to avoid nameclash with ReadUntilClient - - start, stop, device_info not implemented. They should be directly called on the device. - """ - def __init__(self, device : ReadUntilDevice): - self._device = device - - def __repr__(self): - res = "ReadUntilClientFromDevice of the following device:\n" - res += indent(repr(self._device), " ") - return res - - @property - def n_channels(self) -> int: - """ - Number of channels - """ - return len(self._device.n_channels) - - @property - def is_running(self) -> bool: - """ - Whether the device is sequencing - """ - return self._device.is_running - - @property - def mk_run_dir(self): - return self._device.mk_run_dir - - def get_basecalled_read_chunks(self, batch_size=None, channel_subset=None): - """ - Yield basecalled chunks from channels - - Args: - batch_size: maximum number of channels to get reads from - channel_subset: restrict to these channels (if provided) - - Yields: - basecalled chunks from channels in the form (chan_key, read_id, chunk, quality, estimated ref len of all chunks returned so far for this read) - """ - yield from self._device.get_basecalled_read_chunks(batch_size, channel_subset) - - def unblock_read(self, read_channel, read_id, unblock_duration=None) -> bool: - return self._device.unblock_read(read_channel, read_id=read_id, unblock_duration=unblock_duration) - - def stop_receiving_read(self, read_channel, read_id) -> StoppedReceivingResponse: - return self._device.stop_receiving_read(read_channel, read_id=read_id) +# class ReadUntilClientFromDevice(ReadUntilDevice): +# """ +# ReadUntilClient with ReadUntil actions operating on a batch of reads + +# Named ReadUntilClientFromDevice to avoid nameclash with ReadUntilClient + +# start, stop, device_info not implemented. They should be directly called on the device. +# """ +# def __init__(self, device : ReadUntilDevice): +# self._device = device + +# def __repr__(self): +# res = "ReadUntilClientFromDevice of the following device:\n" +# res += indent(repr(self._device), " ") +# return res + +# @property +# def n_channels(self) -> int: +# """ +# Number of channels +# """ +# return len(self._device.n_channels) + +# @property +# def is_running(self) -> bool: +# """ +# Whether the device is sequencing +# """ +# return self._device.is_running + +# @property +# def mk_run_dir(self): +# return self._device.mk_run_dir + +# def get_basecalled_read_chunks(self, batch_size=None, channel_subset=None): +# """ +# Yield basecalled chunks from channels + +# Args: +# batch_size: maximum number of channels to get reads from +# channel_subset: restrict to these channels (if provided) + +# Yields: +# basecalled chunks from channels in the form (chan_key, read_id, chunk, quality, estimated ref len of all chunks returned so far for this read) +# """ +# yield from self._device.get_basecalled_read_chunks(batch_size, channel_subset) + +# def unblock_read(self, read_channel, read_id, unblock_duration=None) -> bool: +# return self._device.unblock_read(read_channel, read_id=read_id, unblock_duration=unblock_duration) + +# def stop_receiving_read(self, read_channel, read_id) -> StoppedReceivingResponse: +# return self._device.stop_receiving_read(read_channel, read_id=read_id) def stop_simulation_after_time_thread(simulator: ONTSimulator, t: float): """ @@ -951,7 +979,7 @@ def run_periodic_mux_scan_thread(simulator: ONTSimulator, period: float, scan_du warnings.warn(f"Period between mux scans may be so short that mux scans happen the whole time: scan_duration={scan_duration:.2f}s, period={period:.2f}s") def _run_periodic_mux_scan(): - logger.info(f"Running periodic mux scan every {period:.2f}s (sim time), acceleration factor {acceleration_factor:.2f}") + logger.info(f"Running periodic mux scan every {period:.2f}s (sim time) with duration {scan_duration:.2f}s, acceleration factor {acceleration_factor:.2f}") i = 1 time_start = cur_ns_time() while True: @@ -1036,6 +1064,7 @@ def run_simulator_from_sampler_per_channel( read_pool=read_pool, reads_writer=reads_writer, sim_params=sim_params, + output_dir=reads_writer.output_dir, ) simulator.sync_start(0) @@ -1118,6 +1147,7 @@ def forward_channels(idx, sim_params, read_durations_per_channel, random_state): read_pool=read_pool, reads_writer=reads_writer, sim_params=sim_params, + output_dir="", ) simulator.sync_start(0) @@ -1185,7 +1215,7 @@ def parse_line(line): return df def plot_simulator_delay_over_time(df, n_delays=200, save_dir=None): - """Plot simulator delay over time""" + """Plot simulator delay over time for the loop updating the channels""" # df = df.sample(min(len(df), 200)) # restrict to the largest rather than sample @@ -1194,13 +1224,13 @@ def plot_simulator_delay_over_time(df, n_delays=200, save_dir=None): fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(11, 4)) sns.lineplot(df, x="time", y="delay", ax=ax1) - ax1.set_xlabel("Time (of iteration) (s)") + ax1.set_xlabel("Real time (of iteration) (s)") ax1.set_ylabel("Delay (s)") sns.lineplot(df, x="iteration", y="delay", ax=ax2) ax2.set_xlabel("Iteration") ax2.set_ylabel("Delay (s)") - fig.suptitle(f"Simulator delays (largest {n_delays})") + fig.suptitle(f"Simulator loop delays (largest {n_delays})") make_tight_layout(fig) if save_dir is not None: diff --git a/src/simreaduntil/simulator/simulator_client.py b/src/simreaduntil/simulator/simulator_client.py index 4e93eac..5d61e6b 100644 --- a/src/simreaduntil/simulator/simulator_client.py +++ b/src/simreaduntil/simulator/simulator_client.py @@ -3,7 +3,7 @@ """ import grpc -from typing import Any, List, Optional, Tuple +from typing import Any, Generator, List, Optional, Tuple from simreaduntil.simulator.channel import StoppedReceivingResponse, UnblockResponse from simreaduntil.simulator.simulator import ActionType @@ -133,11 +133,11 @@ def get_basecalled_read_chunks(self, batch_size=None, channel_subset=None): for chunk in self._stub.GetBasecalledChunks(ont_device_pb2.BasecalledChunksRequest(batch_size=batch_size, channels=channels)): yield (chunk.channel, chunk.read_id, chunk.seq, chunk.quality_seq, chunk.estimated_ref_len_so_far) - def get_action_results(self, clear=True) -> List[Tuple[Any, float, int, str, Any]]: + def get_action_results(self, clear=True) -> Generator[Tuple[Any, float, int, str, Any], Any, Any]: """ Get action results """ - for action_response in self._stub.GetActionResults(ont_device_pb2.ActionResultsRequest(clear=clear)).actions: + for action_response in self._stub.GetActionResults(ont_device_pb2.ActionResultsRequest(clear=clear)): action_type = ActionType(action_response.action_type) action_result = (StoppedReceivingResponse if action_type == ActionType.StopReceiving else UnblockResponse)(action_response.result) yield (action_response.read_id, action_response.time, action_response.channel, action_type, action_result) @@ -150,7 +150,7 @@ def unblock_read(self, read_channel, read_id, unblock_duration=None): # return self._stub.PerformActions(ont_device_pb2.ReadActionsRequest(actions=[ # ont_device_pb2.ReadActionsRequest.Action(channel=read_channel, read_id=read_id, unblock=ont_device_pb2.ReadActionsRequest.Action.UnblockAction(unblock_duration=unblock_duration if unblock_duration is not None else -1)) # ])).succeeded[0] - return self.unblock_read_batch([(read_channel, read_id)], unblock_duration=unblock_duration)[0] + self.unblock_read_batch([(read_channel, read_id)], unblock_duration=unblock_duration) def stop_receiving_read(self, read_channel, read_id): """ @@ -160,7 +160,7 @@ def stop_receiving_read(self, read_channel, read_id): # return self._stub.PerformActions(ont_device_pb2.ReadActionsRequest(actions=[ # ont_device_pb2.ReadActionsRequest.Action(channel=read_channel, read_id=read_id, stop_further_data=ont_device_pb2.ReadActionsRequest.Action.StopReceivingAction()), # ])).succeeded[0] - return self.stop_receiving_read_batch([(read_channel, read_id)])[0] + self.stop_receiving_read_batch([(read_channel, read_id)]) # batch methods def unblock_read_batch(self, channel_and_ids, unblock_duration=None): @@ -168,18 +168,20 @@ def unblock_read_batch(self, channel_and_ids, unblock_duration=None): Unblock a batch of reads on channel; returns whether the actions were performed (not performed if the read was already over) """ self._check_connected() - return self._stub.PerformActions(ont_device_pb2.ReadActionsRequest(actions=[ + self._stub.PerformActions(ont_device_pb2.ReadActionsRequest(actions=[ ont_device_pb2.ReadActionsRequest.Action(channel=read_channel, read_id=read_id, unblock=ont_device_pb2.ReadActionsRequest.Action.UnblockAction(unblock_duration=unblock_duration if unblock_duration is not None else -1)) - for (read_channel, read_id) in channel_and_ids])).succeeded + for (read_channel, read_id) in channel_and_ids + ])) def stop_receiving_read_batch(self, channel_and_ids): """ Stop receiving a batch of reads on channel; returns whether the actions were performed (not performed if the read was already over) """ self._check_connected() - return self._stub.PerformActions(ont_device_pb2.ReadActionsRequest(actions=[ + self._stub.PerformActions(ont_device_pb2.ReadActionsRequest(actions=[ ont_device_pb2.ReadActionsRequest.Action(channel=read_channel, read_id=read_id, stop_further_data=ont_device_pb2.ReadActionsRequest.Action.StopReceivingAction()) - for (read_channel, read_id) in channel_and_ids])).succeeded + for (read_channel, read_id) in channel_and_ids + ])) @property def mk_run_dir(self): diff --git a/src/simreaduntil/simulator/simulator_params.py b/src/simreaduntil/simulator/simulator_params.py index fe15193..d665399 100644 --- a/src/simreaduntil/simulator/simulator_params.py +++ b/src/simreaduntil/simulator/simulator_params.py @@ -1,5 +1,5 @@ """ -Manage simulation parameters such as bps_per_second, chunk_size, gap_samplers +Manage simulation parameters such as bps_per_second, min_chunk_size, gap_samplers """ from simreaduntil.simulator.gap_sampling.gap_sampling import GapSampler @@ -17,21 +17,21 @@ class SimParams: gap_samplers: gap samplers for each channel bp_per_second: basepairs per second going through the pore (per channel) default_unblock_duration: extra delay to reject a read / unblock a pore, in seconds - chunk_size: chunk size for selective sequencing ReadUntil (size of chunks when sending data; chunks are concatenated; last chunk has shorter size) + min_chunk_size: minimum chunk size for selective sequencing ReadUntil seed: seed for random number generator, same state set on all channels """ - def __init__(self, gap_samplers: Dict[str, GapSampler], bp_per_second=450, default_unblock_duration=0.1, chunk_size=200, pore_model: Optional[PoreModel]=None, seed: Union[int, np.random.Generator]=0): - self.set(gap_samplers=gap_samplers, bp_per_second=bp_per_second, default_unblock_duration=default_unblock_duration, chunk_size=chunk_size, seed=seed, pore_model=pore_model) + def __init__(self, gap_samplers: Dict[str, GapSampler], bp_per_second=450, default_unblock_duration=0.1, min_chunk_size=200, pore_model: Optional[PoreModel]=None, seed: Union[int, np.random.Generator]=0): + self.set(gap_samplers=gap_samplers, bp_per_second=bp_per_second, default_unblock_duration=default_unblock_duration, min_chunk_size=min_chunk_size, seed=seed, pore_model=pore_model) def restrict_to_channels(self, channels, rand_state): """Subset SimParams to some channels""" - return SimParams(gap_samplers={channel: self.gap_samplers[channel] for channel in channels}, bp_per_second=self.bp_per_second, default_unblock_duration=self.default_unblock_duration, chunk_size=self.chunk_size, seed=rand_state) + return SimParams(gap_samplers={channel: self.gap_samplers[channel] for channel in channels}, bp_per_second=self.bp_per_second, default_unblock_duration=self.default_unblock_duration, min_chunk_size=self.min_chunk_size, seed=rand_state) def __repr__(self) -> str: # repr(random_state) is not very informative (does not show seed, so we store it separately and display it here) - return f"""SimParams(bp_per_second={self.bp_per_second}, default_unblock_duration={self.default_unblock_duration}, chunk_size={self.chunk_size}, initial_seed={self._initial_seed}, n_channels={len(self.gap_samplers)})""" + return f"""SimParams(bp_per_second={self.bp_per_second}, default_unblock_duration={self.default_unblock_duration}, min_chunk_size={self.min_chunk_size}, initial_seed={self._initial_seed}, n_channels={len(self.gap_samplers)})""" - def set(self, *, gap_samplers: Dict[str, GapSampler]=None, bp_per_second=None, default_unblock_duration=None, chunk_size=None, pore_model=None, seed=None): + def set(self, *, gap_samplers: Dict[str, GapSampler]=None, bp_per_second=None, default_unblock_duration=None, min_chunk_size=None, pore_model=None, seed=None): """ Set parameters, None values are ignored """ @@ -44,8 +44,8 @@ def set(self, *, gap_samplers: Dict[str, GapSampler]=None, bp_per_second=None, d self.bp_per_second = bp_per_second if default_unblock_duration is not None: self.default_unblock_duration = default_unblock_duration - if chunk_size is not None: - self.chunk_size = chunk_size + if min_chunk_size is not None: + self.min_chunk_size = min_chunk_size if pore_model is not None: self.pore_model = pore_model if seed is not None: @@ -66,8 +66,8 @@ def _check_sim_params(self): assert isinstance(self.bp_per_second, (int, float)) assert self.bp_per_second > 0 - assert isinstance(self.chunk_size, int) - assert self.chunk_size > 0 + assert isinstance(self.min_chunk_size, int) + assert self.min_chunk_size > 0 assert isinstance(self.default_unblock_duration, (int, float)) assert self.default_unblock_duration >= 0 diff --git a/src/simreaduntil/simulator/simulator_server.py b/src/simreaduntil/simulator/simulator_server.py index 764b979..1900524 100644 --- a/src/simreaduntil/simulator/simulator_server.py +++ b/src/simreaduntil/simulator/simulator_server.py @@ -102,7 +102,7 @@ def PerformActions(self, request, context): res.append(self.device.unblock_read(channel, read_id=read_id, unblock_duration=unblock_duration)) else: res.append(self.device.stop_receiving_read(channel, read_id=read_id)) #todo2: current conversion from enum 0,1,2 to bool is not ideal - return ont_device_pb2.ActionResultImmediateResponse(succeeded=res) + return ont_device_pb2.EmptyResponse() @print_gen_exceptions def GetActionResults(self, request, context): @@ -134,8 +134,8 @@ def StopSim(self, request, context): @print_nongen_exceptions def RunMuxScan(self, request, context): - assert request.HasField("t_duration"), "t_duration must be set" - return ont_device_pb2.MuxScanStartedInfo(value=self.device.run_mux_scan(t_duration=request.t_duration)) + # assert request.HasField("t_duration"), "t_duration must be set" # implicitly set + return ont_device_pb2.RunMuxScanResponse(nb_reads_rejected=self.device.run_mux_scan(t_duration=request.t_duration)) @print_nongen_exceptions # whether simulation is running diff --git a/src/simreaduntil/simulator/utils.py b/src/simreaduntil/simulator/utils.py index a259671..b752bad 100644 --- a/src/simreaduntil/simulator/utils.py +++ b/src/simreaduntil/simulator/utils.py @@ -32,9 +32,9 @@ def in_interval(x, interval): # pylint: disable=invalid-name _counter = _count() -def new_thread_name(template_str="ont-sim-{}"): +def new_thread_name(template_str="thread-{}"): """ - Helper to generate new thread names + Helper to generate new thread names, thread name is unlikely to exist because we use a counter Args: template_str: string with one placeholder for the counter @@ -53,3 +53,11 @@ def set_package_log_level(log_level=logging.INFO): with temp_logging_level(logging.getLogger("ru"), log_level): with temp_logging_level(logging.getLogger("simreaduntil"), log_level): yield + + # sys.stdout = Tee(old_stdout, out_file) + # sys.stderr = Tee(old_stderr, err_file) + # yield + # sys.stdout = old_stdout + # sys.stderr = old_stderr + + \ No newline at end of file diff --git a/src/simreaduntil/usecase_helpers/cli_usecase/simulator_client_cli.py b/src/simreaduntil/usecase_helpers/cli_usecase/simulator_client_cli.py index 6af3603..8f3479b 100644 --- a/src/simreaduntil/usecase_helpers/cli_usecase/simulator_client_cli.py +++ b/src/simreaduntil/usecase_helpers/cli_usecase/simulator_client_cli.py @@ -1,6 +1,7 @@ import argparse import logging +import signal import time import grpc @@ -11,6 +12,7 @@ from simreaduntil.shared_utils.utils import print_args from simreaduntil.simulator.simulator_client import DeviceGRPCClient +from simreaduntil.shared_utils.utils import set_signal_handler logger = setup_logger_simple(__name__) """module logger""" @@ -49,35 +51,36 @@ def main(): num_batches = 0 num_chunks = 0 - try: - with logging_redirect_tqdm(): - while True: - num_batches += 1 - for (channel, read_id, seq, quality_seq, estimated_ref_len_so_far) in tqdm(client.get_basecalled_read_chunks(), desc=f"Processing chunks in batch {num_batches}"): - num_chunks += 1 - logger.debug(f"Read chunk: channel={channel}, read_id={read_id}, seq={seq[:20]}..., quality_seq={quality_seq}, estimated_ref_len_so_far={estimated_ref_len_so_far}") - u = rng.uniform() - if u < 0.2: - logger.debug(f"Rejecting read '{read_id}'") - client.unblock_read(channel, read_id) - elif u < 0.4: - logger.debug(f"Stop receiving read '{read_id}'") - client.stop_receiving_read(channel, read_id) - else: - # no action - pass - # time.sleep(0.05) - time.sleep(0.2) # throttle - except KeyboardInterrupt: - pass - except grpc.RpcError as e: - logger.error(f"Caught gRPC error: {e}") - finally: + def stop_client(*args, **kwargs): try: if client.stop(): logger.info("Stopped simulation") except grpc.RpcError as e: pass + + with set_signal_handler(signal_type=signal.SIGINT, handler=stop_client): # catch keyboard interrupt (Ctrl+C) + try: + with logging_redirect_tqdm(): + while client.is_running: + num_batches += 1 + for (channel, read_id, seq, quality_seq, estimated_ref_len_so_far) in tqdm(client.get_basecalled_read_chunks(), desc=f"Processing chunks in batch {num_batches}"): + num_chunks += 1 + logger.debug(f"Read chunk: channel={channel}, read_id={read_id}, seq={seq[:20]}..., quality_seq={quality_seq}, estimated_ref_len_so_far={estimated_ref_len_so_far}") + u = rng.uniform() + if u < 0.2: + logger.debug(f"Rejecting read '{read_id}'") + client.unblock_read(channel, read_id) + elif u < 0.4: + logger.debug(f"Stop receiving read '{read_id}'") + client.stop_receiving_read(channel, read_id) + else: + # no action + pass + # time.sleep(0.05) + time.sleep(0.2) # throttle + except grpc.RpcError as e: + logger.error(f"Caught gRPC error: {e}") + logger.info(f"Done. Received {num_chunks} chunks from {num_batches} batches") diff --git a/src/simreaduntil/usecase_helpers/cli_usecase/simulator_server_cli.py b/src/simreaduntil/usecase_helpers/cli_usecase/simulator_server_cli.py index 5497408..5152fc4 100644 --- a/src/simreaduntil/usecase_helpers/cli_usecase/simulator_server_cli.py +++ b/src/simreaduntil/usecase_helpers/cli_usecase/simulator_server_cli.py @@ -15,12 +15,13 @@ from simreaduntil.shared_utils.utils import print_args from simreaduntil.simulator.gap_sampling.constant_gaps_until_blocked import ConstantGapsUntilBlocked from simreaduntil.simulator.gap_sampling.rolling_window_gap_sampler import RollingWindowGapSamplerPerChannel -from simreaduntil.simulator.readpool import ReadPoolFromFile -from simreaduntil.simulator.readswriter import RotatingFileReadsWriter -from simreaduntil.simulator.simfasta_to_seqsum import convert_simfasta_dir_to_seqsum -from simreaduntil.simulator.simulator import ONTSimulator, simulator_stats_to_disk +from simreaduntil.simulator.readpool import ReadPoolFromFile, ThreadedReadPoolWrapper +from simreaduntil.simulator.readswriter import CompoundReadsWriter, RotatingFileReadsWriter +from simreaduntil.simulator.simfasta_to_seqsum import SequencingSummaryWriter, convert_simfasta_dir_to_seqsum +from simreaduntil.simulator.simulator import ONTSimulator, write_simulator_stats from simreaduntil.simulator.simulator_params import SimParams from simreaduntil.simulator.simulator_server import launchable_device_grpc_server, manage_grpc_server +from simreaduntil.shared_utils.utils import set_signal_handler logger = setup_logger_simple(__name__) """module logger""" @@ -30,10 +31,10 @@ def parse_args(args=None): parser.add_argument("reads_file", type=Path, help="Path to the reads, e.g., generated by NanoSim or perfect reads") parser.add_argument("run_dir", type=Path, help="Run dir, must not exist", default="example_run") # todo: add pore model, extract sim params from an existing run: ConstantGapsUntilBlocked.from_seqsum_df, RollingWindowGapSamplerPerChannel.from_seqsum_df - parser.add_argument("--num_channels", type=int, help="Number of channels", default=512) + parser.add_argument("--n_channels", type=int, help="Number of channels", default=512) # parser.add_argument("--run_time", type=float, help="Time to run for (s)", default=2 ** 63 / 10 ** 9) # maximum value handled by time.sleep parser.add_argument("--acceleration_factor", type=float, help="Speedup factor for simulation", default=1.0) - parser.add_argument("--chunk_size", type=int, help="Number of basepairs per chunk (API returns a multiple of chunk_size per channel)", default=200) + parser.add_argument("--min_chunk_size", type=int, help="Number of basepairs per chunk (API returns a multiple of min_chunk_size per channel)", default=200) parser.add_argument("--bp_per_second", type=int, help="Pore speed (number of basepairs per second (per channel))", default=450) parser.add_argument("--seed", type=int, help="Random seed", default=None) parser.add_argument("--unblock_duration", type=float, help="Duration to unblock a read (s)", default=0.1) @@ -65,14 +66,14 @@ def main(): reads_file = args.reads_file assert reads_file.exists(), f"reads_file '{reads_file}' does not exist" run_dir = args.run_dir - num_channels = args.num_channels - assert num_channels > 0, f"num_channels {num_channels} must be > 0" + n_channels = args.n_channels + assert n_channels > 0, f"n_channels {n_channels} must be > 0" # run_time = args.run_time # assert run_time > 0, f"run_time {run_time} must be > 0" acceleration_factor = args.acceleration_factor assert acceleration_factor > 0, f"acceleration_factor {acceleration_factor} must be > 0" - chunk_size = args.chunk_size - assert chunk_size > 0, f"chunk_size {chunk_size} must be > 0" + min_chunk_size = args.min_chunk_size + assert min_chunk_size > 0, f"min_chunk_size {min_chunk_size} must be > 0" bp_per_second = args.bp_per_second assert bp_per_second > 0, f"bp_per_second {bp_per_second} must be > 0" seed = args.seed @@ -98,43 +99,49 @@ def main(): logger.info("Reading in reads. pysam index creation may take some time.") read_pool = ReadPoolFromFile(reads_file=reads_file) + read_pool = ThreadedReadPoolWrapper(read_pool, queue_size=2*n_channels) mk_run_dir = run_dir / "reads" logger.info(f"Writing basecalled reads to directory '{mk_run_dir}'") reads_writer = RotatingFileReadsWriter(mk_run_dir, "reads_", max_reads_per_file=4000) + seqsum_writer = SequencingSummaryWriter(open(run_dir / "live_sequencing_summary.txt", "w")) + reads_writer = CompoundReadsWriter([reads_writer, seqsum_writer]) - gap_samplers = {f"chan{channel}": ConstantGapsUntilBlocked(short_gap_length=0.4, long_gap_length=10, prob_long_gap=0.05, time_until_blocked=np.inf, read_delay=0.05) for channel in range(num_channels)} + gap_samplers = {f"chan{channel}": ConstantGapsUntilBlocked(short_gap_length=0.4, long_gap_length=10, prob_long_gap=0.05, time_until_blocked=np.inf, read_delay=0.05) for channel in range(n_channels)} logger.info("Using constant gap samplers") - sim_params = SimParams(gap_samplers=gap_samplers, bp_per_second=bp_per_second, default_unblock_duration=unblock_duration, chunk_size=chunk_size, seed=np.random.default_rng(seed)) - - simulator = ONTSimulator( - read_pool=read_pool, - reads_writer=reads_writer, - sim_params=sim_params, - ) - - ####### Starting the gRPC server ####### - port, server, unique_id = launchable_device_grpc_server(simulator, port=port) - assert port != 0, f"port {port} already in use" - - logger.info(f"Starting gRPC server on port {port}") - with manage_grpc_server(server): - logger.info("Started gRPC server") - try: - if not dont_start: - logger.info("Starting the simulation") - simulator.start(acceleration_factor=acceleration_factor, log_interval=100) - else: - logger.info("Not starting the simulation, must be started manually (via a gRPC call), or remove the --dont_start flag") - signal.pause() # wait for keyboard interrupt - except KeyboardInterrupt: - pass - finally: - if simulator.stop(): - logger.info("Stopped simulation") - - simulator_stats_to_disk([simulator], output_dir=run_dir) - + sim_params = SimParams(gap_samplers=gap_samplers, bp_per_second=bp_per_second, default_unblock_duration=unblock_duration, min_chunk_size=min_chunk_size, seed=np.random.default_rng(seed)) + + with reads_writer, read_pool: + simulator = ONTSimulator( + read_pool=read_pool, + reads_writer=reads_writer, + sim_params=sim_params, + output_dir=run_dir, + ) + + ####### Starting the gRPC server ####### + port, server, unique_id = launchable_device_grpc_server(simulator, port=port) + assert port != 0, f"port {port} already in use" + + logger.info(f"Starting gRPC server on port {port}") + with manage_grpc_server(server): + logger.info("Started gRPC server") + + def stop_server(*args, **kwargs): + if simulator.stop(): + logger.info("Stopped simulation") + + with set_signal_handler(signal_type=signal.SIGINT, handler=stop_server): # catch keyboard interrupt (Ctrl+C) + if not dont_start: + logger.info("Starting the simulation") + simulator.start(acceleration_factor=acceleration_factor, log_interval=100) + else: + logger.info("Not starting the simulation, must be started manually (via a gRPC call), or remove the --dont_start flag") + signal.pause() # wait for keyboard interrupt, only for Linux, alternatively run a while loop with time.sleep(1) + + write_simulator_stats([simulator], output_dir=simulator.mk_run_dir) + + # todo: possibly remove since the same as live sequencing summary seqsum_filename = run_dir / "sequencing_summary.txt" logger.info(f"Writing sequencing summary file '{seqsum_filename}'") convert_simfasta_dir_to_seqsum(reads_writer.output_dir, seqsummary_filename=seqsum_filename) diff --git a/src/simreaduntil/usecase_helpers/readfish_plotting.py b/src/simreaduntil/usecase_helpers/readfish_plotting.py index 1496a15..199c316 100644 --- a/src/simreaduntil/usecase_helpers/readfish_plotting.py +++ b/src/simreaduntil/usecase_helpers/readfish_plotting.py @@ -42,15 +42,19 @@ def parse_line(line): return df -def plot_extra_basecalling_delay_per_iter(df, save_dir=None): +def plot_extra_basecalling_delay_per_iter(df, save_dir=None, n_points=200): """ Plots the extra basecalling delay per iteration. - One iteration is a called to get_basecalled_read_chunks() which logs the total time spent due to basecalling. + Sort by avg_extra_wait_time, then take the n_points largest. + + One iteration is a call to get_basecalled_read_chunks() which logs the total time spent due to basecalling. The basecalling starts right when the function is called, so if the processing of the basecalled data takes longer, there is no extra delay due to basecalling. """ - df = df.sample(min(len(df), 200)) + # df = df.sample(min(len(df), 200)) + # restrict to the largest rather than sample + df = df.sort_values("avg_extra_wait_time", ascending=False).iloc[:n_points] fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(15, 4)) # to show all in one plot @@ -61,24 +65,80 @@ def plot_extra_basecalling_delay_per_iter(df, save_dir=None): # twin2.spines.right.set_position(("axes", 1.2)) sns.lineplot(df, x="time", y="extra_wait_time", ax=ax1) - ax1.set_xlabel("Time (of iteration) (s)") + ax1.set_xlabel("Real time (of iteration) (s)") ax1.set_ylabel("Extra wait time (whole iteration) (s)") ax1.set_title("Extra waiting time (whole iteration)") sns.lineplot(df, x="time", y="avg_extra_wait_time", ax=ax2) - ax2.set_xlabel("Time (of iteration) (s)") + ax2.set_xlabel("Real time (of iteration) (s)") ax2.set_ylabel("Average waiting time per basepair (s)") ax2.set_title("Average extra waiting time") sns.lineplot(df, x="time", y="nb_basepairs", ax=ax3) - ax3.set_xlabel("Time (of iteration) (s)") + ax3.set_xlabel("Real time (of iteration) (s)") ax3.set_ylabel("Number of called basepairs at iteration") ax3.set_title("Number of called basepairs at iteration") - fig.suptitle("Extra delay due to basecalling (delaying ReadFish)") + fig.suptitle(f"Extra delay due to basecalling (delaying ReadFish), largest {n_points})") # wrt avg_extra_wait_time + + make_tight_layout(fig) + if save_dir is not None: + save_fig_and_pickle(fig, save_dir / f"readfish_extra_basecall_delay.{FIGURE_EXT}") + + return fig + +def plot_chunk_waiting_time(df, save_dir=None, n_points=200): + """ + Plots the time spent waiting for chunks from the device per iteration and time. + + Sorts by waiting_time, then takes the n_points largest. + """ + # df = df.sample(min(len(df), 200)) + # restrict to the largest rather than sample + df["iteration"] = df.index + 1 + df = df.sort_values("waiting_time", ascending=False).iloc[:n_points] + + fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(11, 4)) + + sns.lineplot(df, x="time", y="waiting_time", ax=ax1) + ax1.set_xlabel("Real time (of iteration) (s)") + ax1.set_ylabel("Time waiting for chunks (s)") + sns.lineplot(df, x="iteration", y="waiting_time", ax=ax2) + ax2.set_xlabel("ReadFish iteration") + ax2.set_ylabel("Time waiting for chunks (s)") + + fig.suptitle(f"Time waiting for chunks (largest {n_points})") make_tight_layout(fig) if save_dir is not None: - save_fig_and_pickle(fig, save_dir / f"extra_basecall_delay.{FIGURE_EXT}") + save_fig_and_pickle(fig, save_dir / f"readfish_chunks_waiting_time.{FIGURE_EXT}") + + return fig + +def plot_chunk_mapping_time(df, save_dir=None, n_points=200): + """ + Plots the time spent mapping chunks from the device per iteration and time. + Sorts by mapping_time, then takes the n_points largest. + """ + # df = df.sample(min(len(df), 200)) + # restrict to the largest rather than sample + df["iteration"] = df.index + 1 + df = df.sort_values("mapping_time", ascending=False).iloc[:n_points] + + fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(11, 4)) + + sns.lineplot(df, x="time", y="mapping_time", ax=ax1) + ax1.set_xlabel("Real time (of iteration) (s)") + ax1.set_ylabel("Time mapping chunks (s)") + sns.lineplot(df, x="iteration", y="mapping_time", ax=ax2) + ax2.set_xlabel("ReadFish iteration") + ax2.set_ylabel("Time mapping chunks (s)") + + fig.suptitle(f"Time mapping chunks (largest {n_points})") + + make_tight_layout(fig) + if save_dir is not None: + save_fig_and_pickle(fig, save_dir / f"readfish_chunks_mapping_time.{FIGURE_EXT}") + return fig def get_processing_time_per_read_over_time_df(log_filename): @@ -121,16 +181,16 @@ def plot_readfish_processing_time(df, save_dir=None): # twin2.spines.right.set_position(("axes", 1.2)) sns.lineplot(df, x="time", y="elapsed_time", ax=ax1) - ax1.set_xlabel("Time (of iteration) (s)") + ax1.set_xlabel("Real time (of iteration) (s)") ax1.set_ylabel("Elapsed time (whole iteration) (s)") sns.lineplot(df, x="time", y="elapsed_time_per_read", ax=ax2) - ax2.set_xlabel("Time (of iteration) (s)") + ax2.set_xlabel("Real time (of iteration) (s)") ax2.set_ylabel("Average elapsed time per read (s)") sns.lineplot(df, x="time", y="nb_reads", ax=ax3) - ax3.set_xlabel("Time (of iteration) (s)") + ax3.set_xlabel("Real time (of iteration) (s)") ax3.set_ylabel("Number of reads at iteration") - fig.suptitle("ReadFish processing time") + fig.suptitle("ReadFish processing time (sampled)") make_tight_layout(fig) if save_dir is not None: @@ -161,18 +221,67 @@ def parse_line(line): return df -def plot_throttle_over_time(df, save_dir=None): +def get_chunk_wait_time_over_time_df(log_filename): + """cumulative time waiting for chunks from the device per iteration""" + MARKER = "ReadFish time waiting for chunks: " + def parse_line(line): + # return time of log entry, throttle + log_time, remaining = line.split(" - ", maxsplit=1) + # convert log_time to time + log_time = datetime.datetime.strptime(log_time, "%Y-%m-%d %H:%M:%S,%f") + remaining = remaining.split(" --- ", maxsplit=1)[0] + remaining = remaining.split(MARKER)[1] + return log_time, float(remaining[:-1]) + + # line = "2023-12-16 11:51:53,349 - ReadFish time waiting for chunks: 0.01086s --- ru_gen.py:395 (simple_analysis) INFO ##" + # parse_line(line) + + with open(log_filename) as f: + info = [parse_line(line) for line in f if MARKER in line] + # info = list(itertools.islice((parse_line(line) for line in f if MARKER in line), 100)) + df = pd.DataFrame.from_records(info, columns=["time", "waiting_time"]) + if len(df) > 0: + df["time"] = (df["time"] - df["time"].iloc[0]).dt.total_seconds() + + return df + +def get_chunk_mapping_time_over_time_df(log_filename): + """cumulative time mapping chunks from the device per iteration""" + MARKER = "ReadFish mapping time for chunks: " + def parse_line(line): + # return time of log entry, throttle + log_time, remaining = line.split(" - ", maxsplit=1) + # convert log_time to time + log_time = datetime.datetime.strptime(log_time, "%Y-%m-%d %H:%M:%S,%f") + remaining = remaining.split(" --- ", maxsplit=1)[0] + remaining = remaining.split(MARKER)[1] + return log_time, float(remaining[:-1]) + + # line = "2023-12-16 11:51:53,349 - ReadFish mapping time for chunks: 0.01086s --- ru_gen.py:395 (simple_analysis) INFO ##" + # parse_line(line) + + with open(log_filename) as f: + info = [parse_line(line) for line in f if MARKER in line] + # info = list(itertools.islice((parse_line(line) for line in f if MARKER in line), 100)) + df = pd.DataFrame.from_records(info, columns=["time", "mapping_time"]) + if len(df) > 0: + df["time"] = (df["time"] - df["time"].iloc[0]).dt.total_seconds() + + return df + +def plot_throttle_over_time(df, save_dir=None, n_points=200): """Plot ReadFish throttle over time""" - df = df.sample(min(len(df), 200)) + # df = df.sample(min(len(df), 200)) + df = df.sort_values("throttle", ascending=False).iloc[:n_points] df.sort_values("time", inplace=True) fig, ax = plt.subplots() ax.plot(df["time"], df["throttle"]) sns.lineplot(df, x="time", y="throttle", ax=ax) - ax.set_xlabel("Time (s)") + ax.set_xlabel("Real time (s)") ax.set_ylabel("Throttle") - ax.set_title("Throttle over time") + ax.set_title(f"ReadFish Throttle over time (largest {n_points}, negative means too slow)") make_tight_layout(fig) if save_dir is not None: @@ -182,8 +291,12 @@ def plot_throttle_over_time(df, save_dir=None): if __name__ == "__main__": - log_filename = "/Volumes/mmordig/joblogs/job-13931078-0.err" - + log_filename = "/home/mmordig/ont_project_all/ont_project/runs/enrich_usecase/full_genome_run_sampler_per_window/log.txt" + # plt.get_backend() + # import matplotlib + # print(matplotlib.rcsetup.all_backends) + # plt.switch_backend('TkAgg') + proc_df = get_processing_time_per_read_over_time_df(log_filename) plot_readfish_processing_time(proc_df) @@ -192,4 +305,15 @@ def plot_throttle_over_time(df, save_dir=None): basecall_delay_df = get_extra_basecall_delay_over_time_df(log_filename) plot_extra_basecalling_delay_per_iter(basecall_delay_df) + + chunk_waiting_time_df = get_chunk_wait_time_over_time_df(log_filename) + plot_chunk_waiting_time(chunk_waiting_time_df) + + chunk_mapping_time_df = get_chunk_mapping_time_over_time_df(log_filename) + plot_chunk_mapping_time(chunk_mapping_time_df) + + # # also need X11 forwarding and XQuartz running on MacOS X, linux text mode is sufficient (graphical mode not required) + # import matplotlib.pyplot as plt + # plt.show() + \ No newline at end of file diff --git a/src/simreaduntil/usecase_helpers/readfish_wrappers.py b/src/simreaduntil/usecase_helpers/readfish_wrappers.py index 8622b07..2d5d437 100644 --- a/src/simreaduntil/usecase_helpers/readfish_wrappers.py +++ b/src/simreaduntil/usecase_helpers/readfish_wrappers.py @@ -127,7 +127,7 @@ def basecall_minknow(self, reads: Iterable[Tuple[int, ReadWrapper]], signal_dtyp (channel, read_number), read_id, sequence, sequence_length, quality """ - time_start = time.perf_counter_ns() # in nanoseconds, only offsets are correct + time_start = cur_ns_time() total_wait_time = 0 nb_called_bps = 0 for (channel, read_info) in reads: @@ -136,7 +136,7 @@ def basecall_minknow(self, reads: Iterable[Tuple[int, ReadWrapper]], signal_dtyp # to imitate the guppy basecaller which runs in parallel, we do not delay each time something is requested, but rather since the function was called nb_called_bps += len(read_info.seq) if self.time_per_bp > 0: - wait_time = nb_called_bps * self.time_per_bp - (time.perf_counter_ns() - time_start)/1_000_000_000 + wait_time = nb_called_bps * self.time_per_bp - (cur_ns_time() - time_start) if wait_time > 0: time.sleep(wait_time) total_wait_time += wait_time @@ -203,13 +203,15 @@ def __init__(self, index): @staticmethod def _map_seq(read_id, seq_len): - parsed = NanoSimId.from_str(read_id) + parsed_id = NanoSimId.from_str(read_id) - return NanoSimMapper.Alignment( + if parsed_id.read_type == "unaligned": + return [] + return [NanoSimMapper.Alignment( query_name=read_id, query_len=seq_len, query_start=0, query_end=seq_len, - target_strand=1 if parsed.direction == "F" else -1, target_name=parsed.chrom, target_len="*", target_start=parsed.ref_pos, target_end=parsed.ref_len, + target_strand=1 if parsed_id.direction == "F" else -1, target_name=parsed_id.chrom, target_len="*", target_start=parsed_id.ref_pos, target_end=parsed_id.ref_len, num_matches=seq_len, alignment_block_length=seq_len, mapping_quality=255 - ) + )] def map_reads_2(self, calls): """Align reads against a reference @@ -222,7 +224,7 @@ def map_reads_2(self, calls): """ for read_info, read_id, seq, seq_len, quality in calls: assert len(seq) == seq_len - yield read_info, read_id, seq_len, [self._map_seq(read_id, seq_len)] + yield read_info, read_id, seq_len, self._map_seq(read_id, seq_len) @contextmanager def replace_ru_mapper(replace): diff --git a/src/simreaduntil/usecase_helpers/simulator_with_readfish.py b/src/simreaduntil/usecase_helpers/simulator_with_readfish.py index cc75ed2..63bd33c 100644 --- a/src/simreaduntil/usecase_helpers/simulator_with_readfish.py +++ b/src/simreaduntil/usecase_helpers/simulator_with_readfish.py @@ -6,6 +6,8 @@ from contextlib import contextmanager import logging from pathlib import Path +import signal +import time import numpy as np import pysam from simreaduntil.shared_utils.debugging_helpers import is_test_mode @@ -13,12 +15,12 @@ from simreaduntil.shared_utils.logging_utils import add_comprehensive_stream_handler_to_logger, setup_logger_simple from simreaduntil.shared_utils.timing import cur_ns_time -from simreaduntil.shared_utils.utils import delete_dir_if_exists, dill_dump, dill_load, print_args +from simreaduntil.shared_utils.utils import delete_dir_if_exists, dill_dump, dill_load, print_args, set_signal_handler from simreaduntil.simulator.gap_sampling.constant_gaps_until_blocked import ConstantGapsUntilBlocked -from simreaduntil.simulator.readpool import ReadPoolFromFile, ReadPoolFromIterable -from simreaduntil.simulator.readswriter import ArrayReadsWriter, RotatingFileReadsWriter, SingleFileReadsWriter -from simreaduntil.simulator.simfasta_to_seqsum import convert_simfasta_dir_to_seqsum -from simreaduntil.simulator.simulator import ONTSimulator, convert_action_results_to_df, run_periodic_mux_scan_thread, simulator_stats_to_disk, stop_simulation_after_time_thread +from simreaduntil.simulator.readpool import ReadPoolFromFile, ReadPoolFromIterable, ThreadedReadPoolWrapper +from simreaduntil.simulator.readswriter import ArrayReadsWriter, CompoundReadsWriter, RotatingFileReadsWriter, SingleFileReadsWriter, ThreadedReadsWriterWrapper +from simreaduntil.simulator.simfasta_to_seqsum import SequencingSummaryWriter, convert_simfasta_dir_to_seqsum +from simreaduntil.simulator.simulator import ONTSimulator, convert_action_results_to_df, run_periodic_mux_scan_thread, write_simulator_stats, stop_simulation_after_time_thread from simreaduntil.simulator.simulator_client import DeviceGRPCClient from simreaduntil.simulator.simulator_params import SimParams from simreaduntil.simulator.simulator_server import launchable_device_grpc_server, manage_grpc_server @@ -48,30 +50,36 @@ def compute_nonselective_coverage(ref_genome_path, reads_file): """Compute coverage if all reads are played back without any selective sequencing, i.e. full reads""" ref_length = sum(get_ref_lengths(ref_genome_path).values()) - total_reads_length = sum(get_ref_lengths(reads_file).values()) + reads_file = Path(reads_file) + if reads_file.is_dir(): + total_reads_length = sum(sum(get_ref_lengths(x).values()) for x in reads_file.glob("*.fasta")) + else: + total_reads_length = sum(get_ref_lengths(reads_file).values()) return total_reads_length / ref_length -def get_reads_writer(run_dir: Path, rotating: bool): +def get_reads_writer(run_dir: Path, rotating_writeout: bool): # reads_writer = ArrayReadsWriter() # for debugging mostly mk_run_dir = run_dir / "reads" delete_dir_if_exists(mk_run_dir) mk_run_dir.mkdir() - if rotating: + if rotating_writeout: reads_writer = RotatingFileReadsWriter(mk_run_dir, "reads_", max_reads_per_file=4000) else: reads_writer = SingleFileReadsWriter(open(mk_run_dir / "reads.fasta", "w")) - reads_writer.output_dir = mk_run_dir + seqsum_writer = SequencingSummaryWriter(open(run_dir / "live_sequencing_summary.txt", "w")) + reads_writer = CompoundReadsWriter([reads_writer, seqsum_writer]) + reads_writer = ThreadedReadsWriterWrapper(reads_writer) return reads_writer def get_sim_params(sim_params_file, n_channels) -> SimParams: if sim_params_file is None: # take realistic params, otherwise ReadFish mapper (minimap2) will still not map after 12 (small) chunks sim_params = SimParams( - gap_samplers={f"{i+1}": ConstantGapsUntilBlocked(short_gap_length=0.4, long_gap_length=10.1, prob_long_gap=0, time_until_blocked=np.inf, read_delay=0) for i in range(n_channels)}, - bp_per_second=450, chunk_size=200, default_unblock_duration=0.1, seed=0, + gap_samplers={f"ch{i+1}": ConstantGapsUntilBlocked(short_gap_length=0.4, long_gap_length=10.1, prob_long_gap=0, time_until_blocked=np.inf, read_delay=0.1) for i in range(n_channels)}, + bp_per_second=450, min_chunk_size=200, default_unblock_duration=0.1, seed=0, ) else: logger.info(f"Loading simparams from '{sim_params_file}'") @@ -79,17 +87,29 @@ def get_sim_params(sim_params_file, n_channels) -> SimParams: if n_channels != sim_params.n_channels: logger.warning(f"Using sim_params.n_channels={sim_params.n_channels} instead of {n_channels} because it was saved in the sim_params_file") + assert sorted(list(sim_params.gap_samplers.keys())) == {f"ch{i+1}" for i in range(n_channels)} # assumed by downstream plotting scripts + return sim_params -def get_read_pool(reads_file, ref_genome_path): +def get_read_pool(reads_file_type, reads_file, ref_genome_path, n_channels, reads_len_range=None): """Get read pool either from reads_file or perfect reads from ref_fasta""" - if reads_file is None: + + if reads_file_type == "generate": # read_pool = ReadPoolFromIterable(random_nanosim_reads_gen(random_state=np.random.default_rng(3), length_range=(10, 50))) logger.info(f"Generating perfect reads without NanoSim using ref genome '{ref_genome_path}'") - read_pool = ReadPoolFromIterable(perfect_reads_gen(ref_genome_path, read_lens_range=(5_000, 10_000), random_state=np.random.default_rng(1))) + assert reads_len_range is not None + read_pool = ReadPoolFromIterable(perfect_reads_gen(ref_genome_path, read_lens_range=reads_len_range, random_state=np.random.default_rng(1))) + elif reads_file_type == "fasta": + logger.info("Reading in FASTA reads. pysam index creation may take some time.") + # read_pool = ReadPoolFromFile(reads_file=reads_file) + # read_pool = ReadPoolFromFile(reads_file_or_dir=reads_file) + read_pool = ReadPoolFromFile(reads_file_or_dir=reads_file, shuffle_rand_state=np.random.default_rng(3)) # use a different rng since not thread-safe! else: - logger.info("Reading in reads. pysam index creation may take some time.") - read_pool = ReadPoolFromFile(reads_file=reads_file) + logger.info("Creating slow5 read pool") + raise "slow5 currently unavailable" + # read_pool = Slow5ReadPool(reads_file, read_buffer=2*n_channels) # todo: remove read_buffer + + read_pool = ThreadedReadPoolWrapper(read_pool, queue_size=2*n_channels) return read_pool @contextmanager @@ -105,6 +125,35 @@ def wrap_simulator_in_grpc(simulator, use_grpc): # don't do anything yield simulator +""" +Auto-detect the reads file type + +Args: + reads_file (Path): path to the reads file or directory, possibly None + +Returns: + If reads file is None, "generate: + Else If reads file ends with fasta or is a directory containing fastas, "fasta" + Else If reads file ends with slow5/blow5 or is a directory containing slow5/blow5, "slow5" + Else raise ValueError +""" +def get_reads_file_type(reads_file: Path): + if reads_file is None: + return "generate" # generate reads + elif ReadPoolFromFile.can_handle(reads_file): + return "fasta" + # elif Slow5ReadPool.can_handle(reads_file): + # return "slow5" + else: + raise ValueError(f"Cannot determine reads file type for file '{reads_file}'") + + # if reads_file is None: + # return "generate" # generate reads + # elif reads_file.endswith(".fasta"): + # return "fasta" + # else: + # return "slow5" + def main(toml_file): """ Run ReadFish with the simulator @@ -132,9 +181,12 @@ def main(toml_file): realtime_run_duration = float(config["run_duration"]) / acceleration_factor reads_file = config.get("reads_file", None) + reads_len_range = config.get("reads_len_range", None) + if reads_len_range is not None: + assert isinstance(reads_len_range, list) ref_genome_path = config.get("ref_genome_path", None) sim_params_file = config.get("sim_params_file", None) - rotating = config.get("rotating", True) + rotating_writeout = config.get("rotating_writeout", True) use_grpc = config.get("use_grpc", False) if "mux_scan_period" in config: mux_scan_period = float(config["mux_scan_period"]) @@ -147,55 +199,75 @@ def main(toml_file): # readfish params readfish_config_file = Path(config["readfish_config_file"]) if "readfish_config_file" in config else None readfish_method = config.get("readfish_method", "targeted_seq") - assert readfish_method in ["unblock_all", "targeted_seq"] + assert readfish_method in ["control", "unblock_all", "targeted_seq"] logger.info(f"Running ReadFish with method '{readfish_method}'") #################################################### #################### SET UP RUN #################### #################################################### - if (ref_genome_path is not None) and (reads_file is not None): - logger.info(f"You will reach average coverage {compute_nonselective_coverage(ref_genome_path, reads_file=reads_file)}") + reads_file_type = config.get("reads_file_type", get_reads_file_type(reads_file)) + logger.info(f"Using reads file type '{reads_file_type}'") + if (ref_genome_path is not None) and reads_file_type == "fasta": + logger.info(f"(Without selseq,) You will reach average coverage {compute_nonselective_coverage(ref_genome_path, reads_file=reads_file)}") if run_dir.exists(): logger.warning(f"Run dir '{run_dir}' already exists") run_dir.mkdir(exist_ok=True) - reads_writer = get_reads_writer(run_dir, rotating=rotating) - read_pool = get_read_pool(reads_file, ref_genome_path=ref_genome_path) sim_params = get_sim_params(sim_params_file=sim_params_file, n_channels=int(config["n_channels"])) + reads_writer = get_reads_writer(run_dir, rotating_writeout=rotating_writeout) + read_pool = get_read_pool(reads_file_type=reads_file_type, reads_file=reads_file, ref_genome_path=ref_genome_path, n_channels=sim_params.n_channels, reads_len_range=reads_len_range) + # if isinstance(read_pool, Slow5ReadPool): + # logger.info("Detected slow5 reads, so adapting chunk size and bp_per_second") + # sim_params.bp_per_second = int(sim_params.bp_per_second * 4000/450) # todo: rename bp_per_second + # sim_params.min_chunk_size = int(sim_params.min_chunk_size * 4000/450) # When the simulation is accelerated, we decrease the batch size so that ReadFish can send out actions faster. # Higher acceleration factor means that reads will finish faster. # The throttle is decreased because there are less reads per batch and the acceleration is going faster. original_batch_size = int(config.get("readfish_batch_size", sim_params.n_channels)) - readfish_batch_size = max(1, round(original_batch_size / acceleration_factor)) - # readfish_batch_size = original_batch_size + # readfish_batch_size = max(1, round(original_batch_size / acceleration_factor)) + readfish_batch_size = original_batch_size readfish_throttle = float(config.get("readfish_throttle", 0.1)) * (readfish_batch_size / original_batch_size) / acceleration_factor logger.info(f"Running ReadFish with batch size {readfish_batch_size} and throttle {readfish_throttle}s") - simulator = ONTSimulator( + simulator : ONTSimulator = ONTSimulator( read_pool=read_pool, reads_writer=reads_writer, sim_params=sim_params, + output_dir=run_dir, ) def start_sim(): # start sim and mux scan thread logger.info(f"Starting the simulation with {simulator.n_channels} channels") simulator.start(acceleration_factor=acceleration_factor) + logger.info(f"Started the simulation") if mux_scan_period is not None: mux_scan_thread = run_periodic_mux_scan_thread(simulator, period=mux_scan_period, scan_duration=mux_scan_duration, acceleration_factor=acceleration_factor) mux_scan_thread.start() else: mux_scan_thread = None return mux_scan_thread - - try: - with wrap_simulator_in_grpc(simulator, use_grpc=use_grpc): - if readfish_method == "unblock_all": + + orig_simulator = simulator + with wrap_simulator_in_grpc(orig_simulator, use_grpc=use_grpc) as simulator: + # use a thread to stop it because simulator.stop() may never be able to acquire the mutex in the signal handler because the mutex may still be held by simulator.forward() or similar when the signal handler is running + with set_signal_handler(signal_type=signal.SIGINT, handler=lambda *args, **kwargs: stop_simulation_after_time_thread(simulator, t=0).start()): + if readfish_method == "control": + mux_scan_thread = start_sim() + + stop_thread = stop_simulation_after_time_thread(simulator, t=realtime_run_duration) + stop_thread.start() + + elif readfish_method == "unblock_all": mux_scan_thread = start_sim() - unblock_all(ReadUntilClientWrapper(simulator), duration=realtime_run_duration, batch_size=readfish_batch_size, throttle=readfish_throttle, unblock_duration=0.1) + stop_thread = stop_simulation_after_time_thread(simulator, t=realtime_run_duration) + stop_thread.start() + + unblock_all(ReadUntilClientWrapper(simulator), duration=realtime_run_duration, batch_size=readfish_batch_size, throttle=readfish_throttle, unblock_duration=0.1) + elif readfish_method == "targeted_seq": chunk_logger = get_chunk_logger(run_dir / "chunk_log.txt") paf_logger = get_paf_logger(run_dir / "mapping.paf") @@ -218,36 +290,37 @@ def start_sim(): stop_thread = stop_simulation_after_time_thread(simulator, t=realtime_run_duration) stop_thread.start() + targeted_seq(ReadUntilClientWrapper(simulator), batch_size=readfish_batch_size, throttle=readfish_throttle, unblock_duration=0.1, chunk_logger=chunk_logger, paf_logger=paf_logger, live_toml_path=live_toml, flowcell_size=simulator.n_channels, dry_run=False, run_info=run_info, conditions=conditions, mapper=mapper, caller=dummy_caller) - stop_thread.raise_if_error() + stop_thread.raise_if_error() + while simulator.is_running: + # stop call() returns False early, when another stop call is already running + time.sleep(0.5) - if mux_scan_thread is not None: - mux_scan_thread.raise_if_error() - - except KeyboardInterrupt: - pass - finally: - simulator.stop() - logger.info("Stopped simulation") + if mux_scan_thread is not None: + mux_scan_thread.raise_if_error() + simulator = orig_simulator # restore + # go outside grpc connection + write_simulator_stats([simulator], output_dir=simulator.mk_run_dir) - simulator_stats_to_disk([simulator], output_dir=run_dir) + read_pool.finish() seqsum_filename = None if isinstance(reads_writer, ArrayReadsWriter): - reads_writer.reads + # reads_writer.reads logger.info(reads_writer.extended_repr()) else: - assert isinstance(reads_writer, (SingleFileReadsWriter, RotatingFileReadsWriter)) - logger.info(reads_writer) + # logger.info(reads_writer) + # todo: seqsummary file written on the fly, remove write_seqsum_file arg, if write_seqsum_file: seqsum_filename = run_dir / "sequencing_summary.txt" logger.info(f"Writing sequencing summary file '{seqsum_filename}'") - convert_simfasta_dir_to_seqsum(reads_writer.output_dir, seqsummary_filename=seqsum_filename) + convert_simfasta_dir_to_seqsum(run_dir / "reads", seqsummary_filename=seqsum_filename) logger.info("Wrote sequencing summary file") logger.info("Done with simulation") diff --git a/src/simreaduntil/usecase_helpers/utils.py b/src/simreaduntil/usecase_helpers/utils.py index 70a40f8..9c712c4 100644 --- a/src/simreaduntil/usecase_helpers/utils.py +++ b/src/simreaduntil/usecase_helpers/utils.py @@ -39,7 +39,7 @@ from simreaduntil.simulator.simulator import get_simulator_delay_over_time_df, plot_sim_actions, plot_simulator_delay_over_time from simreaduntil.simulator.simulator_params import SimParams from simreaduntil.simulator.utils import set_package_log_level -from simreaduntil.usecase_helpers.readfish_plotting import get_extra_basecall_delay_over_time_df, get_processing_time_per_read_over_time_df, get_throttle_over_time_df, plot_extra_basecalling_delay_per_iter, plot_readfish_processing_time, plot_throttle_over_time +from simreaduntil.usecase_helpers.readfish_plotting import get_chunk_mapping_time_over_time_df, get_chunk_wait_time_over_time_df, get_extra_basecall_delay_over_time_df, get_processing_time_per_read_over_time_df, get_throttle_over_time_df, plot_chunk_mapping_time, plot_chunk_waiting_time, plot_extra_basecalling_delay_per_iter, plot_readfish_processing_time, plot_throttle_over_time logger = setup_logger_simple(__name__) """module logger""" @@ -74,7 +74,7 @@ def random_nanosim_reads_gen(random_state=np.random.default_rng(2), length_range # to load the FASTA file when the function is called rather than when the first read is requested (which may delay the simulation if an index has to be built first) @force_eval_generator_function -def perfect_reads_gen(fasta_filename: Path, read_lens_range, random_state=np.random.default_rng(1), nanosim_read_id=True): +def perfect_reads_gen(fasta_filename: Path, read_lens_range: tuple[int], random_state=np.random.default_rng(1), nanosim_read_id=True): """ Generate perfect reads that align to the reference genome @@ -367,7 +367,7 @@ def create_simparams_if_inexistent(sim_params_filename, seqsum_param_extr_file, random_state = np.random.default_rng(1) # todo2: one random state, also in on_sim sim_params = SimParams( gap_samplers={f"ch{i+1}": gap_sampler_maker(random_state=random_state)[1] for i in range(n_channels)}, - bp_per_second=compute_median_pore_speed(seqsum_df), chunk_size=200, default_unblock_duration=0.1, seed=0, + bp_per_second=compute_median_pore_speed(seqsum_df), min_chunk_size=200, default_unblock_duration=0.1, seed=0, ) logger.debug(f"Saving sim_params to file '{sim_params_filename}'") @@ -454,6 +454,8 @@ def create_figures(seqsum, run_dir, figure_dir=None, delete_existing_figure_dir= def plot_log_file_metrics(log_filename, save_dir=None): """Parse log file and plot metrics""" + logger.info(f"Plotting metrics from log file '{log_filename}' and saving to '{save_dir}'") + df = get_simulator_delay_over_time_df(log_filename) fig = plot_simulator_delay_over_time(df, save_dir=save_dir); logger.debug("Created 1 plot"); plt.close(fig) @@ -466,8 +468,14 @@ def plot_log_file_metrics(log_filename, save_dir=None): basecall_delay_df = get_extra_basecall_delay_over_time_df(log_filename) fig = plot_extra_basecalling_delay_per_iter(basecall_delay_df, save_dir=save_dir); logger.debug("Created 1 plot"); plt.close(fig) + chunk_waiting_time_df = get_chunk_wait_time_over_time_df(log_filename) + fig = plot_chunk_waiting_time(chunk_waiting_time_df, save_dir=save_dir); logger.debug("Created 1 plot"); plt.close(fig) + + chunk_mapping_time_df = get_chunk_mapping_time_over_time_df(log_filename) + fig = plot_chunk_mapping_time(chunk_mapping_time_df, save_dir=save_dir); logger.debug("Created 1 plot"); plt.close(fig) + def extract_errfile_from_condor_jobad(jobad_filename): - """Extract the file where stderr is redriected to from a condor jobad file""" + """Extract the file where stderr is redirected to from a condor jobad file""" # find first match of "Err = " with open(jobad_filename, "r") as f: for line in f: @@ -475,16 +483,16 @@ def extract_errfile_from_condor_jobad(jobad_filename): return line[6:].strip()[1:-1] # temporary hack to get rid of quotes; could use htcondor's pythonbindings instead: python package classad return None -def plot_condor_log_file_metrics(save_dir=None): - """Parse log filename from condor job add, then plot metrics from log""" - jobad_filename = os.environ.get("_CONDOR_JOB_AD", None) - if jobad_filename is not None: - # parse log filename from condor job ad, then process it - log_filename = extract_errfile_from_condor_jobad(jobad_filename) - if log_filename is not None: - logger.info(f"Plotting metrics from condor log file '{log_filename}'") - plot_log_file_metrics(log_filename, save_dir=save_dir) - else: - logger.warning(f"Did not find log file in condor job ad: {jobad_filename}") - else: - logger.warning("Did not find condor job ad environment variable '_CONDOR_JOB_AD', cannot plot metrics") \ No newline at end of file +# def plot_condor_log_file_metrics(save_dir=None): +# """Parse log filename from condor job add, then plot metrics from log""" +# jobad_filename = os.environ.get("_CONDOR_JOB_AD", None) +# if jobad_filename is not None: +# # parse log filename from condor job ad, then process it +# log_filename = extract_errfile_from_condor_jobad(jobad_filename) +# if log_filename is not None: +# logger.info(f"Plotting metrics from condor log file '{log_filename}'") +# plot_log_file_metrics(log_filename, save_dir=save_dir) +# else: +# logger.warning(f"Did not find log file in condor job ad: {jobad_filename}") +# else: +# logger.warning("Did not find condor job ad environment variable '_CONDOR_JOB_AD', cannot plot metrics") \ No newline at end of file diff --git a/tests/shared_utils/test_nanosim_parsing.py b/tests/shared_utils/test_nanosim_parsing.py index 9298940..5dae6ee 100644 --- a/tests/shared_utils/test_nanosim_parsing.py +++ b/tests/shared_utils/test_nanosim_parsing.py @@ -11,10 +11,10 @@ def test_parsing(): nanosim_id = NanoSimId.from_str(nanosim_id_str) assert nanosim_id.chrom == "Human-chr11-NC-000011" assert nanosim_id.ref_pos == 76599 - assert nanosim_id.ref_len == 9967 - assert nanosim_id.direction == "F" assert nanosim_id.read_nb == "proc0:0" + assert nanosim_id.direction == "F" assert nanosim_id.head_len == 0 + assert nanosim_id.ref_len == 9967 assert nanosim_id.tail_len == 0 assert nanosim_id.read_type == "aligned" assert str(nanosim_id) == nanosim_id_str @@ -34,6 +34,17 @@ def test_parsing(): # reverse strand (R): 76599 + 9967 - 1001 = 85565 assert str(NanoSimId.from_str("Human-chr11-NC-000011_76599_aligned_proc0:0_R_0_9967_0").change_ref_len(1001)) == "Human-chr11-NC-000011_85565_aligned_proc0:0m_R_0_1001_0" # adds m to read_nb +def test_parsing_unaligned(): + nanosim_id = NanoSimId.from_str("genome1-chr-6_236227_unaligned_proc5:16_R_0_16119_0") + assert nanosim_id.chrom == "genome1-chr-6" + assert nanosim_id.ref_pos == 236227 + assert nanosim_id.read_type == "unaligned" + assert nanosim_id.read_nb == "proc5:16" + assert nanosim_id.direction == "R" + assert nanosim_id.head_len == 0 + assert nanosim_id.ref_len == 16119 + assert nanosim_id.tail_len == 0 + def test_normalize_seq_name(): assert normalize_seq_name("chr1 extra_info more-info hello") == "chr1-extra-info-more-info-hello" assert normalize_seq_name("chr1.aa extra_info more-info hello") == "chr1" diff --git a/tests/shared_utils/test_timing.py b/tests/shared_utils/test_timing.py index bcf7a68..0935857 100644 --- a/tests/shared_utils/test_timing.py +++ b/tests/shared_utils/test_timing.py @@ -48,7 +48,7 @@ def test_step_and_total_elapsed_timer(): assert in_interval(time_since_last_call, (0.4, 0.42)) assert in_interval(time_since_last_reset, (0.4, 0.42)) -def test_cur_time_ns(): +def test_cur_ns_time(): # less than 1 microsecond deviation # print(cur_ns_time()) time_difference = time.time_ns()/1_000_000_000 - cur_ns_time() diff --git a/tests/shared_utils/test_utils.py b/tests/shared_utils/test_utils.py index ae73bd7..cb3697d 100644 --- a/tests/shared_utils/test_utils.py +++ b/tests/shared_utils/test_utils.py @@ -1,7 +1,11 @@ +import signal import subprocess +import sys +import threading import time import pytest -from simreaduntil.shared_utils.utils import dill_dump, dill_load, force_eval_generator, force_eval_generator_function, get_file_content, is_sorted, num_lines_in_file, print_cmd_and_run, subset_dict, tqdm_with_name, get_some_value_from_dict +from simreaduntil.shared_utils.utils import MutableValue, StoppableQueue, dill_dump, dill_load, force_eval_generator, force_eval_generator_function, get_file_content, is_sorted, num_lines_in_file, print_cmd_and_run, record_gen_fcn_waiting_time, set_signal_handler, subset_dict, tee_stdouterr_to_file, tqdm_with_name, get_some_value_from_dict +from simreaduntil.shared_utils.utils import record_gen_waiting_time def test_is_sorted(): import numpy as np @@ -104,3 +108,138 @@ def compute_data(): yield i for i in tqdm_with_name((i, i) for i in compute_data()): time.sleep(0.1) + +def test_set_signal_handler(): + print_method = lambda *args, **kwargs: None + # print_method = print + + sleep_time = 0.05 + class StoppableGen: + def __init__(self): + self.stop = False + self.state = 0 + self.invalid_state = False # to signal when state is invalid, to ensure we do not exit anywhere in the code + + def gen_numbers(self, max_val): + while self.state < max_val: + self.invalid_state = False + if self.stop: + print_method("Checked stop condition, stopping") + break + self.invalid_state = True + yield self.state + prev_state = self.state + print_method("Sleep") + time.sleep(sleep_time) + print_method("after sleep") + self.state = prev_state + 1 + self.invalid_state = False + + def on_keyboard_interrupt(signum, frame): + print_method("Keyboard interrupt, setting stop") + gen.stop = True + + def raise_sigint(): + time.sleep(5*sleep_time) + signal.raise_signal(signal.SIGINT) + threading.Thread(target=raise_sigint).start() + + gen = StoppableGen() + with set_signal_handler(signal.SIGINT, on_keyboard_interrupt): + for x in gen.gen_numbers(100): + print_method(x) + assert not gen.invalid_state + print_method("Continuing") + gen.stop = False + print_method(list(gen.gen_numbers(gen.state + 5))) + assert not gen.invalid_state + +def test_tee_stdouterr_to_file(tmp_path): + with tee_stdouterr_to_file(tmp_path / "teeing", mode="w"): + for i in range(5): + print(f"out{i}") + print(f"err{i}", file=sys.stderr) + # time.sleep(0.1) + assert (tmp_path / "teeing.out").read_text() == "".join(f"out{i}\n" for i in range(5)) + assert (tmp_path / "teeing.err").read_text() == "".join(f"err{i}\n" for i in range(5)) + +def test_record_gen_waiting_time(): + def produce_data(): + for i in range(5): + time.sleep(0.1) + yield i + time.sleep(0.5) + + wait_time = MutableValue() + for x in record_gen_waiting_time(produce_data(), wait_time): + print(x) + if x >= 3: + # break early to check this is handled as well when the generator is not exhausted + break + # print(wait_time.value) + assert 0.4 - 0.05 <= wait_time.value <= 0.4 + 0.05 + +def test_record_gen_fcn_waiting_time(): + def produce_data(): + for i in range(5): + time.sleep(0.1) + yield i + time.sleep(0.5) + + def transform_data(gen): + for x in gen: + time.sleep(0.2) + yield x + + transform_time = MutableValue() + for x in record_gen_fcn_waiting_time(transform_data, produce_data(), transform_time): + if x >= 3: + # break early to check this is handled as well when the generator is not exhausted + break + assert 0.2*4-0.05 <= transform_time.value <= 0.2*4+0.05 + + # as opposed to measuring the total time + wait_time = MutableValue() + for x in record_gen_waiting_time(transform_data(produce_data()), wait_time): + print(x) + if x >= 3: + # break early to check this is handled as well when the generator is not exhausted + break + # print(wait_time.value) + assert (0.2+0.1)*4-0.05 <= wait_time.value <= (0.2+0.1)*4+0.05 + +def put_item_onto_queue(queue, i, when): + # print(f"Putting {i} onto queue at {when}") + def f(): + queue.put(i) + print(f"Put item '{i}' onto queue after {when}s") + threading.Timer(when, f).start() + +def test_StoppableQueue_NonBlockingGet(): + queue = StoppableQueue(2) + queue.put(1) + + # non-blocking get + assert queue.get(block=False) == 1 + put_item_onto_queue(queue, 2, 0.1) + # item not yet on queue + with pytest.raises(queue.Empty): + queue.get(block=False) + # block until item on queue + assert queue.get() == 2 + +def test_StoppableQueue_FullQueueStopped(): + queue = StoppableQueue(2) + queue.put(3) + queue.put(4) + # stop queue, so next get() will raise exception + queue.stop() + with pytest.raises(StoppableQueue.QueueStoppedException): + queue.get() + +def test_StoppableQueue_EmptyQueueStopped(): + queue = StoppableQueue(2) + # empty queue that is stopped after some time + threading.Timer(0.1, queue.stop).start() + with pytest.raises(StoppableQueue.QueueStoppedException): + queue.get() \ No newline at end of file diff --git a/tests/simulator/test_channel.py b/tests/simulator/test_channel.py index 8edcd8b..cdd2641 100644 --- a/tests/simulator/test_channel.py +++ b/tests/simulator/test_channel.py @@ -21,7 +21,7 @@ def sim_params() -> SimParams: return SimParams( gap_samplers={"channel1": ConstantGapsUntilBlocked(short_gap_length=0.4, long_gap_length=10.1, prob_long_gap=0, time_until_blocked=np.inf, read_delay=0)}, - bp_per_second=10, chunk_size=4, default_unblock_duration=1.4, seed=0, + bp_per_second=10, min_chunk_size=4, default_unblock_duration=1.4, seed=0, ) @pytest.fixture() @@ -88,15 +88,16 @@ def test_channel_stats(sim_params, channel_stats: ChannelStats): chan.start(t_start) chan.forward(t_start + 1.1 + eps) + # initial read gap of 0.4 channel_stats.reads.start_and_add_time(0.7 + eps, nb_new_bps=7) assert chan.stats == channel_stats - chan.get_new_chunks() - channel_stats.reads.number_bps_requested = 4 + chan.get_new_samples() + channel_stats.reads.number_bps_requested = 7 assert chan.stats == channel_stats chan.stop_receiving() - chan.get_new_chunks() # no chunks + chan.get_new_samples() # no chunks channel_stats.reads.cur_number_stop_receiving = 1 assert chan.stats == channel_stats @@ -104,10 +105,16 @@ def test_channel_stats(sim_params, channel_stats: ChannelStats): channel_stats.reads.cur_number_stop_receiving = 1 assert chan.stats == channel_stats + chan.forward(t_start + 1.3 + eps) + chan.get_new_samples() # no chunks since less than min chunk size + channel_stats.reads.add_time(0.2, nb_new_bps=2) + assert chan.stats == channel_stats + chan.forward(t_start + 1.7 + eps) - channel_stats.reads.add_time(0.6, nb_new_bps=6) + channel_stats.reads.add_time(0.4, nb_new_bps=4) assert chan.stats == channel_stats + # read ends and gap starts chan.forward(t_start + 1.9 + eps) channel_stats.reads.add_time_and_finish(0.1, nb_new_bps=1, stopped_receiving=True) channel_stats.short_gaps.start_and_add_time(0.1) @@ -176,13 +183,13 @@ def test_stopreceiving_then_unblock(sim_params, channel_stats: ChannelStats): chan.start(t_start) chan.forward(t_start + 1.0 + eps) - chan.get_new_chunks() + chan.get_new_samples() assert chan.stop_receiving() == StoppedReceivingResponse.STOPPED_RECEIVING assert chan.stop_receiving() == StoppedReceivingResponse.ALREADY_STOPPED_RECEIVING channel_stats.reads.start_and_add_time(0.6 + eps, nb_new_bps=6) channel_stats.reads.cur_number_stop_receiving += 1 - channel_stats.reads.number_bps_requested += 4 + channel_stats.reads.number_bps_requested += 6 assert chan.stats == channel_stats assert chan.unblock(0.3) @@ -237,7 +244,7 @@ def test_channel_restart(sim_params, channel_stats: ChannelStats): assert channel_stats.n_channels_running == 0 assert chan.stats == channel_stats -def test_get_new_chunks(sim_params): +def test_get_new_samples(sim_params): # reads of length 14, 10, 10, 10 read_pool = ReadPoolFromIterable(gen_from_list((("read1", "AAAAGGGGCCCCTT"), ("read2", "TTTTAAAACC"), ("read3", "TTTTAAAACC"), ("read4", "TTTTAAAACC")))) reads_writer = ArrayReadsWriter() @@ -247,13 +254,13 @@ def test_get_new_chunks(sim_params): chan.start(-0.4) chan.forward(0.3 + eps) assert [x[:2] for x in reads_writer.reads] == [] - assert chan.get_new_chunks()[:2] == ("", "read1") + assert chan.get_new_samples()[:2] == ("", "read1") chan.forward(0.4 + eps) - assert chan.get_new_chunks()[:2] == ("AAAA", "read1") - assert chan.get_new_chunks()[:2] == ("", "read1") # already returned chunks + assert chan.get_new_samples()[:2] == ("AAAA", "read1") + assert chan.get_new_samples()[:2] == ("", "read1") # already returned chunks assert [x[:2] for x in reads_writer.reads] == [] chan.forward(1.3 + eps) - assert chan.get_new_chunks()[:2] == ("GGGGCCCC", "read1") + assert chan.get_new_samples()[:2] == ("GGGGCCCCT", "read1") assert [x[:2] for x in reads_writer.reads] == [] chan.forward(1.4 + eps) assert [x[:2] for x in reads_writer.reads] == [("read1", "AAAAGGGGCCCCTT")] @@ -262,13 +269,14 @@ def test_get_new_chunks(sim_params): chan.forward(0.4 + 2.4 + eps) # 14+10 assert [x[:2] for x in reads_writer.reads] == [("read1", "AAAAGGGGCCCCTT"), ('read2', 'TTTTAAAACC')] - assert chan.get_new_chunks()[:2] == ("", None) + assert chan.get_new_samples()[:2] == ("", None) chan.forward(2*0.4 + 3.1 + eps) # 14+10+7 - assert chan.get_new_chunks()[:2] == ("TTTT", "read3") + assert chan.get_new_samples()[:2] == ("TTTTAAA", "read3") chan.stop() - chan.get_new_chunks() # no exception + chan.get_new_samples() # no exception + # todo: remove # ax = plot_channels([chan], time_interval=[0.9, 6.2], figsize=(6, 2)) # ax.figure.show() @@ -279,16 +287,18 @@ def test_read_stop_receiving(sim_params): chan.start(2-0.4) chan.forward(2.6 + eps) - assert chan.get_new_chunks()[:2] == ("AAAA", "read1") + assert chan.get_new_samples()[:2] == ("AAAAGG", "read1") assert chan.stop_receiving() == StoppedReceivingResponse.STOPPED_RECEIVING assert chan.stop_receiving() == StoppedReceivingResponse.ALREADY_STOPPED_RECEIVING - assert chan.get_new_chunks()[:2] == ("", "read1") + assert chan.get_new_samples()[:2] == ("", "read1") + chan.forward(2.9 + eps) + assert chan.get_new_samples()[:2] == ("", "read1") assert chan.stop_receiving(read_id="inexistent") == StoppedReceivingResponse.MISSED chan.forward(3.5 + eps) assert chan.stop_receiving() == StoppedReceivingResponse.MISSED - assert chan.get_new_chunks()[:2] == ("", None), "in a gap" + assert chan.get_new_samples()[:2] == ("", None), "in a gap" def test_mux_scan(sim_params, channel_stats): read_pool = ReadPoolFromIterable(gen_from_list((("read1", "AAAAGGGGCCCCTT"), ("read2", "TTTTAAAACC")))) @@ -338,7 +348,7 @@ def test_poreblockage_continues_after_mux_scan(): sim_params = SimParams( gap_samplers={"channel1": ConstantGapsUntilBlocked(short_gap_length=0.4, long_gap_length=10.4, prob_long_gap=0.9, time_until_blocked=np.inf, read_delay=0)}, - bp_per_second=10, chunk_size=4, default_unblock_duration=1.4, seed=0, + bp_per_second=10, min_chunk_size=4, default_unblock_duration=1.4, seed=0, ) read_pool = ReadPoolFromIterable(gen_from_list((("read1", "AAAAGGGGCCCCTT"), ("read2", "TTTTAAAACC")))) reads_writer = ArrayReadsWriter() @@ -386,7 +396,7 @@ def test_plotting(sim_params): # print(chan.cur_elem) pass else: - chunks = chan.get_new_chunks()[0] + chunks = chan.get_new_samples()[0] # if len(chunks) > 0: # print(f"{delta_t}: {chunks}") channels.append(chan) @@ -408,7 +418,7 @@ def test_channel_normal_operation(sim_params): chan.start(t_start) for t in (t_start + 1e-8 + np.arange(0, 3, 0.05)): chan.forward(t) - chunks = chan.get_new_chunks()[0] + chunks = chan.get_new_samples()[0] if len(chunks) > 0: print(f"{t}: {chunks}")#, end=", ") @@ -451,7 +461,7 @@ def perform_random_channel_ops(chan: Channel, random_state: np.random.Generator, chan.forward(t_duration + 0.01, delta=True) if random_state.uniform() < 0.2: - chan.get_new_chunks() + chan.get_new_samples() # finish simulation nb_actions["sim_stopped_unblock"] += int(isinstance(chan.cur_elem, ChunkedRead)) @@ -495,7 +505,7 @@ def test_random_operations(sim_params, channel_write_zero_length_reads): # make all elements roughly the same length, reads have length 8-17 -> take about 1.2s sim_params = SimParams( gap_samplers={"channel1": ConstantGapsUntilBlocked(short_gap_length=1.2, long_gap_length=1.2, prob_long_gap=0.15, time_until_blocked=np.inf, read_delay=0)}, - bp_per_second=10, chunk_size=4, default_unblock_duration=1.2, seed=0, + bp_per_second=10, min_chunk_size=4, default_unblock_duration=1.2, seed=0, ) plotted_once = False diff --git a/tests/simulator/test_channel_element.py b/tests/simulator/test_channel_element.py index 7359904..44bef28 100644 --- a/tests/simulator/test_channel_element.py +++ b/tests/simulator/test_channel_element.py @@ -7,26 +7,22 @@ from simreaduntil.simulator.channel_element import ShortGap, ChunkedRead as _ChunkedRead, NoReadLeftGap, LongGap, ReadDescriptionParser, ReadEndReason, end_reason_to_ont_map -ChunkedRead = lambda *args, **kwargs: _ChunkedRead(*args, **kwargs, read_speed=10, chunk_size=4) +ChunkedRead = lambda *args, **kwargs: _ChunkedRead(*args, **kwargs, read_speed=10, min_chunk_size=4) eps = 1e-8 # small delay (to avoid issues with rounding errors when geting chunks up to time <= t) def test_readended_map(): assert end_reason_to_ont_map[ReadEndReason.UNBLOCKED.value] == "data_service_unblock_mux_change" -def test_nb_basepairs(): +def test_actual_seq_length(): # test basic functions chunked_read = ChunkedRead("read1", "111122223333444455", 10.1) assert chunked_read._nanosim_read_id is None - assert chunked_read._nb_chunks == 5 - assert chunked_read._chunk_end_positions == [4, 8, 12, 16, 18] - assert chunked_read._get_chunks(2, 4) == "33334444" - assert chunked_read._get_chunks(4, 10) == "55" - assert chunked_read.nb_basepairs(10.1+eps) == 0 - assert chunked_read.nb_basepairs(10.18+eps) == 0 - assert chunked_read.nb_basepairs(10.2+eps) == 1 - assert chunked_read.nb_basepairs(10.35+eps) == 2 - assert chunked_read.nb_basepairs(13) == len(chunked_read._full_seq) + assert chunked_read.actual_seq_length(10.1+eps) == 0 + assert chunked_read.actual_seq_length(10.18+eps) == 0 + assert chunked_read.actual_seq_length(10.2+eps) == 1 + assert chunked_read.actual_seq_length(10.35+eps) == 2 + assert chunked_read.actual_seq_length(13) == len(chunked_read._full_seq) assert chunked_read.t_end == approx(10.1 + 1.8) assert chunked_read.full_duration() == approx(1.8) @@ -37,8 +33,8 @@ def test_nb_basepairs(): chunked_read.finish(10.9+eps, end_reason=ReadEndReason.UNBLOCKED) assert chunked_read.t_end == 10.9+eps - assert chunked_read.nb_basepairs(10.9+eps) == 8 - assert chunked_read.nb_basepairs_full() == 18 + assert chunked_read.actual_seq_length(10.9+eps) == 8 + assert chunked_read.full_seq_length() == 18 nanosim_id = "chr11_77_aligned_proc0:0_F_0_36_0" chunked_read = ChunkedRead(nanosim_id, "111122223333444455", 10.1) @@ -48,68 +44,101 @@ def test_nb_basepairs(): # issue previously due to floating point chunked_read = ChunkedRead("read1", "111122223", 28.4) t_end = 29.299999999999997 - chunked_read.has_finished_by(t_end) - assert chunked_read.nb_basepairs(t_end) == 9 + assert chunked_read.has_finished_by(t_end) + assert chunked_read.actual_seq_length(t_end) == 9 -def test_chunks(): - # check get_new_chunks - chunked_read = ChunkedRead("read1", "111122223333444455", 10.1) - assert not chunked_read.all_chunks_consumed() +def test_get_samples(): + # check get_new_samples + chunked_read = ChunkedRead("read1", "111112222222222333333", 10.1) + assert not chunked_read.all_samples_consumed() # 1 bp emitted every 0.1 seconds, add small tolerance if it is just on the edge - assert chunked_read.get_new_chunks(8.9) == ("", "read1", 0) - assert chunked_read.nb_basepairs_returned() == 0 - assert chunked_read.get_new_chunks(10.1+0.3+eps) == ("", "read1", 0) - assert chunked_read.get_new_chunks(10.1+0.4+eps) == ("1111", "read1", 4) - assert chunked_read.nb_basepairs_returned() == 4 - assert chunked_read.get_new_chunks(10.1+0.4+eps) == ("", "read1", 4), "no new chunks since last time" - assert not chunked_read.all_chunks_consumed() - assert chunked_read.get_new_chunks(10.1+0.4+eps) == ("", "read1", 4), "no new chunks since last time" - assert chunked_read.get_new_chunks(10.1+0.4+1+eps) == ("22223333", "read1", 12) - assert chunked_read.nb_basepairs_returned() == 12 - assert chunked_read.get_new_chunks(10.1+0.4+1.+0.7+eps) == ("444455", "read1", 18) - assert chunked_read.get_new_chunks(130.2) == ("", "read1", 18) - assert chunked_read.nb_basepairs_returned() == 18 - assert chunked_read.all_chunks_consumed() + assert chunked_read.get_new_samples(8.9) == ("", "read1", 0) + assert chunked_read.num_samples_returned() == 0 + assert chunked_read.get_new_samples(10.1+0.3+eps) == ("", "read1", 0) + assert chunked_read.num_samples_returned() == 0 + assert chunked_read.get_new_samples(10.1+0.5+eps) == ("11111", "read1", 5) + assert chunked_read.num_samples_returned() == 5 + assert chunked_read.get_new_samples(10.1+0.5+eps) == ("", "read1", 5), "no new chunks since last time" + assert not chunked_read.all_samples_consumed() + assert chunked_read.get_new_samples(10.1+0.5+eps) == ("", "read1", 5), "no new chunks since last time" + assert chunked_read.get_new_samples(10.1+0.5+1+eps) == ("2222222222", "read1", 15) + assert chunked_read.num_samples_returned() == 15 + assert chunked_read.get_new_samples(10.1+0.5+1.+0.7+eps) == ("333333", "read1", 21) + assert chunked_read.get_new_samples(130.2) == ("", "read1", 21) + assert chunked_read.num_samples_returned() == 21 + assert chunked_read.all_samples_consumed() # test with NanoSim-like read, ref_len = 36, start position = 77 # 36/18 = 2 nanosim_read_id = "chr11_77_aligned_proc0:0_F_0_36_0" chunked_read = ChunkedRead(nanosim_read_id, "111122223333444455", 10.1) - assert chunked_read.get_new_chunks(10.1+0.9+eps) == ("11112222", nanosim_read_id, 16) - assert chunked_read.get_new_chunks(10.1+3.9+eps) == ("3333444455", nanosim_read_id, 36) + assert chunked_read.get_new_samples(10.1+0.9+eps) == ("111122223", nanosim_read_id, 2 * 9) + assert chunked_read.get_new_samples(10.1+3.9+eps) == ("333444455", nanosim_read_id, 2 * 18) # test stop_receiving chunked_read = ChunkedRead("read1", "111122223333444455", 10.1) chunked_read.stop_receiving() - assert chunked_read.get_new_chunks(10.1+0.9+eps) == ("", "read1", 0) + assert chunked_read.get_new_samples(10.1+0.9+eps) == ("", "read1", 0) chunked_read = ChunkedRead("read1", "111122223333444455", 10.1) - assert chunked_read.get_new_chunks(10.1+0.9+eps) == ("11112222", "read1", 8) + assert chunked_read.get_new_samples(10.1+0.9+eps) == ("111122223", "read1", 9) chunked_read.stop_receiving() - assert chunked_read.get_new_chunks(10.1+1.4+eps) == ("", "read1", 8) + assert chunked_read.get_new_samples(10.1+1.4+eps) == ("", "read1", 9) + +def test__estimate_ref_len(): + head_len = 4 + tail_len = 6 + ref_len = 16 + # seq_len central part: 8 + + direction = "F" + nanosim_read_id = f"chr11_77_aligned_proc0:0_{direction}_{head_len}_{ref_len}_{tail_len}" + chunked_read = ChunkedRead(nanosim_read_id, "111122222222333333", 10.1) + estimated_ref_length = (7 - head_len) * (16/8) + assert chunked_read._estimate_ref_len(7) == estimated_ref_length + assert chunked_read.get_new_samples(10.1+0.7+eps) == ("1111222", nanosim_read_id, estimated_ref_length) + + # same with reverse + direction = "R" + nanosim_read_id = f"chr11_77_aligned_proc0:0_{direction}_{head_len}_{ref_len}_{tail_len}" + chunked_read = ChunkedRead(nanosim_read_id, "111111222222223333", 10.1) + estimated_ref_length = (7 - tail_len) * (16/8) + assert chunked_read._estimate_ref_len(7) == estimated_ref_length + assert chunked_read.get_new_samples(10.1+0.7+eps) == ("1111112", nanosim_read_id, estimated_ref_length) + + # test when in head or in tail + direction = "R" + nanosim_read_id = f"chr11_77_aligned_proc0:0_{direction}_{head_len}_{ref_len}_{tail_len}" + chunked_read = ChunkedRead(nanosim_read_id, "111111222222223333", 10.1) + assert chunked_read._estimate_ref_len(5) == 0 + assert chunked_read.get_new_samples(10.1+0.5+eps) == ("11111", nanosim_read_id, 0) + + direction = "R" + nanosim_read_id = f"chr11_77_aligned_proc0:0_{direction}_{head_len}_{ref_len}_{tail_len}" + chunked_read = ChunkedRead(nanosim_read_id, "111111222222223333", 10.1) + assert chunked_read._estimate_ref_len(15) == 16 # 15 >= 6 + 8 + assert chunked_read.get_new_samples(10.1+1.5+eps) == ("111111222222223", nanosim_read_id, 16) def test_chunks_with_delay(): # extra delay before actual read starts chunked_read = ChunkedRead("read1", "111122223333444455", 10.1, t_delay=2.1) - assert chunked_read._nb_chunks == 5 - assert chunked_read._chunk_end_positions == [4, 8, 12, 16, 18] - assert chunked_read.nb_basepairs(10.4) == 0 - assert chunked_read.nb_basepairs(10.1 + 2.15) == 0 - assert chunked_read.nb_basepairs(10.1 + 2.25) == 1 - assert chunked_read.nb_basepairs(10.1 + 3.25) == 11 + assert chunked_read.actual_seq_length(10.4) == 0 + assert chunked_read.actual_seq_length(10.1 + 2.15) == 0 + assert chunked_read.actual_seq_length(10.1 + 2.25) == 1 + assert chunked_read.actual_seq_length(10.1 + 3.25) == 11 assert chunked_read.t_end == approx(10.1 + 2.1 + 1.8) assert chunked_read.full_duration() == approx(1.8 + 2.1) - assert chunked_read.get_new_chunks(8.9) == ("", "read1", 0) - assert chunked_read.get_new_chunks(10.5) == ("", "read1", 0) - assert chunked_read.get_new_chunks(10.1 + 2.3) == ("", "read1", 0) - assert chunked_read.get_new_chunks(10.1+2.1+0.4+eps) == ("1111", "read1", 4) - assert chunked_read.get_new_chunks(10.1+5+eps) == ("22223333444455", "read1", 18) + assert chunked_read.get_new_samples(8.9) == ("", "read1", 0) + assert chunked_read.get_new_samples(10.5) == ("", "read1", 0) + assert chunked_read.get_new_samples(10.1 + 2.3) == ("", "read1", 0) + assert chunked_read.get_new_samples(10.1+2.1+0.4+eps) == ("1111", "read1", 4) + assert chunked_read.get_new_samples(10.1+5+eps) == ("22223333444455", "read1", 18) # test finish nanosim_read_id = "chr11_77_aligned_proc0:0_F_0_36_0" chunked_read = ChunkedRead(nanosim_read_id, "111122223333444455", 0.2, t_delay=2.1) - assert chunked_read.get_new_chunks(0.2+2.1+0.9+eps) == ("11112222", nanosim_read_id, 16) # 2 * 8 = 16 + assert chunked_read.get_new_samples(0.2+2.1+0.9+eps) == ("111122223", nanosim_read_id, 18) # 2 * 9 = 18 with pytest.raises(AssertionError, match="finish earlier"): chunked_read.finish(0.2+2.1+0.5+eps, end_reason=ReadEndReason.UNBLOCKED) @@ -126,7 +155,7 @@ def test_read_finish(): # get chunks, but no action chunked_read = ChunkedRead(nanosim_read_id, "111122223333444455", 0.2) - chunked_read.get_new_chunks(0.9) + chunked_read.get_new_samples(0.9) seq_record = chunked_read.finish() check_equal_seq_records(seq_record, SeqIO.SeqRecord(Seq("111122223333444455"), id=nanosim_read_id, description=f"full_seqlen=18 t_start=0.2 t_end={0.2+1.8} t_delay={0} ended=read_ended_normally tags= full_read_id={nanosim_read_id}")) assert chunked_read.end_reason == ReadEndReason.READ_ENDED_NORMALLY @@ -136,12 +165,12 @@ def test_read_finish(): # negative start time, read finished chunked_read = ChunkedRead(nanosim_read_id, "111122223333444455", -0.5) - chunked_read.get_new_chunks(0.9) + chunked_read.get_new_samples(0.9) chunked_read.finish(-0.5+1.9, end_reason=ReadEndReason.READ_ENDED_NORMALLY) # get chunks, then stop receiving chunked_read = ChunkedRead(nanosim_read_id, "111122223333444455", 0.2) - chunked_read.get_new_chunks(0.9) + chunked_read.get_new_samples(0.9) chunked_read.stop_receiving() seq_record = chunked_read.finish() check_equal_seq_records(seq_record, chunked_read.get_seq_record()) @@ -164,9 +193,9 @@ def test_read_finish(): # stopped receiving, rejected afterwards chunked_read = ChunkedRead(nanosim_read_id, "111122223333444455", 0.2) - chunked_read.get_new_chunks(0.9) + chunked_read.get_new_samples(0.9) chunked_read.stop_receiving() - chunked_read.get_new_chunks(1.1) + chunked_read.get_new_samples(1.1) seq_record = chunked_read.finish(1.2+eps, end_reason=ReadEndReason.UNBLOCKED) check_equal_seq_records(seq_record, chunked_read.get_seq_record()) check_equal_seq_records(seq_record, SeqIO.SeqRecord(Seq("1111222233"), id="chr11_77_aligned_proc0:0m_F_0_20_0", description=f"full_seqlen=18 t_start=0.2 t_end={1.2+eps} t_delay={0} ended=user_unblocked tags=stopped_receiving full_read_id={nanosim_read_id}")) # 10*2 = 20 @@ -174,14 +203,14 @@ def test_read_finish(): # sim stopped chunked_read = ChunkedRead(nanosim_read_id, "111122223333444455", 0.2) - chunked_read.get_new_chunks(0.9) + chunked_read.get_new_samples(0.9) seq_record = chunked_read.finish(1.2+eps, end_reason=ReadEndReason.SIM_STOPPED) check_equal_seq_records(seq_record, SeqIO.SeqRecord(Seq("1111222233"), id="chr11_77_aligned_proc0:0m_F_0_20_0", description=f"full_seqlen=18 t_start=0.2 t_end={1.2+eps} t_delay={0} ended=sim_stopped_unblocked tags= full_read_id={nanosim_read_id}")) # 10*2 = 20 assert chunked_read.end_reason == ReadEndReason.SIM_STOPPED # mux scan started chunked_read = ChunkedRead(nanosim_read_id, "111122223333444455", 0.2) - chunked_read.get_new_chunks(0.9) + chunked_read.get_new_samples(0.9) seq_record = chunked_read.finish(1.2+eps, end_reason=ReadEndReason.MUX_SCAN_STARTED) check_equal_seq_records(seq_record, SeqIO.SeqRecord(Seq("1111222233"), id="chr11_77_aligned_proc0:0m_F_0_20_0", description=f"full_seqlen=18 t_start=0.2 t_end={1.2+eps} t_delay={0} ended=mux_scan_unblocked tags= full_read_id={nanosim_read_id}")) # 10*2 = 20 assert chunked_read.end_reason == ReadEndReason.MUX_SCAN_STARTED @@ -189,25 +218,25 @@ def test_read_finish(): # terminate early without end reason with pytest.raises(AssertionError, match="end reason"): chunked_read = ChunkedRead(nanosim_read_id, "111122223333444455", 0.2) - chunked_read.get_new_chunks(0.9) + chunked_read.get_new_samples(0.9) chunked_read.finish(1.1) # need to indicate end reason # cannot finish earlier than last chunk received with pytest.raises(AssertionError, match="finish earlier"): chunked_read = ChunkedRead(nanosim_read_id, "111122223333444455", 0.2) - chunked_read.get_new_chunks(0.9) + chunked_read.get_new_samples(0.9) chunked_read.finish(0.5, end_reason=ReadEndReason.UNBLOCKED) - # can finish at 0.6 since last chunk returned then + # can finish at 0.4 since no chunks returned yet chunked_read = ChunkedRead(nanosim_read_id, "111122223333444455", 0.2) - chunked_read.get_new_chunks(0.9) - seq_record = chunked_read.finish(0.6+eps, end_reason=ReadEndReason.UNBLOCKED) - check_equal_seq_records(seq_record, SeqIO.SeqRecord(Seq("1111"), id="chr11_77_aligned_proc0:0m_F_0_8_0", description=f"full_seqlen=18 t_start=0.2 t_end={0.6+eps} t_delay={0} ended=user_unblocked tags= full_read_id={nanosim_read_id}")) # 4*2 = 8 + chunked_read.get_new_samples(0.5) + seq_record = chunked_read.finish(0.4+eps, end_reason=ReadEndReason.UNBLOCKED) + check_equal_seq_records(seq_record, SeqIO.SeqRecord(Seq("11"), id="chr11_77_aligned_proc0:0m_F_0_4_0", description=f"full_seqlen=18 t_start=0.2 t_end={0.4+eps} t_delay={0} ended=user_unblocked tags=never_requested full_read_id={nanosim_read_id}")) # 2*2 = 4 assert chunked_read.end_reason == ReadEndReason.UNBLOCKED # unblock read chunked_read = ChunkedRead(nanosim_read_id, "111122223333444455", 0.2) - chunked_read.get_new_chunks(0.9) + chunked_read.get_new_samples(0.9) seq_record = chunked_read.finish(1.2+eps, end_reason=ReadEndReason.UNBLOCKED) check_equal_seq_records(seq_record, SeqIO.SeqRecord(Seq("1111222233"), id="chr11_77_aligned_proc0:0m_F_0_20_0", description=f"full_seqlen=18 t_start=0.2 t_end={1.2+eps} t_delay={0} ended=user_unblocked tags= full_read_id={nanosim_read_id}")) # 10*2 = 20 assert chunked_read.end_reason == ReadEndReason.UNBLOCKED @@ -216,7 +245,7 @@ def test_read_finish(): # starting at position 77 and ref length 36, 36/18=2 nanosim_read_id = "chr11_77_aligned_proc0:0_R_0_36_0" chunked_read = ChunkedRead(nanosim_read_id, "111122223333444455", 0.2) - chunked_read.get_new_chunks(0.9) + chunked_read.get_new_samples(0.9) seq_record = chunked_read.finish(1.2+eps, end_reason=ReadEndReason.UNBLOCKED) check_equal_seq_records(seq_record, SeqIO.SeqRecord(Seq("1111222233"), id="chr11_93_aligned_proc0:0m_R_0_20_0", description=f"full_seqlen=18 t_start=0.2 t_end={1.2+eps} t_delay={0} ended=user_unblocked tags= full_read_id={nanosim_read_id}")) # 77 + 36 - 10*2 = 93 assert chunked_read.end_reason == ReadEndReason.UNBLOCKED diff --git a/tests/simulator/test_readpool.py b/tests/simulator/test_readpool.py index a9f8c88..501aa92 100644 --- a/tests/simulator/test_readpool.py +++ b/tests/simulator/test_readpool.py @@ -1,24 +1,32 @@ +from pathlib import Path +from unittest.mock import MagicMock import pytest import numpy as np from Bio import SeqIO from Bio.Seq import Seq from simreaduntil.shared_utils.dna import get_random_DNA_seq -from simreaduntil.simulator.readpool import ReadPoolFromIterable, ReadPoolFromIterablePerChannel, ReadPoolFromFile, reads_from_file_gen, NoReadLeft +from simreaduntil.simulator.readpool import ReadPoolFromIterable, ReadPoolFromIterablePerChannel, ReadPoolFromFile, ThreadedReadPoolWrapper, reads_from_file_gen, NoReadLeftException gen_from_list = pytest.helpers.gen_from_list @pytest.fixture -def dummy_reads_fasta(tmp_path): +def dummy_reads_fastas(tmp_path): """ Creates a dummy fasta file with reads """ - filename = tmp_path / "test111.fasta" - with open(filename, "w") as f: + filename1 = tmp_path / "test1.fasta" + with open(filename1, "w") as f: SeqIO.write(SeqIO.SeqRecord(Seq("AAACCTGG"), id="read1"), f, "fasta") SeqIO.write(SeqIO.SeqRecord(Seq("CCCCCGGTT"), id="read2"), f, "fasta") - return filename + + filename2 = tmp_path / "test2.fasta" + with open(filename2, "w") as f: + SeqIO.write(SeqIO.SeqRecord(Seq("AAACCTGG"), id="read3"), f, "fasta") + SeqIO.write(SeqIO.SeqRecord(Seq("CCCCCGGTT"), id="read4"), f, "fasta") + + return filename1, filename2 def test_ReadPoolFromIterable(): @@ -29,9 +37,11 @@ def test_ReadPoolFromIterable(): assert read_pool.nb_reads_returned == 1 assert read_pool.get_new_read() == ("read2", "AAACCTGGTTAGG") assert read_pool.nb_reads_returned == 2 - with pytest.raises(NoReadLeft): + with pytest.raises(NoReadLeftException): read_pool.get_new_read() assert read_pool.nb_reads_returned == 2 + + read_pool.finish() def test_ReadPoolFromIterablePerChannel(): random_state = np.random.default_rng(2) @@ -45,34 +55,89 @@ def test_ReadPoolFromIterablePerChannel(): assert len(read_pool.get_new_read(2)) == 8 assert len(read_pool.get_new_read(1)) == 3 assert len(read_pool.get_new_read(1)) == 5 - with pytest.raises(NoReadLeft): + with pytest.raises(NoReadLeftException): read_pool.get_new_read(1) # do twice - with pytest.raises(NoReadLeft): + with pytest.raises(NoReadLeftException): read_pool.get_new_read(1) assert len(read_pool.get_new_read(2)) == 9 - with pytest.raises(NoReadLeft): + with pytest.raises(NoReadLeftException): read_pool.get_new_read(2) -def test_reads_from_file_gen(dummy_reads_fasta): + read_pool.finish() + +def test_reads_from_file_gen(dummy_reads_fastas): # no shuffle - assert list(reads_from_file_gen(dummy_reads_fasta)) == [('read1', 'AAACCTGG'), ('read2', 'CCCCCGGTT')] + assert list(reads_from_file_gen(dummy_reads_fastas[0])) == [('read1', 'AAACCTGG'), ('read2', 'CCCCCGGTT')] random_state = np.random.default_rng(2) - assert list(reads_from_file_gen(dummy_reads_fasta, shuffle_rand_state=random_state)) == [('read1', 'AAACCTGG'), ('read2', 'CCCCCGGTT')] + assert list(reads_from_file_gen(dummy_reads_fastas[0], shuffle_rand_state=random_state)) == [('read1', 'AAACCTGG'), ('read2', 'CCCCCGGTT')] random_state = np.random.default_rng(5) - assert list(reads_from_file_gen(dummy_reads_fasta, shuffle_rand_state=random_state)) == [('read2', 'CCCCCGGTT'), ('read1', 'AAACCTGG')] + assert list(reads_from_file_gen(dummy_reads_fastas[0], shuffle_rand_state=random_state)) == [('read2', 'CCCCCGGTT'), ('read1', 'AAACCTGG')] -def test_ReadPoolFromFile(dummy_reads_fasta): - read_pool = ReadPoolFromFile(dummy_reads_fasta) +def test_ReadPoolFromFile(dummy_reads_fastas): + read_pool = ReadPoolFromFile(dummy_reads_fastas[0]) assert not read_pool.definitely_empty print(read_pool) assert read_pool.get_new_read() == ("read1", "AAACCTGG") assert read_pool.get_new_read() == ("read2", "CCCCCGGTT") - with pytest.raises(NoReadLeft): + with pytest.raises(NoReadLeftException): read_pool.get_new_read() assert read_pool.definitely_empty +def test_ReadPoolFromFileThreaded(dummy_reads_fastas): + reads_dir = Path(dummy_reads_fastas[0]).parent + + for (file_obj, num_reads) in [(dummy_reads_fastas[0], 2), (reads_dir, 4)]: + + assert ReadPoolFromFile.can_handle(file_obj) + + read_pool = ThreadedReadPoolWrapper(ReadPoolFromFile(file_obj), queue_size=3) + repr(read_pool) + + [read_pool.get_new_read() for _ in range(num_reads)] + with pytest.raises(NoReadLeftException): + read_pool.get_new_read() + with pytest.raises(NoReadLeftException): + read_pool.get_new_read() + + assert read_pool.definitely_empty + + read_pool.finish() + +def test_ReadPoolFromFileThreaded_NonEmpty(dummy_reads_fastas): + reads_dir = Path(dummy_reads_fastas[0]).parent + read_pool = ThreadedReadPoolWrapper(ReadPoolFromFile(reads_dir), queue_size=2) + read_pool.get_new_read() + + read_pool.finish() # joins the thread + # there should still be one read left, check that the read pool thread terminates properly + +def pyslow5_is_available(): + try: + import pyslow5 + return True + except ImportError: + return False + +# @pytest.mark.skipif(not pyslow5_is_available(), reason="pyslow5 is not installed") +# def test_Slow5ReadPool(mocker, tmp_path): +# import tempfile +# mock = MagicMock() # cannot use Mock() because it doesn't have __iter__ +# mocker.patch("simreaduntil.simulator.readpool.get_slow5_reads_gen", return_value=mock) +# mock.__iter__.return_value = gen_from_list((("read1", [1, 2, 3]), ("read2", [4, 5]))) +# slow5_dir = tmp_path / "slow5_dummy" +# slow5_dir.mkdir() +# (slow5_dir / "test.slow5").touch() # so it iterates over one file + +# assert Slow5ReadPool.can_handle(slow5_dir) + +# read_pool = Slow5ReadPool(slow5_dir, 1) +# assert read_pool.get_new_read() == ("read1", [1, 2, 3]) +# assert read_pool.get_new_read() == ("read2", [4, 5]) +# with pytest.raises(NoReadLeftException): +# read_pool.get_new_read() +# assert read_pool.definitely_empty \ No newline at end of file diff --git a/tests/simulator/test_readswriter.py b/tests/simulator/test_readswriter.py index 60c8ee7..10a8808 100644 --- a/tests/simulator/test_readswriter.py +++ b/tests/simulator/test_readswriter.py @@ -3,35 +3,54 @@ import shutil import dill from simreaduntil.shared_utils.utils import get_file_content -from simreaduntil.simulator.readswriter import ArrayReadsWriter, RotatingFileReadsWriter, SingleFileReadsWriter +from simreaduntil.simulator.readswriter import ArrayReadsWriter, CompoundReadsWriter, RotatingFileReadsWriter, SingleFileReadsWriter, ThreadedReadsWriterWrapper from Bio import SeqIO from Bio.Seq import Seq def test_SingleFileReadsWriter(tmp_path): - filename = tmp_path / "reads1.txt" - with open(filename, "w") as fh: - reads_writer = SingleFileReadsWriter(fh, prefix="Pref:") - reads_writer.write_read(SeqIO.SeqRecord(Seq("AACCGTT"), id="read1")) - reads_writer.write_read(SeqIO.SeqRecord(Seq("GGGGCCAA"), id="read2")) - print(reads_writer) - expected_content = ">Pref:read1 \nAACCGTT\n>Pref:read2 \nGGGGCCAA\n" - assert get_file_content(filename) == expected_content - obj = dill.loads(dill.dumps(reads_writer)) - assert obj.fh is None - # check file not overwritten - assert get_file_content(filename) == expected_content - + for (writer_wrapper, test_picklable) in [(lambda x: x, True), (ThreadedReadsWriterWrapper, False)]: + filename = tmp_path / "reads1.txt" + with open(filename, "w") as fh: + reads_writer = SingleFileReadsWriter(fh, prefix="Pref:") + reads_writer = writer_wrapper(reads_writer) + reads_writer.write_read(SeqIO.SeqRecord(Seq("AACCGTT"), id="read1")) + reads_writer.write_read(SeqIO.SeqRecord(Seq("GGGGCCAA"), id="read2")) + print(reads_writer) + + reads_writer.finish() + + expected_content = ">Pref:read1 \nAACCGTT\n>Pref:read2 \nGGGGCCAA\n" + assert get_file_content(filename) == expected_content + if test_picklable: + obj = dill.loads(dill.dumps(reads_writer)) + assert obj.fh is None + # check file not overwritten + assert get_file_content(filename) == expected_content + + def test_ArrayReadsWriter(): reads_writer = ArrayReadsWriter() reads_writer.write_read(SeqIO.SeqRecord(Seq("AACCGTT"), id="read1")) reads_writer.write_read(SeqIO.SeqRecord(Seq("GGGGCCAA"), id="read2")) - reads_writer.reads, [('read1', Seq('AACCGTT')), ('read2', Seq('GGGGCCAA'))] + assert reads_writer.reads == [("read1", Seq("AACCGTT"), ""), ("read2", Seq("GGGGCCAA"), "")] str(reads_writer) str(reads_writer.extended_repr()) dill.loads(dill.dumps(reads_writer)) +def test_CompoundReadsWriter(): + reads_writer1 = ArrayReadsWriter() + reads_writer2 = ArrayReadsWriter() + with CompoundReadsWriter([reads_writer1, reads_writer2]) as reads_writer: + reads_writer.write_read(SeqIO.SeqRecord(Seq("AACCGTT"), id="read1")) + reads_writer.write_read(SeqIO.SeqRecord(Seq("GGGGCCAA"), id="read2")) + + repr(reads_writer) + + assert reads_writer1.reads == [("read1", Seq("AACCGTT"), ""), ("read2", Seq("GGGGCCAA"), "")] + assert reads_writer2.reads == [("read1", Seq("AACCGTT"), ""), ("read2", Seq("GGGGCCAA"), "")] + def test_RotatingFileReadsWriter(tmp_path): def nb_files_in_dir(path): return sum(1 for _ in Path(path).iterdir()) diff --git a/tests/simulator/test_sim_params.py b/tests/simulator/test_sim_params.py index bd11a30..eaa2846 100644 --- a/tests/simulator/test_sim_params.py +++ b/tests/simulator/test_sim_params.py @@ -7,14 +7,14 @@ def test_sim_params(): sim_params = SimParams( gap_samplers={f"channel_{i}": ConstantGapsUntilBlocked(short_gap_length=1.2, long_gap_length=1.2, prob_long_gap=0.15, time_until_blocked=np.inf, read_delay=0) for i in range(2)}, - bp_per_second=10, chunk_size=4, default_unblock_duration=1.2, seed=0, + bp_per_second=10, min_chunk_size=4, default_unblock_duration=1.2, seed=0, ) # set to some random values - sim_params.set(bp_per_second=1000, default_unblock_duration=0.2, chunk_size=100, seed=2) + sim_params.set(bp_per_second=1000, default_unblock_duration=0.2, min_chunk_size=100, seed=2) assert sim_params.bp_per_second == 1000 assert sim_params.default_unblock_duration == 0.2 - assert sim_params.chunk_size == 100 + assert sim_params.min_chunk_size == 100 assert sim_params._initial_seed == 2 assert sim_params.gap_samplers["channel_0"].short_gap_length == 1.2 diff --git a/tests/simulator/test_simfasta_to_seqsum.py b/tests/simulator/test_simfasta_to_seqsum.py index 67e951f..4e725a8 100644 --- a/tests/simulator/test_simfasta_to_seqsum.py +++ b/tests/simulator/test_simfasta_to_seqsum.py @@ -1,7 +1,65 @@ +from pathlib import Path +from textwrap import dedent import pandas as pd from simreaduntil.seqsum_tools.seqsum_preprocessing import sort_and_clean_seqsum_df -from simreaduntil.simulator.simfasta_to_seqsum import convert_simfasta_dir_to_seqsum, convert_simfasta_to_seqsum +from simreaduntil.simulator.channel_element import ChunkedRead +from simreaduntil.simulator.simfasta_to_seqsum import SequencingSummaryWriter, convert_simfasta_dir_to_seqsum, convert_simfasta_to_seqsum, write_seqsum_header, write_seqsum_record_line +from Bio import SeqIO +from Bio.Seq import Seq +def get_dummy_record(): + # >chr20_36784526_aligned_proc0:16m_R_0_2226_0 full_seqlen=13552 t_start=0.1752480000000105 t_end=6.115326881408691 t_delay=0.09375 ended=user_unblocked tags= full_read_id=chr20_36772816_aligned_proc0:16_R_0_13936_0 ch=ch138 + # GTGCAATTTATACTCATGGCCAGTGTACAGTGACTCATGCCTGTACCCCACTTTAGGAGA + description = "chr20_36784526_aligned_proc0:16m_R_0_2226_0 full_seqlen=13552 t_start=0.1752480000000105 t_end=6.115326881408691 t_delay=0.09375 ended=user_unblocked tags= full_read_id=chr20_36772816_aligned_proc0:16_R_0_13936_0 ch=ch138" + read_id = description.split(" ")[0] + seq = "GTGCAATTTATACTCATGGCCAGTGTACAGTGACTCATGCCTGTACCCCACTTTAGGAGA" + return SeqIO.SeqRecord(id=read_id, description=description, seq=Seq(seq)) + +# to match with get_dummy_record +expected_seqsum_file_content=dedent(f"""\ +read_id\tchannel\tmux\tstart_time\tduration\tpasses_filtering\ttemplate_start\ttemplate_duration\tsequence_length_template\tend_reason\tnb_ref_bps_full\tstopped_receiving\tnever_requested +chr20_36784526_aligned_proc0:16m_R_0_2226_0\tch138\t1\t0.1752480000000105\t5.940078881408681\tTrue\t0.2689980000000105\t5.846328881408681\t{len(get_dummy_record().seq)}\tdata_service_unblock_mux_change\t13936\tFalse\tFalse +""") + +def test_write_seqsum_record_line(tmp_path): + sequencing_summary_filename = tmp_path / "seqsummary_filename_simple1.txt" + with open(sequencing_summary_filename, mode="w") as seqsummary_fh: + write_seqsum_header(seqsummary_fh) + + write_seqsum_record_line(get_dummy_record(), seqsummary_fh) + + assert sequencing_summary_filename.read_text() == expected_seqsum_file_content + +def test_SequencingSummaryWriter(tmp_path): + sequencing_summary_filename = tmp_path / "seqsummary_filename_simple2.txt" + with open(sequencing_summary_filename, mode="w") as seqsummary_fh: + with SequencingSummaryWriter(seqsummary_fh) as seqsum_writer: + seqsum_writer.write_read(get_dummy_record()) + + assert sequencing_summary_filename.read_text() == expected_seqsum_file_content + +def test_seqsum_line_with_chunked_read(tmp_path): + chunked_read = ChunkedRead("read1", "111112222222222333333", 10.1, read_speed=10, min_chunk_size=4) + chunked_read.get_new_samples(10.1+0.5) + chunked_read.stop_receiving() + seq_record = chunked_read.finish() + seq_record.description += f" ch=ch1" # added by channel on top + + sequencing_summary_filename = tmp_path / "seqsummary_filename_simple3.txt" + with open(sequencing_summary_filename, mode="w") as seqsummary_fh: + write_seqsum_record_line(seq_record, seqsummary_fh, read_id=seq_record.id) + + # test with SequencingSummaryWriter + sequencing_summary_filename = tmp_path / "seqsummary_filename_simple4.txt" + with open(sequencing_summary_filename, mode="w") as seqsummary_fh: + with SequencingSummaryWriter(seqsummary_fh) as seqsum_writer: + seqsum_writer.write_read(seq_record) + + expected_filecontent = dedent("""\ + read_id\tchannel\tmux\tstart_time\tduration\tpasses_filtering\ttemplate_start\ttemplate_duration\tsequence_length_template\tend_reason\tnb_ref_bps_full\tstopped_receiving\tnever_requested + read1\tch1\t1\t10.1\t2.0999999999999996\tTrue\t10.1\t2.0999999999999996\t21\tsignal_positive\tnan\tTrue\tFalse + """) + assert Path(sequencing_summary_filename).read_text() == expected_filecontent def test_convert_simfasta_to_seqsum(shared_datadir, tmp_path): number_of_lines_in_file = lambda filename: sum(1 for _ in open(filename)) diff --git a/tests/simulator/test_simulator.py b/tests/simulator/test_simulator.py index 213ea8b..492e76e 100644 --- a/tests/simulator/test_simulator.py +++ b/tests/simulator/test_simulator.py @@ -1,5 +1,3 @@ -import itertools -import logging from typing import Dict import pytest from pytest import approx @@ -12,12 +10,11 @@ from simreaduntil.shared_utils.timing import cur_ns_time from simreaduntil.simulator import channel from simreaduntil.simulator.channel import ChannelAlreadyRunningException, StoppedReceivingResponse, UnblockResponse -from simreaduntil.simulator.channel_element import ChunkedRead from simreaduntil.simulator.gap_sampling.constant_gaps_until_blocked import ConstantGapsUntilBlocked from simreaduntil.simulator.pore_model import PoreModel from simreaduntil.simulator.readpool import ReadPoolFromIterable from simreaduntil.simulator.readswriter import ArrayReadsWriter -from simreaduntil.simulator.simulator import ActionType, InexistentChannelsException, ONTSimulator, ReadUntilClientFromDevice, ReadUntilDevice, assign_read_durations_to_channels, convert_action_results_to_df, plot_sim_actions, run_periodic_mux_scan_thread, stop_simulation_after_time_thread +from simreaduntil.simulator.simulator import ActionType, InexistentChannelsException, ONTSimulator, assign_read_durations_to_channels, convert_action_results_to_df, plot_sim_actions, run_periodic_mux_scan_thread, stop_simulation_after_time_thread from simreaduntil.simulator.simulator_params import SimParams from simreaduntil.simulator.utils import in_interval from simreaduntil.usecase_helpers.utils import random_reads_gen @@ -32,7 +29,7 @@ def sim_params() -> SimParams: # make it fast enough so something actually happens (without making tests last too long) return SimParams( gap_samplers={f"channel_{i}": ConstantGapsUntilBlocked(short_gap_length=0.04, long_gap_length=0.05, prob_long_gap=0.02, time_until_blocked=np.inf, read_delay=0) for i in range(2)}, - bp_per_second=100, chunk_size=20, default_unblock_duration=0.02, seed=0, + bp_per_second=100, min_chunk_size=20, default_unblock_duration=0.02, seed=0, ) @pytest.fixture @@ -40,7 +37,8 @@ def simulator(sim_params) -> ONTSimulator: return ONTSimulator( read_pool=ReadPoolFromIterable(random_reads_gen(random_state=np.random.default_rng(3), length_range=(10, 50))), reads_writer=ArrayReadsWriter(), - sim_params=sim_params + sim_params=sim_params, + output_dir="", ) def test_start_stop(simulator): @@ -105,14 +103,15 @@ def test_get_basecalled_chunks(): sim_params = SimParams( gap_samplers={f"channel_{i}": ConstantGapsUntilBlocked(short_gap_length=0.4, long_gap_length=0.5, prob_long_gap=0, time_until_blocked=np.inf, read_delay=0) for i in range(2)}, - bp_per_second=10, chunk_size=4, default_unblock_duration=0.2, seed=0, + bp_per_second=10, min_chunk_size=4, default_unblock_duration=0.2, seed=0, ) - read_pool = ReadPoolFromIterable(gen_from_list((("read1", "AAAAGGGGC"), ("read2", "TTTTAC"), ("read3", "TTTTAAAACCCAAACTTTACCA"), ("read4", "TCTTAAAACCTTA")))) + read_pool = ReadPoolFromIterable(gen_from_list((("read1", "AAAAAGGGGC"), ("read2", "TTTTAC"), ("read3", "TTTTAAAACCCAAACTTTACCA"), ("read4", "TCTTAAAACCTTA")))) simulator = ONTSimulator( read_pool=read_pool, reads_writer=ArrayReadsWriter(), - sim_params = sim_params + sim_params = sim_params, + output_dir="", ) simulator.save_elems = True eps = 1e-5 @@ -120,30 +119,33 @@ def test_get_basecalled_chunks(): simulator.sync_start(0) simulator.sync_forward(0.9) chunks = list(simulator.get_basecalled_read_chunks()) - assert sorted(chunks) == sorted([(1, 'read1', 'AAAA', 'noquality', 4), (2, 'read2', 'TTTT', 'noquality', 4)]) + assert sorted(chunks) == sorted([(1, 'read1', 'AAAAA', 'noquality', 5), (2, 'read2', 'TTTTA', 'noquality', 5)]) chunks = list(simulator.get_basecalled_read_chunks()) assert len(chunks) == 0 simulator.sync_forward(1.2+eps) chunks = list(simulator.get_basecalled_read_chunks()) - assert chunks == [(1, 'read1', 'GGGG', 'noquality', 8)] + assert chunks == [] # below min chunk size + simulator.sync_forward(1.3+eps) + chunks = list(simulator.get_basecalled_read_chunks()) + assert chunks == [(1, 'read1', 'GGGG', 'noquality', 9)] - simulator.sync_forward(1.4+eps) # to force channel 1 to get read3 - simulator.sync_forward(2.2+eps) + simulator.sync_forward(1.5+eps) # to force channel 2 to get read3, channel 1 not yet because gap has not finished + simulator.sync_forward(2.3+eps) with pytest.raises(InexistentChannelsException): # channels are 1-based list(simulator.get_basecalled_read_chunks(channel_subset=[0])) chunks = list(simulator.get_basecalled_read_chunks(channel_subset=[1])) - assert chunks == [(1, 'read4', 'TCTT', 'noquality', 4)] + assert chunks == [(1, 'read4', 'TCTTA', 'noquality', 5)] chunks = list(simulator.get_basecalled_read_chunks(channel_subset=[2])) - assert chunks == [(2, 'read3', 'TTTTAAAA', 'noquality', 8)] + assert chunks == [(2, 'read3', 'TTTTAAAAC', 'noquality', 9)] simulator.sync_forward(2.7+eps) chunks = list(simulator.get_basecalled_read_chunks(batch_size=1)) assert len(chunks) == 1 - # simulator.plot_channels(); import matplotlib.pyplot as plt; plt.show() + # ax = simulator.plot_channels(); import matplotlib.pyplot as plt; plt.show() simulator.sync_stop() @@ -159,14 +161,15 @@ def test_get_raw_chunks(shared_datadir): sim_params = SimParams( gap_samplers={f"channel_{i}": ConstantGapsUntilBlocked(short_gap_length=0.4, long_gap_length=0.5, prob_long_gap=0, time_until_blocked=np.inf, read_delay=0) for i in range(2)}, - bp_per_second=10, chunk_size=4, default_unblock_duration=0.2, seed=0, pore_model=PoreModel(pore_filename, signals_per_bp=signals_per_bp) + bp_per_second=10, min_chunk_size=4, default_unblock_duration=0.2, seed=0, pore_model=PoreModel(pore_filename, signals_per_bp=signals_per_bp) ) read_pool = ReadPoolFromIterable(gen_from_list((("read1", seq), ("read2", "TTTTAC")))) simulator = ONTSimulator( read_pool=read_pool, reads_writer=ArrayReadsWriter(), - sim_params = sim_params + sim_params = sim_params, + output_dir="", ) simulator.sync_start(0) @@ -174,7 +177,7 @@ def test_get_raw_chunks(shared_datadir): simulator.sync_forward(0.4+0.9+eps) chunks = list(simulator.get_raw_chunks(channel_subset=[1])) # get 2 chunks for channel 1 raw_signal = chunks[0][2] - assert len(raw_signal) == (2*4 - k + 1) * signals_per_bp + assert len(raw_signal) == (9 - k + 1) * signals_per_bp def test_synchronous_sim_is_deterministic(sim_params, channel_write_zero_length_reads): # test that the synchronous simulator produces the same results when run twice, only for "constant" update method @@ -188,7 +191,8 @@ def run_sim(seed): simulator = ONTSimulator( read_pool=ReadPoolFromIterable(random_reads_gen(random_state=np.random.default_rng(3), length_range=(10, 50))), reads_writer=reads_writer, - sim_params=sim_params + sim_params=sim_params, + output_dir="", ) simulator.save_elems = True @@ -217,7 +221,7 @@ def test_random_ops_synchronous(simulator, async_mode, channel_write_zero_length sim_params = SimParams( gap_samplers={f"channel_{i}": ConstantGapsUntilBlocked(short_gap_length=1.2, long_gap_length=1.2, prob_long_gap=0.35, time_until_blocked=200, read_delay=0) for i in range(2)}, # gap_samplers={f"channel_{i}": ConstantGapsUntilBlocked(short_gap_length=1.2, long_gap_length=5.2, prob_long_gap=0.25, time_until_blocked=200, read_delay=0) for i in range(2)}, - bp_per_second=10, chunk_size=4, default_unblock_duration=1.2, seed=0, + bp_per_second=10, min_chunk_size=4, default_unblock_duration=1.2, seed=0, ) # apply random operations, check that reads are correct @@ -230,7 +234,8 @@ def get_read_and_save_id(): simulator = ONTSimulator( read_pool=ReadPoolFromIterable(get_read_and_save_id()), reads_writer=ArrayReadsWriter(), - sim_params=sim_params + sim_params=sim_params, + output_dir="", ) simulator.save_elems = True @@ -261,11 +266,14 @@ def get_read_and_save_id(): assert stats.time_active + stats.no_reads_left.time_spent + stats.channel_broken.time_spent == approx(simulator._channels[0].t - (0 if async_mode else t_start)) def test_realtime(channel_write_zero_length_reads): + # todo: this does not test real time, instead need to check that the simulator forward loop is never delayed, + # need to look at test results and look for "Simulation cannot keep up, delay" messages + # test that the simulator can run in real time (i.e. end time is at least 0.95 * real time) # can be used to determine the optimal acceleration factor that does not cause too much delay sim_params = SimParams( gap_samplers={f"channel_{i}": ConstantGapsUntilBlocked(short_gap_length=0.4, long_gap_length=1.3, prob_long_gap=0.1, time_until_blocked=np.inf, read_delay=0) for i in range(512)}, - bp_per_second=450, chunk_size=200, default_unblock_duration=1.4, seed=0, + bp_per_second=450, min_chunk_size=200, default_unblock_duration=1.4, seed=0, ) # pre-generate reads to make sure that this is not responsible for the delay @@ -276,7 +284,8 @@ def test_realtime(channel_write_zero_length_reads): # read_pool=ReadPoolFromIterable(reads_gen), read_pool=ReadPoolFromIterable(random_reads_gen(random_state=np.random.default_rng(3), length_range=(500, 5000))), reads_writer=ArrayReadsWriter(), - sim_params=sim_params + sim_params=sim_params, + output_dir="", ) acceleration_factor = 5 # depends on computer load @@ -317,59 +326,61 @@ def test_run_periodic_mux_scan_thread(simulator): assert simulator.get_channel_stats()[0].mux_scans.finished_number == 5 assert simulator.get_channel_stats()[1].mux_scans.finished_number == 5 -def test_readuntil_fromdevice(): - # tests the readuntil client +# def test_readuntil_fromdevice(): +# # tests the readuntil client - sim_params = SimParams( - gap_samplers={f"channel_{i}": ConstantGapsUntilBlocked(short_gap_length=0.4, long_gap_length=0.5, prob_long_gap=0, time_until_blocked=np.inf, read_delay=0) for i in range(2)}, - bp_per_second=10, chunk_size=4, default_unblock_duration=0.2, seed=0, - ) +# sim_params = SimParams( +# gap_samplers={f"channel_{i}": ConstantGapsUntilBlocked(short_gap_length=0.4, long_gap_length=0.5, prob_long_gap=0, time_until_blocked=np.inf, read_delay=0) for i in range(2)}, +# bp_per_second=10, min_chunk_size=4, default_unblock_duration=0.2, seed=0, +# ) - read_pool = ReadPoolFromIterable(gen_from_list((("read1", "AAAAGGGGC"), ("read2", "TTTTACCTTACC"), ("read3", "TTTTAAAACCCAAACTTTACCA"), ("read4", "TCTTAAAACCTTA")))) - simulator = ONTSimulator( - read_pool=read_pool, - reads_writer=ArrayReadsWriter(), - sim_params=sim_params - ) - simulator.save_elems = True - eps = 1e-5 +# read_pool = ReadPoolFromIterable(gen_from_list((("read1", "AAAAGGGGC"), ("read2", "TTTTACCTTACC"), ("read3", "TTTTAAAACCCAAACTTTACCA"), ("read4", "TCTTAAAACCTTA")))) +# simulator = ONTSimulator( +# read_pool=read_pool, +# reads_writer=ArrayReadsWriter(), +# sim_params=sim_params, +# output_dir="", +# ) +# simulator.save_elems = True +# eps = 1e-5 - ru_client = ReadUntilClientFromDevice(simulator) +# ru_client = ReadUntilClientFromDevice(simulator) - simulator.sync_start(-0.4) - simulator.sync_forward(0.5) +# simulator.sync_start(-0.4) +# simulator.sync_forward(0.5) - chunks = list(ru_client.get_basecalled_read_chunks()) - assert len(chunks) == 2 +# chunks = list(ru_client.get_basecalled_read_chunks()) +# assert len(chunks) == 2 - with pytest.raises(InexistentChannelsException): - ru_client.stop_receiving_batch([(0, "read1")]) - responses = ru_client.stop_receiving_batch([(1, "read1"), (2, "inexistent")]) - assert responses == [StoppedReceivingResponse.STOPPED_RECEIVING, StoppedReceivingResponse.MISSED] - assert ru_client.stop_receiving_batch([(1, "read1")]) == [StoppedReceivingResponse.ALREADY_STOPPED_RECEIVING] +# with pytest.raises(InexistentChannelsException): +# ru_client.stop_receiving_batch([(0, "read1")]) +# responses = ru_client.stop_receiving_batch([(1, "read1"), (2, "inexistent")]) +# assert responses == [StoppedReceivingResponse.STOPPED_RECEIVING, StoppedReceivingResponse.MISSED] +# assert ru_client.stop_receiving_batch([(1, "read1")]) == [StoppedReceivingResponse.ALREADY_STOPPED_RECEIVING] - simulator.sync_forward(0.8+eps) +# simulator.sync_forward(0.8+eps) - chunks = list(ru_client.get_basecalled_read_chunks()) - assert len(chunks) == 1 +# chunks = list(ru_client.get_basecalled_read_chunks()) +# assert len(chunks) == 1 - assert ru_client.stop_receiving_batch([(2, "read2")]) == [StoppedReceivingResponse.STOPPED_RECEIVING] +# assert ru_client.stop_receiving_batch([(2, "read2")]) == [StoppedReceivingResponse.STOPPED_RECEIVING] - assert ru_client.unblock_read_batch([(1, "read1"), (2, "read2"), (1, "inexistent")]) == [True, True, False] +# assert ru_client.unblock_read_batch([(1, "read1"), (2, "read2"), (1, "inexistent")]) == [True, True, False] - simulator.sync_stop() +# simulator.sync_stop() def test_get_action_results(): sim_params = SimParams( gap_samplers={f"channel_{i}": ConstantGapsUntilBlocked(short_gap_length=0.4, long_gap_length=0.5, prob_long_gap=0, time_until_blocked=np.inf, read_delay=0) for i in range(2)}, - bp_per_second=10, chunk_size=4, default_unblock_duration=0.2, seed=0, + bp_per_second=10, min_chunk_size=4, default_unblock_duration=0.2, seed=0, ) read_pool = ReadPoolFromIterable(gen_from_list((("read1", "AAAAGGGGC"), ("read2", "TTTTAC"), ("read3", "TTTTAAAACCCAAACTTTACCA"), ("read4", "TCTTAAAACCTTA")))) simulator = ONTSimulator( read_pool=read_pool, reads_writer=ArrayReadsWriter(), - sim_params = sim_params + sim_params=sim_params, + output_dir="", ) simulator.save_elems = True @@ -381,6 +392,8 @@ def test_get_action_results(): ("read1", 0.5, 1, ActionType.StopReceiving, StoppedReceivingResponse.STOPPED_RECEIVING), ("inexistent", 0.5, 2, ActionType.StopReceiving, StoppedReceivingResponse.MISSED) ] + assert simulator.get_action_results(clear=False) == [] + simulator.sync_forward(t=0, delta=True) # process actions assert simulator.get_action_results(clear=False) == exp_action_results plot_sim_actions(convert_action_results_to_df(exp_action_results), close_figures=True) @@ -390,6 +403,7 @@ def test_get_action_results(): simulator.sync_forward(1.3+1e-4) simulator.unblock_read(2, "read3") simulator.unblock_read(2, "inexistent") + simulator.sync_forward(t=0, delta=True) # process actions assert simulator.get_action_results() == [ ("read3", 1.3+1e-4, 2, ActionType.Unblock, UnblockResponse.UNBLOCKED), ("inexistent", 1.3+1e-4, 2, ActionType.Unblock, UnblockResponse.MISSED) diff --git a/tests/simulator/test_simulator_client.py b/tests/simulator/test_simulator_client.py index 33ab036..4fdefdf 100644 --- a/tests/simulator/test_simulator_client.py +++ b/tests/simulator/test_simulator_client.py @@ -17,7 +17,6 @@ def test_grpc_client(channel_write_zero_length_reads): reads_writer = ArrayReadsWriter() - reads_writer.output_dir = "dummy_dir" # patch attribute sim_params = SimParams( gap_samplers={f"channel_{i}": ConstantGapsUntilBlocked(short_gap_length=0.4, long_gap_length=10.1, prob_long_gap=0, time_until_blocked=np.inf, read_delay=0) for i in range(2)}, @@ -26,6 +25,7 @@ def test_grpc_client(channel_write_zero_length_reads): read_pool=ReadPoolFromIterable(random_reads_gen(random_state=np.random.default_rng(3), length_range=(10, 50))), reads_writer=reads_writer, sim_params=sim_params, + output_dir="", ) port, server, unique_id = launchable_device_grpc_server(simulator) @@ -40,7 +40,7 @@ def test_grpc_client(channel_write_zero_length_reads): assert client.unique_id == unique_id, f"mismatching unique_ids, probably connected to an existing server: {client.unique_id} != {unique_id}" - assert str(client.mk_run_dir) == "dummy_dir" + assert str(client.mk_run_dir) == "" assert not client.is_running assert client.start(acceleration_factor=acceleration_factor) diff --git a/tests/simulator/test_simulator_server.py b/tests/simulator/test_simulator_server.py index e68a926..b02d602 100644 --- a/tests/simulator/test_simulator_server.py +++ b/tests/simulator/test_simulator_server.py @@ -85,7 +85,8 @@ def test_launchable_device_grpc_server(): simulator = ONTSimulator( read_pool=ReadPoolFromIterable(random_reads_gen(random_state=np.random.default_rng(3), length_range=(10, 50))), reads_writer=ArrayReadsWriter(), - sim_params=sim_params + sim_params=sim_params, + output_dir="", ) # import os; os.environ["GRPC_VERBOSITY"] = "DEBUG"; os.environ["GRPC_TRACE"] = "http" @@ -106,13 +107,13 @@ def test_launchable_device_grpc_server(): assert stub.StartSim(ont_device_pb2.StartRequest(acceleration_factor=2, update_method="realtime", log_interval=10)).value # unblocking inexistent read - assert not stub.PerformActions(ont_device_pb2.ReadActionsRequest(actions=[ + stub.PerformActions(ont_device_pb2.ReadActionsRequest(actions=[ ont_device_pb2.ReadActionsRequest.Action(channel=2, read_id="inexistent", unblock=ont_device_pb2.ReadActionsRequest.Action.UnblockAction(unblock_duration=0.2)) - ])).succeeded[0] + ])) - assert not stub.PerformActions(ont_device_pb2.ReadActionsRequest(actions=[ + stub.PerformActions(ont_device_pb2.ReadActionsRequest(actions=[ ont_device_pb2.ReadActionsRequest.Action(channel=1, read_id="inexistent", stop_further_data=ont_device_pb2.ReadActionsRequest.Action.StopReceivingAction()), - ])).succeeded[0] + ])) assert stub.StopSim(ont_device_pb2.EmptyRequest()).value diff --git a/tests/simulator/test_utils.py b/tests/simulator/test_utils.py index 9309de2..5d836bf 100644 --- a/tests/simulator/test_utils.py +++ b/tests/simulator/test_utils.py @@ -1,7 +1,12 @@ +import signal +import sys +import threading +import time from matplotlib import pyplot as plt import pytest from simreaduntil.shared_utils.plotting import make_tight_layout +from simreaduntil.shared_utils.utils import set_signal_handler, tee_stdouterr_to_file from simreaduntil.simulator.utils import format_percentage, in_interval, new_thread_name @@ -22,4 +27,4 @@ def test_format_percentage(): def test_make_tight_layout(): fig, ax = plt.subplots() ax.plot([0, 1], [0, 1]) - make_tight_layout(fig) \ No newline at end of file + make_tight_layout(fig) diff --git a/tests/usecase_helpers/data/run_dir/configs/config.toml b/tests/usecase_helpers/data/run_dir/configs/config.toml index 8306102..b40b264 100644 --- a/tests/usecase_helpers/data/run_dir/configs/config.toml +++ b/tests/usecase_helpers/data/run_dir/configs/config.toml @@ -1,5 +1,5 @@ run_dir = "simulator_run" # where reads, logs, pafs etc. will be written to -n_channels = 200 +n_channels = 20 # n_channels = 4 acceleration_factor = 10 run_duration = 100.0 @@ -9,9 +9,11 @@ run_duration = 100.0 ################################################# # reads_file = "nanosim_reads/perfect_reads_seed1_aligned_reads.fasta" +# reads_file = "/home/mmordig/rawhash_project/rawhash2/test/data/d2_ecoli_r94/small_slow5_files" +reads_len_range = [5_000, 10_000] ref_genome_path = "data/chm13v2.0_normalized1000000firsttwo.fa.gz" # sim_params_file = "sim_params.dill" -rotating = true +rotating_writeout = true mux_scan_period = 50 # seconds, accounting for acceleration mux_scan_duration = 10 # seconds diff --git a/tests/usecase_helpers/data/run_dir/configs/readfish_enrich_chr1.toml b/tests/usecase_helpers/data/run_dir/configs/readfish_enrich_chr1.toml index d4b471b..9b2cd13 100644 --- a/tests/usecase_helpers/data/run_dir/configs/readfish_enrich_chr1.toml +++ b/tests/usecase_helpers/data/run_dir/configs/readfish_enrich_chr1.toml @@ -15,6 +15,6 @@ targets = ["chr1"] single_on = "stop_receiving" multi_on = "stop_receiving" single_off = "unblock" -multi_off = "unblock" +multi_off = "proceed" no_seq = "proceed" # unclear what it is, does not seem to be used no_map = "proceed" # if no_map happens after mux_chunks were received, the read is rejected \ No newline at end of file diff --git a/tests/usecase_helpers/test_run_simulator_with_readfish.py b/tests/usecase_helpers/test_run_simulator_with_readfish.py index efaa470..138d1f1 100644 --- a/tests/usecase_helpers/test_run_simulator_with_readfish.py +++ b/tests/usecase_helpers/test_run_simulator_with_readfish.py @@ -34,6 +34,9 @@ def test_simulator_with_readfish(shared_datadir, tmp_path): assert Path("simulator_run/reads").exists() assert Path("simulator_run/sequencing_summary.txt").exists() + assert Path("simulator_run/live_sequencing_summary.txt").exists() + + assert Path("simulator_run/sequencing_summary.txt").read_text() == Path("simulator_run/live_sequencing_summary.txt").read_text() action_results_df = pd.read_csv("simulator_run/action_results.csv", sep="\t") plot_sim_actions(action_results_df, close_figures=True) diff --git a/usecases/README.md b/usecases/README.md index 3fee734..b59d8ec 100644 --- a/usecases/README.md +++ b/usecases/README.md @@ -75,7 +75,7 @@ cd .. # install ReadFish git submodule update --init --depth 1 external/ont_readfish source ~/ont_project_all/ont_project_venv/bin/activate -pip install -e './[readfish]' # -e for dev version +pip uninstall -y readfish; pip install './[readfish]'; pip show readfish # optional: install NanoSim and minimap2, but the usecase also works without # git submodule update --init --depth 1 external/ont_nanosim @@ -100,8 +100,9 @@ If the read ids are NanoSim ids with ground-truth alignment information, `minima Files: - `enrich_usecase.py`: end-to-end script that runs an enrichment with ReadFish connected to the simulator, see the instructions in that file - `enrich_usecase_submission.sh`: condor submission script, can also be run locally +- `compute_absolute_enrichment.ipynb`: compute the absolute enrichment for the enrich usecase by comparing to control - `install_usecase_deps.sh`: to install `minimap2` and `NanoSim` (optional), launch it from the repo root -- `create_nanosim_reads.ipynb`: notebook to create NanoSim reads that can be fed into the simulator by modifying the config file +- `generate_nanosim_reads.sh`: script to create NanoSim reads that can be fed into the simulator ## Parameter Extraction from an Existing Run @@ -129,7 +130,7 @@ When running several configurations in parallel and some cache files do not exis These files are for our own reference and may not work for you out of the box: - `analyze_readfish_outputs.py`: to check whether ReadFish is mapping reads correctly by parsing the ground-truth from the read id -- `plot_existing_seqsum.py`: to plot an existing sequencing summary file, e.g., from a real run +- `plot_existing_seqsum.py`: to plot an existing sequencing summary file, e.g., from a real run; probably needs to be adapted to your setting - `remove_mux_scans.ipynb`: notebook showing how mux scans are removed (you don't need to run this, this is done automatically in the usecases) - `prepare_small_refgenome.py`: to create a small reference genome for the usecase - `results_preparation.md`: commands to create the results in the paper diff --git a/usecases/analyze_readfish_outputs.ipynb b/usecases/analyze_readfish_outputs.ipynb index 44b43c5..7b26b50 100644 --- a/usecases/analyze_readfish_outputs.ipynb +++ b/usecases/analyze_readfish_outputs.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -23,10 +23,22 @@ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", + "from pathlib import Path\n", "\n", "from simreaduntil.shared_utils.nanosim_parsing import NanoSimId\n" ] }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# run_dir = Path(\"runs/enrich_usecase/full_run_sampler_per_window/simulator_run/\")\n", + "run_dir = Path(\"/is/cluster-test/fast/mmordig/ont_project/runs/enrich_usecase/readfish_exp/results_readfishexp_realreads/simulator_run/\")\n", + "# run_dir = Path(\"/home/mmordig/ont_project_all/ont_project/runs/enrich_usecase/readfish_exp/results_readfishexp_realreads_withflanking/simulator_run\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -36,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -74,98 +86,99 @@ " \n", " \n", " 0\n", - " chr1_818237_aligned_2_R_0_5720_0\n", - " 600\n", + " chr1_26454210_aligned_proc0:64_F_0_1535_0\n", + " 263\n", " 0\n", - " 600\n", - " -\n", + " 263\n", + " +\n", " chr1\n", - " 1000000\n", - " 823357\n", - " 823957\n", + " 248387328\n", + " 26454210\n", + " 26454469\n", " \n", " \n", " 1\n", - " chr1_763753_aligned_21_R_0_8206_0\n", - " 800\n", - " 0\n", - " 800\n", - " -\n", - " chr1\n", - " 1000000\n", - " 771159\n", - " 771959\n", + " chr8_35080996_aligned_proc0:45_F_0_11151_0\n", + " 323\n", + " 3\n", + " 323\n", + " +\n", + " chr8\n", + " 146259331\n", + " 35081000\n", + " 35081331\n", " \n", " \n", " 2\n", - " chr1_541945_aligned_25_R_0_5739_0\n", - " 600\n", + " chr5_119262799_aligned_proc0:51_F_0_7992_0\n", + " 265\n", " 0\n", - " 600\n", - " -\n", - " chr1\n", - " 1000000\n", - " 547084\n", - " 547684\n", + " 265\n", + " +\n", + " chr5\n", + " 182045439\n", + " 119262799\n", + " 119263072\n", " \n", " \n", " 3\n", - " chr1_737931_aligned_46_F_0_8456_0\n", - " 800\n", + " chr2_26169279_aligned_proc0:26_F_0_10295_0\n", + " 343\n", " 0\n", - " 800\n", + " 334\n", " +\n", - " chr1\n", - " 1000000\n", - " 737931\n", - " 738731\n", + " chr2\n", + " 242696752\n", + " 26169279\n", + " 26169617\n", " \n", " \n", " 4\n", - " chr1_826073_aligned_166_R_0_6358_0\n", - " 600\n", + " chr3_117116174_aligned_proc0:36_F_0_7402_0\n", + " 342\n", " 0\n", - " 600\n", - " -\n", - " chr1\n", - " 1000000\n", - " 831831\n", - " 832431\n", + " 342\n", + " +\n", + " chr3\n", + " 201105948\n", + " 117116174\n", + " 117116526\n", " \n", " \n", "\n", "" ], "text/plain": [ - " read_id read_length read_start read_end \\\n", - "0 chr1_818237_aligned_2_R_0_5720_0 600 0 600 \n", - "1 chr1_763753_aligned_21_R_0_8206_0 800 0 800 \n", - "2 chr1_541945_aligned_25_R_0_5739_0 600 0 600 \n", - "3 chr1_737931_aligned_46_F_0_8456_0 800 0 800 \n", - "4 chr1_826073_aligned_166_R_0_6358_0 600 0 600 \n", + " read_id read_length read_start \\\n", + "0 chr1_26454210_aligned_proc0:64_F_0_1535_0 263 0 \n", + "1 chr8_35080996_aligned_proc0:45_F_0_11151_0 323 3 \n", + "2 chr5_119262799_aligned_proc0:51_F_0_7992_0 265 0 \n", + "3 chr2_26169279_aligned_proc0:26_F_0_10295_0 343 0 \n", + "4 chr3_117116174_aligned_proc0:36_F_0_7402_0 342 0 \n", "\n", - " strand contig_name contig_length contig_start contig_end \n", - "0 - chr1 1000000 823357 823957 \n", - "1 - chr1 1000000 771159 771959 \n", - "2 - chr1 1000000 547084 547684 \n", - "3 + chr1 1000000 737931 738731 \n", - "4 - chr1 1000000 831831 832431 " + " read_end strand contig_name contig_length contig_start contig_end \n", + "0 263 + chr1 248387328 26454210 26454469 \n", + "1 323 + chr8 146259331 35081000 35081331 \n", + "2 265 + chr5 182045439 119262799 119263072 \n", + "3 334 + chr2 242696752 26169279 26169617 \n", + "4 342 + chr3 201105948 117116174 117116526 " ] }, - "execution_count": 3, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "mapping_paf_file = \"runs/enrich_usecase/full_run_sampler_per_window/simulator_run/mapping.paf\"\n", + "mapping_paf_file = run_dir / \"mapping.paf\"\n", + "\n", "df = pd.read_csv(mapping_paf_file, sep=\"\\t\", header=None, usecols=[0, 1, 2, 3, 4, 5, 6, 7, 8], names=[\"read_id\", \"read_length\", \"read_start\", \"read_end\", \"strand\", \"contig_name\", \"contig_length\", \"contig_start\", \"contig_end\"])\n", "df.head()" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -205,72 +218,72 @@ " \n", " \n", " 0\n", - " chr1_818237_aligned_2_R_0_5720_0\n", - " 600\n", + " chr1_26454210_aligned_proc0:64_F_0_1535_0\n", + " 263\n", " 0\n", - " 600\n", - " -\n", + " 263\n", + " +\n", " chr1\n", - " 1000000\n", - " 823357\n", - " 823957\n", + " 248387328\n", + " 26454210\n", + " 26454469\n", " chr1\n", " True\n", " \n", " \n", " 1\n", - " chr1_763753_aligned_21_R_0_8206_0\n", - " 800\n", - " 0\n", - " 800\n", - " -\n", - " chr1\n", - " 1000000\n", - " 771159\n", - " 771959\n", - " chr1\n", + " chr8_35080996_aligned_proc0:45_F_0_11151_0\n", + " 323\n", + " 3\n", + " 323\n", + " +\n", + " chr8\n", + " 146259331\n", + " 35081000\n", + " 35081331\n", + " chr8\n", " True\n", " \n", " \n", " 2\n", - " chr1_541945_aligned_25_R_0_5739_0\n", - " 600\n", + " chr5_119262799_aligned_proc0:51_F_0_7992_0\n", + " 265\n", " 0\n", - " 600\n", - " -\n", - " chr1\n", - " 1000000\n", - " 547084\n", - " 547684\n", - " chr1\n", + " 265\n", + " +\n", + " chr5\n", + " 182045439\n", + " 119262799\n", + " 119263072\n", + " chr5\n", " True\n", " \n", " \n", " 3\n", - " chr1_737931_aligned_46_F_0_8456_0\n", - " 800\n", + " chr2_26169279_aligned_proc0:26_F_0_10295_0\n", + " 343\n", " 0\n", - " 800\n", + " 334\n", " +\n", - " chr1\n", - " 1000000\n", - " 737931\n", - " 738731\n", - " chr1\n", + " chr2\n", + " 242696752\n", + " 26169279\n", + " 26169617\n", + " chr2\n", " True\n", " \n", " \n", " 4\n", - " chr1_826073_aligned_166_R_0_6358_0\n", - " 600\n", + " chr3_117116174_aligned_proc0:36_F_0_7402_0\n", + " 342\n", " 0\n", - " 600\n", - " -\n", - " chr1\n", - " 1000000\n", - " 831831\n", - " 832431\n", - " chr1\n", + " 342\n", + " +\n", + " chr3\n", + " 201105948\n", + " 117116174\n", + " 117116526\n", + " chr3\n", " True\n", " \n", " \n", @@ -278,19 +291,19 @@ "" ], "text/plain": [ - " read_id read_length read_start read_end \\\n", - "0 chr1_818237_aligned_2_R_0_5720_0 600 0 600 \n", - "1 chr1_763753_aligned_21_R_0_8206_0 800 0 800 \n", - "2 chr1_541945_aligned_25_R_0_5739_0 600 0 600 \n", - "3 chr1_737931_aligned_46_F_0_8456_0 800 0 800 \n", - "4 chr1_826073_aligned_166_R_0_6358_0 600 0 600 \n", + " read_id read_length read_start \\\n", + "0 chr1_26454210_aligned_proc0:64_F_0_1535_0 263 0 \n", + "1 chr8_35080996_aligned_proc0:45_F_0_11151_0 323 3 \n", + "2 chr5_119262799_aligned_proc0:51_F_0_7992_0 265 0 \n", + "3 chr2_26169279_aligned_proc0:26_F_0_10295_0 343 0 \n", + "4 chr3_117116174_aligned_proc0:36_F_0_7402_0 342 0 \n", "\n", - " strand contig_name contig_length contig_start contig_end chrom \\\n", - "0 - chr1 1000000 823357 823957 chr1 \n", - "1 - chr1 1000000 771159 771959 chr1 \n", - "2 - chr1 1000000 547084 547684 chr1 \n", - "3 + chr1 1000000 737931 738731 chr1 \n", - "4 - chr1 1000000 831831 832431 chr1 \n", + " read_end strand contig_name contig_length contig_start contig_end chrom \\\n", + "0 263 + chr1 248387328 26454210 26454469 chr1 \n", + "1 323 + chr8 146259331 35081000 35081331 chr8 \n", + "2 265 + chr5 182045439 119262799 119263072 chr5 \n", + "3 334 + chr2 242696752 26169279 26169617 chr2 \n", + "4 342 + chr3 201105948 117116174 117116526 chr3 \n", "\n", " mapping_correct \n", "0 True \n", @@ -300,7 +313,7 @@ "4 True " ] }, - "execution_count": 4, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -313,16 +326,16 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.999574612897737" + "0.8156957759568824" ] }, - "execution_count": 5, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -334,7 +347,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -343,13 +356,13 @@ "Text(0, 0.5, 'Cumulative mapping correct')" ] }, - "execution_count": 6, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -359,12 +372,13 @@ } ], "source": [ + "# can be used to see if the mapping fails initially (for small chunk indices) pointing to the mapper being overwhelmed by the sheer number of channels\n", "df[\"chunk_idx\"] = np.arange(len(df))\n", "df[\"cum_mapping_correct\"] = df[\"mapping_correct\"].cumsum()\n", "fig, ax = plt.subplots()\n", "sns.lineplot(data=df, x=\"chunk_idx\", y=\"cum_mapping_correct\", ax=ax)\n", "ax.set_xlabel(\"Chunk index\")\n", - "ax.set_ylabel(\"Cumulative mapping correct\")" + "ax.set_ylabel(\"Cumulative number of correct mappings\")" ] }, { @@ -378,7 +392,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -422,135 +436,135 @@ " \n", " \n", " 0\n", - " 2\n", + " 12\n", " 1\n", - " chr1_818237_aligned_2_R_0_5720_0\n", - " 15\n", - " chr1_818237_aligned_2_R_0_5720_0\n", - " 600\n", + " chr4_79091622_aligned_proc0:17_R_0_10859_0\n", + " 126\n", + " chr4_79091622_aligned_proc0:17_R_0_10859_0\n", + " 319\n", " 1\n", - " single_on\n", - " stop_receiving\n", - " enrich_chr_1\n", + " control\n", + " True\n", + " control\n", " False\n", " False\n", " 0.000000\n", - " 0.000218\n", + " 0.000864\n", " 0.000000\n", " \n", " \n", " 1\n", + " 12\n", " 2\n", - " 2\n", - " chr1_763753_aligned_21_R_0_8206_0\n", - " 78\n", - " chr1_763753_aligned_21_R_0_8206_0\n", - " 800\n", + " chr1_51922780_aligned_proc0:2_R_0_35088_0\n", + " 15\n", + " chr1_51922780_aligned_proc0:2_R_0_35088_0\n", + " 246\n", " 1\n", - " single_on\n", - " stop_receiving\n", - " enrich_chr_1\n", + " control\n", + " True\n", + " control\n", " False\n", " False\n", - " 0.008041\n", - " 0.008245\n", - " 0.008028\n", + " 0.001536\n", + " 0.001548\n", + " 0.000683\n", " \n", " \n", " 2\n", - " 2\n", + " 12\n", " 3\n", - " chr1_541945_aligned_25_R_0_5739_0\n", - " 86\n", - " chr1_541945_aligned_25_R_0_5739_0\n", - " 600\n", + " chr1_26454210_aligned_proc0:64_F_0_1535_0\n", + " 482\n", + " chr1_26454210_aligned_proc0:64_F_0_1535_0\n", + " 263\n", " 1\n", - " single_on\n", - " stop_receiving\n", - " enrich_chr_1\n", + " single_off\n", + " unblock\n", + " enrich_chr_16_20\n", " False\n", " False\n", - " 0.050249\n", - " 0.050441\n", - " 0.050222\n", + " 0.001865\n", + " 0.004453\n", + " 0.003588\n", " \n", " \n", " 3\n", - " 2\n", + " 12\n", " 4\n", - " chr1_737931_aligned_46_F_0_8456_0\n", - " 191\n", - " chr1_737931_aligned_46_F_0_8456_0\n", - " 800\n", + " chr8_35080996_aligned_proc0:45_F_0_11151_0\n", + " 314\n", + " chr8_35080996_aligned_proc0:45_F_0_11151_0\n", + " 323\n", " 1\n", - " single_on\n", - " stop_receiving\n", - " enrich_chr_1\n", + " single_off\n", + " unblock\n", + " enrich_chr_9_14\n", " False\n", " False\n", - " 0.282108\n", - " 0.282311\n", - " 0.282090\n", + " 0.004892\n", + " 0.005007\n", + " 0.004142\n", " \n", " \n", " 4\n", - " 2\n", + " 12\n", " 5\n", - " chr1_826073_aligned_166_R_0_6358_0\n", - " 165\n", - " chr1_826073_aligned_166_R_0_6358_0\n", - " 600\n", + " chr5_119262799_aligned_proc0:51_F_0_7992_0\n", + " 375\n", + " chr5_119262799_aligned_proc0:51_F_0_7992_0\n", + " 265\n", " 1\n", - " single_on\n", - " stop_receiving\n", - " enrich_chr_1\n", + " single_off\n", + " unblock\n", + " enrich_chr_9_14\n", " False\n", " False\n", - " 0.332250\n", - " 0.332455\n", - " 0.332234\n", + " 0.005365\n", + " 0.005459\n", + " 0.004594\n", " \n", " \n", "\n", "" ], "text/plain": [ - " client_iteration read_in_loop read_id \\\n", - "0 2 1 chr1_818237_aligned_2_R_0_5720_0 \n", - "1 2 2 chr1_763753_aligned_21_R_0_8206_0 \n", - "2 2 3 chr1_541945_aligned_25_R_0_5739_0 \n", - "3 2 4 chr1_737931_aligned_46_F_0_8456_0 \n", - "4 2 5 chr1_826073_aligned_166_R_0_6358_0 \n", + " client_iteration read_in_loop read_id \\\n", + "0 12 1 chr4_79091622_aligned_proc0:17_R_0_10859_0 \n", + "1 12 2 chr1_51922780_aligned_proc0:2_R_0_35088_0 \n", + "2 12 3 chr1_26454210_aligned_proc0:64_F_0_1535_0 \n", + "3 12 4 chr8_35080996_aligned_proc0:45_F_0_11151_0 \n", + "4 12 5 chr5_119262799_aligned_proc0:51_F_0_7992_0 \n", "\n", - " channel read_number seq_len counter mode \\\n", - "0 15 chr1_818237_aligned_2_R_0_5720_0 600 1 single_on \n", - "1 78 chr1_763753_aligned_21_R_0_8206_0 800 1 single_on \n", - "2 86 chr1_541945_aligned_25_R_0_5739_0 600 1 single_on \n", - "3 191 chr1_737931_aligned_46_F_0_8456_0 800 1 single_on \n", - "4 165 chr1_826073_aligned_166_R_0_6358_0 600 1 single_on \n", + " channel read_number seq_len counter \\\n", + "0 126 chr4_79091622_aligned_proc0:17_R_0_10859_0 319 1 \n", + "1 15 chr1_51922780_aligned_proc0:2_R_0_35088_0 246 1 \n", + "2 482 chr1_26454210_aligned_proc0:64_F_0_1535_0 263 1 \n", + "3 314 chr8_35080996_aligned_proc0:45_F_0_11151_0 323 1 \n", + "4 375 chr5_119262799_aligned_proc0:51_F_0_7992_0 265 1 \n", "\n", - " decision condition min_threshold count_threshold \\\n", - "0 stop_receiving enrich_chr_1 False False \n", - "1 stop_receiving enrich_chr_1 False False \n", - "2 stop_receiving enrich_chr_1 False False \n", - "3 stop_receiving enrich_chr_1 False False \n", - "4 stop_receiving enrich_chr_1 False False \n", + " mode decision condition min_threshold count_threshold \\\n", + "0 control True control False False \n", + "1 control True control False False \n", + "2 single_off unblock enrich_chr_16_20 False False \n", + "3 single_off unblock enrich_chr_9_14 False False \n", + "4 single_off unblock enrich_chr_9_14 False False \n", "\n", " start_analysis end_analysis timestamp \n", - "0 0.000000 0.000218 0.000000 \n", - "1 0.008041 0.008245 0.008028 \n", - "2 0.050249 0.050441 0.050222 \n", - "3 0.282108 0.282311 0.282090 \n", - "4 0.332250 0.332455 0.332234 " + "0 0.000000 0.000864 0.000000 \n", + "1 0.001536 0.001548 0.000683 \n", + "2 0.001865 0.004453 0.003588 \n", + "3 0.004892 0.005007 0.004142 \n", + "4 0.005365 0.005459 0.004594 " ] }, - "execution_count": 12, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "chunk_log = \"runs/enrich_usecase/full_run_sampler_per_window/simulator_run/chunk_log.txt\"\n", + "chunk_log = run_dir / \"chunk_log.txt\"\n", "chunk_df = pd.read_csv(chunk_log, sep=\"\\t\")\n", "first_time = chunk_df[\"start_analysis\"].min()\n", "chunk_df[\"start_analysis\"] -= first_time\n", @@ -563,9 +577,160 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The following reads have contradicting decisions: ['chr10_46560475_aligned_proc1:8557_R_0_10144_0'\n", + " 'chr10_51022679_aligned_proc0:15129_R_0_4742_0'\n", + " 'chr10_57355202_aligned_proc1:2861_R_0_24476_0'\n", + " 'chr10_95690626_aligned_proc0:22338_F_0_13966_0'\n", + " 'chr11_13489989_aligned_proc1:5601_F_0_3642_0'\n", + " 'chr11_3443736_aligned_proc0:17762_F_0_1555_0'\n", + " 'chr11_65986646_aligned_proc0:44259_R_0_8924_0'\n", + " 'chr11_67699121_aligned_proc1:32911_F_0_18565_0'\n", + " 'chr11_67773251_aligned_proc0:46991_F_0_1263_0'\n", + " 'chr11_79041662_aligned_proc1:11941_R_0_30699_0'\n", + " 'chr11_83978157_aligned_proc0:45408_F_0_988_0'\n", + " 'chr11_93901830_aligned_proc0:1066_R_0_30342_0'\n", + " 'chr12_100988723_aligned_proc1:9500_R_0_6199_0'\n", + " 'chr12_102809304_aligned_proc0:43933_R_0_9229_0'\n", + " 'chr12_18717659_aligned_proc1:22380_F_0_4309_0'\n", + " 'chr12_35213329_aligned_proc1:37488_F_0_10051_0'\n", + " 'chr12_50869782_aligned_proc1:10980_F_0_7198_0'\n", + " 'chr12_69368901_aligned_proc0:18483_R_0_7777_0'\n", + " 'chr13_48384073_aligned_proc1:42729_F_0_9275_0'\n", + " 'chr13_89341620_aligned_proc1:35990_R_0_7831_0'\n", + " 'chr14_2593612_aligned_proc0:31067_R_0_4084_0'\n", + " 'chr14_39765478_aligned_proc1:3662_R_0_6925_0'\n", + " 'chr14_7252598_aligned_proc1:4960_R_0_18754_0'\n", + " 'chr14_8281639_aligned_proc0:46007_R_0_14063_0'\n", + " 'chr14_8492702_aligned_proc1:16602_R_0_6974_0'\n", + " 'chr15_19557707_aligned_proc1:37344_F_0_3725_0'\n", + " 'chr15_25967950_aligned_proc1:46938_R_0_14443_0'\n", + " 'chr15_26063686_aligned_proc1:47323_F_0_3605_0'\n", + " 'chr15_26993023_aligned_proc0:37907_F_0_12110_0'\n", + " 'chr15_38387169_aligned_proc1:33753_F_0_13189_0'\n", + " 'chr15_3870910_aligned_proc0:26542_R_0_3762_0'\n", + " 'chr15_44915189_aligned_proc0:48803_F_0_10344_0'\n", + " 'chr16_52149344_aligned_proc0:25461_R_0_7199_0'\n", + " 'chr16_71069423_aligned_proc0:6274_F_0_871_0'\n", + " 'chr16_76408721_aligned_proc0:37110_R_0_20934_0'\n", + " 'chr17_16177646_aligned_proc1:32996_R_0_957_0'\n", + " 'chr18_14193416_aligned_proc1:34833_R_0_13101_0'\n", + " 'chr18_17587817_aligned_proc0:10739_F_0_7250_0'\n", + " 'chr18_2990210_aligned_proc0:23579_F_0_1180_0'\n", + " 'chr19_18144550_aligned_proc0:30723_R_0_22472_0'\n", + " 'chr19_25360455_aligned_proc1:18421_R_0_7774_0'\n", + " 'chr19_26242398_aligned_proc1:18836_F_0_9834_0'\n", + " 'chr19_39464877_aligned_proc1:34998_R_0_14918_0'\n", + " 'chr1_124032594_aligned_proc1:39712_R_0_4410_0'\n", + " 'chr1_128420334_aligned_proc1:8631_F_0_19655_0'\n", + " 'chr1_128653974_aligned_proc0:44830_R_0_883_0'\n", + " 'chr1_135842843_aligned_proc1:19491_F_0_12556_0'\n", + " 'chr1_162080599_aligned_proc1:2487_R_0_15422_0'\n", + " 'chr1_96214307_aligned_proc0:39567_R_0_15558_0'\n", + " 'chr1_9968920_aligned_proc1:37847_F_0_7081_0'\n", + " 'chr20_15197783_aligned_proc1:46479_F_0_13466_0'\n", + " 'chr20_25158745_aligned_proc1:34813_R_0_12951_0'\n", + " 'chr20_26089332_aligned_proc1:33096_F_0_4777_0'\n", + " 'chr20_30217908_aligned_proc0:18726_R_0_11330_0'\n", + " 'chr20_30418603_aligned_proc0:2936_F_0_5432_0'\n", + " 'chr20_30594460_aligned_proc0:44865_F_0_6238_0'\n", + " 'chr20_31623226_aligned_proc0:42643_R_0_7972_0'\n", + " 'chr20_35870999_aligned_proc0:38245_R_0_12794_0'\n", + " 'chr20_40792048_aligned_proc0:33537_F_0_2262_0'\n", + " 'chr20_61317321_aligned_proc1:49070_F_0_7010_0'\n", + " 'chr21_11161655_aligned_proc1:17388_R_0_7114_0'\n", + " 'chr21_23612599_aligned_proc0:21285_R_0_12620_0'\n", + " 'chr21_41489503_aligned_proc1:26980_R_0_17814_0'\n", + " 'chr21_5248173_aligned_proc0:30336_F_0_7183_0'\n", + " 'chr21_6169884_aligned_proc1:29955_R_0_6273_0'\n", + " 'chr22_11310596_aligned_proc0:21221_F_0_9981_0'\n", + " 'chr22_16107138_aligned_proc0:13091_F_0_3196_0'\n", + " 'chr22_21681969_aligned_proc0:45221_F_0_12802_0'\n", + " 'chr22_25111818_aligned_proc0:20384_R_0_4431_0'\n", + " 'chr22_31832726_aligned_proc1:29276_R_0_20265_0'\n", + " 'chr22_31867035_aligned_proc1:34519_R_0_952_0'\n", + " 'chr22_5446207_aligned_proc0:24705_R_0_6289_0'\n", + " 'chr2_140751766_aligned_proc1:28348_F_0_10596_0'\n", + " 'chr2_146853335_aligned_proc1:2200_F_0_8993_0'\n", + " 'chr2_168253672_aligned_proc1:30454_R_0_2784_0'\n", + " 'chr2_190400513_aligned_proc1:6854_R_0_12491_0'\n", + " 'chr2_198206518_aligned_proc0:41877_R_0_7323_0'\n", + " 'chr2_54474365_aligned_proc0:43024_R_0_8114_0'\n", + " 'chr2_74800789_aligned_proc0:7329_F_0_3593_0'\n", + " 'chr3_101089418_aligned_proc0:35024_R_0_11684_0'\n", + " 'chr3_132923394_aligned_proc1:24600_F_0_947_0'\n", + " 'chr3_143371346_aligned_proc1:43167_R_0_1204_0'\n", + " 'chr3_173897725_aligned_proc0:49640_R_0_5091_0'\n", + " 'chr4_108001666_aligned_proc0:41302_R_0_1142_0'\n", + " 'chr4_129670423_aligned_proc1:12925_R_0_12606_0'\n", + " 'chr4_158225635_aligned_proc0:34109_R_0_26976_0'\n", + " 'chr4_161631671_aligned_proc0:19883_F_0_1768_0'\n", + " 'chr4_162001002_aligned_proc1:17077_R_0_13868_0'\n", + " 'chr4_16635_aligned_proc0:1491_F_0_5297_0'\n", + " 'chr4_193069225_aligned_proc1:30482_R_0_3536_0'\n", + " 'chr4_193469036_aligned_proc0:42397_R_0_13000_0'\n", + " 'chr4_32721384_aligned_proc0:12985_R_0_12866_0'\n", + " 'chr4_46251507_aligned_proc0:8453_R_0_11417_0'\n", + " 'chr4_52459859_aligned_proc1:49498_F_0_7552_0'\n", + " 'chr4_57204653_aligned_proc1:25149_R_0_11764_0'\n", + " 'chr4_67388664_aligned_proc1:13243_F_0_12873_0'\n", + " 'chr4_9583325_aligned_proc1:14902_F_0_630_0'\n", + " 'chr5_146003101_aligned_proc0:14935_R_0_944_0'\n", + " 'chr5_42989255_aligned_proc1:23663_F_0_10686_0'\n", + " 'chr5_56545386_aligned_proc1:46505_F_0_11313_0'\n", + " 'chr5_79513949_aligned_proc0:26723_R_0_6100_0'\n", + " 'chr6_19388917_aligned_proc0:5578_R_0_9180_0'\n", + " 'chr6_75878370_aligned_proc0:4691_R_0_720_0'\n", + " 'chr7_116477133_aligned_proc1:44199_R_0_1709_0'\n", + " 'chr7_151809288_aligned_proc0:6781_R_0_14919_0'\n", + " 'chr7_58029291_aligned_proc0:29200_F_0_20787_0'\n", + " 'chr7_58099971_aligned_proc1:44551_F_0_7750_0'\n", + " 'chr7_66727152_aligned_proc1:40767_R_0_3805_0'\n", + " 'chr8_99847749_aligned_proc0:760_R_0_16778_0'\n", + " 'chr9_105972060_aligned_proc1:42632_F_0_9583_0'\n", + " 'chr9_129852539_aligned_proc1:8824_F_0_3056_0'\n", + " 'chr9_30102964_aligned_proc1:398_R_0_23344_0'\n", + " 'chr9_33661732_aligned_proc0:45082_R_0_18589_0'\n", + " 'chr9_40309392_aligned_proc1:49611_F_0_19854_0'\n", + " 'chr9_40454857_aligned_proc0:33702_F_0_1068_0'\n", + " 'chr9_42868514_aligned_proc0:31014_F_0_6221_0'\n", + " 'chr9_45813582_aligned_proc1:36286_F_0_10326_0'\n", + " 'chr9_75329071_aligned_proc1:30586_F_0_11208_0'\n", + " 'chr9_77335760_aligned_proc1:5947_F_0_2326_0'\n", + " 'chrX_104593026_aligned_proc0:45220_R_0_22565_0'\n", + " 'chrX_110913597_aligned_proc1:5015_R_0_3464_0'\n", + " 'chrX_111331_aligned_proc1:34711_F_0_10062_0'\n", + " 'chrX_113112332_aligned_proc1:515_R_0_1292_0'\n", + " 'chrX_115736885_aligned_proc0:4644_F_0_18129_0'\n", + " 'chrX_119653213_aligned_proc0:21547_R_0_12532_0'\n", + " 'chrX_124386630_aligned_proc0:19718_R_0_23355_0'\n", + " 'chrX_129152314_aligned_proc1:7330_R_0_8222_0'\n", + " 'chrX_140736248_aligned_proc1:24214_F_0_1410_0'\n", + " 'chrX_153106503_aligned_proc0:45685_F_0_1234_0'\n", + " 'chrX_153558546_aligned_proc0:32738_F_0_5713_0'\n", + " 'chrX_20729273_aligned_proc0:26535_R_0_11241_0'\n", + " 'chrX_26075894_aligned_proc1:45825_R_0_3721_0'\n", + " 'chrX_5037558_aligned_proc1:3909_F_0_19167_0'\n", + " 'chrX_61668685_aligned_proc1:18226_R_0_12795_0'\n", + " 'chrX_63309112_aligned_proc0:49759_F_0_15355_0'\n", + " 'chrX_64775308_aligned_proc1:35335_F_0_23726_0'\n", + " 'chrX_6488382_aligned_proc0:6136_R_0_3175_0'\n", + " 'chrX_65679376_aligned_proc0:42394_F_0_2851_0'\n", + " 'chrY_11353602_aligned_proc0:6949_F_0_22804_0'\n", + " 'chrY_24179543_aligned_proc1:8323_F_0_3643_0'\n", + " 'chrY_3830730_aligned_proc1:37676_R_0_6535_0'\n", + " 'chrY_40962526_aligned_proc1:6606_F_0_24037_0'\n", + " 'chrY_52876394_aligned_proc0:8058_F_0_13033_0'\n", + " 'chrY_8103282_aligned_proc0:36625_R_0_11099_0']\n" + ] + } + ], "source": [ "# check whether some reads have contradicting decisions (except for \"proceed\")\n", "df1 = chunk_df[chunk_df[\"decision\"] != \"proceed\"]\n", @@ -577,7 +742,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -625,119 +790,119 @@ " \n", " \n", " \n", - " 4555\n", - " chr1_818237_aligned_2_R_0_5720_0\n", - " 2\n", + " 1524713\n", + " chr4_79091622_aligned_proc0:17_R_0_10859_0\n", + " 12\n", " 1\n", - " 15\n", - " chr1_818237_aligned_2_R_0_5720_0\n", - " 600\n", + " 126\n", + " chr4_79091622_aligned_proc0:17_R_0_10859_0\n", + " 319\n", " 1\n", - " single_on\n", - " stop_receiving\n", - " enrich_chr_1\n", + " control\n", + " True\n", + " control\n", " False\n", " False\n", " 0.000000\n", - " 0.000218\n", + " 0.000864\n", " 0.000000\n", - " chr1\n", + " chr4\n", + " False\n", " False\n", - " True\n", " 0\n", - " 1\n", + " 0\n", " \n", " \n", - " 4196\n", - " chr1_763753_aligned_21_R_0_8206_0\n", + " 912219\n", + " chr1_51922780_aligned_proc0:2_R_0_35088_0\n", + " 12\n", " 2\n", - " 2\n", - " 78\n", - " chr1_763753_aligned_21_R_0_8206_0\n", - " 800\n", + " 15\n", + " chr1_51922780_aligned_proc0:2_R_0_35088_0\n", + " 246\n", " 1\n", - " single_on\n", - " stop_receiving\n", - " enrich_chr_1\n", + " control\n", + " True\n", + " control\n", " False\n", " False\n", - " 0.008041\n", - " 0.008245\n", - " 0.008028\n", + " 0.001536\n", + " 0.001548\n", + " 0.000683\n", " chr1\n", " False\n", - " True\n", + " False\n", + " 0\n", " 0\n", - " 2\n", " \n", " \n", - " 2810\n", - " chr1_541945_aligned_25_R_0_5739_0\n", - " 2\n", + " 891103\n", + " chr1_26454210_aligned_proc0:64_F_0_1535_0\n", + " 12\n", " 3\n", - " 86\n", - " chr1_541945_aligned_25_R_0_5739_0\n", - " 600\n", + " 482\n", + " chr1_26454210_aligned_proc0:64_F_0_1535_0\n", + " 263\n", " 1\n", - " single_on\n", - " stop_receiving\n", - " enrich_chr_1\n", + " single_off\n", + " unblock\n", + " enrich_chr_16_20\n", " False\n", " False\n", - " 0.050249\n", - " 0.050441\n", - " 0.050222\n", + " 0.001865\n", + " 0.004453\n", + " 0.003588\n", " chr1\n", - " False\n", " True\n", + " False\n", + " 1\n", " 0\n", - " 3\n", " \n", " \n", - " 4014\n", - " chr1_737931_aligned_46_F_0_8456_0\n", - " 2\n", + " 1976384\n", + " chr8_35080996_aligned_proc0:45_F_0_11151_0\n", + " 12\n", " 4\n", - " 191\n", - " chr1_737931_aligned_46_F_0_8456_0\n", - " 800\n", + " 314\n", + " chr8_35080996_aligned_proc0:45_F_0_11151_0\n", + " 323\n", " 1\n", - " single_on\n", - " stop_receiving\n", - " enrich_chr_1\n", - " False\n", + " single_off\n", + " unblock\n", + " enrich_chr_9_14\n", " False\n", - " 0.282108\n", - " 0.282311\n", - " 0.282090\n", - " chr1\n", " False\n", + " 0.004892\n", + " 0.005007\n", + " 0.004142\n", + " chr8\n", " True\n", + " False\n", + " 1\n", " 0\n", - " 4\n", " \n", " \n", - " 4608\n", - " chr1_826073_aligned_166_R_0_6358_0\n", - " 2\n", + " 1557842\n", + " chr5_119262799_aligned_proc0:51_F_0_7992_0\n", + " 12\n", " 5\n", - " 165\n", - " chr1_826073_aligned_166_R_0_6358_0\n", - " 600\n", + " 375\n", + " chr5_119262799_aligned_proc0:51_F_0_7992_0\n", + " 265\n", " 1\n", - " single_on\n", - " stop_receiving\n", - " enrich_chr_1\n", - " False\n", + " single_off\n", + " unblock\n", + " enrich_chr_9_14\n", " False\n", - " 0.332250\n", - " 0.332455\n", - " 0.332234\n", - " chr1\n", " False\n", + " 0.005365\n", + " 0.005459\n", + " 0.004594\n", + " chr5\n", " True\n", + " False\n", + " 1\n", " 0\n", - " 5\n", " \n", " \n", " ...\n", @@ -763,119 +928,119 @@ " ...\n", " \n", " \n", - " 10904\n", - " chr2_948429_aligned_11335_R_0_7314_0\n", - " 258\n", - " 57\n", - " 331\n", - " chr2_948429_aligned_11335_R_0_7314_0\n", - " 5800\n", - " 1\n", - " single_off\n", - " unblock\n", - " enrich_chr_1\n", - " False\n", - " False\n", - " 53.131931\n", - " 53.132076\n", - " 53.131196\n", - " chr2\n", - " True\n", - " False\n", - " 4938\n", - " 3\n", - " \n", - " \n", - " 6312\n", - " chr2_208090_aligned_11673_F_0_5703_0\n", - " 258\n", - " 58\n", - " 476\n", - " chr2_208090_aligned_11673_F_0_5703_0\n", + " 2115686\n", + " chr9_68107336_aligned_proc0:9822_R_0_6730_0\n", + " 2449\n", + " 22\n", + " 254\n", + " chr9_68107336_aligned_proc0:9822_R_0_6730_0\n", " 200\n", - " 1\n", - " single_off\n", - " unblock\n", - " enrich_chr_1\n", - " False\n", + " 12\n", + " exceeded_max_chunks_unblocked\n", + " exceeded_max_chunks_unblocked\n", + " enrich_chr_1_8\n", " False\n", - " 53.132271\n", - " 53.132340\n", - " 53.131459\n", - " chr2\n", " True\n", + " 128.434243\n", + " 128.434250\n", + " 128.433385\n", + " chr9\n", " False\n", - " 4939\n", - " 3\n", + " False\n", + " 263\n", + " 133\n", " \n", " \n", - " 9514\n", - " chr2_72326_aligned_11545_R_0_6764_0\n", - " 258\n", - " 59\n", - " 461\n", - " chr2_72326_aligned_11545_R_0_6764_0\n", - " 2000\n", + " 282185\n", + " chr12_82202404_aligned_proc0:10025_R_0_11764_0\n", + " 2449\n", + " 24\n", + " 269\n", + " chr12_82202404_aligned_proc0:10025_R_0_11764_0\n", + " 342\n", " 1\n", - " single_off\n", - " unblock\n", - " enrich_chr_1\n", + " single_on\n", + " stop_receiving\n", + " enrich_chr_9_14\n", " False\n", " False\n", - " 53.133177\n", - " 53.133263\n", - " 53.132383\n", - " chr2\n", - " True\n", + " 128.434833\n", + " 128.434892\n", + " 128.434027\n", + " chr12\n", " False\n", - " 4940\n", - " 3\n", + " True\n", + " 262\n", + " 131\n", " \n", " \n", - " 1240\n", - " chr1_294523_aligned_11554_F_0_9141_0\n", - " 258\n", - " 60\n", - " 144\n", - " chr1_294523_aligned_11554_F_0_9141_0\n", - " 1800\n", + " 321565\n", + " chr13_28038874_aligned_proc0:10008_R_0_1007_0\n", + " 2449\n", + " 25\n", + " 352\n", + " chr13_28038874_aligned_proc0:10008_R_0_1007_0\n", + " 384\n", " 1\n", " single_on\n", " stop_receiving\n", - " enrich_chr_1\n", + " enrich_chr_9_14\n", " False\n", " False\n", - " 53.134323\n", - " 53.134421\n", - " 53.133541\n", - " chr1\n", + " 128.435300\n", + " 128.435361\n", + " 128.434495\n", + " chr13\n", " False\n", " True\n", - " 0\n", - " 5056\n", + " 204\n", + " 106\n", " \n", " \n", - " 9906\n", - " chr2_79073_aligned_11329_F_0_9603_0\n", - " 258\n", - " 61\n", - " 416\n", - " chr2_79073_aligned_11329_F_0_9603_0\n", - " 5800\n", + " 1604331\n", + " chr5_175553425_aligned_proc0:10034_F_0_7706_0\n", + " 2449\n", + " 26\n", + " 464\n", + " chr5_175553425_aligned_proc0:10034_F_0_7706_0\n", + " 225\n", " 1\n", " single_off\n", " unblock\n", - " enrich_chr_1\n", + " enrich_chr_16_20\n", " False\n", " False\n", - " 53.136868\n", - " 53.137215\n", - " 53.136335\n", - " chr2\n", + " 128.435569\n", + " 128.435622\n", + " 128.434756\n", + " chr5\n", " True\n", " False\n", - " 4941\n", - " 3\n", + " 372\n", + " 126\n", + " \n", + " \n", + " 2063327\n", + " chr9_139979310_aligned_proc0:10028_R_0_6990_0\n", + " 2449\n", + " 27\n", + " 300\n", + " chr9_139979310_aligned_proc0:10028_R_0_6990_0\n", + " 354\n", + " 1\n", + " single_on\n", + " stop_receiving\n", + " enrich_chr_9_14\n", + " False\n", + " False\n", + " 128.435911\n", + " 128.435969\n", + " 128.435104\n", + " chr9\n", + " False\n", + " True\n", + " 263\n", + " 134\n", " \n", " \n", "\n", @@ -883,88 +1048,101 @@ "" ], "text/plain": [ - " read_id client_iteration read_in_loop \\\n", - "4555 chr1_818237_aligned_2_R_0_5720_0 2 1 \n", - "4196 chr1_763753_aligned_21_R_0_8206_0 2 2 \n", - "2810 chr1_541945_aligned_25_R_0_5739_0 2 3 \n", - "4014 chr1_737931_aligned_46_F_0_8456_0 2 4 \n", - "4608 chr1_826073_aligned_166_R_0_6358_0 2 5 \n", - "... ... ... ... \n", - "10904 chr2_948429_aligned_11335_R_0_7314_0 258 57 \n", - "6312 chr2_208090_aligned_11673_F_0_5703_0 258 58 \n", - "9514 chr2_72326_aligned_11545_R_0_6764_0 258 59 \n", - "1240 chr1_294523_aligned_11554_F_0_9141_0 258 60 \n", - "9906 chr2_79073_aligned_11329_F_0_9603_0 258 61 \n", + " read_id client_iteration \\\n", + "1524713 chr4_79091622_aligned_proc0:17_R_0_10859_0 12 \n", + "912219 chr1_51922780_aligned_proc0:2_R_0_35088_0 12 \n", + "891103 chr1_26454210_aligned_proc0:64_F_0_1535_0 12 \n", + "1976384 chr8_35080996_aligned_proc0:45_F_0_11151_0 12 \n", + "1557842 chr5_119262799_aligned_proc0:51_F_0_7992_0 12 \n", + "... ... ... \n", + "2115686 chr9_68107336_aligned_proc0:9822_R_0_6730_0 2449 \n", + "282185 chr12_82202404_aligned_proc0:10025_R_0_11764_0 2449 \n", + "321565 chr13_28038874_aligned_proc0:10008_R_0_1007_0 2449 \n", + "1604331 chr5_175553425_aligned_proc0:10034_F_0_7706_0 2449 \n", + "2063327 chr9_139979310_aligned_proc0:10028_R_0_6990_0 2449 \n", "\n", - " channel read_number seq_len counter \\\n", - "4555 15 chr1_818237_aligned_2_R_0_5720_0 600 1 \n", - "4196 78 chr1_763753_aligned_21_R_0_8206_0 800 1 \n", - "2810 86 chr1_541945_aligned_25_R_0_5739_0 600 1 \n", - "4014 191 chr1_737931_aligned_46_F_0_8456_0 800 1 \n", - "4608 165 chr1_826073_aligned_166_R_0_6358_0 600 1 \n", - "... ... ... ... ... \n", - "10904 331 chr2_948429_aligned_11335_R_0_7314_0 5800 1 \n", - "6312 476 chr2_208090_aligned_11673_F_0_5703_0 200 1 \n", - "9514 461 chr2_72326_aligned_11545_R_0_6764_0 2000 1 \n", - "1240 144 chr1_294523_aligned_11554_F_0_9141_0 1800 1 \n", - "9906 416 chr2_79073_aligned_11329_F_0_9603_0 5800 1 \n", + " read_in_loop channel \\\n", + "1524713 1 126 \n", + "912219 2 15 \n", + "891103 3 482 \n", + "1976384 4 314 \n", + "1557842 5 375 \n", + "... ... ... \n", + "2115686 22 254 \n", + "282185 24 269 \n", + "321565 25 352 \n", + "1604331 26 464 \n", + "2063327 27 300 \n", "\n", - " mode decision condition min_threshold \\\n", - "4555 single_on stop_receiving enrich_chr_1 False \n", - "4196 single_on stop_receiving enrich_chr_1 False \n", - "2810 single_on stop_receiving enrich_chr_1 False \n", - "4014 single_on stop_receiving enrich_chr_1 False \n", - "4608 single_on stop_receiving enrich_chr_1 False \n", - "... ... ... ... ... \n", - "10904 single_off unblock enrich_chr_1 False \n", - "6312 single_off unblock enrich_chr_1 False \n", - "9514 single_off unblock enrich_chr_1 False \n", - "1240 single_on stop_receiving enrich_chr_1 False \n", - "9906 single_off unblock enrich_chr_1 False \n", + " read_number seq_len counter \\\n", + "1524713 chr4_79091622_aligned_proc0:17_R_0_10859_0 319 1 \n", + "912219 chr1_51922780_aligned_proc0:2_R_0_35088_0 246 1 \n", + "891103 chr1_26454210_aligned_proc0:64_F_0_1535_0 263 1 \n", + "1976384 chr8_35080996_aligned_proc0:45_F_0_11151_0 323 1 \n", + "1557842 chr5_119262799_aligned_proc0:51_F_0_7992_0 265 1 \n", + "... ... ... ... \n", + "2115686 chr9_68107336_aligned_proc0:9822_R_0_6730_0 200 12 \n", + "282185 chr12_82202404_aligned_proc0:10025_R_0_11764_0 342 1 \n", + "321565 chr13_28038874_aligned_proc0:10008_R_0_1007_0 384 1 \n", + "1604331 chr5_175553425_aligned_proc0:10034_F_0_7706_0 225 1 \n", + "2063327 chr9_139979310_aligned_proc0:10028_R_0_6990_0 354 1 \n", "\n", - " count_threshold start_analysis end_analysis timestamp chrom \\\n", - "4555 False 0.000000 0.000218 0.000000 chr1 \n", - "4196 False 0.008041 0.008245 0.008028 chr1 \n", - "2810 False 0.050249 0.050441 0.050222 chr1 \n", - "4014 False 0.282108 0.282311 0.282090 chr1 \n", - "4608 False 0.332250 0.332455 0.332234 chr1 \n", - "... ... ... ... ... ... \n", - "10904 False 53.131931 53.132076 53.131196 chr2 \n", - "6312 False 53.132271 53.132340 53.131459 chr2 \n", - "9514 False 53.133177 53.133263 53.132383 chr2 \n", - "1240 False 53.134323 53.134421 53.133541 chr1 \n", - "9906 False 53.136868 53.137215 53.136335 chr2 \n", + " mode decision \\\n", + "1524713 control True \n", + "912219 control True \n", + "891103 single_off unblock \n", + "1976384 single_off unblock \n", + "1557842 single_off unblock \n", + "... ... ... \n", + "2115686 exceeded_max_chunks_unblocked exceeded_max_chunks_unblocked \n", + "282185 single_on stop_receiving \n", + "321565 single_on stop_receiving \n", + "1604331 single_off unblock \n", + "2063327 single_on stop_receiving \n", "\n", - " is_rejection is_stopreceiving cum_nb_rejections_per_chrom \\\n", - "4555 False True 0 \n", - "4196 False True 0 \n", - "2810 False True 0 \n", - "4014 False True 0 \n", - "4608 False True 0 \n", - "... ... ... ... \n", - "10904 True False 4938 \n", - "6312 True False 4939 \n", - "9514 True False 4940 \n", - "1240 False True 0 \n", - "9906 True False 4941 \n", + " condition min_threshold count_threshold start_analysis \\\n", + "1524713 control False False 0.000000 \n", + "912219 control False False 0.001536 \n", + "891103 enrich_chr_16_20 False False 0.001865 \n", + "1976384 enrich_chr_9_14 False False 0.004892 \n", + "1557842 enrich_chr_9_14 False False 0.005365 \n", + "... ... ... ... ... \n", + "2115686 enrich_chr_1_8 False True 128.434243 \n", + "282185 enrich_chr_9_14 False False 128.434833 \n", + "321565 enrich_chr_9_14 False False 128.435300 \n", + "1604331 enrich_chr_16_20 False False 128.435569 \n", + "2063327 enrich_chr_9_14 False False 128.435911 \n", "\n", - " cum_nb_stopreceiving_per_chrom \n", - "4555 1 \n", - "4196 2 \n", - "2810 3 \n", - "4014 4 \n", - "4608 5 \n", - "... ... \n", - "10904 3 \n", - "6312 3 \n", - "9514 3 \n", - "1240 5056 \n", - "9906 3 \n", + " end_analysis timestamp chrom is_rejection is_stopreceiving \\\n", + "1524713 0.000864 0.000000 chr4 False False \n", + "912219 0.001548 0.000683 chr1 False False \n", + "891103 0.004453 0.003588 chr1 True False \n", + "1976384 0.005007 0.004142 chr8 True False \n", + "1557842 0.005459 0.004594 chr5 True False \n", + "... ... ... ... ... ... \n", + "2115686 128.434250 128.433385 chr9 False False \n", + "282185 128.434892 128.434027 chr12 False True \n", + "321565 128.435361 128.434495 chr13 False True \n", + "1604331 128.435622 128.434756 chr5 True False \n", + "2063327 128.435969 128.435104 chr9 False True \n", + "\n", + " cum_nb_rejections_per_chrom cum_nb_stopreceiving_per_chrom \n", + "1524713 0 0 \n", + "912219 0 0 \n", + "891103 1 0 \n", + "1976384 1 0 \n", + "1557842 1 0 \n", + "... ... ... \n", + "2115686 263 133 \n", + "282185 262 131 \n", + "321565 204 106 \n", + "1604331 372 126 \n", + "2063327 263 134 \n", "\n", "[10000 rows x 20 columns]" ] }, - "execution_count": 14, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -979,12 +1157,12 @@ "chunk_df[\"is_stopreceiving\"] = chunk_df[\"decision\"].apply(lambda x: x == \"stop_receiving\")\n", "chunk_df[\"cum_nb_rejections_per_chrom\"] = chunk_df.groupby(\"chrom\", observed=True)[\"is_rejection\"].cumsum()\n", "chunk_df[\"cum_nb_stopreceiving_per_chrom\"] = chunk_df.groupby(\"chrom\", observed=True)[\"is_stopreceiving\"].cumsum()\n", - "chunk_df.head(10000)" + "chunk_df.head(10)" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -993,13 +1171,13 @@ "Text(0.5, 1.0, 'Cumulative number of stopreceiving per chromosome')" ] }, - "execution_count": 21, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] diff --git a/usecases/compare_replication_methods.ipynb b/usecases/compare_replication_methods.ipynb index c765837..db492ad 100644 --- a/usecases/compare_replication_methods.ipynb +++ b/usecases/compare_replication_methods.ipynb @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -53,7 +53,137 @@ " ax.autoscale()\n", " make_tight_layout(ax.figure)\n", "\n", - " return fig\n", + " return fig\n" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "# # merging the plots\n", + "# fig = figs[\"1\"]\n", + "# fig\n", + "\n", + "# for (name, fig) in figs.items():\n", + "# print(name)\n", + "# ax = fig.axes[0]\n", + "# data = ax.get_children()[0].get_offsets().data.T\n", + "# plt.plot(*data, ls=\"None\", marker=\".\", markersize=1)\n", + " \n", + "# # test for merging two plots\n", + "\n", + "# n_points = 50\n", + "# xvals = np.arange(n_points)\n", + "# df1 = pd.DataFrame({\"x\": xvals, \"y\": np.random.randn(n_points), \"group\": np.random.choice([\"a\", \"b\"], n_points)})\n", + "# sns.lmplot(df1, x=\"x\", y=\"y\", hue=\"group\", scatter_kws={\"s\": 1})\n", + "# fig1 = plt.gcf()\n", + "\n", + "# xvals = np.arange(n_points) + 0.5\n", + "# df2 = pd.DataFrame({\"x\": xvals, \"y\": np.random.randn(n_points) * 20, \"group\": np.random.choice([\"a\", \"b\"], n_points)})\n", + "# sns.lmplot(df2, x=\"x\", y=\"y\", hue=\"group\", scatter_kws={\"s\": 1})\n", + "# fig2 = plt.gcf()\n", + "\n", + "# figs = {\"1\": fig1, \"2\": fig2}\n", + "\n", + "# for (name, fig) in figs.items():\n", + "# print(name)\n", + "# ax = fig.axes[0]\n", + "# data = ax.get_children()[0].get_offsets().data.T\n", + "# plt.plot(*data, ls=\"None\", marker=\".\", markersize=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 0.98, 'Read length of rejected reads over time for different acceleration factors')" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# base_dir = Path(\"/Volumes/mmordig/ont_project/runs/run_replication/runs/run_replication\")\n", + "base_dir = Path(\"/home/mmordig/ont_project_all/ont_project/runs/enrich_usecase/\")\n", + "figure_dirs = {\n", + " \"1\": base_dir / \"results_accel1/simulator_run/figures/pickled_figures\",\n", + " \"3\": base_dir / \"results_accel3/simulator_run/figures/pickled_figures\",\n", + " \"5\": base_dir / \"results_accel5/simulator_run/figures/pickled_figures\",\n", + " \"7.5\": base_dir / \"results_accel7.5/simulator_run/figures/pickled_figures\",\n", + " \"10\": base_dir / \"results_accel10/simulator_run/figures/pickled_figures\",\n", + "}\n", + "\n", + "figure_basename = \"read_length_rejected.dill\"\n", + "\n", + "figs = {name: dill_load(figure_dir / figure_basename) for name, figure_dir in figure_dirs.items()}\n", + "[plt.close(fig) for fig in figs.values()]\n", + "named_axes = {name: fig.axes[0] for name, fig in figs.items()}\n", + "\n", + "# # plot number rather than fraction of active channels\n", + "# for original_ax in named_axes.values():\n", + "# # parse title of the form:\n", + "# # f\"Fraction of active channels over time ({n_channels} active channels)\"\n", + "# n_channels = int(original_ax.get_title().split(\"(\")[-1].split(\" \")[0])\n", + "# line = original_ax.lines[0]\n", + "# print(n_channels)\n", + "# line.set_ydata(np.array(line.get_ydata()) / 100 * n_channels)\n", + "# # ax.autoscale()\n", + "\n", + "fig = merge_axes_into_one(named_axes)\n", + "ax = fig.axes[0]\n", + "\n", + "fig.suptitle(\"Read length of rejected reads over time for different acceleration factors\")\n", + "# ax.set_ylabel(\"Number of reading channels\")\n", + "# ax.set_title(f\"Number of reading channels over time\") # number of active channels varies between sequencing runs\n", + "# fig.savefig(base_dir / \"combined_channel_occupation_over_time.png\", dpi=300, bbox_inches=\"tight\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "\n", "# base_dir = Path(\"/Volumes/mmordig/ont_project/runs/run_replication/runs/run_replication\")\n", "base_dir = Path(\"/Users/maximilianmordig/ont_project_all/figures_cluster/runs/run_replication\")\n", @@ -98,7 +228,7 @@ } ], "source": [ - "figure_basename = \"channel_occupation_fraction_over_time.dill\"\n", + "figure_basename = \"channel_occupation_over_time.dill\"\n", "\n", "figs = {name: dill_load(figure_dir / figure_basename) for name, figure_dir in figure_dirs.items()}\n", "[plt.close(fig) for fig in figs.values()]\n", @@ -119,7 +249,7 @@ "\n", "ax.set_ylabel(\"Number of reading channels\")\n", "ax.set_title(f\"Number of reading channels over time\") # number of active channels varies between sequencing runs\n", - "fig.savefig(base_dir / \"combined_channel_occupation_fraction_over_time.png\", dpi=300, bbox_inches=\"tight\")" + "fig.savefig(base_dir / \"combined_channel_occupation_over_time.png\", dpi=300, bbox_inches=\"tight\")" ] }, { @@ -229,7 +359,7 @@ "cp \"${base_dir}sampler_per_window/simulator_run/figures/read_stats_by_channel.png\" \"${target_base_dir}read_stats_by_channel_sampler_per_window.png\"\n", "cp \"${base_dir}constant_gaps/simulator_run/figures/read_stats_by_channel.png\" \"${target_base_dir}read_stats_by_channel_constantgaps.png\"\n", "\n", - "cp \"${base_dir}/combined_channel_occupation_fraction_over_time.png\" \"${target_base_dir}\"\n", + "cp \"${base_dir}/combined_channel_occupation_over_time.png\" \"${target_base_dir}\"\n", "cp \"${base_dir}/combined_cum_nb_reads_per_all.png\" \"${target_base_dir}\"\n", "cp \"${base_dir}/combined_cum_nb_seq_bps_per_all.png\" \"${target_base_dir}\"\n", "\n", diff --git a/usecases/compute_absolute_enrichment.ipynb b/usecases/compute_absolute_enrichment.ipynb new file mode 100644 index 0000000..19a0d24 --- /dev/null +++ b/usecases/compute_absolute_enrichment.ipynb @@ -0,0 +1,447 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import numpy as np\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "import logging\n", + "import toml\n", + "import copy\n", + "\n", + "from simreaduntil.shared_utils.logging_utils import add_comprehensive_stream_handler_to_logger, setup_logger_simple\n", + "from simreaduntil.seqsum_tools.seqsum_plotting import preprocess_seqsum_df_for_plotting\n", + "\n", + "\n", + "add_comprehensive_stream_handler_to_logger(None)\n", + "logging.getLogger(__name__).setLevel(logging.DEBUG)\n", + "logging.getLogger(\"simreaduntil\").setLevel(logging.DEBUG)\n", + "\n", + "logger = logging\n", + "\n", + "# logging.getLogger(None).setLevel(logging.ERROR)\n", + "# logging.getLogger(\"simreaduntil\").setLevel(logging.ERROR)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# run_dir = Path(\"/home/mmordig/ont_project_all/ont_project/runs/enrich_usecase/readfish_exp/results_readfishexp_realreads_withflanking/simulator_run/\")\n", + "# run_dir = Path(\"/is/cluster-test/fast/mmordig/ont_project/runs/enrich_usecase/readfish_exp/results_readfishexp_realreads/simulator_run/\")\n", + "# run_dir = Path(\"/is/cluster-test/fast/mmordig/ont_project/runs/enrich_usecase/readfish_exp/results_readfishexp_realreads_realmapper_accel5/simulator_run/\")\n", + "# run_dir = Path(\"/is/cluster-test/fast/mmordig/ont_project/runs/enrich_usecase/readfish_exp/results_readfishexp_realreads_fakemapper_accel10/simulator_run/\")\n", + "# run_dir = Path(\"/is/cluster-test/fast/mmordig/ont_project/runs/enrich_usecase/readfish_exp/results_readfishexp_realreads_realmapper_withunaligned_constantgapsampler_accel5/simulator_run/\")\n", + "# run_dir = Path(\"/is/cluster-test/fast/mmordig/ont_project/runs/enrich_usecase/readfish_exp/results_readfishexp_realreads_realmapper_withunaligned_accel10/simulator_run/\")\n", + "# run_dir = Path(\"/is/cluster-test/fast/mmordig/ont_project/runs/enrich_usecase/readfish_exp/results_readfishexp_realreads_realmapper_withunaligned_accel5/simulator_run/\")\n", + "# run_dir = Path(\"/is/cluster-test/fast/mmordig/ont_project/runs/enrich_usecase/readfish_exp/results_readfishexp_realreads_realmapper_withunaligned_accel2/simulator_run/\")\n", + "# run_dir = Path(\"/is/cluster-test/fast/mmordig/ont_project/runs/enrich_usecase/readfish_exp/results_readfishexp_realreads_realmapper_withunaligned_accel3_longer/simulator_run/\")\n", + "run_dir = Path(\"/is/cluster-test/fast/mmordig/ont_project/runs/enrich_usecase/readfish_exp/results_readfishexp_realreads_realmapper_withunaligned_accel5_longer/simulator_run/\")\n", + "\n", + "seqsum_filename = run_dir / \"sequencing_summary.txt\"\n", + "sim_config = {\"readfish_config_file\": run_dir / \"..\" / \"configs/readfish_enrich_per_quadrant.toml\" }\n", + "\n", + "readfish_conditions = [v for v in toml.load(sim_config[\"readfish_config_file\"])[\"conditions\"].values() if isinstance(v, dict)]\n", + "channel_assignments_toml = run_dir / \"channels.toml\"\n", + "channel_assignments_per_cond = toml.load(channel_assignments_toml)\n", + "channels_per_condition = {condition_dict[\"name\"]: condition_dict[\"channels\"] for condition_dict in channel_assignments_per_cond[\"conditions\"].values()}\n", + "\n", + "logger.debug(f\"Reading sequencing summary file '{seqsum_filename}'\")\n", + "full_seqsum_df = pd.read_csv(seqsum_filename, sep=\"\\t\")#, nrows=100) # todo\n", + "logger.debug(f\"Done reading sequencing summary file '{seqsum_filename}'\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# partial_seqsum_df.columns\n", + "# full_seqsum_df[\"channel\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-03-01 10:33:16,409 - Sorting and cleaning seqsummary file of shape (184996, 13) --- seqsum_plotting.py:939 (preprocess_seqsum_df_for_plotting) INFO ##\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing condition control\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-03-01 10:33:16,677 - Adding previous gap duration to seqsummary --- seqsum_plotting.py:941 (preprocess_seqsum_df_for_plotting) INFO ##\n", + "2024-03-01 10:33:16,835 - Adding group column from NanoSim read id --- seqsum_plotting.py:951 (preprocess_seqsum_df_for_plotting) INFO ##\n", + "2024-03-01 10:33:17,431 - Splitting according to groups {'enrich_chr_1_8': ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8'], 'enrich_chr_9_14': ['chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14'], 'enrich_chr_16_20': ['chr16', 'chr17', 'chr18', 'chr19', 'chr20'], 'other': {'chr21', 'chrY', 'chr15', 'chr22', 'chrX'}} --- seqsum_plotting.py:964 (preprocess_seqsum_df_for_plotting) INFO ##\n", + "2024-03-01 10:33:17,453 - Adding extra columns for plotting --- seqsum_plotting.py:971 (preprocess_seqsum_df_for_plotting) INFO ##\n", + "2024-03-01 10:33:17,543 - /tmp/ipykernel_1011868/1748468773.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", + " num_sequenced_bps_per_group = dict(partial_seqsum_df.groupby(\"group\")[\"cum_nb_seq_bps_per_group\"].max())\n", + " --- warnings.py:109 (_showwarnmsg) WARNING ##\n", + "2024-03-01 10:33:17,651 - Sorting and cleaning seqsummary file of shape (355109, 13) --- seqsum_plotting.py:939 (preprocess_seqsum_df_for_plotting) INFO ##\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing condition enrich_chr_1_8\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-03-01 10:33:18,095 - Adding previous gap duration to seqsummary --- seqsum_plotting.py:941 (preprocess_seqsum_df_for_plotting) INFO ##\n", + "2024-03-01 10:33:18,389 - Adding group column from NanoSim read id --- seqsum_plotting.py:951 (preprocess_seqsum_df_for_plotting) INFO ##\n", + "2024-03-01 10:33:19,486 - Splitting according to groups {'enrich_chr_1_8': ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8'], 'enrich_chr_9_14': ['chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14'], 'enrich_chr_16_20': ['chr16', 'chr17', 'chr18', 'chr19', 'chr20'], 'other': {'chr21', 'chrY', 'chr15', 'chr22', 'chrX'}} --- seqsum_plotting.py:964 (preprocess_seqsum_df_for_plotting) INFO ##\n", + "2024-03-01 10:33:19,527 - Adding extra columns for plotting --- seqsum_plotting.py:971 (preprocess_seqsum_df_for_plotting) INFO ##\n", + "2024-03-01 10:33:19,697 - /tmp/ipykernel_1011868/1748468773.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", + " num_sequenced_bps_per_group = dict(partial_seqsum_df.groupby(\"group\")[\"cum_nb_seq_bps_per_group\"].max())\n", + " --- warnings.py:109 (_showwarnmsg) WARNING ##\n", + "2024-03-01 10:33:19,821 - Sorting and cleaning seqsummary file of shape (526237, 13) --- seqsum_plotting.py:939 (preprocess_seqsum_df_for_plotting) INFO ##\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing condition enrich_chr_9_14\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-03-01 10:33:20,519 - Adding previous gap duration to seqsummary --- seqsum_plotting.py:941 (preprocess_seqsum_df_for_plotting) INFO ##\n", + "2024-03-01 10:33:20,981 - Adding group column from NanoSim read id --- seqsum_plotting.py:951 (preprocess_seqsum_df_for_plotting) INFO ##\n", + "2024-03-01 10:33:22,599 - Splitting according to groups {'enrich_chr_1_8': ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8'], 'enrich_chr_9_14': ['chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14'], 'enrich_chr_16_20': ['chr16', 'chr17', 'chr18', 'chr19', 'chr20'], 'other': {'chr21', 'chrY', 'chr15', 'chr22', 'chrX'}} --- seqsum_plotting.py:964 (preprocess_seqsum_df_for_plotting) INFO ##\n", + "2024-03-01 10:33:22,658 - Adding extra columns for plotting --- seqsum_plotting.py:971 (preprocess_seqsum_df_for_plotting) INFO ##\n", + "2024-03-01 10:33:22,903 - /tmp/ipykernel_1011868/1748468773.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", + " num_sequenced_bps_per_group = dict(partial_seqsum_df.groupby(\"group\")[\"cum_nb_seq_bps_per_group\"].max())\n", + " --- warnings.py:109 (_showwarnmsg) WARNING ##\n", + "2024-03-01 10:33:23,035 - Sorting and cleaning seqsummary file of shape (700614, 13) --- seqsum_plotting.py:939 (preprocess_seqsum_df_for_plotting) INFO ##\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing condition enrich_chr_16_20\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-03-01 10:33:23,805 - Adding previous gap duration to seqsummary --- seqsum_plotting.py:941 (preprocess_seqsum_df_for_plotting) INFO ##\n", + "2024-03-01 10:33:24,419 - Adding group column from NanoSim read id --- seqsum_plotting.py:951 (preprocess_seqsum_df_for_plotting) INFO ##\n", + "2024-03-01 10:33:26,596 - Splitting according to groups {'enrich_chr_1_8': ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8'], 'enrich_chr_9_14': ['chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14'], 'enrich_chr_16_20': ['chr16', 'chr17', 'chr18', 'chr19', 'chr20'], 'other': {'chr21', 'chrY', 'chr15', 'chrM', 'chr22', 'chrX'}} --- seqsum_plotting.py:964 (preprocess_seqsum_df_for_plotting) INFO ##\n", + "2024-03-01 10:33:26,675 - Adding extra columns for plotting --- seqsum_plotting.py:971 (preprocess_seqsum_df_for_plotting) INFO ##\n", + "2024-03-01 10:33:27,002 - /tmp/ipykernel_1011868/1748468773.py:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", + " num_sequenced_bps_per_group = dict(partial_seqsum_df.groupby(\"group\")[\"cum_nb_seq_bps_per_group\"].max())\n", + " --- warnings.py:109 (_showwarnmsg) WARNING ##\n" + ] + } + ], + "source": [ + "\n", + "\n", + "num_sequenced_bps_per_group_per_condition = {}\n", + "\n", + "# targets of conditions are disjoint, so we can group by each of them for each condition (a condition is a selseq strategy applied to a subset of channels)\n", + "group_to_units = {cond[\"name\"]: cond[\"targets\"] for cond in readfish_conditions if cond[\"name\"] != \"control\"}\n", + "\n", + "for condition in readfish_conditions:\n", + " condition_name = condition[\"name\"]\n", + " print(f\"Processing condition {condition_name}\")\n", + " subchannels = channels_per_condition[condition_name]\n", + " \n", + " partial_seqsum_df = full_seqsum_df[full_seqsum_df[\"channel\"].isin([f\"ch{i}\" for i in subchannels])]\n", + " # partial_seqsum_df = full_seqsum_df[full_seqsum_df[\"channel\"].isin([i for i in subchannels])]\n", + " \n", + " partial_seqsum_df, group_column, chrom_column = preprocess_seqsum_df_for_plotting(partial_seqsum_df, group_to_units=copy.deepcopy(group_to_units))\n", + " \n", + " num_sequenced_bps_per_group = dict(partial_seqsum_df.groupby(\"group\")[\"cum_nb_seq_bps_per_group\"].max())\n", + " num_sequenced_bps_per_group_per_condition[condition_name] = num_sequenced_bps_per_group\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# when having one condition/target per simulation run\n", + "\n", + "# group_to_units = {\n", + "# 'enrich_chr_1_8': ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8'],\n", + "# 'enrich_chr_9_14': ['chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14'],\n", + "# 'enrich_chr_16_20': ['chr16', 'chr17', 'chr18', 'chr19', 'chr20'],\n", + "# }\n", + "# seqsum_filenames_per_cond = {\n", + "# \"control\": Path(\"/is/cluster-test/fast/mmordig/ont_project/runs/enrich_usecase/readfish_exp/results_readfishexp_fakemapper_control/simulator_run/sequencing_summary.txt\"),\n", + "# \"enrich_chr_1_8\": Path(\"/is/cluster-test/fast/mmordig/ont_project/runs/enrich_usecase/readfish_exp/results_readfishexp_fakemapper_chr1to8/simulator_run/sequencing_summary.txt\"),\n", + "# \"enrich_chr_9_14\": Path(\"/is/cluster-test/fast/mmordig/ont_project/runs/enrich_usecase/readfish_exp/results_readfishexp_fakemapper_chr9to14/simulator_run/sequencing_summary.txt\"),\n", + "# \"enrich_chr_16_20\": Path(\"/is/cluster-test/fast/mmordig/ont_project/runs/enrich_usecase/readfish_exp/results_readfishexp_fakemapper_chr16to20/simulator_run/sequencing_summary.txt\"),\n", + "# }\n", + "\n", + "# assert(all(x.exists() for x in seqsum_filenames_per_cond.values()))\n", + "\n", + "# num_sequenced_bps_per_group_per_condition = {}\n", + "# for (condition_name, seqsum_filename) in seqsum_filenames_per_cond.items():\n", + "# partial_seqsum_df, group_column = preprocess_seqsum_df_for_plotting(seqsum_filename, group_to_units=copy.deepcopy(group_to_units))\n", + " \n", + "# num_sequenced_bps_per_group = dict(partial_seqsum_df.groupby(\"group\")[\"cum_nb_seq_bps_per_group\"].max())\n", + "# num_sequenced_bps_per_group_per_condition[condition_name] = num_sequenced_bps_per_group" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "control : total: 2.67E+09 (1.0), enrich_chr_16_20: 3.35E+08, enrich_chr_1_8: 1.32E+09, enrich_chr_9_14: 6.60E+08, other: 3.54E+08\n", + "enrich_chr_1_8 : total: 2.12E+09 (1.26), enrich_chr_16_20: 6.32E+07, enrich_chr_1_8: 1.84E+09, enrich_chr_9_14: 1.33E+08, other: 8.11E+07\n", + "enrich_chr_9_14 : total: 1.94E+09 (1.38), enrich_chr_16_20: 9.16E+07, enrich_chr_1_8: 3.60E+08, enrich_chr_9_14: 1.35E+09, other: 1.36E+08\n", + "enrich_chr_16_20 : total: 1.75E+09 (1.53), enrich_chr_16_20: 9.05E+08, enrich_chr_1_8: 4.61E+08, enrich_chr_9_14: 2.40E+08, other: 1.43E+08\n" + ] + } + ], + "source": [ + "for (condition_name, num_sequenced_bps_per_group) in num_sequenced_bps_per_group_per_condition.items():\n", + " total_bps = sum(num_sequenced_bps_per_group.values())\n", + " throughput_reduction = sum(num_sequenced_bps_per_group_per_condition[\"control\"].values()) / sum(num_sequenced_bps_per_group.values())\n", + " \n", + " bps_per_target_str = \", \".join(f\"{group}: {num_bps:.2E}\" for (group, num_bps) in num_sequenced_bps_per_group.items())\n", + " print(f\"{condition_name:20}: total: {total_bps:.2E} ({throughput_reduction:.3}), {bps_per_target_str}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Absolute enrichment of target in condition enrich_chr_1_8 : 1.4\n", + "Relative enrichment of target in condition enrich_chr_1_8 : 1.75\n", + "Absolute enrichment of target in condition enrich_chr_9_14 : 2.04\n", + "Relative enrichment of target in condition enrich_chr_9_14 : 2.82\n", + "Absolute enrichment of target in condition enrich_chr_16_20 : 2.7\n", + "Relative enrichment of target in condition enrich_chr_16_20 : 4.14\n" + ] + } + ], + "source": [ + "from simreaduntil.simulator.gap_sampling.gap_sampler_per_window_until_blocked import dict_without_items\n", + "\n", + "absolute_enrichment = {\n", + " condition_name: seq_bps_per_target[condition_name] / num_sequenced_bps_per_group_per_condition[\"control\"][condition_name]\n", + " for (condition_name, seq_bps_per_target) in dict_without_items(num_sequenced_bps_per_group_per_condition, [\"control\"]).items()\n", + "}\n", + "\n", + "# relative composition when no selective sequencing is happening\n", + "composition_noselseq = {\n", + " \"enrich_chr_1_8\": 0.496,\n", + " \"enrich_chr_9_14\": 0.247,\n", + " \"enrich_chr_16_20\": 0.125\n", + "}\n", + "relative_enrichment = {\n", + " condition_name: (seq_bps_per_target[condition_name] / sum(seq_bps_per_target.values())) / composition_noselseq[condition_name]\n", + " for (condition_name, seq_bps_per_target) in dict_without_items(num_sequenced_bps_per_group_per_condition, [\"control\"]).items()\n", + "}\n", + "\n", + "for (condition_name, enrichment) in absolute_enrichment.items():\n", + " print(f\"Absolute enrichment of target in condition {condition_name:20}: {enrichment:.3}\")\n", + " print(f\"Relative enrichment of target in condition {condition_name:20}: {relative_enrichment[condition_name]:.3}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```\n", + "# aligned, fake mapper, accel10 (results_readfishexp_realreads_fakemapper_accel10)\n", + "control : total: 2.72E+09 (1.0 ), enrich_chr_16_20: 3.36E+08, enrich_chr_1_8: 1.35E+09, enrich_chr_9_14: 6.74E+08, other: 3.59E+08\n", + "enrich_chr_1_8 : total: 2.28E+09 (1.19), enrich_chr_16_20: 2.09E+07, enrich_chr_1_8: 2.20E+09, enrich_chr_9_14: 4.03E+07, other: 2.19E+07\n", + "enrich_chr_9_14 : total: 1.99E+09 (1.36), enrich_chr_16_20: 3.27E+07, enrich_chr_1_8: 1.31E+08, enrich_chr_9_14: 1.79E+09, other: 3.49E+07\n", + "enrich_chr_16_20 : total: 1.69E+09 (1.61), enrich_chr_16_20: 1.35E+09, enrich_chr_1_8: 1.92E+08, enrich_chr_9_14: 9.56E+07, other: 5.17E+07\n", + "Absolute enrichment of target in condition enrich_chr_1_8 : 1.63\n", + "Relative enrichment of target in condition enrich_chr_1_8 : 1.94\n", + "Absolute enrichment of target in condition enrich_chr_9_14 : 2.66\n", + "Relative enrichment of target in condition enrich_chr_9_14 : 3.65\n", + "Absolute enrichment of target in condition enrich_chr_16_20 : 4.0\n", + "Relative enrichment of target in condition enrich_chr_16_20 : 6.39\n", + "\n", + "# aligned, realmapper, accel5 (results_readfishexp_realreads_realmapper_accel5)\n", + "control : total: 2.72E+09 (1.0 ), enrich_chr_16_20: 3.38E+08, enrich_chr_1_8: 1.35E+09, enrich_chr_9_14: 6.70E+08, other: 3.65E+08\n", + "enrich_chr_1_8 : total: 2.31E+09 (1.18), enrich_chr_16_20: 3.01E+07, enrich_chr_1_8: 2.16E+09, enrich_chr_9_14: 6.96E+07, other: 4.90E+07\n", + "enrich_chr_9_14 : total: 2.03E+09 (1.34), enrich_chr_16_20: 4.51E+07, enrich_chr_1_8: 1.71E+08, enrich_chr_9_14: 1.72E+09, other: 1.01E+08\n", + "enrich_chr_16_20 : total: 1.73E+09 (1.58), enrich_chr_16_20: 1.28E+09, enrich_chr_1_8: 2.27E+08, enrich_chr_9_14: 1.31E+08, other: 9.46E+07\n", + "Absolute enrichment of target in condition enrich_chr_1_8 : 1.59\n", + "Relative enrichment of target in condition enrich_chr_1_8 : 1.89\n", + "Absolute enrichment of target in condition enrich_chr_9_14 : 2.56\n", + "Relative enrichment of target in condition enrich_chr_9_14 : 3.42\n", + "Absolute enrichment of target in condition enrich_chr_16_20 : 3.78\n", + "Relative enrichment of target in condition enrich_chr_16_20 : 5.9\n", + "\n", + "# with unaligned, realmapper accel10 (results_readfishexp_realreads_realmapper_withunaligned_accel10)\n", + "control : total: 2.67E+09 (1.0), enrich_chr_16_20: 3.32E+08, enrich_chr_1_8: 1.33E+09, enrich_chr_9_14: 6.57E+08, other: 3.55E+08\n", + "enrich_chr_1_8 : total: 2.32E+09 (1.15), enrich_chr_16_20: 2.26E+08, enrich_chr_1_8: 1.40E+09, enrich_chr_9_14: 4.52E+08, other: 2.44E+08\n", + "enrich_chr_9_14 : total: 2.31E+09 (1.16), enrich_chr_16_20: 2.40E+08, enrich_chr_1_8: 9.53E+08, enrich_chr_9_14: 8.48E+08, other: 2.68E+08\n", + "enrich_chr_16_20 : total: 2.27E+09 (1.18), enrich_chr_16_20: 4.90E+08, enrich_chr_1_8: 1.00E+09, enrich_chr_9_14: 5.00E+08, other: 2.74E+08\n", + "Absolute enrichment of target in condition enrich_chr_1_8 : 1.06\n", + "Relative enrichment of target in condition enrich_chr_1_8 : 1.22\n", + "Absolute enrichment of target in condition enrich_chr_9_14 : 1.29\n", + "Relative enrichment of target in condition enrich_chr_9_14 : 1.49\n", + "Absolute enrichment of target in condition enrich_chr_16_20 : 1.48\n", + "Relative enrichment of target in condition enrich_chr_16_20 : 1.73\n", + "\n", + "# with unaligned, realmapper accel2 (results_readfishexp_realreads_realmapper_withunaligned_accel2)\n", + "control : total: 2.67E+09 (1.0), enrich_chr_16_20: 3.34E+08, enrich_chr_1_8: 1.32E+09, enrich_chr_9_14: 6.57E+08, other: 3.52E+08\n", + "enrich_chr_1_8 : total: 2.15E+09 (1.24), enrich_chr_16_20: 6.55E+07, enrich_chr_1_8: 1.86E+09, enrich_chr_9_14: 1.37E+08, other: 8.44E+07\n", + "enrich_chr_9_14 : total: 1.92E+09 (1.39), enrich_chr_16_20: 9.18E+07, enrich_chr_1_8: 3.61E+08, enrich_chr_9_14: 1.34E+09, other: 1.33E+08\n", + "enrich_chr_16_20 : total: 1.74E+09 (1.54), enrich_chr_16_20: 8.91E+08, enrich_chr_1_8: 4.61E+08, enrich_chr_9_14: 2.40E+08, other: 1.44E+08\n", + "Absolute enrichment of target in condition enrich_chr_1_8 : 1.41\n", + "Relative enrichment of target in condition enrich_chr_1_8 : 1.75\n", + "Absolute enrichment of target in condition enrich_chr_9_14 : 2.03\n", + "Relative enrichment of target in condition enrich_chr_9_14 : 2.81\n", + "Absolute enrichment of target in condition enrich_chr_16_20 : 2.67\n", + "Relative enrichment of target in condition enrich_chr_16_20 : 4.11\n", + "\n", + "# with unaligned, realmapper, accel3 (results_readfishexp_realreads_realmapper_withunaligned_accel3_longer)\n", + "control : total: 2.65E+09 (1.0), enrich_chr_16_20: 3.28E+08, enrich_chr_1_8: 1.32E+09, enrich_chr_9_14: 6.47E+08, other: 3.49E+08\n", + "enrich_chr_1_8 : total: 2.13E+09 (1.24), enrich_chr_16_20: 6.44E+07, enrich_chr_1_8: 1.85E+09, enrich_chr_9_14: 1.33E+08, other: 8.19E+07\n", + "enrich_chr_9_14 : total: 1.92E+09 (1.38), enrich_chr_16_20: 9.07E+07, enrich_chr_1_8: 3.55E+08, enrich_chr_9_14: 1.34E+09, other: 1.33E+08\n", + "enrich_chr_16_20 : total: 1.73E+09 (1.53), enrich_chr_16_20: 8.92E+08, enrich_chr_1_8: 4.56E+08, enrich_chr_9_14: 2.39E+08, other: 1.42E+08\n", + "Absolute enrichment of target in condition enrich_chr_1_8 : 1.4\n", + "Relative enrichment of target in condition enrich_chr_1_8 : 1.75\n", + "Absolute enrichment of target in condition enrich_chr_9_14 : 2.07\n", + "Relative enrichment of target in condition enrich_chr_9_14 : 2.82\n", + "Absolute enrichment of target in condition enrich_chr_16_20 : 2.72\n", + "Relative enrichment of target in condition enrich_chr_16_20 : 4.13\n", + "\n", + "# with unaligned, realmapper, accel5 (results_readfishexp_realreads_realmapper_withunaligned_accel5_longer)\n", + "control : total: 2.67E+09 (1.0), enrich_chr_16_20: 3.35E+08, enrich_chr_1_8: 1.32E+09, enrich_chr_9_14: 6.60E+08, other: 3.54E+08\n", + "enrich_chr_1_8 : total: 2.12E+09 (1.26), enrich_chr_16_20: 6.32E+07, enrich_chr_1_8: 1.84E+09, enrich_chr_9_14: 1.33E+08, other: 8.11E+07\n", + "enrich_chr_9_14 : total: 1.94E+09 (1.38), enrich_chr_16_20: 9.16E+07, enrich_chr_1_8: 3.60E+08, enrich_chr_9_14: 1.35E+09, other: 1.36E+08\n", + "enrich_chr_16_20 : total: 1.75E+09 (1.53), enrich_chr_16_20: 9.05E+08, enrich_chr_1_8: 4.61E+08, enrich_chr_9_14: 2.40E+08, other: 1.43E+08\n", + "Absolute enrichment of target in condition enrich_chr_1_8 : 1.4\n", + "Relative enrichment of target in condition enrich_chr_1_8 : 1.75\n", + "Absolute enrichment of target in condition enrich_chr_9_14 : 2.04\n", + "Relative enrichment of target in condition enrich_chr_9_14 : 2.82\n", + "Absolute enrichment of target in condition enrich_chr_16_20 : 2.7\n", + "Relative enrichment of target in condition enrich_chr_16_20 : 4.14\n", + "\n", + "\n", + "## unused \n", + "\n", + "# short: with unaligned, realmapper accel5 (results_readfishexp_realreads_realmapper_withunaligned_accel5)\n", + "control : total: 1.82E+09 (1.0), enrich_chr_16_20: 2.23E+08, enrich_chr_1_8: 9.07E+08, enrich_chr_9_14: 4.47E+08, other: 2.39E+08\n", + "enrich_chr_1_8 : total: 1.54E+09 (1.18), enrich_chr_16_20: 4.62E+07, enrich_chr_1_8: 1.34E+09, enrich_chr_9_14: 9.68E+07, other: 5.85E+07\n", + "enrich_chr_9_14 : total: 1.36E+09 (1.33), enrich_chr_16_20: 6.51E+07, enrich_chr_1_8: 2.55E+08, enrich_chr_9_14: 9.48E+08, other: 9.54E+07\n", + "enrich_chr_16_20 : total: 1.27E+09 (1.43), enrich_chr_16_20: 6.53E+08, enrich_chr_1_8: 3.34E+08, enrich_chr_9_14: 1.74E+08, other: 1.05E+08\n", + "Absolute enrichment of target in condition enrich_chr_1_8 : 1.47\n", + "Relative enrichment of target in condition enrich_chr_1_8 : 1.75\n", + "Absolute enrichment of target in condition enrich_chr_9_14 : 2.12\n", + "Relative enrichment of target in condition enrich_chr_9_14 : 2.82\n", + "Absolute enrichment of target in condition enrich_chr_16_20 : 2.93\n", + "Relative enrichment of target in condition enrich_chr_16_20 : 4.13\n", + "\n", + "# for constant gap sampler (results_readfishexp_realreads_realmapper_withunaligned_constantgapsampler_accel5)\n", + "control : total: 2.27E+09 (1.0), enrich_chr_16_20: 2.82E+08, enrich_chr_1_8: 1.12E+09, enrich_chr_9_14: 5.63E+08, other: 3.00E+08\n", + "enrich_chr_1_8 : total: 2.21E+09 (1.03), enrich_chr_16_20: 6.70E+07, enrich_chr_1_8: 1.92E+09, enrich_chr_9_14: 1.40E+08, other: 8.49E+07\n", + "enrich_chr_9_14 : total: 2.15E+09 (1.05), enrich_chr_16_20: 1.03E+08, enrich_chr_1_8: 4.05E+08, enrich_chr_9_14: 1.49E+09, other: 1.51E+08\n", + "enrich_chr_16_20 : total: 2.08E+09 (1.09), enrich_chr_16_20: 1.07E+09, enrich_chr_1_8: 5.56E+08, enrich_chr_9_14: 2.88E+08, other: 1.72E+08\n", + "Absolute enrichment of target in condition enrich_chr_1_8 : 1.71\n", + "Relative enrichment of target in condition enrich_chr_1_8 : 1.75\n", + "Absolute enrichment of target in condition enrich_chr_9_14 : 2.65\n", + "Relative enrichment of target in condition enrich_chr_9_14 : 2.81\n", + "Absolute enrichment of target in condition enrich_chr_16_20 : 3.78\n", + "Relative enrichment of target in condition enrich_chr_16_20 : 4.1\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# yield-corrected\n", + "1.49 * 1.23/1.24,\\\n", + "2.2 * 1.44/1.89, \\\n", + "3.04 * 1.7/2.84" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# yield-corrected\n", + "# 1.59 * 1.15/1.24, \\\n", + "# 2.38 * 1.32/1.89, \\\n", + "# 3.23 * 1.56/2.84" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ont_project_venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/usecases/configs/enrich_usecase/chr202122_run/sampler_per_window/config.toml b/usecases/configs/enrich_usecase/chr202122_run/sampler_per_window/config.toml index bd1024a..f50cd15 100644 --- a/usecases/configs/enrich_usecase/chr202122_run/sampler_per_window/config.toml +++ b/usecases/configs/enrich_usecase/chr202122_run/sampler_per_window/config.toml @@ -9,10 +9,11 @@ run_duration = 3600.0 ################################################# # reads_file = "nanosim_reads/perfect_reads_seed1_aligned_reads.fasta" +reads_len_range = [12000, 16000] ref_genome_path = "data/chm13v2.0_normalized3chroms.fa.gz" sim_params_file = "sim_params.dill" -rotating = true -mux_scan_period = 240 # 90 minutes +rotating_writeout = true +mux_scan_period = 240 # seconds mux_scan_duration = 40 # seconds use_grpc = true diff --git a/usecases/configs/enrich_usecase/chr202122_run/sampler_per_window/readfish_enrich_chr20.toml b/usecases/configs/enrich_usecase/chr202122_run/sampler_per_window/readfish_enrich_chr20.toml index 1ce1482..3909139 100644 --- a/usecases/configs/enrich_usecase/chr202122_run/sampler_per_window/readfish_enrich_chr20.toml +++ b/usecases/configs/enrich_usecase/chr202122_run/sampler_per_window/readfish_enrich_chr20.toml @@ -15,6 +15,6 @@ targets = ["chr20"] single_on = "stop_receiving" multi_on = "stop_receiving" single_off = "unblock" -multi_off = "unblock" +multi_off = "proceed" no_seq = "proceed" # unclear what it is, does not seem to be used no_map = "proceed" # if no_map happens after mux_chunks were received, the read is rejected \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel1/README.md b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel1/README.md new file mode 100644 index 0000000..f5391ad --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel1/README.md @@ -0,0 +1 @@ +Used to enrich chr20, 21 from the human genome \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel1/config.toml b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel1/config.toml new file mode 100644 index 0000000..d921479 --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel1/config.toml @@ -0,0 +1,29 @@ +run_dir = "simulator_run" # where reads, logs, pafs etc. will be written to +n_channels = 512 +acceleration_factor = 1 +run_duration = 2000 + +################################################# +# Optional arguments +################################################# + +# reads_file = "nanosim_reads/perfect_reads_seed1_aligned_reads.fasta" +ref_genome_path = "data/chm13v2.0_normalized.fa.gz" +sim_params_file = "sim_params.dill" +rotating_writeout = true +# mux_scan_period = 5400 # 90 minutes +# mux_scan_duration = 100 # seconds + +# readfish params +readfish_config_file = "configs/readfish_enrich_chr2021.toml" +# readfish_method = "unblock_all" +readfish_method = "targeted_seq" +# readfish_method = "control" + +################################################# +# Parameter extraction arguments +################################################# +seqsum_param_extr_file = "data/20190809_zymo_seqsum.txt" +n_channels_full = 512 +# gap_sampler_type = "sampler_per_window" +gap_sampler_type = "sampler_per_rolling_window_channel" \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel1/readfish_enrich_chr2021.toml b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel1/readfish_enrich_chr2021.toml new file mode 100644 index 0000000..727125a --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel1/readfish_enrich_chr2021.toml @@ -0,0 +1,21 @@ +[caller_settings] +config_name = "ignored" +host = "ignored" +port = 9999 + +[conditions] +# reference = "data/chm13v2.0_normalized.mmi" +reference = "fake_mapper" + +[conditions.0] +name = "enrich_chr_20_21" +control = false +min_chunks = 0 # no decision made whenever <= min_chunks have been received from a read +max_chunks = 12 +targets = ["chr20", "chr21"] +single_on = "stop_receiving" +multi_on = "stop_receiving" +single_off = "unblock" +multi_off = "proceed" +no_seq = "proceed" # unclear what it is, does not seem to be used +no_map = "proceed" # if no_map happens after mux_chunks were received, the read is rejected \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel10/README.md b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel10/README.md new file mode 100644 index 0000000..f5391ad --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel10/README.md @@ -0,0 +1 @@ +Used to enrich chr20, 21 from the human genome \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel10/config.toml b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel10/config.toml new file mode 100644 index 0000000..4a1220d --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel10/config.toml @@ -0,0 +1,29 @@ +run_dir = "simulator_run" # where reads, logs, pafs etc. will be written to +n_channels = 512 +acceleration_factor = 10 +run_duration = 20000 + +################################################# +# Optional arguments +################################################# + +# reads_file = "nanosim_reads/perfect_reads_seed1_aligned_reads.fasta" +ref_genome_path = "data/chm13v2.0_normalized.fa.gz" +sim_params_file = "sim_params.dill" +rotating_writeout = true +# mux_scan_period = 5400 # 90 minutes +# mux_scan_duration = 100 # seconds + +# readfish params +readfish_config_file = "configs/readfish_enrich_chr2021.toml" +# readfish_method = "unblock_all" +readfish_method = "targeted_seq" +# readfish_method = "control" + +################################################# +# Parameter extraction arguments +################################################# +seqsum_param_extr_file = "data/20190809_zymo_seqsum.txt" +n_channels_full = 512 +# gap_sampler_type = "sampler_per_window" +gap_sampler_type = "sampler_per_rolling_window_channel" \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel10/readfish_enrich_chr2021.toml b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel10/readfish_enrich_chr2021.toml new file mode 100644 index 0000000..727125a --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel10/readfish_enrich_chr2021.toml @@ -0,0 +1,21 @@ +[caller_settings] +config_name = "ignored" +host = "ignored" +port = 9999 + +[conditions] +# reference = "data/chm13v2.0_normalized.mmi" +reference = "fake_mapper" + +[conditions.0] +name = "enrich_chr_20_21" +control = false +min_chunks = 0 # no decision made whenever <= min_chunks have been received from a read +max_chunks = 12 +targets = ["chr20", "chr21"] +single_on = "stop_receiving" +multi_on = "stop_receiving" +single_off = "unblock" +multi_off = "proceed" +no_seq = "proceed" # unclear what it is, does not seem to be used +no_map = "proceed" # if no_map happens after mux_chunks were received, the read is rejected \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel3/README.md b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel3/README.md new file mode 100644 index 0000000..f5391ad --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel3/README.md @@ -0,0 +1 @@ +Used to enrich chr20, 21 from the human genome \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel3/config.toml b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel3/config.toml new file mode 100644 index 0000000..4183490 --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel3/config.toml @@ -0,0 +1,29 @@ +run_dir = "simulator_run" # where reads, logs, pafs etc. will be written to +n_channels = 512 +acceleration_factor = 3 +run_duration = 10000 + +################################################# +# Optional arguments +################################################# + +# reads_file = "nanosim_reads/perfect_reads_seed1_aligned_reads.fasta" +ref_genome_path = "data/chm13v2.0_normalized.fa.gz" +sim_params_file = "sim_params.dill" +rotating_writeout = true +# mux_scan_period = 5400 # 90 minutes +# mux_scan_duration = 100 # seconds + +# readfish params +readfish_config_file = "configs/readfish_enrich_chr2021.toml" +# readfish_method = "unblock_all" +readfish_method = "targeted_seq" +# readfish_method = "control" + +################################################# +# Parameter extraction arguments +################################################# +seqsum_param_extr_file = "data/20190809_zymo_seqsum.txt" +n_channels_full = 512 +# gap_sampler_type = "sampler_per_window" +gap_sampler_type = "sampler_per_rolling_window_channel" \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel3/readfish_enrich_chr2021.toml b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel3/readfish_enrich_chr2021.toml new file mode 100644 index 0000000..727125a --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel3/readfish_enrich_chr2021.toml @@ -0,0 +1,21 @@ +[caller_settings] +config_name = "ignored" +host = "ignored" +port = 9999 + +[conditions] +# reference = "data/chm13v2.0_normalized.mmi" +reference = "fake_mapper" + +[conditions.0] +name = "enrich_chr_20_21" +control = false +min_chunks = 0 # no decision made whenever <= min_chunks have been received from a read +max_chunks = 12 +targets = ["chr20", "chr21"] +single_on = "stop_receiving" +multi_on = "stop_receiving" +single_off = "unblock" +multi_off = "proceed" +no_seq = "proceed" # unclear what it is, does not seem to be used +no_map = "proceed" # if no_map happens after mux_chunks were received, the read is rejected \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel5/README.md b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel5/README.md new file mode 100644 index 0000000..f5391ad --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel5/README.md @@ -0,0 +1 @@ +Used to enrich chr20, 21 from the human genome \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel5/config.toml b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel5/config.toml new file mode 100644 index 0000000..effacef --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel5/config.toml @@ -0,0 +1,29 @@ +run_dir = "simulator_run" # where reads, logs, pafs etc. will be written to +n_channels = 512 +acceleration_factor = 5 +run_duration = 10000 + +################################################# +# Optional arguments +################################################# + +# reads_file = "nanosim_reads/perfect_reads_seed1_aligned_reads.fasta" +ref_genome_path = "data/chm13v2.0_normalized.fa.gz" +sim_params_file = "sim_params.dill" +rotating_writeout = true +# mux_scan_period = 5400 # 90 minutes +# mux_scan_duration = 100 # seconds + +# readfish params +readfish_config_file = "configs/readfish_enrich_chr2021.toml" +# readfish_method = "unblock_all" +readfish_method = "targeted_seq" +# readfish_method = "control" + +################################################# +# Parameter extraction arguments +################################################# +seqsum_param_extr_file = "data/20190809_zymo_seqsum.txt" +n_channels_full = 512 +# gap_sampler_type = "sampler_per_window" +gap_sampler_type = "sampler_per_rolling_window_channel" \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel5/readfish_enrich_chr2021.toml b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel5/readfish_enrich_chr2021.toml new file mode 100644 index 0000000..727125a --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel5/readfish_enrich_chr2021.toml @@ -0,0 +1,21 @@ +[caller_settings] +config_name = "ignored" +host = "ignored" +port = 9999 + +[conditions] +# reference = "data/chm13v2.0_normalized.mmi" +reference = "fake_mapper" + +[conditions.0] +name = "enrich_chr_20_21" +control = false +min_chunks = 0 # no decision made whenever <= min_chunks have been received from a read +max_chunks = 12 +targets = ["chr20", "chr21"] +single_on = "stop_receiving" +multi_on = "stop_receiving" +single_off = "unblock" +multi_off = "proceed" +no_seq = "proceed" # unclear what it is, does not seem to be used +no_map = "proceed" # if no_map happens after mux_chunks were received, the read is rejected \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel7.5/README.md b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel7.5/README.md new file mode 100644 index 0000000..f5391ad --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel7.5/README.md @@ -0,0 +1 @@ +Used to enrich chr20, 21 from the human genome \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel7.5/config.toml b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel7.5/config.toml new file mode 100644 index 0000000..ed36234 --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel7.5/config.toml @@ -0,0 +1,29 @@ +run_dir = "simulator_run" # where reads, logs, pafs etc. will be written to +n_channels = 512 +acceleration_factor = 7.5 +run_duration = 2000 + +################################################# +# Optional arguments +################################################# + +# reads_file = "nanosim_reads/perfect_reads_seed1_aligned_reads.fasta" +ref_genome_path = "data/chm13v2.0_normalized.fa.gz" +sim_params_file = "sim_params.dill" +rotating_writeout = true +# mux_scan_period = 5400 # 90 minutes +# mux_scan_duration = 100 # seconds + +# readfish params +readfish_config_file = "configs/readfish_enrich_chr2021.toml" +# readfish_method = "unblock_all" +readfish_method = "targeted_seq" +# readfish_method = "control" + +################################################# +# Parameter extraction arguments +################################################# +seqsum_param_extr_file = "data/20190809_zymo_seqsum.txt" +n_channels_full = 512 +# gap_sampler_type = "sampler_per_window" +gap_sampler_type = "sampler_per_rolling_window_channel" \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel7.5/readfish_enrich_chr2021.toml b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel7.5/readfish_enrich_chr2021.toml new file mode 100644 index 0000000..727125a --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/accelerations/config_accel7.5/readfish_enrich_chr2021.toml @@ -0,0 +1,21 @@ +[caller_settings] +config_name = "ignored" +host = "ignored" +port = 9999 + +[conditions] +# reference = "data/chm13v2.0_normalized.mmi" +reference = "fake_mapper" + +[conditions.0] +name = "enrich_chr_20_21" +control = false +min_chunks = 0 # no decision made whenever <= min_chunks have been received from a read +max_chunks = 12 +targets = ["chr20", "chr21"] +single_on = "stop_receiving" +multi_on = "stop_receiving" +single_off = "unblock" +multi_off = "proceed" +no_seq = "proceed" # unclear what it is, does not seem to be used +no_map = "proceed" # if no_map happens after mux_chunks were received, the read is rejected \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp/config.toml b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp/config.toml new file mode 100644 index 0000000..87c581a --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp/config.toml @@ -0,0 +1,30 @@ +run_dir = "simulator_run" # where reads, logs, pafs etc. will be written to +n_channels = 512 +acceleration_factor = 2 +run_duration = 120000 + +################################################# +# Optional arguments +################################################# + +# reads_file = "nanosim_reads/perfect_reads_seed1_aligned_reads.fasta" +reads_len_range = [12000, 16000] +ref_genome_path = "data/chm13v2.0_normalized.fa.gz" +sim_params_file = "sim_params.dill" +rotating_writeout = true +# mux_scan_period = 5400 # 90 minutes +# mux_scan_duration = 100 # seconds + +# readfish params +readfish_config_file = "configs/readfish_enrich_chr1620.toml" +# readfish_method = "unblock_all" +readfish_method = "targeted_seq" +# readfish_method = "control" + +################################################# +# Parameter extraction arguments +################################################# +seqsum_param_extr_file = "data/20190809_zymo_seqsum.txt" +n_channels_full = 512 +# gap_sampler_type = "sampler_per_window" +gap_sampler_type = "sampler_per_rolling_window_channel" \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp/readfish_enrich_chr1620.toml b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp/readfish_enrich_chr1620.toml new file mode 100644 index 0000000..5703403 --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp/readfish_enrich_chr1620.toml @@ -0,0 +1,21 @@ +[caller_settings] +config_name = "ignored" +host = "ignored" +port = 9999 + +[conditions] +reference = "data/chm13v2.0_normalized.mmi" +# reference = "fake_mapper" + +[conditions.0] +name = "enrich_chr_16_20" +control = false +min_chunks = 0 # no decision made whenever <= min_chunks have been received from a read +max_chunks = 12 +targets = ["chr16", "chr17", "chr18", "chr19", "chr20"] +single_on = "stop_receiving" +multi_on = "stop_receiving" +single_off = "unblock" +multi_off = "proceed" +no_seq = "proceed" # unclear what it is, does not seem to be used +no_map = "proceed" # if no_map happens after mux_chunks were received, the read is rejected \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_chr1to8_fakemapper/config.toml b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_chr1to8_fakemapper/config.toml new file mode 100644 index 0000000..532e4c0 --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_chr1to8_fakemapper/config.toml @@ -0,0 +1,30 @@ +run_dir = "simulator_run" # where reads, logs, pafs etc. will be written to +n_channels = 512 +acceleration_factor = 10 +run_duration = 120000 + +################################################# +# Optional arguments +################################################# + +# reads_file = "nanosim_reads/perfect_reads_seed1_aligned_reads.fasta" +reads_len_range = [12000, 16000] +ref_genome_path = "data/chm13v2.0_normalized.fa.gz" +sim_params_file = "sim_params.dill" +rotating_writeout = true +# mux_scan_period = 5400 # 90 minutes +# mux_scan_duration = 100 # seconds + +# readfish params +readfish_config_file = "configs/readfish_enrich_chr1to8.toml" +# readfish_method = "unblock_all" +readfish_method = "targeted_seq" +# readfish_method = "control" + +################################################# +# Parameter extraction arguments +################################################# +seqsum_param_extr_file = "data/20190809_zymo_seqsum.txt" +n_channels_full = 512 +# gap_sampler_type = "sampler_per_window" +gap_sampler_type = "sampler_per_rolling_window_channel" \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_chr1to8_fakemapper/readfish_enrich_chr1to8.toml b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_chr1to8_fakemapper/readfish_enrich_chr1to8.toml new file mode 100644 index 0000000..e875fd7 --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_chr1to8_fakemapper/readfish_enrich_chr1to8.toml @@ -0,0 +1,21 @@ +[caller_settings] +config_name = "ignored" +host = "ignored" +port = 9999 + +[conditions] +# reference = "data/chm13v2.0_normalized.mmi" +reference = "fake_mapper" + +[conditions.0] +name = "enrich_chr_1_8" +control = false +min_chunks = 0 # no decision made whenever <= min_chunks have been received from a read +max_chunks = 12 +targets = ["chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8"] +single_on = "stop_receiving" +multi_on = "stop_receiving" +single_off = "unblock" +multi_off = "proceed" +no_seq = "proceed" # unclear what it is, does not seem to be used +no_map = "proceed" # if no_map happens after mux_chunks were received, the read is rejected \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_chr9to14_fakemapper/config.toml b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_chr9to14_fakemapper/config.toml new file mode 100644 index 0000000..5e257fb --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_chr9to14_fakemapper/config.toml @@ -0,0 +1,30 @@ +run_dir = "simulator_run" # where reads, logs, pafs etc. will be written to +n_channels = 512 +acceleration_factor = 10 +run_duration = 120000 + +################################################# +# Optional arguments +################################################# + +# reads_file = "nanosim_reads/perfect_reads_seed1_aligned_reads.fasta" +reads_len_range = [12000, 16000] +ref_genome_path = "data/chm13v2.0_normalized.fa.gz" +sim_params_file = "sim_params.dill" +rotating_writeout = true +# mux_scan_period = 5400 # 90 minutes +# mux_scan_duration = 100 # seconds + +# readfish params +readfish_config_file = "configs/readfish_enrich_chr9to14.toml" +# readfish_method = "unblock_all" +readfish_method = "targeted_seq" +# readfish_method = "control" + +################################################# +# Parameter extraction arguments +################################################# +seqsum_param_extr_file = "data/20190809_zymo_seqsum.txt" +n_channels_full = 512 +# gap_sampler_type = "sampler_per_window" +gap_sampler_type = "sampler_per_rolling_window_channel" \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_chr9to14_fakemapper/readfish_enrich_chr9to14.toml b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_chr9to14_fakemapper/readfish_enrich_chr9to14.toml new file mode 100644 index 0000000..b05315c --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_chr9to14_fakemapper/readfish_enrich_chr9to14.toml @@ -0,0 +1,21 @@ +[caller_settings] +config_name = "ignored" +host = "ignored" +port = 9999 + +[conditions] +# reference = "data/chm13v2.0_normalized.mmi" +reference = "fake_mapper" + +[conditions.0] +name = "enrich_chr_9_14" +control = false +min_chunks = 0 # no decision made whenever <= min_chunks have been received from a read +max_chunks = 12 +targets = ["chr9", "chr10", "chr11", "chr12", "chr13", "chr14"] +single_on = "stop_receiving" +multi_on = "stop_receiving" +single_off = "unblock" +multi_off = "proceed" +no_seq = "proceed" # unclear what it is, does not seem to be used +no_map = "proceed" # if no_map happens after mux_chunks were received, the read is rejected \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_control/config.toml b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_control/config.toml new file mode 100644 index 0000000..799aea1 --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_control/config.toml @@ -0,0 +1,30 @@ +run_dir = "simulator_run" # where reads, logs, pafs etc. will be written to +n_channels = 512 +acceleration_factor = 2 +run_duration = 120000 + +################################################# +# Optional arguments +################################################# + +# reads_file = "nanosim_reads/perfect_reads_seed1_aligned_reads.fasta" +reads_len_range = [12000, 16000] +ref_genome_path = "data/chm13v2.0_normalized.fa.gz" +sim_params_file = "sim_params.dill" +rotating_writeout = true +# mux_scan_period = 5400 # 90 minutes +# mux_scan_duration = 100 # seconds + +# readfish params +readfish_config_file = "configs/readfish_enrich_chr1620.toml" +# readfish_method = "unblock_all" +# readfish_method = "targeted_seq" +readfish_method = "control" + +################################################# +# Parameter extraction arguments +################################################# +seqsum_param_extr_file = "data/20190809_zymo_seqsum.txt" +n_channels_full = 512 +# gap_sampler_type = "sampler_per_window" +gap_sampler_type = "sampler_per_rolling_window_channel" \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_control/readfish_enrich_chr1620.toml b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_control/readfish_enrich_chr1620.toml new file mode 100644 index 0000000..5703403 --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_control/readfish_enrich_chr1620.toml @@ -0,0 +1,21 @@ +[caller_settings] +config_name = "ignored" +host = "ignored" +port = 9999 + +[conditions] +reference = "data/chm13v2.0_normalized.mmi" +# reference = "fake_mapper" + +[conditions.0] +name = "enrich_chr_16_20" +control = false +min_chunks = 0 # no decision made whenever <= min_chunks have been received from a read +max_chunks = 12 +targets = ["chr16", "chr17", "chr18", "chr19", "chr20"] +single_on = "stop_receiving" +multi_on = "stop_receiving" +single_off = "unblock" +multi_off = "proceed" +no_seq = "proceed" # unclear what it is, does not seem to be used +no_map = "proceed" # if no_map happens after mux_chunks were received, the read is rejected \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_fakemapper/config.toml b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_fakemapper/config.toml new file mode 100644 index 0000000..a73bdbe --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_fakemapper/config.toml @@ -0,0 +1,30 @@ +run_dir = "simulator_run" # where reads, logs, pafs etc. will be written to +n_channels = 512 +acceleration_factor = 10 +run_duration = 120000 + +################################################# +# Optional arguments +################################################# + +# reads_file = "nanosim_reads/perfect_reads_seed1_aligned_reads.fasta" +reads_len_range = [12000, 16000] +ref_genome_path = "data/chm13v2.0_normalized.fa.gz" +sim_params_file = "sim_params.dill" +rotating_writeout = true +# mux_scan_period = 5400 # 90 minutes +# mux_scan_duration = 100 # seconds + +# readfish params +readfish_config_file = "configs/readfish_enrich_chr1620.toml" +# readfish_method = "unblock_all" +readfish_method = "targeted_seq" +# readfish_method = "control" + +################################################# +# Parameter extraction arguments +################################################# +seqsum_param_extr_file = "data/20190809_zymo_seqsum.txt" +n_channels_full = 512 +# gap_sampler_type = "sampler_per_window" +gap_sampler_type = "sampler_per_rolling_window_channel" \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_fakemapper/readfish_enrich_chr1620.toml b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_fakemapper/readfish_enrich_chr1620.toml new file mode 100644 index 0000000..1001f7a --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_fakemapper/readfish_enrich_chr1620.toml @@ -0,0 +1,21 @@ +[caller_settings] +config_name = "ignored" +host = "ignored" +port = 9999 + +[conditions] +# reference = "data/chm13v2.0_normalized.mmi" +reference = "fake_mapper" + +[conditions.0] +name = "enrich_chr_16_20" +control = false +min_chunks = 0 # no decision made whenever <= min_chunks have been received from a read +max_chunks = 12 +targets = ["chr16", "chr17", "chr18", "chr19", "chr20"] +single_on = "stop_receiving" +multi_on = "stop_receiving" +single_off = "unblock" +multi_off = "proceed" +no_seq = "proceed" # unclear what it is, does not seem to be used +no_map = "proceed" # if no_map happens after mux_chunks were received, the read is rejected \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_fakemapper_control/config.toml b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_fakemapper_control/config.toml new file mode 100644 index 0000000..7b7cc26 --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_fakemapper_control/config.toml @@ -0,0 +1,30 @@ +run_dir = "simulator_run" # where reads, logs, pafs etc. will be written to +n_channels = 512 +acceleration_factor = 10 +run_duration = 120000 + +################################################# +# Optional arguments +################################################# + +# reads_file = "nanosim_reads/perfect_reads_seed1_aligned_reads.fasta" +reads_len_range = [12000, 16000] +ref_genome_path = "data/chm13v2.0_normalized.fa.gz" +sim_params_file = "sim_params.dill" +rotating_writeout = true +# mux_scan_period = 5400 # 90 minutes +# mux_scan_duration = 100 # seconds + +# readfish params +readfish_config_file = "configs/readfish_enrich_chr1620.toml" +# readfish_method = "unblock_all" +# readfish_method = "targeted_seq" +readfish_method = "control" + +################################################# +# Parameter extraction arguments +################################################# +seqsum_param_extr_file = "data/20190809_zymo_seqsum.txt" +n_channels_full = 512 +# gap_sampler_type = "sampler_per_window" +gap_sampler_type = "sampler_per_rolling_window_channel" \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_fakemapper_control/readfish_enrich_chr1620.toml b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_fakemapper_control/readfish_enrich_chr1620.toml new file mode 100644 index 0000000..1001f7a --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_fakemapper_control/readfish_enrich_chr1620.toml @@ -0,0 +1,21 @@ +[caller_settings] +config_name = "ignored" +host = "ignored" +port = 9999 + +[conditions] +# reference = "data/chm13v2.0_normalized.mmi" +reference = "fake_mapper" + +[conditions.0] +name = "enrich_chr_16_20" +control = false +min_chunks = 0 # no decision made whenever <= min_chunks have been received from a read +max_chunks = 12 +targets = ["chr16", "chr17", "chr18", "chr19", "chr20"] +single_on = "stop_receiving" +multi_on = "stop_receiving" +single_off = "unblock" +multi_off = "proceed" +no_seq = "proceed" # unclear what it is, does not seem to be used +no_map = "proceed" # if no_map happens after mux_chunks were received, the read is rejected \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_realreads/config.toml b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_realreads/config.toml new file mode 100644 index 0000000..64d3652 --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_realreads/config.toml @@ -0,0 +1,38 @@ +run_dir = "simulator_run" # where reads, logs, pafs etc. will be written to +n_channels = 512 +acceleration_factor = 5 +run_duration = 108000 +# acceleration_factor = 5 +# run_duration = 120000 +# run_duration = 72000 +# run_duration = 720 # todo +# run_duration = 200 # todo + +################################################# +# Optional arguments +################################################# + +# reads_file = "nanosim_reads/perfect_reads_seed1_aligned_reads.fasta" +reads_file = "data/nanosim_reads/human_genome_med15000_alignedrate2" +# reads_file = "data/nanosim_reads/human_genome_med15000" +# reads_file = "data/nanosim_reads/human_genome_few" #todo +# reads_len_range = [12000, 16000] +ref_genome_path = "data/chm13v2.0_normalized.fa.gz" +sim_params_file = "sim_params.dill" # todo +rotating_writeout = true +# mux_scan_period = 5400 # 90 minutes +# mux_scan_duration = 100 # seconds + +# readfish params +readfish_config_file = "configs/readfish_enrich_per_quadrant.toml" +# readfish_method = "unblock_all" +readfish_method = "targeted_seq" +# readfish_method = "control" + +################################################# +# Parameter extraction arguments +################################################# +seqsum_param_extr_file = "data/20190809_zymo_seqsum.txt" +n_channels_full = 512 +# gap_sampler_type = "sampler_per_window" +gap_sampler_type = "sampler_per_rolling_window_channel" \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_realreads/readfish_enrich_per_quadrant.toml b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_realreads/readfish_enrich_per_quadrant.toml new file mode 100644 index 0000000..2fa0382 --- /dev/null +++ b/usecases/configs/enrich_usecase/full_genome_run/readfish_exp/config_readfishexp_realreads/readfish_enrich_per_quadrant.toml @@ -0,0 +1,60 @@ +[caller_settings] +config_name = "ignored" +host = "ignored" +port = 9999 + +[conditions] +reference = "data/chm13v2.0_normalized.mmi" +# reference = "fake_mapper" + +[conditions.0] +name = "control" +control = true +min_chunks = 0 # no decision made whenever <= min_chunks have been received from a read +max_chunks = 12 +targets = [] +single_on = "stop_receiving" +multi_on = "stop_receiving" +single_off = "unblock" +multi_off = "proceed" +no_seq = "proceed" # unclear what it is, does not seem to be used +no_map = "proceed" # if no_map happens after mux_chunks were received, the read is rejected + +[conditions.1] +name = "enrich_chr_1_8" +control = false +min_chunks = 0 # no decision made whenever <= min_chunks have been received from a read +max_chunks = 12 +targets = ["chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8"] +single_on = "stop_receiving" +multi_on = "stop_receiving" +single_off = "unblock" +multi_off = "proceed" +no_seq = "proceed" # unclear what it is, does not seem to be used +no_map = "proceed" # if no_map happens after mux_chunks were received, the read is rejected + +[conditions.2] +name = "enrich_chr_9_14" +control = false +min_chunks = 0 # no decision made whenever <= min_chunks have been received from a read +max_chunks = 12 +targets = ["chr9", "chr10", "chr11", "chr12", "chr13", "chr14"] +single_on = "stop_receiving" +multi_on = "stop_receiving" +single_off = "unblock" +multi_off = "proceed" +no_seq = "proceed" # unclear what it is, does not seem to be used +no_map = "proceed" # if no_map happens after mux_chunks were received, the read is rejected + +[conditions.3] +name = "enrich_chr_16_20" +control = false +min_chunks = 0 # no decision made whenever <= min_chunks have been received from a read +max_chunks = 12 +targets = ["chr16", "chr17", "chr18", "chr19", "chr20"] +single_on = "stop_receiving" +multi_on = "stop_receiving" +single_off = "unblock" +multi_off = "proceed" +no_seq = "proceed" # unclear what it is, does not seem to be used +no_map = "proceed" # if no_map happens after mux_chunks were received, the read is rejected \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/full_genome_run/sampler_per_window/config.toml b/usecases/configs/enrich_usecase/full_genome_run/sampler_per_window/config.toml index 5026a1a..edbb315 100644 --- a/usecases/configs/enrich_usecase/full_genome_run/sampler_per_window/config.toml +++ b/usecases/configs/enrich_usecase/full_genome_run/sampler_per_window/config.toml @@ -1,8 +1,9 @@ run_dir = "simulator_run" # where reads, logs, pafs etc. will be written to n_channels = 512 acceleration_factor = 10 -# run_duration = 200 -run_duration = 15000 +# acceleration_factor = 1 +# run_duration = 100 +run_duration = 2000 # run_duration = 86400.0 ################################################# @@ -12,7 +13,7 @@ run_duration = 15000 # reads_file = "nanosim_reads/perfect_reads_seed1_aligned_reads.fasta" ref_genome_path = "data/chm13v2.0_normalized.fa.gz" sim_params_file = "sim_params.dill" -rotating = true +rotating_writeout = true # mux_scan_period = 5400 # 90 minutes # mux_scan_duration = 100 # seconds @@ -20,6 +21,7 @@ rotating = true readfish_config_file = "configs/readfish_enrich_chr2021.toml" # readfish_method = "unblock_all" readfish_method = "targeted_seq" +# readfish_method = "control" ################################################# # Parameter extraction arguments diff --git a/usecases/configs/enrich_usecase/full_genome_run/sampler_per_window/readfish_enrich_chr2021.toml b/usecases/configs/enrich_usecase/full_genome_run/sampler_per_window/readfish_enrich_chr2021.toml index cc0fcf5..727125a 100644 --- a/usecases/configs/enrich_usecase/full_genome_run/sampler_per_window/readfish_enrich_chr2021.toml +++ b/usecases/configs/enrich_usecase/full_genome_run/sampler_per_window/readfish_enrich_chr2021.toml @@ -16,6 +16,6 @@ targets = ["chr20", "chr21"] single_on = "stop_receiving" multi_on = "stop_receiving" single_off = "unblock" -multi_off = "unblock" +multi_off = "proceed" no_seq = "proceed" # unclear what it is, does not seem to be used no_map = "proceed" # if no_map happens after mux_chunks were received, the read is rejected \ No newline at end of file diff --git a/usecases/configs/enrich_usecase/test_config/sampler_per_window/config.toml b/usecases/configs/enrich_usecase/test_config/sampler_per_window/config.toml index b9d0a0a..15a4fe3 100644 --- a/usecases/configs/enrich_usecase/test_config/sampler_per_window/config.toml +++ b/usecases/configs/enrich_usecase/test_config/sampler_per_window/config.toml @@ -10,7 +10,7 @@ run_duration = 200 # reads_file = "nanosim_reads/perfect_reads_seed1_aligned_reads.fasta" ref_genome_path = "data/chm13v2.0_normalized.fa.gz" sim_params_file = "sim_params.dill" -rotating = true +rotating_writeout = true # mux_scan_period = 5400 # 90 minutes # mux_scan_duration = 100 # seconds diff --git a/usecases/configs/enrich_usecase/test_config/sampler_per_window/readfish_enrich_chr2021.toml b/usecases/configs/enrich_usecase/test_config/sampler_per_window/readfish_enrich_chr2021.toml index cc0fcf5..727125a 100644 --- a/usecases/configs/enrich_usecase/test_config/sampler_per_window/readfish_enrich_chr2021.toml +++ b/usecases/configs/enrich_usecase/test_config/sampler_per_window/readfish_enrich_chr2021.toml @@ -16,6 +16,6 @@ targets = ["chr20", "chr21"] single_on = "stop_receiving" multi_on = "stop_receiving" single_off = "unblock" -multi_off = "unblock" +multi_off = "proceed" no_seq = "proceed" # unclear what it is, does not seem to be used no_map = "proceed" # if no_map happens after mux_chunks were received, the read is rejected \ No newline at end of file diff --git a/usecases/create_nanosim_reads.ipynb b/usecases/create_nanosim_reads.ipynb index 76b6480..9e77699 100644 --- a/usecases/create_nanosim_reads.ipynb +++ b/usecases/create_nanosim_reads.ipynb @@ -7,21 +7,25 @@ "## Generate reads from the reference\n", "\n", "We generate reads with NanoSim.\n", - "We first extract the NanoSim read error model to some directory. This is only necessary once." + "We first extract the NanoSim read error model to some directory. This is only necessary once.\n", + "\n", + "**Rather take a look at `generate_nanosim_reads.sh`.**" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 2, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -37,66 +41,62 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "n_procs = 4\n", - "perfect = True\n", - "use_slurm = True\n", + "perfect = False\n", + "use_slurm = False\n", "on_cluster = False\n", "nanosim_dir = Path(\"external/ont_nanosim/\")\n", "nanosim_model_dir = Path(\"runs/nanosim_models\")\n", "nanosim_model_prefix = nanosim_model_dir / \"human_NA12878_DNA_FAB49712_guppy/training\"\n", "reads_output_dir = \"runs/enrich_usecase/nanosim_reads\"\n", - "ref_genome_path = \"runs/enrich_usecase/data/chm13v2.0_normalized1000000firsttwo.fa.gz\"\n", + "# ref_genome_path = \"runs/enrich_usecase/data/chm13v2.0_normalized1000000firsttwo.fa.gz\"\n", + "ref_genome_path = \"runs/enrich_usecase/data/chm13v2.0_normalized.fa.gz\"\n", "\n", "assert nanosim_dir.exists(), \"move to the repo root repository\"" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "x human_NA12878_DNA_FAB49712_guppy/\n", - "x human_NA12878_DNA_FAB49712_guppy/training_unaligned_length.pkl\n", - "x human_NA12878_DNA_FAB49712_guppy/training_reads_alignment_rate\n", - "x human_NA12878_DNA_FAB49712_guppy/training_model_profile\n", - "x human_NA12878_DNA_FAB49712_guppy/training_aligned_region.pkl\n", - "x human_NA12878_DNA_FAB49712_guppy/training_first_match.hist\n", - "x human_NA12878_DNA_FAB49712_guppy/training_strandness_rate\n", - "x human_NA12878_DNA_FAB49712_guppy/training_gap_length.pkl\n", - "x human_NA12878_DNA_FAB49712_guppy/training_error_markov_model\n", - "x human_NA12878_DNA_FAB49712_guppy/training_aligned_reads.pkl\n", - "x human_NA12878_DNA_FAB49712_guppy/training_chimeric_info\n", - "x human_NA12878_DNA_FAB49712_guppy/training_ht_ratio.pkl\n", - "x human_NA12878_DNA_FAB49712_guppy/training_match_markov_model\n", - "x human_NA12878_DNA_FAB49712_guppy/training_ht_length.pkl\n", - "x human_NA12878_DNA_FAB49712_guppy/training_error_rate.tsv\n" + "mkdir: cannot create directory ‘runs/nanosim_models’: File exists\n", + "tar -xvzf external/ont_nanosim/pre-trained_models/human_NA12878_DNA_FAB49712_guppy.tar.gz -C runs/nanosim_models\n" ] } ], "source": [ "# only necessary once\n", - "!mkdir runs/nanosim_models\n", - "!tar -xvzf external/ont_nanosim/pre-trained_models/human_NA12878_DNA_FAB49712_guppy.tar.gz -C \"{nanosim_model_dir}\"" + "!mkdir {nanosim_model_dir}\n", + "# !tar -xvzf external/ont_nanosim/pre-trained_models/human_NA12878_DNA_FAB49712_guppy.tar.gz -C \"{nanosim_model_dir}\"\n", + "!echo tar -xvzf external/ont_nanosim/pre-trained_models/human_NA12878_DNA_FAB49712_guppy.tar.gz -C \"{nanosim_model_dir}\"" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2023-07-27 20:20:19,776 - Dry run, so not executing the command:\n", + "2024-02-26 14:20:44,568 - Dry run, so not executing the command:\n", "#!/usr/bin/bash\n", "seed=1\n", "conda run -n nanosim python -c \"import HTSeq; print(HTSeq.__version__)\"\n", @@ -105,19 +105,19 @@ "conda run -n nanosim \\\n", " python \"external/ont_nanosim/src/simulator.py\" genome \\\n", " --model_prefix \"runs/nanosim_models/human_NA12878_DNA_FAB49712_guppy/training\" \\\n", - " --ref_g \"runs/enrich_usecase/data/chm13v2.0_normalized1000000firsttwo.fa.gz\" \\\n", + " --ref_g \"runs/enrich_usecase/data/chm13v2.0_normalized.fa.gz\" \\\n", " -dna_type linear \\\n", - " --output \"runs/enrich_usecase/nanosim_reads/perfect_reads_seed$seed\" \\\n", - " --number 10 \\\n", + " --output \"runs/enrich_usecase/nanosim_reads/reads_seed$seed\" \\\n", + " --number 1000 \\\n", " --seed \"$seed\" \\\n", " --strandness 0.5 \\\n", " --basecaller guppy \\\n", " --aligned_rate \"100%\" \\\n", " --num_threads \"4\" \\\n", - " --perfect \\\n", + " \\\n", " --no_error_profile \\\n", " --no_flanking\n", - " #; exit --- 999482901.py:12 () WARNING ##\n" + " #; exit --- 2829796575.py:15 () WARNING ##\n" ] } ], @@ -125,12 +125,15 @@ "from simreaduntil.shared_utils.utils import print_cmd_and_run\n", "from simreaduntil.usecase_helpers.utils import get_gen_nanosim_reads_cmd\n", "\n", - "if on_cluster:\n", - " n_reads_per_sim = 1_000_000\n", - "else: \n", - " # n_reads_per_sim = 160_000\n", - " n_reads_per_sim = 10\n", - " use_slurm = False\n", + "# if on_cluster:\n", + "# n_reads_per_sim = 1_000_000\n", + "# else: \n", + "# # n_reads_per_sim = 160_000\n", + "# n_reads_per_sim = 10\n", + "# use_slurm = False\n", + "\n", + "n_reads_per_sim = 1_000\n", + "# n_reads_per_sim = 100_000\n", " \n", "nanosim_command = get_gen_nanosim_reads_cmd(nanosim_dir, nanosim_model_prefix, ref_genome_path, reads_dir=reads_output_dir, n_reads_per_sim=n_reads_per_sim, perfect=perfect, use_slurm=use_slurm)\n", "print_cmd_and_run(nanosim_command, dry=True)\n", diff --git a/usecases/enrich_usecase.py b/usecases/enrich_usecase.py index 29c1fc1..1c743f5 100644 --- a/usecases/enrich_usecase.py +++ b/usecases/enrich_usecase.py @@ -2,6 +2,7 @@ Combines SimReadUntil with ReadFish This script shows how to combine the SimReadUntil with ReadFish. +It creates the output in the current directory. It first learns a gap sampler from an existing run and saves it. Then, it runs the simulator in combination with ReadFish. on perfect reads generated from a reference genome, or from a reads file (if the config is adapted, e.g. NanoSim reads). @@ -41,30 +42,66 @@ import sys import warnings import numpy as np +import pandas as pd import toml from simreaduntil.shared_utils.debugging_helpers import is_test_mode -from simreaduntil.shared_utils.logging_utils import add_comprehensive_stream_handler_to_logger, print_logging_levels, setup_logger_simple +from simreaduntil.shared_utils.logging_utils import add_comprehensive_stream_handler_to_logger, logging_output_formatter, print_logging_levels, setup_logger_simple from simreaduntil.shared_utils.plotting import filter_seaborn_warnings -from simreaduntil.shared_utils.tee_stdouterr import TeeStdouterr -from simreaduntil.shared_utils.utils import delete_dir_if_exists, dill_dump, dill_load, print_cmd_and_run +# from simreaduntil.shared_utils.tee_stdouterr import TeeStdouterr +from simreaduntil.shared_utils.utils import delete_dir_if_exists, dill_dump, dill_load, print_cmd_and_run, tee_stdouterr_to_file from simreaduntil.simulator.utils import set_package_log_level from simreaduntil.usecase_helpers import simulator_with_readfish -from simreaduntil.usecase_helpers.utils import create_simparams_if_inexistent, get_gap_sampler_method, plot_condor_log_file_metrics +from simreaduntil.usecase_helpers.utils import create_simparams_if_inexistent, get_gap_sampler_method, plot_log_file_metrics from simreaduntil.usecase_helpers.utils import create_figures logger = setup_logger_simple(__name__) add_comprehensive_stream_handler_to_logger(None) set_package_log_level(logging.INFO).__enter__() -print_logging_levels() logging.getLogger(__name__).setLevel(logging.DEBUG) # logging.getLogger().setLevel(logging.DEBUG) # warnings from everywhere, not desired +file_handler = logging.FileHandler("log.txt", mode="a") # append in case we are just running the plotting part of the script +logging_output_formatter(file_handler) +logging.getLogger(None).addHandler(file_handler) +print_logging_levels() # import warnings # warnings.filterwarnings("error") filter_seaborn_warnings() +def create_minimap_index_if_inexistent(): + if sim_config["readfish_method"] != "unblock_all": + readfish_config = toml.load(sim_config["readfish_config_file"]) + mmi_filename = readfish_config["conditions"]["reference"] + if mmi_filename == "fake_mapper": + logger.info(f"Skipping minimap2 index creation, using fake wrapper") + return + + mmi_filename = Path(mmi_filename) + if mmi_filename.exists(): + logger.info(f"Minimap2 index '{mmi_filename}' already exists, skipping minimap2 index creation") + else: + logger.debug(f"Creating minimap2 index at location '{mmi_filename}' for ReadFish from reference genome '{ref_genome_path}'") + assert ref_genome_path is not None + print_cmd_and_run(f"""minimap2 -d {mmi_filename} {ref_genome_path}""") + else: + logger.debug("Skipping minimap2 index (not needed)") + +def run_readfish_simulation(): + logger.debug(f"#################################################################") + logger.debug(f"#################################################################") + logger.debug(f"Running the simulation from config file '{sim_config_file}' with ReadFish config file '{sim_config['readfish_config_file']}'") + delete_dir_if_exists(run_dir, ask=ask_dir_deletion) + # with set_package_log_level(logging.INFO): + # print_logging_levels() + seqsum_file = simulator_with_readfish.main(sim_config_file) + assert Path(seqsum_file).exists() + logger.debug(f"#################################################################") + logger.debug(f"#################################################################") + + return seqsum_file + ################################ ## PARAMS ################################ @@ -86,10 +123,8 @@ ################################ sim_config = toml.load(sim_config_file) -run_dir = Path(sim_config["run_dir"]) -# TeeStdouterr(run_dir / "stdouterr.txt").redirect() logger.debug(f"Read in simulation config file '{sim_config_file}'") - +run_dir = Path(sim_config["run_dir"]) ref_genome_path = sim_config.get("ref_genome_path", None) sim_params_filename = Path(sim_config["sim_params_file"]) if "sim_params_file" in sim_config else None seqsum_param_extr_file = Path(sim_config["seqsum_param_extr_file"]) if "seqsum_param_extr_file" in sim_config else None @@ -103,38 +138,6 @@ logger.info(f"""Loading ReadFish config file with content:\n{Path(sim_config["readfish_config_file"]).read_text()}""") logger.info("#"*80) -def create_minimap_index_if_inexistent(): - if sim_config["readfish_method"] != "unblock_all": - readfish_config = toml.load(sim_config["readfish_config_file"]) - mmi_filename = readfish_config["conditions"]["reference"] - if mmi_filename == "fake_mapper": - logger.info(f"Skipping minimap2 index creation, using fake wrapper") - return - - mmi_filename = Path(mmi_filename) - if mmi_filename.exists(): - logger.info(f"Minimap2 index '{mmi_filename}' already exists, skipping minimap2 index creation") - else: - logger.debug(f"Creating minimap2 index at location '{mmi_filename}' for ReadFish from reference genome '{ref_genome_path}'") - assert ref_genome_path is not None - print_cmd_and_run(f"""minimap2 -d {mmi_filename} {ref_genome_path}""") - else: - logger.debug("Skipping minimap2 index (not needed)") - -def run_readfish_simulation(): - logger.debug(f"#################################################################") - logger.debug(f"#################################################################") - logger.debug(f"Running the simulation from config file '{sim_config_file}' with ReadFish config file '{sim_config['readfish_config_file']}'") - delete_dir_if_exists(run_dir, ask=ask_dir_deletion) - # with set_package_log_level(logging.INFO): - # print_logging_levels() - seqsum_file = simulator_with_readfish.main(sim_config_file) - assert Path(seqsum_file).exists() - logger.debug(f"#################################################################") - logger.debug(f"#################################################################") - - return seqsum_file - # comment out as needed create_minimap_index_if_inexistent() # comment this out if you want to use minimap2 to align to a reference if sim_params_filename is None: @@ -149,11 +152,32 @@ def run_readfish_simulation(): delete_dir_if_exists(figure_dir, ask=ask_dir_deletion) figure_dir.mkdir(exist_ok=True) -plot_condor_log_file_metrics(figure_dir) -create_figures( - seqsum_filename, run_dir=run_dir, figure_dir=figure_dir, - ref_genome_path=ref_genome_path, cov_thresholds=[1, 2, 3, 4], - group_to_units={"target": toml.load(sim_config["readfish_config_file"])["conditions"]["0"]["targets"]}, -) +file_handler.flush() # logger writes to stderr +plot_log_file_metrics(file_handler.baseFilename, save_dir=figure_dir) + +readfish_conditions = [v for v in toml.load(sim_config["readfish_config_file"])["conditions"].values() if isinstance(v, dict)] +channel_assignments_toml = run_dir / "channels.toml" +channel_assignments_per_cond = toml.load(channel_assignments_toml) +channels_per_condition = {condition_dict["name"]: condition_dict["channels"] for condition_dict in channel_assignments_per_cond["conditions"].values()} + +logger.debug(f"Reading sequencing summary file '{seqsum_filename}'") +full_seqsum_df = pd.read_csv(seqsum_filename, sep="\t")#, nrows=100) # todo +logger.debug(f"Done reading sequencing summary file '{seqsum_filename}'") + +for condition in readfish_conditions: + condition_name = condition["name"] + subchannels = channels_per_condition[condition_name] + logger.info(f"Creating figures for condition '{condition_name}' with subchannels {subchannels}") + + partial_seqsum_df = full_seqsum_df[full_seqsum_df["channel"].isin([f"ch{i}" for i in subchannels])] + create_figures( + partial_seqsum_df, run_dir=run_dir, figure_dir=figure_dir / ("condition_" + condition_name), + ref_genome_path=ref_genome_path, cov_thresholds=[1, 2, 3, 4], + group_to_units={"target": condition["targets"]}, + ) + + logger.info(f"Done creating figures for condition '{condition_name}'") + + # break # todo logger.debug(f"Done with usecase script") \ No newline at end of file diff --git a/usecases/enrich_usecase_submission.sh b/usecases/enrich_usecase_submission.sh index 84e13f3..89611ed 100755 --- a/usecases/enrich_usecase_submission.sh +++ b/usecases/enrich_usecase_submission.sh @@ -1,15 +1,28 @@ #!/usr/bin/env bash -# it seems 2 CPUs are fine based on condor log average resource usage -##CONDOR request_cpus=2 +# it seems 2 CPUs are fine based on condor log average resource usage, simforward,muxscan,stopthread +##CONDOR request_cpus=4 # takes about 8GB of memory -##CONDOR request_memory=32000 +##CONDOR request_memory=64000 ##CONDOR request_disk=100G +##CONDOR +JobBatchName = "ont_enrich_usecase" ##CONDOR log = /home/mmordig/joblogs/job-$(ClusterId)-$(ProcId).log ##CONDOR output = /home/mmordig/joblogs/job-$(ClusterId)-$(ProcId).out ##CONDOR error = /home/mmordig/joblogs/job-$(ClusterId)-$(ProcId).err +#SBATCH --job-name=enrich_usecase-%j +#SBATCH --error=/cluster/home/mmordig/joblogs/job-%j.err +#SBATCH --output=/cluster/home/mmordig/joblogs/job-%j.out +#SBATCH --mem=16G +#SBATCH --cpus-per-task=4 +## #SBATCH --time=00:10:00 +#SBATCH --time=2:00:00 +# not avail #SBATCH --tmp=10G +#SBATCH --partition=compute + + # launch_condor_job 20 --- ~/ont_project_all/ont_project/usecases/enrich_usecase_submission.sh +# sbatch ~/ont_project_all/ont_project/usecases/enrich_usecase_submission.sh echo "Content of job ad file $_CONDOR_JOB_AD:"; cat "$_CONDOR_JOB_AD" echo "Starting job with args: " "$@" @@ -19,25 +32,31 @@ source ~/.bashrc cd ~/ont_project_all/ont_project/ source ~/ont_project_all/ont_project_venv/bin/activate +set -ex export PATH=~/ont_project_all/tools/bin:$PATH && which minimap2 -set -ex +output_dir=${1:-full_genome_run_sampler_per_window} +config_rel_dir=${2:-sampler_per_window} cd runs/enrich_usecase -rm -rf full_genome_run_sampler_per_window -mkdir full_genome_run_sampler_per_window -cd full_genome_run_sampler_per_window -ln -s ../data . -ln -s ../configs/full_genome_run/sampler_per_window configs +rm -rf "$output_dir" +mkdir -p "$output_dir" +ln -s "$(pwd)"/data "$output_dir" +cp -rL configs/full_genome_run/"${config_rel_dir}" "$output_dir"/configs # -L: expand symlinks +cd "$output_dir" +pwd python ~/ont_project_all/ont_project/usecases/enrich_usecase.py # symlink job output files to directory # parse stderr from job ad file # Err = "/home/mmordig/joblogs/job-13951206-0.err" if [ -n "$_CONDOR_JOB_AD" ]; then - grep -oP '(?<=Err = ").*(?=")' "$_CONDOR_JOB_AD" | xargs -I {} ln -s {} . - grep -oP '(?<=Out = ").*(?=")' "$_CONDOR_JOB_AD" | xargs -I {} ln -s {} . - grep -oP '(?<=Log = ").*(?=")' "$_CONDOR_JOB_AD" | xargs -I {} ln -s {} . + grep -oP '(?<=Err = ").*(?=")' "$_CONDOR_JOB_AD" | xargs -I {} cp {} . + grep -oP '(?<=Out = ").*(?=")' "$_CONDOR_JOB_AD" | xargs -I {} cp {} . + grep -oP '(?<=Log = ").*(?=")' "$_CONDOR_JOB_AD" | xargs -I {} cp {} . + # grep -oP '(?<=Err = ").*(?=")' "$_CONDOR_JOB_AD" | xargs -I {} ln -s {} . + # grep -oP '(?<=Out = ").*(?=")' "$_CONDOR_JOB_AD" | xargs -I {} ln -s {} . + # grep -oP '(?<=Log = ").*(?=")' "$_CONDOR_JOB_AD" | xargs -I {} ln -s {} . fi echo "Done with job, pwd $(pwd)" diff --git a/usecases/gen_example_sim_plot.py b/usecases/gen_example_sim_plot.py new file mode 100644 index 0000000..56d2734 --- /dev/null +++ b/usecases/gen_example_sim_plot.py @@ -0,0 +1,48 @@ +""" +Generate an example plot for the simulator with 2 channels (included in the paper) +""" + +import numpy as np +from simreaduntil.simulator.gap_sampling.constant_gaps_until_blocked import ConstantGapsUntilBlocked +from simreaduntil.simulator.readpool import ReadPoolFromIterable +from simreaduntil.simulator.readswriter import ArrayReadsWriter +from simreaduntil.simulator.simulator import ONTSimulator +from simreaduntil.simulator.simulator_params import SimParams + +sim_params = SimParams( + gap_samplers={f"channel_{i}": ConstantGapsUntilBlocked(short_gap_length=0.4, long_gap_length=1.5, prob_long_gap=0.5, time_until_blocked=8.1, read_delay=0) for i in range(2)}, + bp_per_second=10, min_chunk_size=4, default_unblock_duration=0.8, seed=0, +) + +rng = np.random.default_rng(0) +def reads_gen(): + for i in range(8): + l = rng.integers(1, 3)*10 + yield f"read{i}", "A" * l + +read_pool = ReadPoolFromIterable(reads_gen()) +simulator = ONTSimulator( + read_pool=read_pool, + reads_writer=ArrayReadsWriter(), + sim_params = sim_params, + output_dir="", +) +simulator.save_elems = True + +simulator.sync_start(0) +simulator.sync_forward(2) +simulator._channels[1].unblock() +simulator.sync_forward(5.5) +# simulator._channels[0].cur_elem. +simulator._channels[0].unblock() +simulator.sync_forward(8) +simulator.run_mux_scan(2, is_sync=True) +simulator.sync_forward(13) + +ax = simulator.plot_channels()#; import matplotlib.pyplot as plt; plt.show() +ax.figure.tight_layout() +ax.set_ylim([-0.2, 1.2]) +ax.autoscale() + +simulator.sync_stop() +ax.figure.savefig("simulator_example.png", dpi=300) \ No newline at end of file diff --git a/usecases/generate_nanosim_reads.sh b/usecases/generate_nanosim_reads.sh new file mode 100755 index 0000000..fff3beb --- /dev/null +++ b/usecases/generate_nanosim_reads.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash + +# note: aligned and unaligned reads are not shuffled here because they are shuffled by the ReadPoolFromFile +# more efficient to run with 2 cpus and run many in parallel + +##CONDOR request_cpus=2 +##CONDOR request_memory=6000 +##CONDOR request_disk=2G +##CONDOR +JobBatchName = "ont_enrich_usecase" +##CONDOR log = /home/mmordig/joblogs/job-$(ClusterId)-$(ProcId).log +##CONDOR output = /home/mmordig/joblogs/job-$(ClusterId)-$(ProcId).out +##CONDOR error = /home/mmordig/joblogs/job-$(ClusterId)-$(ProcId).err +# seems to be broken/filesystem very slow +##CONDOR Requirements = (Machine != "g110.internal.cluster.is.localnet") + +# bash ~/ont_project_all/ont_project/usecases/generate_nanosim_reads.sh 1000 3 +# conda activate nanosim +# sbatch ~/ont_project_all/ont_project/usecases/generate_nanosim_reads.sh 100000 3 +# or use ##CONDOR queue 100 together with $(Item) +# for seed in range(1, 100+1): +# print(f"sbatch ~/ont_project_all/ont_project/usecases/generate_nanosim_reads.sh 100000 {seed}") +# +# initially +# mkdir -p runs/nanosim_models +# echo tar -xvzf external/ont_nanosim/pre-trained_models/human_NA12878_DNA_FAB49712_guppy.tar.gz -C "runs/nanosim_models" + +# set -x +# source ~/.bashrc +# conda hell: conda not found, not sure why, so hardcoding python executable from conda env + +# for python, samtools; conda activate not really working (need to copy entire env) +export PATH=/home/mmordig/tools/mambaforge/envs/nanosim/bin:$PATH +# export PATH=/home/mmordig/miniforge3/envs/nanosim/bin:$PATH + +set -eux + +cd ~/ont_project_all/ont_project/ + +num_reads=$1 +seed=$2 + +# output_dir=runs/data/nanosim_reads/human_genome_med15000 +output_dir=runs/data/nanosim_reads/human_genome_med15000_alignedrate2 +# output_dir=runs/data/nanosim_reads/human_genome +# output_dir=runs/data/nanosim_reads/human_genome_with_flanking +mkdir -p "$output_dir" +num_procs=$(nproc) +# ((num_procs--)) # 1 manager process, not really needed +# num_procs=1 #todo + +# genome=runs/data/random_genome.fasta # see below for how to generate +genome="runs/enrich_usecase/data/chm13v2.0_normalized.fa.gz" +# aligned_rate="100%" +aligned_rate="2" + +echo "nanosim read generation: generating ${num_reads} using seed $seed using $num_procs threads from genome '$genome' with aligned_rate '$aligned_rate' into output_dir '$output_dir'" + +echo "Generating reads" +# conda slow to run, instead use "conda activate nanosim" once and then launch the script several times +# conda run -n nanosim python \ +rm "$output_dir/reads_seed$seed"* || true +# in NanoSim, replaced by uniform distribution now because median length was unreliable, sd=6.9=ln(1000) is the std of the lognormal (lognormal = distribution whose log is normally distributed with stddeviation std) +python \ + "external/ont_nanosim/src/simulator.py" genome \ + --model_prefix "runs/nanosim_models/human_NA12878_DNA_FAB49712_guppy/training" \ + --ref_g "$genome" \ + -dna_type linear \ + -med 15000 -max 20000 -min 400 -sd 2000 \ + --output "$output_dir/reads_seed$seed" \ + --number "${num_reads}" \ + --seed "$seed" \ + --strandness 0.5 \ + --basecaller guppy \ + --aligned_rate "$aligned_rate" \ + --num_threads "$num_procs" \ + --no_flanking \ + --no_error_profile + +echo "Merging files" +# merge 'reads_seed1_aligned_reads.fasta', 'reads_seed1_unaligned_reads.fasta' into 'reads_seed1_merged_reads.fasta' +files=$(ls "$output_dir/reads_seed${seed}_"*) +merged_file="$output_dir/reads_seed${seed}_merged_reads.fasta" +# note: aligned and unaligned reads are not shuffled here because they are shuffled by the ReadPoolFromFile +cat $files > "$merged_file" +rm $files + +echo "Generating .fai file" +samtools faidx "$merged_file" + +echo "nanosim read generation: done with seed $seed" + + +# # generate small fake genome and write to file using pysam +# from simreaduntil.shared_utils.dna import get_random_DNA_seq +# from Bio import SeqIO +# from Bio.Seq import Seq +# with open("runs/data/random_genome.fasta", "w") as fasta: +# SeqIO.write((SeqIO.SeqRecord(id=f"fakechr_{i}", seq=Seq(get_random_DNA_seq(1_000_000))) for i in range(20)), fasta, "fasta") \ No newline at end of file diff --git a/usecases/install_usecase_deps.sh b/usecases/install_usecase_deps.sh index 47fe9ca..ec053af 100755 --- a/usecases/install_usecase_deps.sh +++ b/usecases/install_usecase_deps.sh @@ -16,6 +16,9 @@ trap on_exit ERR tools_dir=~/ont_project_all/tools conda_or_mamba="conda" # very slow +# if mamba is available, use mamba +which mamba && conda_or_mamba="mamba" +echo "Using $conda_or_mamba for conda environment creation" usage() { echo "Usage: $0 [-h] [-e ] [-t ]" @@ -50,10 +53,6 @@ echo "The current base directory to install minimap to is: $tools_dir" # echo "Updated path to: $tools_dir" # fi -# if mamba is available, use mamba -which mamba && conda_or_mamba="mamba" -echo "Using $conda_or_mamba for conda environment creation" - # check we are in the right directory by checking for a directory "external" [ -d "external" ] || (echo "Error: not in the right directory. Run this script from the ont_project root directory containing the external directory"; exit 1) @@ -94,6 +93,7 @@ echo "Installed minimap2 to location: $(which minimap2)" echo "Make sure to add this to your PATH variable, e.g." echo "export \"PATH=$tools_dir/bin:\$PATH\"" +# exit 0 #################################################### # install NanoSim conda env diff --git a/usecases/replicate_run.py b/usecases/replicate_run.py index d174b88..a8eeed1 100644 --- a/usecases/replicate_run.py +++ b/usecases/replicate_run.py @@ -32,7 +32,7 @@ from simreaduntil.shared_utils.utils import delete_dir_if_exists, dill_dump, dill_load, num_lines_in_file, subset_dict from simreaduntil.simulator.gap_sampling.inactive_active_gaps_replication import get_read_durations_per_channel from simreaduntil.simulator.simfasta_to_seqsum import convert_simfasta_dir_to_seqsum, convert_simfasta_to_seqsum -from simreaduntil.simulator.simulator import assign_read_durations_to_channels, run_simulator_from_sampler_per_channel, run_simulator_from_sampler_per_channel_parallel, simulator_stats_to_disk +from simreaduntil.simulator.simulator import assign_read_durations_to_channels, run_simulator_from_sampler_per_channel, run_simulator_from_sampler_per_channel_parallel, write_simulator_stats from simreaduntil.simulator.simulator_params import SimParams from simreaduntil.simulator.utils import set_package_log_level from simreaduntil.usecase_helpers.utils import get_cleaned_seqsum_filename, create_figures, create_simparams_if_inexistent, get_gap_sampler_method, remove_mux_scans_and_clean_if_inexistent @@ -116,7 +116,7 @@ def run_simulator(seqsum_filename): logger.debug(f"#################################################################") logger.info("Saving simulator statistics") - simulator_stats_to_disk([simulator for (simulator, _) in simulators_and_read_filenames], output_dir=run_dir) + write_simulator_stats([simulator for (simulator, _) in simulators_and_read_filenames], output_dir=run_dir) logger.info(f"Writing sequencing summary file '{seqsum_filename}'") convert_simfasta_dir_to_seqsum(reads_dir, seqsummary_filename=seqsum_filename) logger.info("Wrote sequencing summary file") diff --git a/usecases/replicate_run_submission.sh b/usecases/replicate_run_submission.sh index 29479dd..9a0e07b 100755 --- a/usecases/replicate_run_submission.sh +++ b/usecases/replicate_run_submission.sh @@ -4,6 +4,7 @@ # takes about 8GB of memory ##CONDOR request_memory=32G ##CONDOR request_disk=100G +##CONDOR +JobBatchName = "ont_replicate_run" ##CONDOR log = /home/mmordig/joblogs/job-$(ClusterId)-$(ProcId).log ##CONDOR output = /home/mmordig/joblogs/job-$(ClusterId)-$(ProcId).out ##CONDOR error = /home/mmordig/joblogs/job-$(ClusterId)-$(ProcId).err @@ -25,9 +26,9 @@ source ~/.bashrc cd ~/ont_project_all/ont_project/ source ~/ont_project_all/ont_project_venv/bin/activate +set -ex export PATH=~/ont_project_all/tools/bin:$PATH && which minimap2 -set -ex cd runs/run_replication method=$1