From 89c55b002b8fc46a206f9e2e93bdcbe2929517b1 Mon Sep 17 00:00:00 2001 From: maxibor Date: Wed, 24 Jul 2024 17:47:22 +0200 Subject: [PATCH] feat: switch print to logging --- pydamage/__init__.py | 4 ++++ pydamage/damage.py | 52 +++++++++++--------------------------------- pydamage/filter.py | 10 ++++----- pydamage/main.py | 17 ++++++++------- pydamage/rescale.py | 1 - pydamage/utils.py | 6 ++--- tests/test_damage.py | 4 ++-- 7 files changed, 35 insertions(+), 59 deletions(-) diff --git a/pydamage/__init__.py b/pydamage/__init__.py index 6dce1fe..ca279e7 100644 --- a/pydamage/__init__.py +++ b/pydamage/__init__.py @@ -1,2 +1,6 @@ __version__ = "0.80" from pydamage.main import pydamage_analyze +import logging + +logging.basicConfig(level=logging.INFO, format="%(message)s") + diff --git a/pydamage/damage.py b/pydamage/damage.py index 2cabd00..9a8248f 100644 --- a/pydamage/damage.py +++ b/pydamage/damage.py @@ -3,10 +3,11 @@ import pysam from pydamage.parse_damage import damage_al from pydamage.model_fit import fit_models +from pydamage.contig_stats import get_contig_stats, get_ref_stats from pydamage import models import numpy as np -from numba import jit from tqdm import tqdm +import logging def sort_count_array_dict(int_array): """Sorts and counts unique values in an array @@ -62,7 +63,7 @@ def get_damage(self, show_al): self.no_mut = [] self.read_dict = {self.reference: dict()} if self.reference is None: - iterator = tqdm(self.alignments) + iterator = tqdm(self.alignments, desc="Compute damage for entire reference") else: iterator = self.alignments for al in iterator: @@ -146,20 +147,7 @@ def compute_damage(self): ) -@jit(parallel=True) -def avg_coverage_contig(pysam_cov): - """Computes average coverage of a reference - Args: - pysam_cov (np.array): Four dimensional array of coverage for each base - Returns: - float: mean coverage of reference - """ - return ( - np.mean( - np.sum(pysam_cov, axis=0) - ) - ) def check_model_fit(model_dict, wlen, verbose): @@ -175,7 +163,7 @@ def check_model_fit(model_dict, wlen, verbose): # Check that no fitted parameters or stdev are infinite if np.isinf(np.array(model_dict["model_params"])).any(): if verbose: - print(f"Unreliable model fit for {model_dict['reference']}") + logging.warning(f"Unreliable model fit for {model_dict['reference']}") return False return model_dict @@ -200,27 +188,11 @@ def test_damage(ref, bam, mode, wlen, g2a, subsample, show_al, process, verbose) al_handle = pysam.AlignmentFile(bam, mode=mode, threads=process) try: if ref is None: - all_references = al_handle.references - print("Computing coverage") - cov = avg_coverage_contig( - np.concatenate( - [np.array(al_handle.count_coverage(contig=ref), dtype="uint16") for ref in tqdm(all_references)], - axis=1 - ) - ) - print("Getting number of reads aligned") - nb_reads_aligned = np.sum( - [al_handle.count(contig=ref) for ref in tqdm(all_references)] - ) - print("Getting reference length") - reflen = np.sum( - [al_handle.get_reference_length(ref) for ref in tqdm(all_references)] - ) + logging.info("Computing alignment stats for entire reference") + reflen, cov, nb_reads_aligned = get_ref_stats(bam) refname = "reference" else: - cov = avg_coverage_contig(np.array(al_handle.count_coverage(contig=ref), dtype="uint16")) - nb_reads_aligned = al_handle.count(contig=ref) - reflen = al_handle.get_reference_length(ref) + reflen, cov, nb_reads_aligned = get_contig_stats(al_handle, ref) refname = ref if subsample: @@ -242,6 +214,8 @@ def test_damage(ref, bam, mode, wlen, g2a, subsample, show_al, process, verbose) all_damage, ) = al.compute_damage() + al_handle.close() + model_A = models.damage_model() model_B = models.null_model() test_res = fit_models( @@ -266,15 +240,15 @@ def test_damage(ref, bam, mode, wlen, g2a, subsample, show_al, process, verbose) GA_log[f"GtoA-{i}"] = GA_damage[i] test_res.update(CT_log) test_res.update(GA_log) - return (check_model_fit(test_res, wlen, verbose), read_dict) except (ValueError, RuntimeError) as e: if verbose: - print(f"Model fitting for {ref} failed") - print(f"Model fitting error: {e}") - print( + logging.warning(f"Model fitting for {ref} failed") + logging.warning(f"Model fitting error: {e}") + logging.warning( f"nb_reads_aligned: {nb_reads_aligned} - coverage: {cov}" f" - reflen: {reflen}\n" ) + al_handle.close() return False diff --git a/pydamage/filter.py b/pydamage/filter.py index 7071555..dd8ef88 100644 --- a/pydamage/filter.py +++ b/pydamage/filter.py @@ -3,7 +3,7 @@ from pydamage.utils import df_to_csv from pandas import read_csv import os - +import logging def define_threshold(pydam_df, min_knee=0.5, alpha=0.05): """Find kneedle point in PyDamage results @@ -32,8 +32,6 @@ def define_threshold(pydam_df, min_knee=0.5, alpha=0.05): direction="decreasing", online=True, ) - print(thresholds) - print(nb_contigs) return kneedle.knee @@ -63,13 +61,13 @@ def apply_filter(csv, threshold, outdir, alpha=0.05): outfile = "pydamage_filtered_results.csv" if threshold == 0: threshold = define_threshold(df) - print(f"Optimal prediction accuracy threshold found to be: {threshold}") + logging.info(f"Optimal prediction accuracy threshold found to be: {threshold}") filt_df = filter_pydamage_results(df, acc_thresh=threshold) - print( + logging.info( f"Filtering PyDamage results with qvalue <= {alpha} and predicted_accuracy >= {threshold}" ) if not os.path.exists(outdir): os.mkdir(outdir) df_to_csv(filt_df, outdir, outfile) - print(f"Filtered PyDamage results written to {outdir}/{outfile}") + logging.info(f"Filtered PyDamage results written to {outdir}/{outfile}") return filt_df diff --git a/pydamage/main.py b/pydamage/main.py index f64cff2..6540de3 100644 --- a/pydamage/main.py +++ b/pydamage/main.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -import pysam from pydamage import utils import multiprocessing from functools import partial @@ -10,11 +9,11 @@ from pydamage.rescale import rescale_bam from pydamage.models import glm_model_params import os -import sys from tqdm import tqdm import warnings from pydamage import __version__ from collections import ChainMap +import logging def pydamage_analyze( bam, @@ -58,7 +57,7 @@ def pydamage_analyze( raise ValueError("Cannot use subsample and rescale together") if verbose: - print(f"Pydamage version {__version__}\n") + logging.info(f"Pydamage version {__version__}\n") utils.makedir(outdir, force=force) refs, mode = utils.prepare_bam(bam, minlen=minlen) @@ -98,7 +97,7 @@ def pydamage_analyze( verbose=verbose, ) if group: - print("Estimating and testing Damage") + logging.info("Estimating and testing Damage") filt_res, read_dict = damage.test_damage( ref=None, bam=bam, @@ -133,14 +132,15 @@ def pydamage_analyze( ###################### ###################### - print(f"{len(filt_res)} contig(s) analyzed by Pydamage") + if not group: + logging.info(f"{len(filt_res)} contig(s) analyzed by Pydamage") if len(filt_res) == 0: warnings.warn( "No alignments were found, check your alignment file", PyDamageWarning ) if plot and len(filt_res) > 0: - print("\nGenerating Pydamage plots") + logging.info("Generating Pydamage plots") plotdir = f"{outdir}/plots" utils.makedir(plotdir, confirm=False) @@ -153,7 +153,8 @@ def pydamage_analyze( df_glm = glm_predict(prep_df_glm, glm_model_params) df = df_glm.merge(df_pydamage, left_index=True, right_index=True) - + utils.df_to_csv(df, outdir) + if rescale: rescale_bam( bam=bam, @@ -165,5 +166,5 @@ def pydamage_analyze( outname=os.path.join(outdir, "pydamage_rescaled.bam"), threads=process, ) - utils.df_to_csv(df, outdir) + return df diff --git a/pydamage/rescale.py b/pydamage/rescale.py index 62a1f80..0bcc0ab 100644 --- a/pydamage/rescale.py +++ b/pydamage/rescale.py @@ -86,7 +86,6 @@ def rescale_bam( "CL": " ".join(sys.argv), } ) - print(hd) refs = al.references with pysam.AlignmentFile(outname, "wb", threads=threads, header=hd) as out: for ref in tqdm(refs, desc="Rescaling quality scores"): diff --git a/pydamage/utils.py b/pydamage/utils.py index 4c445d7..a4ec1cc 100644 --- a/pydamage/utils.py +++ b/pydamage/utils.py @@ -6,7 +6,7 @@ import pandas as pd from typing import Tuple from pysam import AlignmentFile - +import logging def check_extension(filename: str) -> str: """Check alignment file format to give correct open mode @@ -38,7 +38,7 @@ def makedir(dirpath: str, confirm: bool = True, force: bool = False): """ if os.path.exists(dirpath): if confirm and force is False: - print( + logging.warning( f"Result directory, {dirpath}, already exists, it will be overwritten" ) if input("Do You Want To Continue? (y|n) ").lower() != "y": @@ -240,7 +240,7 @@ def prepare_bam(bam: str, minlen: int) -> Tuple[Tuple, str]: alf = AlignmentFile(bam, mode) if not alf.has_index(): - print( + logging.error( f"BAM file {bam} has no index. Sort BAM file and provide index " "before running pydamage." ) diff --git a/tests/test_damage.py b/tests/test_damage.py index 2f0a3dd..ba47edd 100644 --- a/tests/test_damage.py +++ b/tests/test_damage.py @@ -26,10 +26,10 @@ def test_al_to_damage(bamfile): def test_avg_coverage(): - from pydamage.damage import avg_coverage_contig + from pydamage.contig_stats import compute_coverage_sum cov = np.array([[1, 4, 5, 3], [2, 5, 6, 7], [5, 6, 3, 1], [2, 4, 8, 1]], np.int32) - assert avg_coverage_contig(cov) == 15.75 + assert compute_coverage_sum(cov) == 63 def test_check_model_fit():