Skip to content

Commit

Permalink
Improve generic parser: automatic full directory processing and compr…
Browse files Browse the repository at this point in the history
…essed files

On branch master
	modified:   .gitignore
	modified:   README.md
	modified:   rcf
	modified:   recentrifuge/__init__.py
	modified:   recentrifuge/generic.py
	modified:   setup.py
  • Loading branch information
khyox committed Sep 24, 2024
1 parent ef4d0cc commit 6e5502b
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 11 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,4 @@ rcf_test/
build/
dist/
emg/
.vscode
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


____
[![Retest](https://github.com/khyox/Recentrifuge/actions/workflows/retest.yaml/badge.svg?branch=v1.14.1)](https://github.com/khyox/recentrifuge/actions/workflows/retest.yaml)
[![Retest](https://github.com/khyox/Recentrifuge/actions/workflows/retest.yaml/badge.svg?branch=v1.15.0)](https://github.com/khyox/recentrifuge/actions/workflows/retest.yaml)
[![](https://img.shields.io/maintenance/yes/2024.svg)](http://www.recentrifuge.org)
[![](https://img.shields.io/github/languages/top/khyox/recentrifuge.svg)](https://pypi.org/project/recentrifuge/)
[![](https://img.shields.io/pypi/pyversions/recentrifuge.svg)](https://pypi.org/project/recentrifuge/)
Expand Down
12 changes: 8 additions & 4 deletions rcf
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ from recentrifuge.config import STR_CONTROL, STR_EXCLUSIVE, STR_SHARED
from recentrifuge.config import STR_CONTROL_SHARED
from recentrifuge.config import gray, red, green, yellow, blue, magenta
from recentrifuge.core import process_rank, summarize_analysis
from recentrifuge.generic import GenericFormat
from recentrifuge.generic import GenericFormat, select_generic_inputs
from recentrifuge.kraken import select_kraken_inputs
from recentrifuge.krona import COUNT, UNASSIGNED, SCORE
from recentrifuge.krona import KronaTree
Expand Down Expand Up @@ -156,8 +156,10 @@ def main():
metavar='FILE',
type=Filename,
help=('output file from a generic classifier; it requires the flag'
' --format (see such option for details); multiple -g is '
'available to include several generic samples')
' --format (see such option for details); if a single '
'directory is entered, every file inside will be taken as a '
'different sample; multiple -g is available to include '
'several generic samples by filename')
)
parser_filein.add_argument(
'-l', '--lmat',
Expand Down Expand Up @@ -188,7 +190,7 @@ def main():
help=('Kraken output files; if a single directory is entered, '
'every .krk file inside will be taken as a different sample;'
' multiple -k is available to include several Kraken '
'(version 1 or 2) samples')
'(version 1 or 2) samples by filename')
)
parser_out = parser.add_argument_group(
'output', 'Related to the Recentrifuge output files')
Expand Down Expand Up @@ -400,6 +402,8 @@ def main():
'generic classifier; using GENERIC.')
scoring = Scoring.GENERIC
input_files = generics
if len(generics) == 1 and os.path.isdir(generics[0]):
select_generic_inputs(generics)
elif krakens:
classifier = Classifier.KRAKEN
process = process_output
Expand Down
4 changes: 2 additions & 2 deletions recentrifuge/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
__email__ = 'jse.mnl **AT** gmail.com'
__maintainer__ = 'Jose Manuel Marti'
__status__ = 'Production/Stable'
__date__ = 'Jul 2024'
__version__ = '1.14.1'
__date__ = 'Sep 2024'
__version__ = '1.15.0'

import sys
from Bio import SeqIO
Expand Down
36 changes: 33 additions & 3 deletions recentrifuge/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@

import collections as col
import io
import os
from enum import Enum
from math import log10
from statistics import mean
from typing import Tuple, Counter, Dict, List, Set
from typing import Tuple, Counter, Dict, List, Set, Any, Union, TextIO, IO

from recentrifuge.config import Filename, Id, Score, Scoring
from recentrifuge.config import gray, red, green, yellow, blue
Expand Down Expand Up @@ -106,6 +107,20 @@ def __str__(self):
f'LEN:{self.len}, SCO:{self.sco}, UNC:{self.unc}.')


def open_compressed_and_uncompressed(filename: Filename
) -> Union[TextIO, IO[Any]]:
"""Aux method to deal with compressed generic files"""
ext = os.path.splitext(filename)[1]
if ext == '.gz':
import gzip
return gzip.open(filename, mode='rt')
elif ext == '.bz2':
import bz2
return bz2.open(filename, mode='rt')
else:
return open(filename, mode='rt')


def read_generic_output(output_file: Filename,
scoring: Scoring = Scoring.GENERIC,
minscore: Score = None,
Expand Down Expand Up @@ -141,7 +156,7 @@ def read_generic_output(output_file: Filename,
raise Exception(red('\nERROR!'),
'Missing GenericFormat when reading a generic output.')
try:
with open(output_file, 'r') as file:
with open_compressed_and_uncompressed(output_file) as file:
# Main loop processing each file line
for raw_line in file:
raw_line = raw_line.strip(' \n\t')
Expand Down Expand Up @@ -221,7 +236,7 @@ def read_generic_output(output_file: Filename,
+ f'Cannot read any sequence from "{output_file}"')
filt_seqs: int = sum([len(scores) for scores in all_scores.values()])
if filt_seqs == 0:
raise Exception(red('\nERROR! ') + 'No sequence passed the filter!')
print(yellow('Warning!'), f'{output_file}: No seq passed the filter!')
# Get statistics
stat: SampleStats = SampleStats(
minscore=minscore, nt_read=nt_read, lens=all_length, scores=all_scores,
Expand Down Expand Up @@ -268,3 +283,18 @@ def read_generic_output(output_file: Filename,
f'Generic: Unsupported Scoring "{scoring}"')
# Return
return output.getvalue(), stat, counts, out_scores


def select_generic_inputs(generics: List[Filename]) -> None:
"""Search for generic files to analyze"""
dir_name = generics[0]
generics.clear()
with os.scandir(dir_name) as dir_entry:
for fil in dir_entry:
if not fil.name.startswith('.'):
if dir_name != '.':
generics.append(Filename(os.path.join(dir_name, fil.name)))
else: # Avoid sample names starting with just the dot
generics.append(Filename(fil.name))
generics.sort()
print(gray(f'Generic files to analyze:'), generics)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name='recentrifuge',
version='1.14.1',
version='1.15.0',
packages=['recentrifuge'],
url='http://www.recentrifuge.org',
license='AGPL except krona.js, with its own license by BNBI',
Expand Down

0 comments on commit 6e5502b

Please sign in to comment.