Skip to content
This repository has been archived by the owner on Jan 20, 2022. It is now read-only.

Commit

Permalink
Low hanging fruit optimization of cluster file parsing (#65)
Browse files Browse the repository at this point in the history
  • Loading branch information
morsecodist authored Dec 14, 2020
1 parent a1d21c8 commit a922fa8
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 27 deletions.
6 changes: 3 additions & 3 deletions short-read-mngs/idseq-dag/idseq_dag/steps/nonhost_fastq.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os

from typing import Dict, Optional, Sequence, Set, Tuple
from typing import Dict, Optional, Sequence, Set, List, Tuple

import idseq_dag.util.command as command
import idseq_dag.util.command_patterns as command_patterns
Expand Down Expand Up @@ -33,7 +33,7 @@ def run_with_tax_ids(
self,
tax_ids: Optional[Set[int]],
filename: Optional[str],
clusters_dict: Dict[str, Tuple] = None,
clusters_dict: Dict[str, List] = None,
) -> None:
assert (tax_ids and filename) or not (
tax_ids or filename), 'Must be supplied with tax_ids and filename or neither'
Expand Down Expand Up @@ -135,7 +135,7 @@ def extract_header_from_line(line: str) -> Tuple[int, str, Set[int]]:
def generate_nonhost_headers(
self,
nonhost_fasta_file: str,
clusters_dict: Dict[str, Tuple] = None,
clusters_dict: Dict[str, List] = None,
tax_ids: Set[int] = None
):
# This var is only needed when tax_ids, because tax_id
Expand Down
20 changes: 1 addition & 19 deletions short-read-mngs/idseq-dag/idseq_dag/util/count.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import gzip
import multiprocessing
from enum import Enum
from subprocess import run, PIPE

Expand Down Expand Up @@ -98,7 +97,7 @@ def get_read_cluster_size(duplicate_cluster_sizes, read_id):
return cluster_size


def _load_duplicate_cluster_sizes_work(filename):
def load_duplicate_cluster_sizes(filename):
duplicate_cluster_sizes = {}
with open(filename, "r") as f:
for line in f:
Expand All @@ -107,28 +106,11 @@ def _load_duplicate_cluster_sizes_work(filename):
return duplicate_cluster_sizes


# Loading cluster sizes can be expensive prior to subsampling (for some exceptionally large
# samples with over 100 million reads). To ameliorate this cost, we make sure it is only
# paid once per stage (not once per step).
_DUPLICATE_CLUSTER_SIZES_CACHE = {}
_DUPLICATE_CLUSTER_SIZES_LOCK = multiprocessing.RLock()


def load_duplicate_cluster_sizes(filename):
with _DUPLICATE_CLUSTER_SIZES_LOCK:
if filename not in _DUPLICATE_CLUSTER_SIZES_CACHE:
_DUPLICATE_CLUSTER_SIZES_CACHE[filename] = _load_duplicate_cluster_sizes_work(filename)
return _DUPLICATE_CLUSTER_SIZES_CACHE[filename]


def save_duplicate_cluster_sizes(filename, duplicate_clusters):
with _DUPLICATE_CLUSTER_SIZES_LOCK:
_DUPLICATE_CLUSTER_SIZES_CACHE[filename] = {}
with open(filename, "w") as tsv:
for read_id, clusters in duplicate_clusters.items():
cluster_size = clusters[0]
tsv.write(f"{cluster_size}\t{read_id}\n")
_DUPLICATE_CLUSTER_SIZES_CACHE[filename][read_id] = cluster_size


def reads_in_group(file_group, max_fragments=None, cluster_sizes=None, cluster_key=None):
Expand Down
10 changes: 5 additions & 5 deletions short-read-mngs/idseq-dag/idseq_dag/util/idseq_dedup_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,19 @@
a cluster, and the second column contains the read id.
"""
from csv import DictReader
from typing import Dict, Optional, Tuple
from typing import Dict, Optional, List


def parse_clusters_file(
idseq_dedup_clusters_path: str,
) -> Dict[str, Optional[Tuple]]:
) -> Dict[str, Optional[List]]:
clusters_dict = {}
with open(idseq_dedup_clusters_path) as f:
for row in DictReader(f):
r_read_id, read_id = row["representative read id"], row["read id"]
if r_read_id not in clusters_dict:
clusters_dict[r_read_id] = (1,)
clusters_dict[r_read_id] = [1]
else:
count, *others = clusters_dict[r_read_id]
clusters_dict[r_read_id] = tuple([count + 1] + others + [read_id])
clusters_dict[r_read_id][0] += 1
clusters_dict[r_read_id].append(read_id)
return clusters_dict

0 comments on commit a922fa8

Please sign in to comment.