Low hanging fruit optimization of cluster file parsing (#65)

chanzuckerberg · Dec 14, 2020 · a922fa8 · a922fa8
1 parent a1d21c8
commit a922fa8
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 27 deletions.
diff --git a/short-read-mngs/idseq-dag/idseq_dag/steps/nonhost_fastq.py b/short-read-mngs/idseq-dag/idseq_dag/steps/nonhost_fastq.py
@@ -1,6 +1,6 @@
 import os
 
-from typing import Dict, Optional, Sequence, Set, Tuple
+from typing import Dict, Optional, Sequence, Set, List, Tuple
 
 import idseq_dag.util.command as command
 import idseq_dag.util.command_patterns as command_patterns
@@ -33,7 +33,7 @@ def run_with_tax_ids(
         self,
         tax_ids: Optional[Set[int]],
         filename: Optional[str],
-        clusters_dict: Dict[str, Tuple] = None,
+        clusters_dict: Dict[str, List] = None,
     ) -> None:
         assert (tax_ids and filename) or not (
             tax_ids or filename), 'Must be supplied with tax_ids and filename or neither'
@@ -135,7 +135,7 @@ def extract_header_from_line(line: str) -> Tuple[int, str, Set[int]]:
     def generate_nonhost_headers(
         self,
         nonhost_fasta_file: str,
-        clusters_dict: Dict[str, Tuple] = None,
+        clusters_dict: Dict[str, List] = None,
         tax_ids: Set[int] = None
     ):
         # This var is only needed when tax_ids, because tax_id

diff --git a/short-read-mngs/idseq-dag/idseq_dag/util/count.py b/short-read-mngs/idseq-dag/idseq_dag/util/count.py
@@ -1,5 +1,4 @@
 import gzip
-import multiprocessing
 from enum import Enum
 from subprocess import run, PIPE
 
@@ -98,7 +97,7 @@ def get_read_cluster_size(duplicate_cluster_sizes, read_id):
     return cluster_size
 
 
-def _load_duplicate_cluster_sizes_work(filename):
+def load_duplicate_cluster_sizes(filename):
     duplicate_cluster_sizes = {}
     with open(filename, "r") as f:
         for line in f:
@@ -107,28 +106,11 @@ def _load_duplicate_cluster_sizes_work(filename):
     return duplicate_cluster_sizes
 
 
-# Loading cluster sizes can be expensive prior to subsampling (for some exceptionally large
-# samples with over 100 million reads).  To ameliorate this cost, we make sure it is only
-# paid once per stage (not once per step).
-_DUPLICATE_CLUSTER_SIZES_CACHE = {}
-_DUPLICATE_CLUSTER_SIZES_LOCK = multiprocessing.RLock()
-
-
-def load_duplicate_cluster_sizes(filename):
-    with _DUPLICATE_CLUSTER_SIZES_LOCK:
-        if filename not in _DUPLICATE_CLUSTER_SIZES_CACHE:
-            _DUPLICATE_CLUSTER_SIZES_CACHE[filename] = _load_duplicate_cluster_sizes_work(filename)
-        return _DUPLICATE_CLUSTER_SIZES_CACHE[filename]
-
-
 def save_duplicate_cluster_sizes(filename, duplicate_clusters):
-    with _DUPLICATE_CLUSTER_SIZES_LOCK:
-        _DUPLICATE_CLUSTER_SIZES_CACHE[filename] = {}
     with open(filename, "w") as tsv:
         for read_id, clusters in duplicate_clusters.items():
             cluster_size = clusters[0]
             tsv.write(f"{cluster_size}\t{read_id}\n")
-            _DUPLICATE_CLUSTER_SIZES_CACHE[filename][read_id] = cluster_size
 
 
 def reads_in_group(file_group, max_fragments=None, cluster_sizes=None, cluster_key=None):

diff --git a/short-read-mngs/idseq-dag/idseq_dag/util/idseq_dedup_clusters.py b/short-read-mngs/idseq-dag/idseq_dag/util/idseq_dedup_clusters.py
@@ -4,19 +4,19 @@
 a cluster, and the second column contains the read id.
 """
 from csv import DictReader
-from typing import Dict, Optional, Tuple
+from typing import Dict, Optional, List
 
 
 def parse_clusters_file(
     idseq_dedup_clusters_path: str,
-) -> Dict[str, Optional[Tuple]]:
+) -> Dict[str, Optional[List]]:
     clusters_dict = {}
     with open(idseq_dedup_clusters_path) as f:
         for row in DictReader(f):
             r_read_id, read_id = row["representative read id"], row["read id"]
             if r_read_id not in clusters_dict:
-                clusters_dict[r_read_id] = (1,)
+                clusters_dict[r_read_id] = [1]
             else:
-                count, *others = clusters_dict[r_read_id]
-                clusters_dict[r_read_id] = tuple([count + 1] + others + [read_id])
+                clusters_dict[r_read_id][0] += 1
+                clusters_dict[r_read_id].append(read_id)
     return clusters_dict