Fix dataset processing (#22)

* update general paths in scripts * update process_datasets * lower default n_obs_limit * update submodule * update changelog * refactor subsample * fix typo
openproblems-bio · Oct 2, 2024 · bfa2730 · bfa2730
1 parent f5021bb
commit bfa2730
Show file tree

Hide file tree

Showing 8 changed files with 37 additions and 31 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -44,6 +44,8 @@
 
 * Fix paths in scripts (PR #18).
 
+* Subsample datasets by batch if batch is defined (PR #22).
+
 ## transfer from openproblems-v2 repository
 
 ### NEW FUNCTIONALITY

diff --git a/common b/common
diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh
@@ -30,7 +30,7 @@ publish_dir: "$publish_dir"
 HERE
 
 # run the benchmark
-nextflow run openproblems-bio/task_template \
+nextflow run openproblems-bio/task_denoising \
   --revision build/main \
   -main-script target/nextflow/workflows/run_benchmark/main.nf \
   -profile docker \

diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh
@@ -10,11 +10,11 @@ set -e
 
 # generate a unique id
 RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
-publish_dir="s3://openproblems-data/resources/denoising/results/${RUN_ID}"
+publish_dir="s3://openproblems-data/resources/task_denoising/results/${RUN_ID}"
 
 # write the parameters to file
 cat > /tmp/params.yaml << HERE
-input_states: s3://openproblems-data/resources/denoising/datasets/**/state.yaml
+input_states: s3://openproblems-data/resources/task_denoising/datasets/**/state.yaml
 rename_keys: 'input_train:output_train;input_test:output_test'
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
@@ -29,4 +29,4 @@ tw launch https://github.com/openproblems-bio/task_denoising.git \
   --params-file /tmp/params.yaml \
   --entry-name auto \
   --config common/nextflow_helpers/labels_tw.config \
-  --labels denoising,full
+  --labels task_denoising,full
diff --git a/scripts/run_benchmark/run_test_seqeracloud.sh b/scripts/run_benchmark/run_test_seqeracloud.sh
@@ -8,8 +8,8 @@ cd "$REPO_ROOT"
 
 set -e
 
-resources_test_s3=s3://openproblems-data/resources_test/denoising
-publish_dir_s3="s3://openproblems-nextflow/temp/results/denoising/$(date +%Y-%m-%d_%H-%M-%S)"
+resources_test_s3=s3://openproblems-data/resources_test/task_denoising
+publish_dir_s3="s3://openproblems-nextflow/temp/results/task_denoising/$(date +%Y-%m-%d_%H-%M-%S)"
 
 # write the parameters to file
 cat > /tmp/params.yaml << HERE
@@ -28,4 +28,4 @@ tw launch https://github.com/openproblems-bio/task_denoising.git \
   --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
   --params-file /tmp/params.yaml \
   --config common/nextflow_helpers/labels_tw.config \
-  --labels denoising,test
+  --labels task_denoising,test
diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml
@@ -22,7 +22,7 @@ arguments:
   - name: "--n_obs_limit"
     type: "integer"
     description: "The maximum number of cells the dataset may have before subsampling according to `obs.batch`."
-    default: 20000
+    default: 10000
 resources:
   - type: python_script
     path: script.py

diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
@@ -5,15 +5,16 @@
 
 ## VIASH START
 par = {
-    'input': "resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad",
-    'output_train': "train.h5ad",
-    'output_test': "test.h5ad",
+    'input': "resources/datasets/openproblems_v1/pancreas/log_cp10k/dataset.h5ad",
+    'output_train': "output/processed_datasets/train.h5ad",
+    'output_test': "output/processed_datasets/test.h5ad",
     'train_frac': 0.9,
-    'seed': 0
+    'seed': 0,
+    'n_obs_limit': 4000
 }
 meta = {
     "name": "process_dataset",
-    "resources_dir": "src/tasks/denoising/process_dataset"
+    "resources_dir": "src/data_processors/process_dataset"
 }
 ## VIASH END
 
@@ -29,27 +30,30 @@
 
 # limit to max number of observations
 adata_output = adata.copy()
-if adata.n_obs > par["n_obs_limit"]:
-    print(">> Subsampling the observations", flush=True)
-    print(f">> Setting seed to {par['seed']}")
+
+if "batch" in adata.obs:
+    print(f">> Subsampling observations by largest batch", flush=True)
+    batch_counts = adata.obs.groupby('batch').size()
+    sorted_batches = batch_counts.sort_values(ascending=False)
+    selected_batch = sorted_batches.index[0]
+    adata_output = adata[adata.obs["batch"]==selected_batch,:].copy()
+
+if adata_output.n_obs > par["n_obs_limit"]:
+    print(f">> Randomly subsampling observations to {par['n_obs_limit']}", flush=True)
+    print(f">> Setting seed to {par['seed']}", flush=True)
     random.seed(par["seed"])
-    if "batch" not in adata.obs:
-        obs_filt = np.ones(dtype=np.bool_, shape=adata.n_obs)
-        obs_index = np.random.choice(np.where(obs_filt)[0], par["n_obs_limit"], replace=False)
-        adata_output = adata[obs_index].copy()
-    else:
-        batch_counts = adata.obs.groupby('batch').size()
-        filtered_batches = batch_counts[batch_counts <= par["n_obs_limit"]]
-        sorted_filtered_batches = filtered_batches.sort_values(ascending=False)
-        selected_batch = sorted_filtered_batches.index[0]
-        adata_output = adata[adata.obs["batch"]==selected_batch,:].copy()
+    obs_filt = np.ones(dtype=np.bool_, shape=adata_output.n_obs)
+    obs_index = np.random.choice(np.where(obs_filt)[0], par["n_obs_limit"], replace=False)
+    adata_output = adata_output[obs_index].copy()
 
 # remove all layers except for counts
+print(">> Remove all layers except for counts", flush=True)
 for key in list(adata_output.layers.keys()):
     if key != "counts":
         del adata_output.layers[key]
 
 # round counts and convert to int
+print(">> Round counts and convert to int", flush=True)
 counts = np.array(adata_output.layers["counts"]).round().astype(int)
 
 print(">> process and split data", flush=True)
@@ -65,6 +69,7 @@
 X_test.eliminate_zeros()
 
 # copy adata to train_set, test_set
+print(">> Create AnnData output objects", flush=True)
 output_train = ad.AnnData(
     layers={"counts": X_train},
     obs=adata_output.obs[[]],
@@ -83,6 +88,7 @@
 output_test.uns["train_sum"] = X_train.sum()
 
 # Remove no cells that do not have enough reads
+print(">> Remove cells that do not have enough reads", flush=True)
 is_missing = np.array(X_train.sum(axis=0) == 0)
 
 output_train = output_train[:, ~is_missing.flatten()]

diff --git a/src/workflows/process_datasets/run_test.sh b/src/workflows/process_datasets/run_test.sh
@@ -11,15 +11,13 @@ cd "$REPO_ROOT"
 
 set -e
 
-export NXF_VER=22.04.5
-
 nextflow run . \
-  -main-script target/nextflow/denoising/workflows/process_datasets/main.nf \
+  -main-script target/nextflow/workflows/process_datasets/main.nf \
   -profile docker \
   -entry auto \
   -c common/nextflow_helpers/labels_ci.config \
   --id run_test \
   --input_states "resources_test/common/**/state.yaml" \
   --rename_keys 'input:output_dataset' \
   --settings '{"output_train": "train.h5ad", "output_test": "test.h5ad"}' \
-  --publish_dir "resources_test/denoising"
+  --publish_dir "resources_test/task_denoising"