Skip to content

Commit

Permalink
Fix dataset processing (#22)
Browse files Browse the repository at this point in the history
* update general paths in scripts

* update process_datasets

* lower default n_obs_limit

* update submodule

* update changelog

* refactor subsample

* fix typo
  • Loading branch information
KaiWaldrant authored Oct 2, 2024
1 parent f5021bb commit bfa2730
Show file tree
Hide file tree
Showing 8 changed files with 37 additions and 31 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@

* Fix paths in scripts (PR #18).

* Subsample datasets by batch if batch is defined (PR #22).

## transfer from openproblems-v2 repository

### NEW FUNCTIONALITY
Expand Down
2 changes: 1 addition & 1 deletion common
2 changes: 1 addition & 1 deletion scripts/run_benchmark/run_full_local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ publish_dir: "$publish_dir"
HERE

# run the benchmark
nextflow run openproblems-bio/task_template \
nextflow run openproblems-bio/task_denoising \
--revision build/main \
-main-script target/nextflow/workflows/run_benchmark/main.nf \
-profile docker \
Expand Down
6 changes: 3 additions & 3 deletions scripts/run_benchmark/run_full_seqeracloud.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ set -e

# generate a unique id
RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
publish_dir="s3://openproblems-data/resources/denoising/results/${RUN_ID}"
publish_dir="s3://openproblems-data/resources/task_denoising/results/${RUN_ID}"

# write the parameters to file
cat > /tmp/params.yaml << HERE
input_states: s3://openproblems-data/resources/denoising/datasets/**/state.yaml
input_states: s3://openproblems-data/resources/task_denoising/datasets/**/state.yaml
rename_keys: 'input_train:output_train;input_test:output_test'
output_state: "state.yaml"
publish_dir: "$publish_dir"
Expand All @@ -29,4 +29,4 @@ tw launch https://github.com/openproblems-bio/task_denoising.git \
--params-file /tmp/params.yaml \
--entry-name auto \
--config common/nextflow_helpers/labels_tw.config \
--labels denoising,full
--labels task_denoising,full
6 changes: 3 additions & 3 deletions scripts/run_benchmark/run_test_seqeracloud.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ cd "$REPO_ROOT"

set -e

resources_test_s3=s3://openproblems-data/resources_test/denoising
publish_dir_s3="s3://openproblems-nextflow/temp/results/denoising/$(date +%Y-%m-%d_%H-%M-%S)"
resources_test_s3=s3://openproblems-data/resources_test/task_denoising
publish_dir_s3="s3://openproblems-nextflow/temp/results/task_denoising/$(date +%Y-%m-%d_%H-%M-%S)"

# write the parameters to file
cat > /tmp/params.yaml << HERE
Expand All @@ -28,4 +28,4 @@ tw launch https://github.com/openproblems-bio/task_denoising.git \
--compute-env 6TeIFgV5OY4pJCk8I0bfOh \
--params-file /tmp/params.yaml \
--config common/nextflow_helpers/labels_tw.config \
--labels denoising,test
--labels task_denoising,test
2 changes: 1 addition & 1 deletion src/data_processors/process_dataset/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ arguments:
- name: "--n_obs_limit"
type: "integer"
description: "The maximum number of cells the dataset may have before subsampling according to `obs.batch`."
default: 20000
default: 10000
resources:
- type: python_script
path: script.py
Expand Down
42 changes: 24 additions & 18 deletions src/data_processors/process_dataset/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,16 @@

## VIASH START
par = {
'input': "resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad",
'output_train': "train.h5ad",
'output_test': "test.h5ad",
'input': "resources/datasets/openproblems_v1/pancreas/log_cp10k/dataset.h5ad",
'output_train': "output/processed_datasets/train.h5ad",
'output_test': "output/processed_datasets/test.h5ad",
'train_frac': 0.9,
'seed': 0
'seed': 0,
'n_obs_limit': 4000
}
meta = {
"name": "process_dataset",
"resources_dir": "src/tasks/denoising/process_dataset"
"resources_dir": "src/data_processors/process_dataset"
}
## VIASH END

Expand All @@ -29,27 +30,30 @@

# limit to max number of observations
adata_output = adata.copy()
if adata.n_obs > par["n_obs_limit"]:
print(">> Subsampling the observations", flush=True)
print(f">> Setting seed to {par['seed']}")

if "batch" in adata.obs:
print(f">> Subsampling observations by largest batch", flush=True)
batch_counts = adata.obs.groupby('batch').size()
sorted_batches = batch_counts.sort_values(ascending=False)
selected_batch = sorted_batches.index[0]
adata_output = adata[adata.obs["batch"]==selected_batch,:].copy()

if adata_output.n_obs > par["n_obs_limit"]:
print(f">> Randomly subsampling observations to {par['n_obs_limit']}", flush=True)
print(f">> Setting seed to {par['seed']}", flush=True)
random.seed(par["seed"])
if "batch" not in adata.obs:
obs_filt = np.ones(dtype=np.bool_, shape=adata.n_obs)
obs_index = np.random.choice(np.where(obs_filt)[0], par["n_obs_limit"], replace=False)
adata_output = adata[obs_index].copy()
else:
batch_counts = adata.obs.groupby('batch').size()
filtered_batches = batch_counts[batch_counts <= par["n_obs_limit"]]
sorted_filtered_batches = filtered_batches.sort_values(ascending=False)
selected_batch = sorted_filtered_batches.index[0]
adata_output = adata[adata.obs["batch"]==selected_batch,:].copy()
obs_filt = np.ones(dtype=np.bool_, shape=adata_output.n_obs)
obs_index = np.random.choice(np.where(obs_filt)[0], par["n_obs_limit"], replace=False)
adata_output = adata_output[obs_index].copy()

# remove all layers except for counts
print(">> Remove all layers except for counts", flush=True)
for key in list(adata_output.layers.keys()):
if key != "counts":
del adata_output.layers[key]

# round counts and convert to int
print(">> Round counts and convert to int", flush=True)
counts = np.array(adata_output.layers["counts"]).round().astype(int)

print(">> process and split data", flush=True)
Expand All @@ -65,6 +69,7 @@
X_test.eliminate_zeros()

# copy adata to train_set, test_set
print(">> Create AnnData output objects", flush=True)
output_train = ad.AnnData(
layers={"counts": X_train},
obs=adata_output.obs[[]],
Expand All @@ -83,6 +88,7 @@
output_test.uns["train_sum"] = X_train.sum()

# Remove no cells that do not have enough reads
print(">> Remove cells that do not have enough reads", flush=True)
is_missing = np.array(X_train.sum(axis=0) == 0)

output_train = output_train[:, ~is_missing.flatten()]
Expand Down
6 changes: 2 additions & 4 deletions src/workflows/process_datasets/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,13 @@ cd "$REPO_ROOT"

set -e

export NXF_VER=22.04.5

nextflow run . \
-main-script target/nextflow/denoising/workflows/process_datasets/main.nf \
-main-script target/nextflow/workflows/process_datasets/main.nf \
-profile docker \
-entry auto \
-c common/nextflow_helpers/labels_ci.config \
--id run_test \
--input_states "resources_test/common/**/state.yaml" \
--rename_keys 'input:output_dataset' \
--settings '{"output_train": "train.h5ad", "output_test": "test.h5ad"}' \
--publish_dir "resources_test/denoising"
--publish_dir "resources_test/task_denoising"

0 comments on commit bfa2730

Please sign in to comment.