diff --git a/CHANGELOG.md b/CHANGELOG.md index 156515c5d41..573303e942a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ * The `transfer/publish` component is deprecated and will be removed in a future major release (PR #941). +# NEW FUNCTIONALITY + +* `workflows/annotation/harmony_knn` workflow: Cell-type annotation based on harmony integration with KNN label transfer (PR #836). + # MINOR CHANGES * Several workflows: refactor neighbors, leiden and UMAP in a separate subworkflow (PR #942 and PR #949). diff --git a/resources_test_scripts/annotation_test_data.sh b/resources_test_scripts/annotation_test_data.sh index 083058ba0d5..1d163270e58 100755 --- a/resources_test_scripts/annotation_test_data.sh +++ b/resources_test_scripts/annotation_test_data.sh @@ -33,15 +33,28 @@ wget "https://zenodo.org/record/7580707/files/pretrained_models_Blood_ts.tar.gz? # Process Tabula Sapiens Blood reference h5ad # (Select one individual and 100 cells per cell type) +# normalize and log1p transform data python <=n].groupby('cell_ontology_class').head(n).index] # assert sub_ref_adata_final.shape == (500, 58870) +data_for_scanpy = ad.AnnData(X=sub_ref_adata_final.X) +sc.pp.normalize_total(data_for_scanpy, target_sum=10000) +sc.pp.log1p( + data_for_scanpy, + base=None, + layer=None, + copy=False, +) +sub_ref_adata_final.layers["log_normalized"] = data_for_scanpy.X + sub_ref_adata_final.write("${OUT}/TS_Blood_filtered.h5ad", compression='gzip') + HEREDOC diff --git a/src/feature_annotation/highly_variable_features_scanpy/script.py b/src/feature_annotation/highly_variable_features_scanpy/script.py index 089f9c1f089..19aab20c27d 100644 --- a/src/feature_annotation/highly_variable_features_scanpy/script.py +++ b/src/feature_annotation/highly_variable_features_scanpy/script.py @@ -140,7 +140,7 @@ try: out = sc.pp.highly_variable_genes(**hvg_args) if par["var_input"] is not None: - out.index = data[:, data.var[par["var_input"]]].var.index + out.index = input_anndata.var.index out = out.reindex(index=data.var.index, method=None) out.highly_variable = out.highly_variable.fillna(False) assert ( diff --git a/src/integrate/harmony/config.vsh.yaml b/src/integrate/harmony/config.vsh.yaml index fb835ddcc51..85972f7ca78 100644 --- a/src/integrate/harmony/config.vsh.yaml +++ b/src/integrate/harmony/config.vsh.yaml @@ -41,7 +41,7 @@ arguments: required: false description: "In which .obsm slot to store the resulting integrated embedding." - name: "--theta" - description: "Diversity clustering penalty parameter. Specify for each variable in group.by.vars. theta=0 does not encourage any diversity. Larger values of theta result in more diverse clusters." + description: "Diversity clustering penalty parameter. Can be set as a single value for all batch observations or as multiple values, one for each observation in the batches defined by --obs_covariates. theta=0 does not encourage any diversity. Larger values of theta result in more diverse clusters." type: double default: 2 multiple: true diff --git a/src/integrate/harmonypy/config.vsh.yaml b/src/integrate/harmonypy/config.vsh.yaml index 7f2f4d06af7..2dbd7479f56 100644 --- a/src/integrate/harmonypy/config.vsh.yaml +++ b/src/integrate/harmonypy/config.vsh.yaml @@ -41,7 +41,7 @@ arguments: required: false description: "In which .obsm slot to store the resulting integrated embedding." - name: "--theta" - description: "Diversity clustering penalty parameter. Specify for each variable in group.by.vars. theta=0 does not encourage any diversity. Larger values of theta result in more diverse clusters." + description: "Diversity clustering penalty parameter. Can be set as a single value for all batch observations or as multiple values, one for each observation in the batches defined by --obs_covariates. theta=0 does not encourage any diversity. Larger values of theta result in more diverse clusters." type: double default: 2 multiple: true diff --git a/src/utils/subset_vars.py b/src/utils/subset_vars.py index d8cb05b2d3d..6a68435fc0c 100644 --- a/src/utils/subset_vars.py +++ b/src/utils/subset_vars.py @@ -18,4 +18,14 @@ def subset_vars(adata, subset_col): f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." ) + if adata.var[subset_col].dtype == "boolean": + assert ( + adata.var[subset_col].isna().sum() == 0 + ), f"The .var column `{subset_col}` contains NaN values. Can not subset data." + adata.var[subset_col] = adata.var[subset_col].astype("bool") + + assert ( + adata.var[subset_col].dtype == "bool" + ), f"Expected dtype of .var column '{subset_col}' to be `bool`, but found {adata.var[subset_col].dtype}. Can not subset data." + return adata[:, adata.var[subset_col]].copy() diff --git a/src/workflows/annotation/harmony_knn/config.vsh.yaml b/src/workflows/annotation/harmony_knn/config.vsh.yaml new file mode 100644 index 00000000000..427d26b4319 --- /dev/null +++ b/src/workflows/annotation/harmony_knn/config.vsh.yaml @@ -0,0 +1,205 @@ +name: "harmony_knn" +namespace: "workflows/annotation" +description: "Cell type annotation workflow by performing harmony integration of reference and query dataset followed by KNN label transfer." +info: + name: "Harmony integration followed by KNN label transfer" + test_dependencies: + - name: harmony_knn_test + namespace: test_workflows/annotation +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Query Input + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + required: true + type: file + description: Input dataset consisting of the (unlabeled) query observations. The dataset is expected to be pre-processed in the same way as --reference. + example: input.h5mu + - name: "--modality" + description: Which modality to process. Should match the modality of the --reference dataset. + type: string + default: "rna" + required: false + - name: "--input_layer" + description: The layer of the input dataset to process if .X is not to be used. Should contain log normalized counts. + required: false + type: string + - name: "--input_obs_batch_label" + type: string + description: "The .obs field in the input (query) dataset containing the batch labels." + example: "sample" + required: true + - name: "--input_var_gene_names" + type: string + required: false + description: | + The .var field in the input (query) dataset containing gene names; if not provided, the .var index will be used. + - name: "--input_reference_gene_overlap" + type: integer + default: 100 + min: 1 + description: | + The minimum number of genes present in both the reference and query datasets. + - name: "--overwrite_existing_key" + type: boolean_true + description: If provided, will overwrite existing fields in the input dataset when data are copied during the reference alignment process. + + - name: Reference input + arguments: + - name: "--reference" + required: true + type: file + description: Reference dataset consisting of the labeled observations to train the KNN classifier on. The dataset is expected to be pre-processed in the same way as the --input query dataset. + example: reference.h5mu + - name: "--reference_layer" + description: The layer of the reference dataset to process if .X is not to be used. Should contain log normalized counts. + required: false + type: string + - name: "--reference_obs_target" + type: string + example: cell_type + required: true + description: The `.obs` key of the target cell type labels to transfer. + - name: "--reference_var_gene_names" + type: string + required: false + description: | + The .var field in the reference dataset containing gene names; if not provided, the .var index will be used. + - name: "--reference_obs_batch_label" + type: string + description: "The .obs field in the reference dataset containing the batch labels." + example: "sample" + required: true + + - name: "Highly Variable Genes calculation options" + arguments: + - name: "--hvg_flavor" + alternatives: ["--filter_with_hvg_flavor"] + type: string + default: "seurat" + choices: ["seurat", "cell_ranger", "seurat_v3"] + description: | + Choose the flavor for identifying highly variable features. For the dispersion based methods + in their default workflows, Seurat passes the cutoffs whereas Cell Ranger passes n_top_features. + - name: "--hvg_n_top_features" + alternatives: ["--filter_with_hvg_n_top_genes"] + required: false + type: integer + description: Number of highly-variable features to keep. Mandatory if filter_with_hvg_flavor is set to 'seurat_v3'. + + - name: PCA options + arguments: + - name: "--pca_num_components" + type: integer + example: 25 + description: Number of principal components to compute. Defaults to 50, or 1 - minimum dimension size of selected representation. + + - name: Harmony integration options + arguments: + - name: "--harmony_theta" + type: double + description: | + Diversity clustering penalty parameter. Can be set as a single value for all batch observations or as multiple values, one for each observation in the batches defined by --input_obs_batch_label. theta=0 does not encourage any diversity. Larger values of theta result in more diverse clusters." + min: 0 + default: [2] + multiple: true + + - name: Leiden clustering options + arguments: + - name: "--leiden_resolution" + type: double + description: Control the coarseness of the clustering. Higher values lead to more clusters. + min: 0 + default: [1] + multiple: true + + - name: Neighbor classifier arguments + arguments: + - name: "--knn_weights" + type: string + default: "uniform" + choices: ["uniform", "distance"] + description: | + Weight function used in prediction. Possible values are: + `uniform` (all points in each neighborhood are weighted equally) or + `distance` (weight points by the inverse of their distance) + - name: "--knn_n_neighbors" + type: integer + default: 15 + min: 5 + required: false + description: | + The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent. + Larger values will result in more accurate search results at the cost of computation time. + + - name: "Outputs" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: The query data in .h5mu format with predicted labels predicted from the classifier trained on the reference. + example: output.h5mu + - name: "--output_obs_predictions" + type: string + required: false + multiple: true + description: | + In which `.obs` slots to store the predicted cell labels. + If provided, must have the same length as `--reference_obs_targets`. + If empty, will default to the `reference_obs_targets` combined with the `"_pred"` suffix. + - name: "--output_obs_probability" + type: string + required: false + multiple: true + description: | + In which `.obs` slots to store the probability of the predictions. + If provided, must have the same length as `--reference_obs_targets`. + If empty, will default to the `reference_obs_targets` combined with the `"_probability"` suffix. + - name: "--output_obsm_integrated" + type: string + default: "X_integrated_harmony" + required: false + description: "In which .obsm slot to store the integrated embedding." + - name: "--output_compression" + type: string + description: | + The compression format to be used on the output h5mu object. + choices: ["gzip", "lzf"] + required: false + example: "gzip" + +dependencies: + - name: workflows/integration/harmony_leiden + alias: harmony_leiden_workflow + - name: labels_transfer/knn + - name: dataflow/split_h5mu + - name: dataflow/concatenate_h5mu + - name: dimred/pca + - name: feature_annotation/align_query_reference + - name: feature_annotation/highly_variable_features_scanpy + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu + - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu + +runners: + - type: nextflow diff --git a/src/workflows/annotation/harmony_knn/integration_test.sh b/src/workflows/annotation/harmony_knn/integration_test.sh new file mode 100755 index 00000000000..829d005b158 --- /dev/null +++ b/src/workflows/annotation/harmony_knn/integration_test.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/annotation/harmony_knn/test.nf \ + -entry test_wf \ + -resume \ + -profile docker,no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config \ diff --git a/src/workflows/annotation/harmony_knn/main.nf b/src/workflows/annotation/harmony_knn/main.nf new file mode 100644 index 00000000000..8c10adffb12 --- /dev/null +++ b/src/workflows/annotation/harmony_knn/main.nf @@ -0,0 +1,168 @@ +workflow run_wf { + take: + input_ch + + main: + + output_ch = input_ch + // Set aside the output for this workflow to avoid conflicts + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // Align query and reference datasets + | align_query_reference.run( + fromState: [ + "input": "input", + "modality": "modality", + "input_layer": "input_layer", + "input_obs_batch": "input_obs_batch_label", + "input_var_gene_names": "input_var_gene_names", + "reference": "reference", + "reference_layer": "reference_layer", + "reference_obs_batch": "reference_obs_batch_label", + "reference_obs_label": "reference_obs_target", + "reference_var_gene_names": "reference_var_gene_names", + "input_reference_gene_overlap": "input_reference_gene_overlap", + "overwrite_existing_key": "overwrite_existing_key" + ], + args: [ + "input_id": "query", + "reference_id": "reference", + "output_layer": "_counts", + "output_var_gene_names": "_gene_names", + "output_obs_batch": "_sample_id", + "output_obs_label": "_cell_type", + "output_obs_id": "_dataset", + "output_var_common_genes": "_common_vars" + ], + toState: [ + "input": "output_query", + "reference": "output_reference" + ] + ) + // Concatenate query and reference datasets prior to integration + | concatenate_h5mu.run( + fromState: { id, state -> [ + "input": [state.input, state.reference] + ] + }, + args: [ + "input_id": ["query", "reference"], + "other_axis_mode": "move" + ], + toState: ["input": "output"] + ) + | view {"After concatenation: $it"} + | highly_variable_features_scanpy.run( + fromState: [ + "input": "input", + "modality": "modality", + "flavor": "hvg_flavor", + "n_top_features": "hvg_n_top_features", + ], + args: [ + "layer": "_counts", + "var_input": "_common_vars", + "var_name_filter": "_common_hvg", + "obs_batch_key": "_sample_id" + ], + toState: [ + "input": "output" + ] + ) + | pca.run( + fromState: [ + "input": "input", + "modality": "modality", + "overwrite": "overwrite_existing_key", + "num_compontents": "pca_num_components" + ], + args: [ + "layer": "_counts", + "var_input": "_common_hvg", + "obsm_output": "X_pca_harmony", + "varm_output": "pca_loadings_harmony", + "uns_output": "pca_variance_harmony", + ], + toState: [ + "input": "output" + ] + ) + // Run harmony integration with leiden clustering + | harmony_leiden_workflow.run( + fromState: { id, state -> [ + "id": id, + "input": state.input, + "modality": state.modality, + "obsm_integrated": state.output_obsm_integrated, + "theta": state.harmony_theta, + "leiden_resolution": state.leiden_resolution, + ]}, + args: [ + "embedding": "X_pca_harmony", + "uns_neighbors": "harmonypy_integration_neighbors", + "obsp_neighbor_distances": "harmonypy_integration_distances", + "obsp_neighbor_connectivities": "harmonypy_integration_connectivities", + "obs_cluster": "harmony_integration_leiden", + "obsm_umap": "X_leiden_harmony_umap", + "obs_covariates": "_sample_id" + ], + toState: ["input": "output"] + ) + | view {"After integration: $it"} + // Split integrated dataset back into a separate reference and query dataset + | split_h5mu.run( + fromState: [ + "input": "input", + "modality": "modality" + ], + args: [ + "obs_feature": "_dataset", + "output_files": "sample_files.csv", + "drop_obs_nan": "true", + "output": "ref_query" + ], + toState: [ + "output": "output", + "output_files": "output_files" + ], + auto: [ publish: true ] + ) + | view {"After sample splitting: $it"} + // map the integrated query and reference datasets back to the state + | map {id, state -> + def outputDir = state.output + def files = readCsv(state.output_files.toUriString()) + def query_file = files.findAll{ dat -> dat.name == 'query' } + assert query_file.size() == 1, 'there should only be one query file' + def reference_file = files.findAll{ dat -> dat.name == 'reference' } + assert reference_file.size() == 1, 'there should only be one reference file' + def integrated_query = outputDir.resolve(query_file.filename) + def integrated_reference = outputDir.resolve(reference_file.filename) + def newKeys = ["integrated_query": integrated_query, "integrated_reference": integrated_reference] + [id, state + newKeys] + } + | view {"After splitting query: $it"} + // Perform KNN label transfer from integrated reference to integrated query + | knn.run( + fromState: [ + "input": "integrated_query", + "modality": "modality", + "input_obsm_features": "output_obsm_integrated", + "reference": "integrated_reference", + "reference_obsm_features": "output_obsm_integrated", + "reference_obs_targets": "reference_obs_target", + "output_obs_predictions": "output_obs_predictions", + "output_obs_probability": "output_obs_probability", + "output_compression": "output_compression", + "weights": "knn_weights", + "n_neighbors": "knn_n_neighbors", + "output": "workflow_output" + ], + toState: {id, output, state -> ["output": output.output]}, + ) + + emit: + output_ch +} diff --git a/src/workflows/annotation/harmony_knn/nextflow.config b/src/workflows/annotation/harmony_knn/nextflow.config new file mode 100644 index 00000000000..059100c489c --- /dev/null +++ b/src/workflows/annotation/harmony_knn/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") diff --git a/src/workflows/annotation/harmony_knn/test.nf b/src/workflows/annotation/harmony_knn/test.nf new file mode 100644 index 00000000000..cdb62f052a0 --- /dev/null +++ b/src/workflows/annotation/harmony_knn/test.nf @@ -0,0 +1,66 @@ +nextflow.enable.dsl=2 + +include { harmony_knn } from params.rootDir + "/target/nextflow/workflows/annotation/harmony_knn/main.nf" +include { harmony_knn_test } from params.rootDir + "/target/nextflow/test_workflows/annotation/harmony_knn_test/main.nf" +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + // allow changing the resources_test dir + resources_test = file(params.resources_test) + + output_ch = Channel.fromList( + [ + [ + id: "simple_execution_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + input_layer: "log_normalized", + reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"), + reference_var_gene_names: "ensemblid", + input_obs_batch_label: "sample_id", + reference_layer: "log_normalized", + reference_obs_batch_label: "donor_assay", + reference_obs_target: "cell_type", + leiden_resolution: [1.0, 0.25] + ], + [ + id: "no_leiden_resolutions_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + input_layer: "log_normalized", + reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"), + reference_var_gene_names: "ensemblid", + input_obs_batch_label: "sample_id", + reference_layer: "log_normalized", + reference_obs_batch_label: "donor_assay", + reference_obs_target: "cell_type", + leiden_resolution: [] + ] + ]) + | map{ state -> [state.id, state] } + | harmony_knn + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id.endsWith("_test") : "Output ID should be same as input ID" + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" + } + | harmony_knn_test.run( + fromState: [ + "input": "output" + ] + ) + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 2 : "output channel should contain 2 events" + assert output_list.collect{it[0]} == ["no_leiden_resolutions_test", "simple_execution_test"] + } + } \ No newline at end of file diff --git a/src/workflows/integration/harmony_leiden/config.vsh.yaml b/src/workflows/integration/harmony_leiden/config.vsh.yaml index 058375735c3..11e9d3d0d1d 100644 --- a/src/workflows/integration/harmony_leiden/config.vsh.yaml +++ b/src/workflows/integration/harmony_leiden/config.vsh.yaml @@ -71,8 +71,7 @@ argument_groups: - name: "--theta" type: double description: | - Diversity clustering penalty parameter. Specify for each variable in group.by.vars. - theta=0 does not encourage any diversity. Larger values of theta + Diversity clustering penalty parameter. Can be set as a single value for all batch observations or as multiple values, one for each observation in the batches defined by --obs_covariates. theta=0 does not encourage any diversity. Larger values of theta result in more diverse clusters." default: 2 multiple: true diff --git a/src/workflows/test_workflows/annotation/harmony_knn/config.vsh.yaml b/src/workflows/test_workflows/annotation/harmony_knn/config.vsh.yaml new file mode 100644 index 00000000000..f5fab1c5efc --- /dev/null +++ b/src/workflows/test_workflows/annotation/harmony_knn/config.vsh.yaml @@ -0,0 +1,35 @@ +name: "harmony_knn_test" +namespace: "test_workflows/annotation" +description: "This component tests the output of the annotation of the harmony_knn of workflow." +authors: + - __merge__: /src/authors/dorien_roosen.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + description: Path to h5mu output. + example: foo.final.h5mu +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/base/openpipelinetestutils + dest: openpipelinetestutils +engines: + - type: docker + image: python:3.12-slim + setup: + - type: docker + copy: ["openpipelinetestutils /opt/openpipelinetestutils"] + - type: apt + packages: + - procps + - type: python + packages: /opt/openpipelinetestutils + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/viashpy.yaml, .] +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/workflows/test_workflows/annotation/harmony_knn/script.py b/src/workflows/test_workflows/annotation/harmony_knn/script.py new file mode 100644 index 00000000000..cf20808a735 --- /dev/null +++ b/src/workflows/test_workflows/annotation/harmony_knn/script.py @@ -0,0 +1,38 @@ +from mudata import read_h5mu +import shutil +import os +import sys +from pathlib import Path +import pytest + +##VIASH START +par = {"input": "harmony_knn/output.h5mu"} + +meta = {"resources_dir": "resources_test"} +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_obsm = ["X_integrated_harmony", "X_leiden_harmony_umap"] + expected_obs = ["cell_type_pred", "cell_type_probability"] + + assert "rna" in list(input_mudata.mod.keys()), "Input should contain rna modality." + assert all( + key in list(input_mudata.mod["rna"].obsm) for key in expected_obsm + ), f"Input mod['rna'] obs columns should be: {expected_obsm}, found: {input_mudata.mod['rna'].obsm.keys()}." + assert all( + key in list(input_mudata.mod["rna"].obs) for key in expected_obs + ), f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod['rna'].obs.keys()}." + + assert input_mudata.mod["rna"].obs["cell_type_pred"].dtype == "category" + assert input_mudata.mod["rna"].obs["cell_type_probability"].dtype == "float64" + + +if __name__ == "__main__": + HERE_DIR = Path(__file__).resolve().parent + shutil.copyfile( + os.path.join(meta["resources_dir"], "openpipelinetestutils", "conftest.py"), + os.path.join(HERE_DIR, "conftest.py"), + ) + sys.exit(pytest.main(["--import-mode=importlib"]))