diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a023889596..9f981cca443 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -119,6 +119,8 @@ * `metadata/copy_obs` component: Added a component to copy an .obs column from a MuData object to another (PR #874). +* `workflow/annotation/scanorama_knn` workflow: Cell-type annotation based on scanorama integration with KNN label transfer (PR #884). + ## MINOR CHANGES * `resources_test_scripts/cellranger_atac_tiny_bcl.sh` script: generate counts from fastq files using CellRanger atac count (PR #726). diff --git a/src/workflows/annotation/scanorama_knn/config.vsh.yaml b/src/workflows/annotation/scanorama_knn/config.vsh.yaml new file mode 100644 index 00000000000..82fa61b0df4 --- /dev/null +++ b/src/workflows/annotation/scanorama_knn/config.vsh.yaml @@ -0,0 +1,165 @@ +name: "scanorama_knn" +namespace: "workflows/annotation" +description: "Cell type annotation workflow by performing scanorama integration of reference and query dataset followed by KNN label transfer." +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author, maintainer ] + - __merge__: /src/authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Query Input + arguments: + - name: "--id" + required: true + type: string + description: ID of the sample. + example: foo + - name: "--input" + required: true + type: file + description: Input dataset consisting of the (unlabeled) query observations. The dataset is expected to be pre-processed in the same way as --reference. + example: input.h5mu + - name: "--modality" + description: Which modality to process. Should match the modality of the --reference dataset. + type: string + default: "rna" + required: false + - name: "--input_obsm_embedding" + example: "X_pca" + type: string + description: Embedding .obsm column to use as input for integration. Should match the embedding .obsm columng of the --reference dataset. + - name: "--input_obs_batch_label" + type: string + description: "The .obs field in the input (query) dataset containing the batch labels." + example: "sample" + required: true + + - name: Reference input + arguments: + - name: "--reference" + required: true + type: file + description: Reference dataset consisting of the labeled observations to train the KNN classifier on. The dataset is expected to be pre-processed in the same way as the --input query dataset. + example: reference.h5mu + - name: "--reference_obs_targets" + type: string + example: [ ann_level_1, ann_level_2, ann_level_3, ann_level_4, ann_level_5, ann_finest_level ] + required: true + multiple: true + description: The `.obs` key(s) of the target labels to transfer. + - name: "--reference_obs_batch_label" + type: string + description: "The .obs field in the reference dataset containing the batch labels." + example: "sample" + required: true + + - name: Scanorama integration options + arguments: + - name: "--knn" + type: integer + description: "Number of nearest neighbors to use for matching during scanorama integration." + default: 20 + - name: "--batch_size" + type: integer + description: "The batch size used in the alignment vector computation. Useful when integrating very large (>100k samples) datasets. Set to large value that runs within available memory." + default: 5000 + - name: "--sigma" + type: double + description: "Correction smoothing parameter on Gaussian kernel." + default: 15 + - name: "--approx" + type: boolean + description: "Use approximate nearest neighbors with Python annoy; greatly speeds up matching runtime." + default: True + - name: "--alpha" + type: double + description: "Alignment score minimum cutoff" + default: 0.1 + + - name: Leiden clustering options + arguments: + - name: "--leiden_resolution" + type: double + description: Control the coarseness of the clustering. Higher values lead to more clusters. + min: 0 + default: [1] + multiple: true + + - name: Neighbor classifier arguments + arguments: + - name: "--weights" + type: string + default: "uniform" + choices: ["uniform", "distance"] + description: | + Weight function used in prediction. Possible values are: + `uniform` (all points in each neighborhood are weighted equally) or + `distance` (weight points by the inverse of their distance) + - name: "--n_neighbors" + type: integer + default: 15 + required: false + description: | + The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent. + Larger values will result in more accurate search results at the cost of computation time. + + - name: "Outputs" + arguments: + - name: "--output" + type: file + required: true + direction: output + description: The query data in .h5mu format with predicted labels predicted from the classifier trained on the reference. + example: output.h5mu + - name: "--output_obs_predictions" + type: string + required: false + multiple: true + description: | + In which `.obs` slots to store the predicted cell labels. + If provided, must have the same length as `--reference_obs_targets`. + If empty, will default to the `reference_obs_targets` combined with the `"_pred"` suffix. + - name: "--output_obs_probability" + type: string + required: false + multiple: true + description: | + In which `.obs` slots to store the probability of the predictions. + If provided, must have the same length as `--reference_obs_targets`. + If empty, will default to the `reference_obs_targets` combined with the `"_probability"` suffix. + - name: "--output_obsm_integrated" + type: string + default: "X_integrated_scanorama" + required: false + description: "In which .obsm slot to store the integrated embedding." + - name: "--output_compression" + type: string + description: | + The compression format to be used on the output h5mu object. + choices: ["gzip", "lzf"] + required: false + example: "gzip" + +dependencies: + - name: workflows/integration/scanorama_leiden + alias: scanorama_leiden_workflow + - name: labels_transfer/pynndescent_knn + - name: dataflow/split_h5mu + - name: dataflow/concatenate_h5mu + - name: metadata/add_id + - name: metadata/copy_obs + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - path: /resources_test/scgpt + +runners: + - type: nextflow diff --git a/src/workflows/annotation/scanorama_knn/integration_test.sh b/src/workflows/annotation/scanorama_knn/integration_test.sh new file mode 100755 index 00000000000..ce567527124 --- /dev/null +++ b/src/workflows/annotation/scanorama_knn/integration_test.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +export NXF_VER=21.10.6 + +viash ns build -q scanorama_knn + +nextflow \ + run . \ + -main-script src/workflows/annotation/scanorama_knn/test.nf \ + -entry test_wf \ + -resume \ + -profile no_publish \ + -c src/workflows/utils/labels_ci.config \ + -c src/workflows/utils/integration_tests.config \ + -with-trace work/trace.txt diff --git a/src/workflows/annotation/scanorama_knn/main.nf b/src/workflows/annotation/scanorama_knn/main.nf new file mode 100644 index 00000000000..03330da8a38 --- /dev/null +++ b/src/workflows/annotation/scanorama_knn/main.nf @@ -0,0 +1,164 @@ +workflow run_wf { + take: + input_ch + + main: + + + output_ch = input_ch + // Set aside the output for this workflow to avoid conflicts + | map {id, state -> + def new_state = state + ["workflow_output": state.output] + [id, new_state] + } + // add id as _meta join id to be able to merge with source channel and end of workflow + | map{ id, state -> + def new_state = state + ["_meta": ["join_id": id]] + [id, new_state] + } + | view {"After adding join_id: $it"} + // Add 'query' id to .obs columns of query dataset + | add_id.run( + fromState: [ + "input": "input", + ], + args:[ + "input_id": "query", + "obs_output": "dataset", + ], + toState: ["input": "output"]) + // Add 'reference'id to .obs columns of reference dataset + | add_id.run( + fromState:[ + "input": "reference", + ], + args:[ + "input_id": "reference", + "obs_output": "dataset" + ], + toState: ["reference": "output"]) + // Make sure that query and reference dataset have batch information in the same .obs column + // By copying the respective .obs columns to the obs column "batch_label" + | copy_obs.run( + fromState: [ + "input": "input", + "modality": "modality", + "input_obs_key": "input_obs_batch_label", + ], + args: [ + "output_obs_key": "batch_label" + ], + toState: [ + "input": "output" + ] + ) + | copy_obs.run( + fromState: [ + "input": "reference", + "modality": "modality", + "input_obs_key": "reference_obs_batch_label", + ], + args: [ + "output_obs_key": "batch_label" + ], + toState: [ + "reference": "output" + ] + ) + // Concatenate query and reference datasets prior to integration + | concatenate_h5mu.run( + fromState: { id, state -> [ + "input": [state.input, state.reference] + ] + }, + args: [ + "input_id": ["query", "reference"], + "other_axis_mode": "move" + ], + toState: ["input": "output"] + ) + | view {"After concatenation: $it"} + // Run scanorama integration with leiden clustering + | scanorama_leiden_workflow.run( + fromState: { id, state -> + [ + "id": id, + "input": state.input, + // "layer": state.layer, + "modality": state.modality, + "obsm_input": state.input_obsm_embedding, // + "obsm_output": state.output_obsm_integrated, + "leiden_resolution": state.leiden_resolution, + "knn": state.knn, + "batch_size": state.batch_size, + "sigma": state.sigma, + "approx": state.approx, + "alpha": state.alpha + ]}, + args: [ + "uns_neighbors": "scanorama_integration_neighbors", + "obsp_neighbor_distances": "scanorama_integration_distances", + "obsp_neighbor_connectivities": "scanorama_integration_connectivities", + "obs_cluster": "scanorama_integration_leiden", + "obsm_umap": "X_leiden_scanorama_umap", + "obs_batch": "batch_label" + ], + toState: ["input": "output"] + ) + | view {"After integration: $it"} + // Split integrated dataset back into a separate reference and query dataset + | split_h5mu.run( + fromState: [ + "input": "input", + "modality": "modality" + ], + args: [ + "obs_feature": "dataset", + "output_files": "sample_files.csv", + "drop_obs_nan": "true", + "output": "ref_query" + ], + toState: [ + "output": "output", + "output_files": "output_files" + ], + auto: [ publish: true ] + ) + | view {"After sample splitting: $it"} + // map the integrated query and reference datasets back to the state + | map {id, state -> + def outputDir = state.output + def files = readCsv(state.output_files.toUriString()) + def query_file = files.findAll{ dat -> dat.name == 'query' } + assert query_file.size() == 1, 'there should only be one query file' + def reference_file = files.findAll{ dat -> dat.name == 'reference' } + assert reference_file.size() == 1, 'there should only be one reference file' + def integrated_query = outputDir.resolve(query_file.filename) + def integrated_reference = outputDir.resolve(reference_file.filename) + def newKeys = ["integrated_query": integrated_query, "integrated_reference": integrated_reference] + [id, state + newKeys] + } + | view {"After splitting query: $it"} + // Perform KNN label transfer from integrated reference to integrated query + | pynndescent_knn.run( + fromState: [ + "input": "integrated_query", + "modality": "modality", + "input_obsm_features": "output_obsm_integrated", + "reference": "integrated_reference", + "reference_obsm_features": "output_obsm_integrated", + "reference_obs_targets": "reference_obs_targets", + "output_obs_predictions": "output_obs_predictions", + "output_obs_probability": "output_obs_probability", + "output_compression": "output_compression", + "weights": "weights", + "n_neighbors": "n_neighbors", + "output": "workflow_output" + ], + toState: {id, output, state -> ["output": output.output]}, + auto: [ publish: true ] + ) + + emit: + output_ch +} diff --git a/src/workflows/annotation/scanorama_knn/nextflow.config b/src/workflows/annotation/scanorama_knn/nextflow.config new file mode 100644 index 00000000000..059100c489c --- /dev/null +++ b/src/workflows/annotation/scanorama_knn/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") diff --git a/src/workflows/annotation/scanorama_knn/test.nf b/src/workflows/annotation/scanorama_knn/test.nf new file mode 100644 index 00000000000..c814ff51fc6 --- /dev/null +++ b/src/workflows/annotation/scanorama_knn/test.nf @@ -0,0 +1,59 @@ +nextflow.enable.dsl=2 + +include {scanorama_knn } from params.rootDir + "/target/nextflow/workflows/annotation/scanorama_knn/main.nf" +include { scanorama_knn_test } from params.rootDir + "/target/nextflow/test_workflows/annotation/scanorama_knn_test/main.nf" + +workflow test_wf { + // allow changing the resources_test dir + resources_test = file("${params.rootDir}/resources_test") + + output_ch = Channel.fromList( + [ + [ + id: "simple_execution_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"), + input_obs_batch_label: "sample_id", + reference_obs_batch_label: "donor_assay", + reference_obs_targets: "cell_type", + leiden_resolution: [1.0, 0.25] + ], + [ + id: "no_leiden_resolutions_test", + input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"), + reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"), + input_obs_batch_label: "sample_id", + reference_obs_batch_label: "donor_assay", + reference_obs_targets: "cell_type", + leiden_resolution: [] + ] + ]) + | map{ state -> [state.id, state] } + | scanorama_knn + | view { output -> + assert output.size() == 2 : "Outputs should contain two elements; [id, state]" + + // check id + def id = output[0] + assert id.endsWith("_test") : "Output ID should be same as input ID" + + // check output + def state = output[1] + assert state instanceof Map : "State should be a map. Found: ${state}" + assert state.containsKey("output") : "Output should contain key 'output'." + assert state.output.isFile() : "'output' should be a file." + assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}" + + "Output: $output" + } + | scanorama_knn_test.run( + fromState: [ + "input": "output" + ] + ) + | toSortedList({a, b -> a[0] <=> b[0]}) + | map { output_list -> + assert output_list.size() == 2 : "output channel should contain 2 events" + assert output_list.collect{it[0]} == ["no_leiden_resolutions_test", "simple_execution_test"] + } + } diff --git a/src/workflows/test_workflows/annotation/scanorama_knn/config.vsh.yaml b/src/workflows/test_workflows/annotation/scanorama_knn/config.vsh.yaml new file mode 100644 index 00000000000..c7a2072d673 --- /dev/null +++ b/src/workflows/test_workflows/annotation/scanorama_knn/config.vsh.yaml @@ -0,0 +1,35 @@ +name: "scanorama_knn_test" +namespace: "test_workflows/annotation" +description: "This component tests the output of the annotation of the scanorama_knn of workflow." +authors: + - __merge__: /src/authors/dorien_roosen.yaml +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + description: Path to h5mu output. + example: foo.final.h5mu +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + - path: /src/base/openpipelinetestutils + dest: openpipelinetestutils +engines: + - type: docker + image: python:3.12-slim + setup: + - type: docker + copy: ["openpipelinetestutils /opt/openpipelinetestutils"] + - type: apt + packages: + - procps + - type: python + packages: /opt/openpipelinetestutils + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/viashpy.yaml, .] +runners: + - type: executable + - type: nextflow diff --git a/src/workflows/test_workflows/annotation/scanorama_knn/script.py b/src/workflows/test_workflows/annotation/scanorama_knn/script.py new file mode 100644 index 00000000000..8e5e77edfdb --- /dev/null +++ b/src/workflows/test_workflows/annotation/scanorama_knn/script.py @@ -0,0 +1,35 @@ +from mudata import read_h5mu +import numpy as np +import shutil +import os +import sys +from pathlib import Path +import pytest + +##VIASH START +par = { + "input": "scanorama_knn/output.h5mu" +} + +meta = { + "resources_dir": "resources_test" +} +##VIASH END + + +def test_run(): + input_mudata = read_h5mu(par["input"]) + expected_obsm = ["X_integrated_scanorama", "X_leiden_scanorama_umap"] + expected_obs = ["cell_type_pred", "cell_type_probability"] + expected_obsp = ["scanorama_integration_connectivities", "scanorama_integration_distances"] + + assert "rna" in list(input_mudata.mod.keys()), "Input should contain rna modality." + assert all(key in list(input_mudata.mod["rna"].obsm) for key in expected_obsm), f"Input mod['rna'] obsm columns should be: {expected_obsm}, found: {input_mudata.mod['rna'].obsm.keys()}." + assert all(key in list(input_mudata.mod["rna"].obs) for key in expected_obs), f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod['rna'].obs.keys()}." + assert all(key in list(input_mudata.mod["rna"].obsp) for key in expected_obsp), f"Input mod['rna'] obsp columns should be: {expected_obsp}, found: {input_mudata.mod['rna'].obsp.keys()}." + +if __name__ == "__main__": + HERE_DIR = Path(__file__).resolve().parent + shutil.copyfile(os.path.join(meta['resources_dir'], "openpipelinetestutils", "conftest.py"), + os.path.join(HERE_DIR, "conftest.py")) + sys.exit(pytest.main(["--import-mode=importlib"])) \ No newline at end of file