diff --git a/CHANGELOG.md b/CHANGELOG.md index 467ca28..b6d94f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +# task_predict_modality 0.1.1 + +## NEW FUNCTIONALITY + +* Added Simple MLP method (PR #3). + # task_predict_modality 0.1.0 Initial release after migrating the codebase. diff --git a/README.md b/README.md index 00f01fc..11451c0 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,8 @@ data shows that this is not trivial. | Kaiwen Deng | contributor | | Louise Deconinck | author | | Robrecht Cannoodt | author, maintainer | +| Xueer Chen | contributor | +| Jiwei Liu | contributor | ## API diff --git a/_viash.yaml b/_viash.yaml index 1584b62..6952bf5 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -73,7 +73,18 @@ authors: info: github: rcannood orcid: "0000-0003-3641-729X" - + - name: Xueer Chen + roles: [ contributor ] + info: + github: xuerchen + email: xc2579@columbia.edu + - name: Jiwei Liu + roles: [ contributor ] + info: + github: daxiongshu + email: jiweil@nvidia.com + orcid: "0000-0002-8799-9763" + links: issue_tracker: https://github.com/openproblems-bio/task_predict_modality/issues repository: https://github.com/openproblems-bio/task_predict_modality @@ -84,8 +95,8 @@ info: test_resources: - type: s3 - path: s3://openproblems-data/resources_test/common/ - dest: resources_test/common + path: s3://openproblems-data/resources_test/common/openproblems_neurips2021 + dest: resources_test/common/openproblems_neurips2021 - type: s3 path: s3://openproblems-data/resources_test/task_predict_modality/ dest: resources_test/task_predict_modality diff --git a/common b/common index 65e05af..b505fd6 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit 65e05af68a11ee87853fcf7a3c6b579001f21abe +Subproject commit b505fd616154190db807490589cb42b5e8b32192 diff --git a/scripts/create_datasets/test_resources.sh b/scripts/create_datasets/test_resources.sh index a913e88..d869d00 100755 --- a/scripts/create_datasets/test_resources.sh +++ b/scripts/create_datasets/test_resources.sh @@ -29,31 +29,25 @@ nextflow run . \ echo "Run one method" -viash run src/methods/knnr_py/config.vsh.yaml -- \ - --input_train_mod1 $OUTPUT_DIR/openproblems_neurips2021/bmmc_cite/normal/train_mod1.h5ad \ - --input_train_mod2 $OUTPUT_DIR/openproblems_neurips2021/bmmc_cite/normal/train_mod2.h5ad \ - --input_test_mod1 $OUTPUT_DIR/openproblems_neurips2021/bmmc_cite/normal/test_mod1.h5ad \ - --output $OUTPUT_DIR/openproblems_neurips2021/bmmc_cite/normal/prediction.h5ad - -viash run src/methods/knnr_py/config.vsh.yaml -- \ - --input_train_mod1 $OUTPUT_DIR//openproblems_neurips2021/bmmc_cite/swap/train_mod1.h5ad \ - --input_train_mod2 $OUTPUT_DIR//openproblems_neurips2021/bmmc_cite/swap/train_mod2.h5ad \ - --input_test_mod1 $OUTPUT_DIR//openproblems_neurips2021/bmmc_cite/swap/test_mod1.h5ad \ - --output $OUTPUT_DIR//openproblems_neurips2021/bmmc_cite/swap/prediction.h5ad - -viash run src/methods/knnr_py/config.vsh.yaml -- \ - --input_train_mod1 $OUTPUT_DIR/openproblems_neurips2021/bmmc_multiome/normal/train_mod1.h5ad \ - --input_train_mod2 $OUTPUT_DIR/openproblems_neurips2021/bmmc_multiome/normal/train_mod2.h5ad \ - --input_test_mod1 $OUTPUT_DIR/openproblems_neurips2021/bmmc_multiome/normal/test_mod1.h5ad \ - --output $OUTPUT_DIR/openproblems_neurips2021/bmmc_multiome/normal/prediction.h5ad - -viash run src/methods/knnr_py/config.vsh.yaml -- \ - --input_train_mod1 $OUTPUT_DIR/openproblems_neurips2021/bmmc_multiome/swap/train_mod1.h5ad \ - --input_train_mod2 $OUTPUT_DIR/openproblems_neurips2021/bmmc_multiome/swap/train_mod2.h5ad \ - --input_test_mod1 $OUTPUT_DIR/openproblems_neurips2021/bmmc_multiome/swap/test_mod1.h5ad \ - --output $OUTPUT_DIR/openproblems_neurips2021/bmmc_multiome/swap/prediction.h5ad +for name in bmmc_cite/normal bmmc_cite/swap bmmc_multiome/normal bmmc_multiome/swap; do + viash run src/methods/knnr_py/config.vsh.yaml -- \ + --input_train_mod1 $OUTPUT_DIR/openproblems_neurips2021/$name/train_mod1.h5ad \ + --input_train_mod2 $OUTPUT_DIR/openproblems_neurips2021/$name/train_mod2.h5ad \ + --input_test_mod1 $OUTPUT_DIR/openproblems_neurips2021/$name/test_mod1.h5ad \ + --output $OUTPUT_DIR/openproblems_neurips2021/$name/prediction.h5ad + + # pre-train simple_mlp + rm -r $OUTPUT_DIR/openproblems_neurips2021/$name/models/simple_mlp/ + mkdir -p $OUTPUT_DIR/openproblems_neurips2021/$name/models/simple_mlp/ + viash run src/methods/simple_mlp/train/config.vsh.yaml -- \ + --input_train_mod1 $OUTPUT_DIR/openproblems_neurips2021/$name/train_mod1.h5ad \ + --input_train_mod2 $OUTPUT_DIR/openproblems_neurips2021/$name/train_mod2.h5ad \ + --input_test_mod1 $OUTPUT_DIR/openproblems_neurips2021/$name/test_mod1.h5ad \ + --output $OUTPUT_DIR/openproblems_neurips2021/$name/models/simple_mlp/ +done # only run this if you have access to the openproblems-data bucket aws s3 sync --profile op \ - "$DATASET_DIR" s3://openproblems-data/resources_test/task_predict_modality \ + resources_test/task_predict_modality \ + s3://openproblems-data/resources_test/task_predict_modality \ --delete --dryrun diff --git a/src/methods/simple_mlp/predict/config.vsh.yaml b/src/methods/simple_mlp/predict/config.vsh.yaml new file mode 100644 index 0000000..4fc47e0 --- /dev/null +++ b/src/methods/simple_mlp/predict/config.vsh.yaml @@ -0,0 +1,28 @@ +__merge__: /src/api/comp_method_predict.yaml +name: simplemlp_predict + +info: + test_setup: + with_model: + input_model: resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/swap/models/simple_mlp + +resources: + - type: python_script + path: script.py + - path: ../resources/ + +engines: + - type: docker + image: openproblems/base_pytorch_nvidia:1.0.0 + # run_args: ["--gpus all --ipc=host"] + setup: + - type: python + pypi: + - scikit-learn + - scanpy + - pytorch-lightning +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, hightime, midcpu, gpu, highsharedmem] diff --git a/src/methods/simple_mlp/predict/script.py b/src/methods/simple_mlp/predict/script.py new file mode 100644 index 0000000..f95aada --- /dev/null +++ b/src/methods/simple_mlp/predict/script.py @@ -0,0 +1,112 @@ +from glob import glob +import sys +import numpy as np +from scipy.sparse import csc_matrix +import anndata as ad +import torch +from torch.utils.data import TensorDataset,DataLoader + +## VIASH START +par = { + 'input_train_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod1.h5ad', + 'input_train_mod2': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod2.h5ad', + 'input_test_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/test_mod1.h5ad', + 'input_model': 'output/model', + 'output': 'output/prediction' +} +meta = { + 'config': 'target/executable/methods/simplemlp_predict/.config.vsh.yaml', + 'resources_dir': 'target/executable/methods/simplemlp_predict', + 'cpus': 10 +} +## VIASH END + +resources_dir = f"{meta['resources_dir']}/resources" +sys.path.append(resources_dir) +from models import MLP +import utils + +def _predict(model,dl): + if torch.cuda.is_available(): + model = model.cuda() + else: + model = model.cpu() + model.eval() + yps = [] + for x in dl: + with torch.no_grad(): + if torch.cuda.is_available(): + x0 = x[0].cuda() + else: + x0 = x[0].cpu() + yp = model(x0) + yps.append(yp.detach().cpu().numpy()) + yp = np.vstack(yps) + return yp + + +print('Load data', flush=True) +input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) +input_test_mod1 = ad.read_h5ad(par['input_test_mod1']) + +# determine variables +mod_1 = input_test_mod1.uns['modality'] +mod_2 = input_train_mod2.uns['modality'] + +task = f'{mod_1}2{mod_2}' + +print('Load ymean', flush=True) +ymean_path = f"{par['input_model']}/{task}_ymean.npy" +ymean = np.load(ymean_path) + +print('Start predict', flush=True) +if task == 'GEX2ATAC': + y_pred = ymean*np.ones([input_test_mod1.n_obs, input_test_mod1.n_vars]) +else: + folds = [0, 1, 2] + + ymean = torch.from_numpy(ymean).float() + yaml_path=f"{resources_dir}/yaml/mlp_{task}.yaml" + config = utils.load_yaml(yaml_path) + X = input_test_mod1.layers["normalized"].toarray() + X = torch.from_numpy(X).float() + + te_ds = TensorDataset(X) + + yp = 0 + for fold in folds: + # load_path = f"{par['input_model']}/{task}_fold_{fold}/version_0/checkpoints/*" + load_path = f"{par['input_model']}/{task}_fold_{fold}/**.ckpt" + print(load_path) + ckpt = glob(load_path)[0] + model_inf = MLP.load_from_checkpoint( + ckpt, + in_dim=X.shape[1], + out_dim=input_test_mod1.n_vars, + ymean=ymean, + config=config + ) + te_loader = DataLoader( + te_ds, + batch_size=config.batch_size, + num_workers=0, + shuffle=False, + drop_last=False + ) + yp = yp + _predict(model_inf, te_loader) + + y_pred = yp/len(folds) + +y_pred = csc_matrix(y_pred) + +adata = ad.AnnData( + layers={"normalized": y_pred}, + shape=y_pred.shape, + uns={ + 'dataset_id': input_test_mod1.uns['dataset_id'], + 'method_id': meta['functionality_name'], + }, +) + +print('Write data', flush=True) +adata.write_h5ad(par['output'], compression = "gzip") diff --git a/src/methods/simple_mlp/resources/models.py b/src/methods/simple_mlp/resources/models.py new file mode 100644 index 0000000..cf3af25 --- /dev/null +++ b/src/methods/simple_mlp/resources/models.py @@ -0,0 +1,71 @@ +import torch +import pytorch_lightning as pl +import torch.nn as nn +import torch.nn.functional as F + +class MLP(pl.LightningModule): + def __init__(self,in_dim,out_dim,ymean,config): + super(MLP, self).__init__() + if torch.cuda.is_available(): + self.ymean = ymean.cuda() + else: + self.ymean = ymean + H1 = config.H1 + H2 = config.H2 + p = config.dropout + self.config = config + self.fc1 = nn.Linear(in_dim, H1) + self.fc2 = nn.Linear(H1,H2) + self.fc3 = nn.Linear(H1+H2, out_dim) + self.dp2 = nn.Dropout(p=p) + + def forward(self, x): + x0 = x + x1 = F.relu(self.fc1(x)) + x1 = self.dp2(x1) + x = F.relu(self.fc2(x1)) + x = torch.cat([x,x1],dim=1) + x = self.fc3(x) + x = self.apply_mask(x) + return x + + def apply_mask(self,yp): + tmp = torch.ones_like(yp).float()*self.ymean + mask = tmp