Merge remote-tracking branch 'origin/main' into add-novel

openproblems-bio · Jan 8, 2025 · 4e15a89 · 4e15a89
2 parents 8197460 + 39f5cec
commit 4e15a89
Show file tree

Hide file tree

Showing 18 changed files with 600 additions and 29 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+# task_predict_modality 0.1.1
+
+## NEW FUNCTIONALITY
+
+* Added Simple MLP method (PR #3).
+
 # task_predict_modality 0.1.0
 
 Initial release after migrating the codebase.

diff --git a/README.md b/README.md
@@ -41,6 +41,8 @@ data shows that this is not trivial.
 | Kaiwen Deng        | contributor        |
 | Louise Deconinck   | author             |
 | Robrecht Cannoodt  | author, maintainer |
+| Xueer Chen         | contributor        |
+| Jiwei Liu          | contributor        |
 
 ## API
 

diff --git a/_viash.yaml b/_viash.yaml
@@ -73,7 +73,18 @@ authors:
     info:
       github: rcannood
       orcid: "0000-0003-3641-729X"
-
+  - name: Xueer Chen
+    roles: [ contributor ]
+    info:
+      github: xuerchen
+      email: [email protected]
+  - name: Jiwei Liu
+    roles: [ contributor ]
+    info:
+      github: daxiongshu
+      email: [email protected]
+      orcid: "0000-0002-8799-9763"
+
 links:
   issue_tracker: https://github.com/openproblems-bio/task_predict_modality/issues
   repository: https://github.com/openproblems-bio/task_predict_modality
@@ -84,8 +95,8 @@ info:
 
   test_resources:
     - type: s3
-      path: s3://openproblems-data/resources_test/common/
-      dest: resources_test/common
+      path: s3://openproblems-data/resources_test/common/openproblems_neurips2021
+      dest: resources_test/common/openproblems_neurips2021
     - type: s3
       path: s3://openproblems-data/resources_test/task_predict_modality/
       dest: resources_test/task_predict_modality

diff --git a/common b/common
diff --git a/scripts/create_datasets/test_resources.sh b/scripts/create_datasets/test_resources.sh
@@ -29,31 +29,25 @@ nextflow run . \
 
 echo "Run one method"
 
-viash run src/methods/knnr_py/config.vsh.yaml -- \
-  --input_train_mod1 $OUTPUT_DIR/openproblems_neurips2021/bmmc_cite/normal/train_mod1.h5ad \
-  --input_train_mod2 $OUTPUT_DIR/openproblems_neurips2021/bmmc_cite/normal/train_mod2.h5ad \
-  --input_test_mod1 $OUTPUT_DIR/openproblems_neurips2021/bmmc_cite/normal/test_mod1.h5ad \
-  --output $OUTPUT_DIR/openproblems_neurips2021/bmmc_cite/normal/prediction.h5ad
-
-viash run src/methods/knnr_py/config.vsh.yaml -- \
-  --input_train_mod1 $OUTPUT_DIR//openproblems_neurips2021/bmmc_cite/swap/train_mod1.h5ad \
-  --input_train_mod2 $OUTPUT_DIR//openproblems_neurips2021/bmmc_cite/swap/train_mod2.h5ad \
-  --input_test_mod1 $OUTPUT_DIR//openproblems_neurips2021/bmmc_cite/swap/test_mod1.h5ad \
-  --output $OUTPUT_DIR//openproblems_neurips2021/bmmc_cite/swap/prediction.h5ad
-
-viash run src/methods/knnr_py/config.vsh.yaml -- \
-  --input_train_mod1 $OUTPUT_DIR/openproblems_neurips2021/bmmc_multiome/normal/train_mod1.h5ad \
-  --input_train_mod2 $OUTPUT_DIR/openproblems_neurips2021/bmmc_multiome/normal/train_mod2.h5ad \
-  --input_test_mod1 $OUTPUT_DIR/openproblems_neurips2021/bmmc_multiome/normal/test_mod1.h5ad \
-  --output $OUTPUT_DIR/openproblems_neurips2021/bmmc_multiome/normal/prediction.h5ad
-
-viash run src/methods/knnr_py/config.vsh.yaml -- \
-  --input_train_mod1 $OUTPUT_DIR/openproblems_neurips2021/bmmc_multiome/swap/train_mod1.h5ad \
-  --input_train_mod2 $OUTPUT_DIR/openproblems_neurips2021/bmmc_multiome/swap/train_mod2.h5ad \
-  --input_test_mod1 $OUTPUT_DIR/openproblems_neurips2021/bmmc_multiome/swap/test_mod1.h5ad \
-  --output $OUTPUT_DIR/openproblems_neurips2021/bmmc_multiome/swap/prediction.h5ad
+for name in bmmc_cite/normal bmmc_cite/swap bmmc_multiome/normal bmmc_multiome/swap; do
+  viash run src/methods/knnr_py/config.vsh.yaml -- \
+    --input_train_mod1 $OUTPUT_DIR/openproblems_neurips2021/$name/train_mod1.h5ad \
+    --input_train_mod2 $OUTPUT_DIR/openproblems_neurips2021/$name/train_mod2.h5ad \
+    --input_test_mod1 $OUTPUT_DIR/openproblems_neurips2021/$name/test_mod1.h5ad \
+    --output $OUTPUT_DIR/openproblems_neurips2021/$name/prediction.h5ad
+
+  # pre-train simple_mlp
+  rm -r $OUTPUT_DIR/openproblems_neurips2021/$name/models/simple_mlp/
+  mkdir -p $OUTPUT_DIR/openproblems_neurips2021/$name/models/simple_mlp/
+  viash run src/methods/simple_mlp/train/config.vsh.yaml -- \
+    --input_train_mod1 $OUTPUT_DIR/openproblems_neurips2021/$name/train_mod1.h5ad \
+    --input_train_mod2 $OUTPUT_DIR/openproblems_neurips2021/$name/train_mod2.h5ad \
+    --input_test_mod1 $OUTPUT_DIR/openproblems_neurips2021/$name/test_mod1.h5ad \
+    --output $OUTPUT_DIR/openproblems_neurips2021/$name/models/simple_mlp/
+done
 
 # only run this if you have access to the openproblems-data bucket
 aws s3 sync --profile op \
-  "$DATASET_DIR" s3://openproblems-data/resources_test/task_predict_modality \
+  resources_test/task_predict_modality \
+  s3://openproblems-data/resources_test/task_predict_modality \
   --delete --dryrun
diff --git a/src/methods/simple_mlp/predict/config.vsh.yaml b/src/methods/simple_mlp/predict/config.vsh.yaml
@@ -0,0 +1,28 @@
+__merge__: /src/api/comp_method_predict.yaml
+name: simplemlp_predict
+
+info:
+  test_setup:
+    with_model:
+      input_model: resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/swap/models/simple_mlp
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: ../resources/
+
+engines:
+  - type: docker
+    image: openproblems/base_pytorch_nvidia:1.0.0
+    # run_args: ["--gpus all --ipc=host"]
+    setup:
+      - type: python
+        pypi:
+          - scikit-learn
+          - scanpy
+          - pytorch-lightning
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [highmem, hightime, midcpu, gpu, highsharedmem]
diff --git a/src/methods/simple_mlp/predict/script.py b/src/methods/simple_mlp/predict/script.py
@@ -0,0 +1,112 @@
+from glob import glob
+import sys
+import numpy as np
+from scipy.sparse import csc_matrix
+import anndata as ad
+import torch
+from torch.utils.data import TensorDataset,DataLoader
+
+## VIASH START
+par = {
+    'input_train_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod1.h5ad',
+    'input_train_mod2': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod2.h5ad',
+    'input_test_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/test_mod1.h5ad',
+    'input_model': 'output/model',
+    'output': 'output/prediction'
+}
+meta = {
+    'config': 'target/executable/methods/simplemlp_predict/.config.vsh.yaml',
+    'resources_dir': 'target/executable/methods/simplemlp_predict',
+    'cpus': 10
+}
+## VIASH END
+
+resources_dir = f"{meta['resources_dir']}/resources"
+sys.path.append(resources_dir)
+from models import MLP
+import utils
+
+def _predict(model,dl):
+    if torch.cuda.is_available():
+        model = model.cuda()
+    else:
+        model = model.cpu()
+    model.eval()
+    yps = []
+    for x in dl:
+        with torch.no_grad():
+            if torch.cuda.is_available():
+                x0 = x[0].cuda()
+            else:
+                x0 = x[0].cpu()
+            yp = model(x0)
+            yps.append(yp.detach().cpu().numpy())
+    yp = np.vstack(yps)
+    return yp
+
+
+print('Load data', flush=True)
+input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
+input_test_mod1 = ad.read_h5ad(par['input_test_mod1'])
+
+# determine variables
+mod_1 = input_test_mod1.uns['modality']
+mod_2 = input_train_mod2.uns['modality']
+
+task = f'{mod_1}2{mod_2}'
+
+print('Load ymean', flush=True)
+ymean_path = f"{par['input_model']}/{task}_ymean.npy"
+ymean = np.load(ymean_path)
+
+print('Start predict', flush=True)
+if task == 'GEX2ATAC':
+    y_pred = ymean*np.ones([input_test_mod1.n_obs, input_test_mod1.n_vars])
+else:
+    folds = [0, 1, 2]
+
+    ymean = torch.from_numpy(ymean).float()
+    yaml_path=f"{resources_dir}/yaml/mlp_{task}.yaml"
+    config = utils.load_yaml(yaml_path)
+    X = input_test_mod1.layers["normalized"].toarray()
+    X = torch.from_numpy(X).float()
+
+    te_ds = TensorDataset(X)
+
+    yp = 0
+    for fold in folds:
+        # load_path = f"{par['input_model']}/{task}_fold_{fold}/version_0/checkpoints/*"
+        load_path = f"{par['input_model']}/{task}_fold_{fold}/**.ckpt"
+        print(load_path)
+        ckpt = glob(load_path)[0]
+        model_inf = MLP.load_from_checkpoint(
+            ckpt,
+            in_dim=X.shape[1],
+            out_dim=input_test_mod1.n_vars,
+            ymean=ymean,
+            config=config
+        )
+        te_loader = DataLoader(
+            te_ds,
+            batch_size=config.batch_size,
+            num_workers=0,
+            shuffle=False,
+            drop_last=False
+        )
+        yp = yp + _predict(model_inf, te_loader)
+
+    y_pred = yp/len(folds)
+
+y_pred = csc_matrix(y_pred)
+
+adata = ad.AnnData(
+    layers={"normalized": y_pred},
+    shape=y_pred.shape,
+    uns={
+        'dataset_id': input_test_mod1.uns['dataset_id'],
+        'method_id': meta['functionality_name'],
+    },
+)
+
+print('Write data', flush=True)
+adata.write_h5ad(par['output'], compression = "gzip") 
diff --git a/src/methods/simple_mlp/resources/models.py b/src/methods/simple_mlp/resources/models.py
@@ -0,0 +1,71 @@
+import torch
+import pytorch_lightning as pl
+import torch.nn as nn
+import torch.nn.functional as F
+
+class MLP(pl.LightningModule):
+    def __init__(self,in_dim,out_dim,ymean,config):
+        super(MLP, self).__init__()
+        if torch.cuda.is_available():
+            self.ymean = ymean.cuda()
+        else:
+            self.ymean = ymean
+        H1 = config.H1
+        H2 = config.H2
+        p = config.dropout
+        self.config = config
+        self.fc1 = nn.Linear(in_dim, H1)
+        self.fc2 = nn.Linear(H1,H2)
+        self.fc3 = nn.Linear(H1+H2, out_dim)
+        self.dp2 = nn.Dropout(p=p)
+
+    def forward(self, x):
+        x0 = x
+        x1 = F.relu(self.fc1(x))
+        x1 = self.dp2(x1)
+        x = F.relu(self.fc2(x1))
+        x = torch.cat([x,x1],dim=1)
+        x = self.fc3(x)
+        x = self.apply_mask(x)
+        return x
+
+    def apply_mask(self,yp):
+        tmp = torch.ones_like(yp).float()*self.ymean
+        mask = tmp<self.config.threshold
+        mask = mask.float()
+        return yp*(1-mask) + tmp*mask
+
+    def training_step(self, batch, batch_nb):
+        x,y = batch
+        yp = self(x)
+        criterion = nn.MSELoss()
+        loss = criterion(yp, y)
+        self.log('train_loss', loss, prog_bar=True)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        x,y = batch
+        yp = self(x)
+        criterion = nn.MSELoss()
+        loss = criterion(yp, y)
+        self.log('valid_RMSE', loss**0.5, prog_bar=True)
+        return loss
+
+    def predict_step(self, batch, batch_idx):
+        if len(batch) == 2:
+            x,_ = batch
+        else:
+            x = batch
+        return self(x)
+
+    def configure_optimizers(self):
+        lr = self.config.lr
+        wd = float(self.config.wd)
+        adam = torch.optim.Adam(self.parameters(), lr=lr, weight_decay=wd)
+        if self.config.lr_schedule == 'adam':
+            return adam
+        elif self.config.lr_schedule == 'adam_cosin':
+            slr = torch.optim.lr_scheduler.CosineAnnealingLR(adam, self.config.epochs)
+            return [adam], [slr]
+        else:
+            assert 0
diff --git a/src/methods/simple_mlp/resources/utils.py b/src/methods/simple_mlp/resources/utils.py
@@ -0,0 +1,37 @@
+import yaml
+from collections import namedtuple
+
+
+def to_site_donor(data):
+    df = data.obs['batch'].copy().to_frame().reset_index()
+    df.columns = ['index','batch']
+    df['site'] = df['batch'].apply(lambda x: x[:2])
+    df['donor'] = df['batch'].apply(lambda x: x[2:]) 
+    return df
+
+
+def split(tr1, tr2, fold):
+    df = to_site_donor(tr1) 
+    mask = df['site'] == f's{fold+1}'
+    maskr = ~mask
+
+    Xt = tr1[mask].layers["normalized"].toarray()
+    X = tr1[maskr].layers["normalized"].toarray()
+
+    yt = tr2[mask].layers["normalized"].toarray()
+    y = tr2[maskr].layers["normalized"].toarray()
+
+    print(f"{X.shape}, {y.shape}, {Xt.shape}, {yt.shape}")
+
+    return X,y,Xt,yt
+
+
+def load_yaml(path):
+    with open(path) as f:
+        x = yaml.safe_load(f)
+    res = {}
+    for i in x:
+        res[i] = x[i]['value']
+    config = namedtuple('Config', res.keys())(**res)
+    print(config)
+    return config
diff --git a/src/methods/simple_mlp/resources/yaml/mlp_ADT2GEX.yaml b/src/methods/simple_mlp/resources/yaml/mlp_ADT2GEX.yaml
@@ -0,0 +1,29 @@
+
+# sample config defaults file
+epochs:
+  desc: Number of epochs to train over
+  value: 10
+batch_size:
+  desc: Size of each mini-batch
+  value: 512
+H1:
+  desc: Number of hidden neurons in 1st layer of MLP
+  value: 256
+H2:
+  desc: Number of hidden neurons in 2nd layer of MLP
+  value: 128
+dropout:
+  desc: probs of zeroing values
+  value: 0
+lr:
+  desc: learning rate
+  value: 0.001
+wd:
+  desc: weight decay
+  value: 1e-5
+threshold:
+  desc: threshold to set values to zero
+  value: 0
+lr_schedule:
+  desc: learning rate scheduler
+  value: adam