nadeemlab · jimmymathews · Jun 21, 2024 · Jun 5, 2024 · Jun 7, 2024 · Jun 7, 2024
diff --git a/.gitignore b/.gitignore
@@ -20,6 +20,7 @@ build/*/Dockerfile
 /Dockerfile
 **.whl
 build/*/docker.built
+build/plugins/**/*.built
 build/*/requirements.txt
 build/*/specific_requirements.txt
 build/*/dlogs.*.txt

diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "plugin/graph_processing/graph-transformer/tmi2022"]
+	path = plugin/graph_processing/graph-transformer/tmi2022
+	url = [email protected]:CarlinLiao/tmi2022.git
diff --git a/Makefile b/Makefile
diff --git a/build/plugins/graph_processing/cg-gnn-cuda.dockerfile b/build/plugins/graph_processing/cg-gnn-cuda.dockerfile
@@ -0,0 +1,32 @@
+FROM pytorch/pytorch:2.1.2-cuda11.8-cudnn8-runtime
+WORKDIR /app
+
+# Install apt packages you need here, and then clean up afterward
+RUN apt-get update
+RUN apt-get install -y \
+    libhdf5-serial-dev \
+    libatlas-base-dev \
+    libblas-dev \
+    liblapack-dev \
+    gfortran \
+    libpq-dev
+RUN rm -rf /var/lib/apt/lists/*
+
+# Install python packages you need here
+ENV PIP_NO_CACHE_DIR=1
+RUN pip install h5py==3.10.0
+RUN pip install numpy==1.24.3
+RUN pip install scipy==1.10.1
+RUN pip install dgl -f https://data.dgl.ai/wheels/cu118/repo.html
+RUN pip install dglgo -f https://data.dgl.ai/wheels-test/repo.html
+ENV DGLBACKEND=pytorch
+RUN pip install cg-gnn==0.3.2
+
+# Make the files you need in this directory available everywhere in the container
+ADD . /app
+RUN chmod +x train.py
+RUN mv train.py /usr/local/bin/spt-plugin-train-on-graphs
+RUN chmod +x /app/print_graph_config.sh
+RUN mv /app/print_graph_config.sh /usr/local/bin/spt-plugin-print-graph-request-configuration
+RUN chmod +x /app/print_training_config.sh
+RUN mv /app/print_training_config.sh /usr/local/bin/spt-plugin-print-training-configuration
diff --git a/build/plugins/graph_processing/cg-gnn.dockerfile b/build/plugins/graph_processing/cg-gnn.dockerfile
@@ -0,0 +1,34 @@
+# Use cuda.Dockerfile if you have a CUDA-enabled GPU
+FROM python:3.11-slim-buster
+WORKDIR /app
+
+# Install apt packages you need here, and then clean up afterward
+RUN apt-get update
+RUN apt-get install -y \
+    libhdf5-serial-dev \
+    libatlas-base-dev \
+    libblas-dev \
+    liblapack-dev \
+    gfortran \
+    libpq-dev
+RUN rm -rf /var/lib/apt/lists/*
+
+# Install python packages you need here
+ENV PIP_NO_CACHE_DIR=1
+RUN pip install h5py==3.10.0
+RUN pip install numpy==1.24.3
+RUN pip install scipy==1.10.1
+RUN pip install torch --index-url https://download.pytorch.org/whl/cpu
+RUN pip install dgl -f https://data.dgl.ai/wheels/repo.html
+RUN pip install dglgo -f https://data.dgl.ai/wheels-test/repo.html
+ENV DGLBACKEND=pytorch
+RUN pip install cg-gnn==0.3.2
+
+# Make the files you need in this directory available everywhere in the container
+ADD . /app
+RUN chmod +x train.py
+RUN mv train.py /usr/local/bin/spt-plugin-train-on-graphs
+RUN chmod +x /app/print_graph_config.sh
+RUN mv /app/print_graph_config.sh /usr/local/bin/spt-plugin-print-graph-request-configuration
+RUN chmod +x /app/print_training_config.sh
+RUN mv /app/print_training_config.sh /usr/local/bin/spt-plugin-print-training-configuration
diff --git a/build/plugins/graph_processing/graph-transformer.dockerfile b/build/plugins/graph_processing/graph-transformer.dockerfile
@@ -0,0 +1,27 @@
+FROM pytorch/pytorch:2.1.2-cuda11.8-cudnn8-runtime
+WORKDIR /app
+
+# Install apt packages you need here, and then clean up afterward
+RUN apt-get update
+RUN rm -rf /var/lib/apt/lists/*
+
+# Install python packages you need here
+ENV PIP_NO_CACHE_DIR=1
+RUN pip install h5py==3.10.0
+RUN pip install numpy==1.24.3
+RUN pip install scipy==1.10.1
+RUN pip install pandas
+RUN pip install pillow
+RUN pip install tensorboardX
+RUN pip install opencv-python
+RUN pip install einops
+RUN pip install torch-geometric
+
+# Make the files you need in this directory available everywhere in the container
+ADD . /app
+RUN chmod +x train.py
+RUN mv train.py /usr/local/bin/spt-plugin-train-on-graphs
+RUN chmod +x /app/print_graph_config.sh
+RUN mv /app/print_graph_config.sh /usr/local/bin/spt-plugin-print-graph-request-configuration
+RUN chmod +x /app/print_training_config.sh
+RUN mv /app/print_training_config.sh /usr/local/bin/spt-plugin-print-training-configuration
diff --git a/plugin/README.md b/plugin/README.md
@@ -0,0 +1,3 @@
+# Plugins
+
+This directory contains various plugins and plugin archetypes for the SPT platform.
diff --git a/plugin/graph_processing/README.md b/plugin/graph_processing/README.md
@@ -0,0 +1,22 @@
+# Graph processing
+
+These plugins create and process cell graphs to make downstream predictions.
+
+## Development process
+
+Addition of new graph processing plugins to SPT is done in three steps:
+1. **Exploratory stage**: a user evaluates their processing model by manually using the output and upload functions described in the following section and examining their results locally or in an SPT instance after upload.
+2. **Proposal stage**: If a user determines that their graph model is a good candidate for inclusion into SPT as a default graph deep learning plugin, that could be run on every new study imported into SPT, they would containerize their model according to the Docker template defined in `template/`, upload it to a fork of this repository, and open an issue or pull request for SPT to include their plugin.
+3. **Inclusion stage**: The SPT maintainers would review the proposed plugin and, if it is accepted, upload the container to the SPT Docker page and modify SPT code to pull down and use that Docker image by default in SPT.
+
+## Graph processing plugins should look like this
+
+Graph processing plugins are to be made available as Docker images, built from a Dockerfile following the template provided in `Dockerfile`.
+
+Each plugin should have the following commands available from anywhere in the Docker image:
+* `spt-plugin-print-graph-request-configuration`, which prints to `stdout` the configuration file intended to be used by this plugin to fetch graphs from an SPT instance to use for model training. An empty configuration file and a shell script to do this is provided in this repo, as well as the command needed to make this available in the template `Dockerfile`.
+* `spt-plugin-train-on-graphs` trains the model and outputs a CSV of importance scores that can be read by `spt graphs upload-importances`. A template `train.py` is provided that uses a command line interface specified in `train_cli.py`. The template `Dockerfile` provides a command to make this script available anywhere in the Docker image. Its arguments are
+    1. `--input_directory`, the path to the directory containing the graphs to train on.
+    2. `--config_file`, the path to the configuration file. This should be optional, and if not provided `spt-plugin-train-on-graphs` should use reasonable defaults.
+    3. `--output_directory`, the path to the directory in which to save the trained model, importance scores, and any other artifacts deemed important enough to save, like performance reports.
+* `spt-plugin-print-training-configuration`, which prints to `stdout` an example configuration file for running `spt-plugin-train-on-graphs`, populated either with example values or the reasonable defaults used by the command. An empty configuration file and a shell script to do this is provided in this repo, as well as the command needed to make this available in the template `Dockerfile`.
diff --git a/plugin/graph_processing/cg-gnn/README.md b/plugin/graph_processing/cg-gnn/README.md
@@ -0,0 +1,3 @@
+# spt-cg-gnn
+
+This builds the cg-gnn SPT plugin as a Docker image.
diff --git a/plugin/graph_processing/cg-gnn/graph.config b/plugin/graph_processing/cg-gnn/graph.config
@@ -0,0 +1,7 @@
+[graph-generation]
+validation_data_percent = 0
+test_data_percent = 15
+; use_channels = true
+; use_phenotypes = true
+; cells_per_roi_target = 5000
+target_name = <name of your target>
diff --git a/plugin/graph_processing/cg-gnn/print_graph_config.sh b/plugin/graph_processing/cg-gnn/print_graph_config.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+cat /app/graph.config
diff --git a/plugin/graph_processing/cg-gnn/print_training_config.sh b/plugin/graph_processing/cg-gnn/print_training_config.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+cat /app/training.config
diff --git a/plugin/graph_processing/cg-gnn/train.py b/plugin/graph_processing/cg-gnn/train.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+"""Convert SPT graph objects to CG-GNN graph objects and run training and evaluation with them."""
+
+from sys import path
+from configparser import ConfigParser
+from os import remove
+from os.path import join, exists
+from configparser import ConfigParser
+from warnings import warn
+
+from numpy import nonzero  # type: ignore
+from networkx import to_scipy_sparse_array  # type: ignore
+from torch import (
+    FloatTensor,
+    IntTensor,  # type: ignore
+)
+from dgl import DGLGraph, graph
+from cggnn.util import GraphData, save_cell_graphs, load_cell_graphs
+from cggnn.util.constants import INDICES, CENTROIDS, FEATURES, IMPORTANCES
+from cggnn.run import train_and_evaluate
+
+path.append('/app')  # noqa
+from train_cli import parse_arguments, DEFAULT_CONFIG_FILE
+from util import HSGraph, GraphData as SPTGraphData, load_hs_graphs, save_hs_graphs
+
+
+def _convert_spt_graph(g_spt: HSGraph) -> DGLGraph:
+    """Convert a SPT HSGraph to a CG-GNN cell graph."""
+    num_nodes = g_spt.node_features.shape[0]
+    g_dgl = graph([])
+    g_dgl.add_nodes(num_nodes)
+    g_dgl.ndata[INDICES] = IntTensor(g_spt.histological_structure_ids)
+    g_dgl.ndata[CENTROIDS] = FloatTensor(g_spt.centroids)
+    g_dgl.ndata[FEATURES] = FloatTensor(g_spt.node_features)
+    # Note: channels and phenotypes are binary variables, but DGL only supports FloatTensors
+    edge_list = nonzero(g_spt.adj.toarray())
+    g_dgl.add_edges(list(edge_list[0]), list(edge_list[1]))
+    return g_dgl
+
+
+def _convert_spt_graph_data(g_spt: SPTGraphData) -> GraphData:
+    """Convert a SPT GraphData object to a CG-GNN/DGL GraphData object."""
+    return GraphData(
+        graph=_convert_spt_graph(g_spt.graph),
+        label=g_spt.label,
+        name=g_spt.name,
+        specimen=g_spt.specimen,
+        set=g_spt.set,
+    )
+
+
+def _convert_spt_graphs_data(graphs_data: list[SPTGraphData]) -> list[GraphData]:
+    """Convert a list of SPT HSGraphs to CG-GNN cell graphs."""
+    return [_convert_spt_graph_data(g_spt) for g_spt in graphs_data]
+
+
+def _convert_dgl_graph(g_dgl: DGLGraph) -> HSGraph:
+    """Convert a DGLGraph to a CG-GNN cell graph."""
+    return HSGraph(
+        adj=to_scipy_sparse_array(g_dgl.to_networkx()),
+        node_features=g_dgl.ndata[FEATURES].detach().cpu().numpy(),
+        centroids=g_dgl.ndata[CENTROIDS].detach().cpu().numpy(),
+        histological_structure_ids=g_dgl.ndata[INDICES].detach().cpu().numpy(),
+        importances=g_dgl.ndata[IMPORTANCES].detach().cpu().numpy() if (IMPORTANCES in g_dgl.ndata)
+        else None,
+    )
+
+
+def _convert_dgl_graph_data(g_dgl: GraphData) -> SPTGraphData:
+    return SPTGraphData(
+        graph=_convert_dgl_graph(g_dgl.graph),
+        label=g_dgl.label,
+        name=g_dgl.name,
+        specimen=g_dgl.specimen,
+        set=g_dgl.set,
+    )
+
+
+def _convert_dgl_graphs_data(graphs_data: list[GraphData]) -> list[SPTGraphData]:
+    """Convert a list of DGLGraphs to CG-GNN cell graphs."""
+    return [_convert_dgl_graph_data(g_dgl) for g_dgl in graphs_data]
+
+
+def _handle_random_seed_values(random_seed_value: str | None) -> int | None:
+    if (random_seed_value is not None) and (str(random_seed_value).strip().lower() != "none"):
+        return int(random_seed_value)
+    return None
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    config_file = ConfigParser()
+    config_file.read(args.config_file)
+    random_seed: int | None = None
+    if 'general' in config_file:
+        random_seed = _handle_random_seed_values(config_file['general'].get('random_seed', None))
+    if 'cg-gnn' not in config_file:
+        warn('No cg-gnn section in config file. Using default values.')
+        config_file.read(DEFAULT_CONFIG_FILE)
+    config = config_file['cg-gnn']
+
+    in_ram: bool = config.getboolean('in_ram', True)
+    batch_size: int = config.getint('batch_size', 32)
+    epochs: int = config.getint('epochs', 10)
+    learning_rate: float = config.getfloat('learning_rate', 1e-3)
+    k_folds: int = config.getint('k_folds', 5)
+    explainer: str = config.get('explainer', 'pp')
+    merge_rois: bool = config.getboolean('merge_rois', True)
+    if random_seed is None:
+        random_seed = _handle_random_seed_values(config.get('random_seed', None))
+
+    spt_graphs, _ = load_hs_graphs(args.input_directory)
+    save_cell_graphs(_convert_spt_graphs_data(spt_graphs), args.output_directory)
+
+    model, graphs_data, hs_id_to_importances = train_and_evaluate(args.output_directory,
+                                                                  in_ram,
+                                                                  batch_size,
+                                                                  epochs,
+                                                                  learning_rate,
+                                                                  k_folds,
+                                                                  explainer,
+                                                                  merge_rois,
+                                                                  random_seed)
+
+    save_hs_graphs(_convert_dgl_graphs_data(load_cell_graphs(args.output_directory)[0]),
+                   args.output_directory)
+    for filename in ('graphs.bin', 'graph_info.pkl'):
+        graphs_file = join(args.output_directory, filename)
+        if exists(graphs_file):
+            remove(graphs_file)
diff --git a/plugin/graph_processing/cg-gnn/train_cli.py b/plugin/graph_processing/cg-gnn/train_cli.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+"""Process arguments to training command."""
+
+from argparse import ArgumentParser
+
+DEFAULT_CONFIG_FILE = 'training.config'
+
+
+def parse_arguments():
+    """Parse arguments."""
+    arg_parser = ArgumentParser()
+    arg_parser.add_argument(
+        '--input_directory',
+        type=str,
+        help='Path to the directory containing the cell graphs to be used for training.',
+    )
+    arg_parser.add_argument(
+        '--config_file',
+        type=str,
+        help='Path to config file.',
+        default=DEFAULT_CONFIG_FILE,
+    )
+    arg_parser.add_argument(
+        '--output_directory',
+        type=str,
+        help='Path to the directory containing the cell graphs to be used for training.',
+    )
+    return arg_parser.parse_args()
diff --git a/plugin/graph_processing/cg-gnn/training.config b/plugin/graph_processing/cg-gnn/training.config
@@ -0,0 +1,8 @@
+[cg-gnn]
+; in_ram = true
+batch_size = 1
+epochs = 5
+learning_rate = 1e-3
+k_folds = 0
+; explainer_model = pp
+merge_rois = true
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Plugins

		This directory contains various plugins and plugin archetypes for the SPT platform.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# spt-cg-gnn

		This builds the cg-gnn SPT plugin as a Docker image.