Merge remote-tracking branch 'origin/main' into docs

torchmd · Oct 30, 2023 · cf446bf · cf446bf
2 parents a113068 + e5fc011
commit cf446bf
Show file tree

Hide file tree

Showing 8 changed files with 161 additions and 27 deletions.
diff --git a/README.md b/README.md
@@ -13,15 +13,22 @@ TorchMD-NET provides state-of-the-art neural networks potentials (NNPs) and a me
 - [TensorNet](https://arxiv.org/abs/2306.06482)
 
 
-## Installation
+## Installation  
+TorchMD-Net is available in [conda-forge](https://conda-forge.org/) and can be installed with:  
+```shell
+mamba install torchmd-net
+```
+We recommend using [Mamba](https://github.com/conda-forge/miniforge/#mambaforge) instead of conda.  
+
+### Install from source  
 
 1. Clone the repository:
     ```shell
     git clone https://github.com/torchmd/torchmd-net.git
     cd torchmd-net
     ```
 
-2. Install [Mambaforge](https://github.com/conda-forge/miniforge/#mambaforge). We recommend to use `mamba` rather than `conda`.
+2. Install the dependencies in environment.yml. You can do it via pip, but we recommend [Mambaforge](https://github.com/conda-forge/miniforge/#mambaforge) instead.
 
 3. Create an environment and activate it:
     ```shell
@@ -36,6 +43,23 @@ TorchMD-NET provides state-of-the-art neural networks potentials (NNPs) and a me
 This will install TorchMD-NET in editable mode, so that changes to the source code are immediately available.
 Besides making all python utilities available environment-wide, this will also install the `torchmd-train` command line utility.
 
+
+#### CUDA enabled installation
+
+Besides the dependencies listed in the environment file, you will also need the CUDA `nvcc` compiler suite to build TorchMD-Net.   
+If your system lacks nvcc you may install it via conda-forge:
+	```shell
+	mamba install cudatoolkit-dev
+	```
+Or from the nvidia channel:
+```shell
+mamba install -c nvidia cuda-nvcc cuda-cudart-dev cuda-libraries-dev
+```
+Make sure you install a major version compatible with your torch installation, which you can check with:
+```shell
+python -c "import torch; print(torch.version.cuda)"
+```
+
 ## Usage
 Specifying training arguments can either be done via a configuration yaml file or through command line arguments directly. Several examples of architectural and training specifications for some models and datasets can be found in [examples/](https://github.com/torchmd/torchmd-net/tree/main/examples). Note that if a parameter is present both in the yaml file and the command line, the command line version takes precedence.
 GPUs can be selected by setting the `CUDA_VISIBLE_DEVICES` environment variable. Otherwise, the argument `--ngpus` can be used to select the number of GPUs to train on (-1, the default, uses all available GPUs or the ones specified in `CUDA_VISIBLE_DEVICES`). Keep in mind that the [GPU ID reported by nvidia-smi might not be the same as the one `CUDA_VISIBLE_DEVICES` uses](https://stackoverflow.com/questions/26123252/inconsistency-of-ids-between-nvidia-smi-l-and-cudevicegetname).  

diff --git a/tests/test_calculator.py b/tests/test_calculator.py
@@ -1,10 +1,11 @@
 import torch
 from torch.testing import assert_allclose
+import pytest
 from pytest import mark
 from glob import glob
 from os.path import dirname, join
 from torchmdnet.calculators import External
-from torchmdnet.models.model import load_model
+from torchmdnet.models.model import load_model, create_model
 
 from utils import create_example_batch
 
@@ -21,6 +22,42 @@ def test_compare_forward():
     assert_allclose(e_calc, e_pred)
     assert_allclose(f_calc, f_pred.unsqueeze(0))
 
+def test_compare_forward_cuda_graph():
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+    checkpoint = join(dirname(dirname(__file__)), "tests", "example.ckpt")
+    args = {"model": "tensornet",
+            "embedding_dimension": 128,
+            "num_layers": 2,
+            "num_rbf": 32,
+            "rbf_type": "expnorm",
+            "trainable_rbf": False,
+            "activation": "silu",
+            "cutoff_lower": 0.0,
+            "cutoff_upper": 5.0,
+            "max_z": 100,
+            "max_num_neighbors": 128,
+            "equivariance_invariance_group": "O(3)",
+            "prior_model": None,
+            "atom_filter": -1,
+            "derivative": True,
+            "output_model": "Scalar",
+            "reduce_op": "sum",
+            "precision": 32 }
+    model = create_model(args).to(device="cuda")
+    z, pos, _ = create_example_batch(multiple_batches=False)
+    z = z.to("cuda")
+    pos = pos.to("cuda")
+    calc = External(checkpoint, z.unsqueeze(0), use_cuda_graph=False, device="cuda")
+    calc_graph = External(checkpoint, z.unsqueeze(0), use_cuda_graph=True, device="cuda")
+    calc.model = model
+    calc_graph.model = model
+    for _ in range(10):
+        e_calc, f_calc = calc.calculate(pos, None)
+        e_pred, f_pred = calc_graph.calculate(pos, None)
+        assert_allclose(e_calc, e_pred)
+        assert_allclose(f_calc, f_pred)
+
 
 def test_compare_forward_multiple():
     checkpoint = join(dirname(dirname(__file__)), "tests", "example.ckpt")

diff --git a/tests/test_cfconv.py b/tests/test_cfconv.py
@@ -77,5 +77,5 @@ def test_cfconv(device, num_atoms, num_filters, num_rbfs, cutoff_upper):
     total.backward()
     grad = pos.grad.clone()
 
-    assert pt.allclose(ref_output, output, atol=5e-7)
-    assert pt.allclose(ref_grad, grad, atol=5e-7)
+    pt.testing.assert_close(ref_output, output, atol=1e-6, rtol=1e-5)
+    pt.testing.assert_close(ref_grad, grad, atol=1e-6, rtol=1e-5)
diff --git a/tests/test_optimize.py b/tests/test_optimize.py
@@ -52,5 +52,5 @@ def test_gn(device, num_atoms):
     # Execute the optimize model
     energy, gradient = model(elements, positions)
 
-    assert pt.allclose(ref_energy, energy, atol=5e-7)
-    assert pt.allclose(ref_gradient, gradient, atol=1e-5)
+    pt.testing.assert_close(ref_energy, energy, rtol=1e-5, atol=1e-5)
+    pt.testing.assert_close(ref_gradient, gradient, rtol=1e-4, atol=1e-5)
diff --git a/torchmdnet/calculators.py b/torchmdnet/calculators.py
@@ -20,15 +20,43 @@
 
 class External:
     """
-    The External class is used to calculate the energy and forces of an external potential, such as a neural network. The class is initialized with the path to the neural network
-    ckpt, the embeddings, the device on which the neural network should be run and the output_transform argument. The output_transform is used to give a function that transform
-    the energy and the forces, this could be a preset transform or a custom function. In this way there is no constraint to the units of the neural network, the user can choose
-    the units of the simulation and the neural network will be automatically converted to the units of the simulation. The function should take two arguments, the energy and the
-    forces, and return the transformed energy and the transformed forces.
+    This is an adapter to use TorchMD-Net models in TorchMD.
+    Parameters
+    ----------
+    netfile : str or torch.nn.Module
+        Path to the checkpoint file of the model or the model itself.
+    embeddings : torch.Tensor
+        Embeddings of the atoms in the system.
+    device : str, optional
+        Device on which the model should be run. Default: "cpu"
+    output_transform : str or callable, optional
+        Transform to apply to the energy and forces.
+        If a string is given, it should be a key in the `transforms` dict.
+        If a callable is given, it should take two arguments (energy and forces) and return two tensors of the same shape.
+        Default: None
+    use_cuda_graph : bool, optional
+        Whether to use CUDA graphs to speed up the calculation. Default: False
+    cuda_graph_warmup_steps : int, optional
+        Number of steps to run as warmup before recording the CUDA graph. Default: 12
     """
 
-    def __init__(self, netfile, embeddings, device="cpu", output_transform=None):
-        self.model = load_model(netfile, device=device, derivative=True)
+    def __init__(
+        self,
+        netfile,
+        embeddings,
+        device="cpu",
+        output_transform=None,
+        use_cuda_graph=False,
+        cuda_graph_warmup_steps=12,
+    ):
+        if isinstance(netfile, str):
+            self.model = load_model(netfile, device=device, derivative=True)
+        elif isinstance(netfile, torch.nn.Module):
+            self.model = netfile
+        else:
+            raise ValueError(
+                f"Expected a path to a checkpoint file or a torch.nn.Module, got {type(netfile)}"
+            )
         self.device = device
         self.n_atoms = embeddings.size(1)
         self.embeddings = embeddings.reshape(-1).to(device)
@@ -46,11 +74,49 @@ def __init__(self, netfile, embeddings, device="cpu", output_transform=None):
             self.output_transformer = tranforms[output_transform]
         else:
             self.output_transformer = eval(output_transform)
+        if not torch.cuda.is_available() and use_cuda_graph:
+            raise ValueError("CUDA graphs are only available if CUDA is")
+        self.use_cuda_graph = use_cuda_graph
+        self.cuda_graph_warmup_steps = cuda_graph_warmup_steps
+        self.cuda_graph = None
+        self.energy = None
+        self.forces = None
+        self.pos = None
+
+    def _init_cuda_graph(self):
+        stream = torch.cuda.Stream()
+        self.cuda_graph = torch.cuda.CUDAGraph()
+        with torch.cuda.stream(stream):
+            for _ in range(self.cuda_graph_warmup_steps):
+                self.energy, self.forces = self.model(
+                    self.embeddings, self.pos, self.batch
+                )
+            with torch.cuda.graph(self.cuda_graph):
+                self.energy, self.forces = self.model(
+                    self.embeddings, self.pos, self.batch
+                )
 
     def calculate(self, pos, box):
         pos = pos.to(self.device).type(torch.float32).reshape(-1, 3)
-        energy, forces = self.model(self.embeddings, pos, self.batch)
-
+        if self.use_cuda_graph:
+            if self.pos is None:
+                self.pos = (
+                    pos.clone()
+                    .to(self.device)
+                    .detach()
+                    .requires_grad_(pos.requires_grad)
+                )
+            if self.cuda_graph is None:
+                self._init_cuda_graph()
+            assert self.cuda_graph is not None, "CUDA graph is not initialized. This should not had happened."
+            with torch.no_grad():
+                self.pos.copy_(pos)
+                self.cuda_graph.replay()
+        else:
+            self.energy, self.forces = self.model(self.embeddings, pos, self.batch)
+        assert self.forces is not None, "The model is not returning forces"
+        assert self.energy is not None, "The model is not returning energy"
         return self.output_transformer(
-            energy.detach(), forces.reshape(-1, self.n_atoms, 3).detach()
+            self.energy.clone().detach(),
+            self.forces.clone().reshape(-1, self.n_atoms, 3).detach(),
         )
diff --git a/torchmdnet/models/model.py b/torchmdnet/models/model.py
@@ -337,8 +337,8 @@ def forward(
                 [y],
                 [pos],
                 grad_outputs=grad_outputs,
-                create_graph=True,
-                retain_graph=True,
+                create_graph=self.training,
+                retain_graph=self.training,
             )[0]
             if dy is None:
                 raise RuntimeError("Autograd returned None for the force prediction.")

diff --git a/torchmdnet/models/tensornet.py b/torchmdnet/models/tensornet.py
@@ -207,26 +207,31 @@ def forward(
         assert (
             edge_vec is not None
         ), "Distance module did not return directional information"
-        # Distance module returns -1 for non-existing edges, to avoid having to resize the tensors when we want to ensure static shapes (for CUDA graphs) we make all non-existing edges pertain to the first atom
+        # Distance module returns -1 for non-existing edges, to avoid having to resize the tensors when we want to ensure static shapes (for CUDA graphs) we make all non-existing edges pertain to a ghost atom
+        zp = z
         if self.static_shapes:
-            mask = (edge_index[0] >= 0).unsqueeze(0).expand_as(edge_index)
-            # I trick the model into thinking that the masked edges pertain to the first atom
+            mask = (edge_index[0] < 0).unsqueeze(0).expand_as(edge_index)
+            zp = torch.cat((z, torch.zeros(1, device=z.device, dtype=z.dtype)), dim=0)
+            # I trick the model into thinking that the masked edges pertain to the extra atom
             # WARNING: This can hurt performance if max_num_pairs >> actual_num_pairs
-            edge_index = edge_index * mask
-            edge_weight = edge_weight * mask[0]
-            edge_vec = edge_vec * mask[0].unsqueeze(-1).expand_as(edge_vec)
+            edge_index = edge_index.masked_fill(mask, z.shape[0])
+            edge_weight = edge_weight.masked_fill(mask[0], 0)
+            edge_vec = edge_vec.masked_fill(mask[0].unsqueeze(-1).expand_as(edge_vec), 0)
         edge_attr = self.distance_expansion(edge_weight)
         mask = edge_index[0] == edge_index[1]
         # Normalizing edge vectors by their length can result in NaNs, breaking Autograd.
         # I avoid dividing by zero by setting the weight of self edges and self loops to 1
         edge_vec = edge_vec / edge_weight.masked_fill(mask, 1).unsqueeze(1)
-        X = self.tensor_embedding(z, edge_index, edge_weight, edge_vec, edge_attr)
+        X = self.tensor_embedding(zp, edge_index, edge_weight, edge_vec, edge_attr)
         for layer in self.layers:
             X = layer(X, edge_index, edge_weight, edge_attr)
         I, A, S = _decompose_tensor(X)
         x = torch.cat((_tensor_norm(I), _tensor_norm(A), _tensor_norm(S)), dim=-1)
         x = self.out_norm(x)
         x = self.act(self.linear((x)))
+        # # Remove the extra atom
+        if self.static_shapes:
+            x = x[:-1]
         return x, None, z, pos, batch
 
 

diff --git a/torchmdnet/module.py b/torchmdnet/module.py
@@ -97,7 +97,9 @@ def _compute_losses(self, y, neg_y, batch, loss_fn, stage):
         # Returns:
         #   loss_y: loss for the predicted value
         #   loss_neg_y: loss for the predicted negative derivative
-        loss_y, loss_neg_y = 0.0, 0.0
+        loss_y, loss_neg_y = torch.tensor(0.0, device=self.device), torch.tensor(
+            0.0, device=self.device
+        )
         loss_name = loss_fn.__name__
         if self.hparams.derivative and "neg_dy" in batch:
             loss_neg_y = loss_fn(neg_y, batch.neg_dy)