On-the-fly training works for the RAM case

mala-project · RandomDefaultUser · Jan 15, 2025 · Jan 6, 2025 · Jan 6, 2025 · Jan 7, 2025
commit e154d2a49bcab8778f02f04f19802753f5dd85de
diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py
@@ -1,6 +1,7 @@
 """DataHandler class that loads and scales data."""
 
 import os
+import tempfile
 
 import numpy as np
 import torch
@@ -169,6 +170,19 @@ def clear_data(self):
         self.output_data_scaler.reset()
         super(DataHandler, self).clear_data()
 
+    def delete_temporary_data(self):
+        """
+        Delete temporary data files.
+
+        These may have been created during a training or testing process
+        when using atomic positions for on-the-fly calculation of descriptors
+        rather than precomputed data files.
+        """
+        for snapshot in self.parameters.snapshot_directories_list:
+            if snapshot.temporary_input_file is not None:
+                if os.path.isfile(snapshot.temporary_input_file):
+                    os.remove(snapshot.temporary_input_file)
+
     # Preparing data
     ######################
 
@@ -595,14 +609,37 @@ def __load_data(self, function, data_type):
                         snapshot.input_npy_directory, snapshot.input_npy_file
                     )
                     units = snapshot.input_units
+
+                    # If the input for the descriptors is actually a JSON
+                    # file then we need to calculate the descriptors.
+                    if snapshot.snapshot_type == "json+numpy":
+                        snapshot.temporary_input_file = (
+                            tempfile.NamedTemporaryFile(
+                                delete=False,
+                                prefix=snapshot.input_npy_file.split(".")[0],
+                                suffix=".in.npy",
+                                dir=snapshot.input_npy_directory,
+                            ).name
+                        )
+                        descriptors, grid = (
+                            self.descriptor_calculator.calculate_from_json(
+                                file
+                            )
+                        )
+                        np.save(snapshot.temporary_input_file, descriptors)
+                        file = snapshot.temporary_input_file
+
                 else:
                     file = os.path.join(
                         snapshot.output_npy_directory,
                         snapshot.output_npy_file,
                     )
                     units = snapshot.output_units
 
-                if snapshot.snapshot_type == "numpy":
+                if (
+                    snapshot.snapshot_type == "numpy"
+                    or snapshot.snapshot_type == "json+numpy"
+                ):
                     calculator.read_from_numpy_file(
                         file,
                         units=units,

diff --git a/mala/datahandling/data_handler_base.py b/mala/datahandling/data_handler_base.py
@@ -207,6 +207,15 @@ def _check_snapshots(self, comm=None):
                     ),
                     comm=comm,
                 )
+            elif snapshot.snapshot_type == "json+numpy":
+                tmp_dimension = (
+                    self.descriptor_calculator.read_dimensions_from_json(
+                        os.path.join(
+                            snapshot.input_npy_directory,
+                            snapshot.input_npy_file,
+                        )
+                    )
+                )
             else:
                 raise Exception("Unknown snapshot file type.")
 
@@ -235,7 +244,10 @@ def _check_snapshots(self, comm=None):
                 snapshot.output_npy_directory,
                 min_verbosity=1,
             )
-            if snapshot.snapshot_type == "numpy":
+            if (
+                snapshot.snapshot_type == "numpy"
+                or snapshot.snapshot_type == "json+numpy"
+            ):
                 tmp_dimension = (
                     self.target_calculator.read_dimensions_from_numpy_file(
                         os.path.join(

diff --git a/mala/datahandling/snapshot.py b/mala/datahandling/snapshot.py
@@ -133,6 +133,9 @@ def __init__(
         self.input_dimension = None
         self.output_dimension = None
 
+        # Temporary descriptor files, which may be needed.
+        self.temporary_input_file = None
+
     @classmethod
     def from_json(cls, json_dict):
         """

diff --git a/mala/descriptors/atomic_density.py b/mala/descriptors/atomic_density.py
@@ -119,6 +119,10 @@ def _calculate(self, outdir, **kwargs):
         else:
             return self.__calculate_python(**kwargs)
 
+    def _read_feature_dimension_from_json(self, json_dict):
+        # For now, has to be adapted in the multielement case.
+        return 4
+
     def __calculate_lammps(self, outdir, **kwargs):
         """Perform actual Gaussian descriptor calculation."""
         # For version compatibility; older lammps versions (the serial version

diff --git a/mala/descriptors/bispectrum.py b/mala/descriptors/bispectrum.py
@@ -120,6 +120,24 @@ def _calculate(self, outdir, **kwargs):
         else:
             return self.__calculate_python(**kwargs)
 
+    def _read_feature_dimension_from_json(self, json_dict):
+        if self.parameters.descriptors_contain_xyz:
+            return self.__get_feature_size() - 3
+        else:
+            return self.__get_feature_size()
+
+    def __get_feature_size(self):
+        ncols0 = 3
+
+        # Analytical relation for fingerprint length
+        ncoeff = (
+            (self.parameters.bispectrum_twojmax + 2)
+            * (self.parameters.bispectrum_twojmax + 3)
+            * (self.parameters.bispectrum_twojmax + 4)
+        )
+        ncoeff = ncoeff // 24  # integer division
+        return ncols0 + ncoeff
+
     def __calculate_lammps(self, outdir, **kwargs):
         """
         Perform bispectrum calculation using LAMMPS.
@@ -173,19 +191,7 @@ def __calculate_lammps(self, outdir, **kwargs):
 
         # Do the LAMMPS calculation and clean up.
         lmp.file(self.parameters.lammps_compute_file)
-
-        # Set things not accessible from LAMMPS
-        # First 3 cols are x, y, z, coords
-        ncols0 = 3
-
-        # Analytical relation for fingerprint length
-        ncoeff = (
-            (self.parameters.bispectrum_twojmax + 2)
-            * (self.parameters.bispectrum_twojmax + 3)
-            * (self.parameters.bispectrum_twojmax + 4)
-        )
-        ncoeff = ncoeff // 24  # integer division
-        self.feature_size = ncols0 + ncoeff
+        self.feature_size = self.__get_feature_size()
 
         # Extract data from LAMMPS calculation.
         # This is different for the parallel and the serial case.

diff --git a/mala/descriptors/descriptor.py b/mala/descriptors/descriptor.py
@@ -1,11 +1,12 @@
 """Base class for all descriptor calculators."""
 
 from abc import abstractmethod
-from functools import cached_property
+import json
 import os
 import tempfile
 
 import ase
+from ase.cell import Cell
 from ase.units import m
 from ase.neighborlist import NeighborList, NewPrimitiveNeighborList
 import numpy as np
@@ -375,6 +376,16 @@ def calculate_from_qe_out(
 
         return self._calculate(working_directory, **kwargs)
 
+    def calculate_from_json(self, json_file, working_directory=".", **kwargs):
+        if isinstance(json_file, str):
+            json_dict = json.load(open(json_file, encoding="utf-8"))
+        else:
+            json_dict = json.load(json_file)
+        self.grid_dimensions = json_dict["grid_dimensions"]
+        self._atoms = ase.Atoms.fromdict(json_dict["atoms"])
+        self._voxel = Cell(json_dict["voxel"]["array"])
+        return self._calculate(working_directory, **kwargs)
+
     def calculate_from_atoms(
         self, atoms, grid_dimensions, working_directory=".", **kwargs
     ):
@@ -573,6 +584,16 @@ def convert_local_to_3d(self, descriptors_np):
         ).transpose([2, 1, 0, 3])
         return descriptors_full, local_offset, local_reach
 
+    def read_dimensions_from_json(self, json_file):
+        if isinstance(json_file, str):
+            json_dict = json.load(open(json_file, encoding="utf-8"))
+        else:
+            json_dict = json.load(json_file)
+        grid_dimensions = json_dict["grid_dimensions"] + [
+            self._read_feature_dimension_from_json(json_dict)
+        ]
+        return grid_dimensions
+
     # Private methods
     #################
 
@@ -1021,5 +1042,9 @@ def _grid_to_coord(self, gridpoint):
     def _calculate(self, outdir, **kwargs):
         pass
 
+    @abstractmethod
+    def _read_feature_dimension_from_json(self, json_dict):
+        pass
+
     def _set_feature_size_from_array(self, array):
         self.feature_size = np.shape(array)[-1]
diff --git a/mala/descriptors/minterpy_descriptors.py b/mala/descriptors/minterpy_descriptors.py
@@ -87,6 +87,12 @@ def backconvert_units(array, out_units):
         else:
             raise Exception("Unsupported unit for Minterpy descriptors.")
 
+    def _read_feature_dimension_from_json(self, json_dict):
+        raise Exception(
+            "This feature has not been implemented for Minterpy "
+            "descriptors."
+        )
+
     def _calculate(self, atoms, outdir, grid_dimensions, **kwargs):
         # For version compatibility; older lammps versions (the serial version
         # we still use on some machines) have these constants as part of the

diff --git a/mala/network/trainer.py b/mala/network/trainer.py
@@ -597,6 +597,9 @@ def train_network(self):
                 )
             self.final_validation_loss = vloss
 
+        # Cleaning up temporary data files.
+        self.data.delete_temporary_data()
+
         # Clean-up for pre-fetching lazy loading.
         if self.data.parameters.use_lazy_loading_prefetch:
             self._training_data_loaders.cleanup()

diff --git a/mala/targets/target.py b/mala/targets/target.py
@@ -653,7 +653,7 @@ def read_additional_calculation_data(self, data, data_type=None):
             }
             self.atomic_forces_dft = None
             self.entropy_contribution_dft_calculation = None
-            self.grid_dimensions = [0, 0, 0]
+            self.grid_dimensions = json_dict["grid_dimensions"]
             self.atoms = None
 
             for key in json_dict: