Merge branch 'develop' into fix_data_scaling

mala-project · Nov 22, 2024 · 1525529 · 1525529
2 parents 608ba39 + a402f79
commit 1525529
Show file tree

Hide file tree

Showing 29 changed files with 481 additions and 331 deletions.
diff --git a/docs/source/advanced_usage/predictions.rst b/docs/source/advanced_usage/predictions.rst
@@ -81,11 +81,13 @@ Gaussian representation of atomic positions. In this algorithm, most of the
 computational overhead of the total energy calculation is offloaded to the
 computation of this Gaussian representation. This calculation is realized via
 LAMMPS and can therefore be GPU accelerated (parallelized) in the same fashion
-as the bispectrum descriptor calculation. Simply activate this option via
+as the bispectrum descriptor calculation. If a GPU is activated (and LAMMPS
+is available), this option will be used by default. It can also manually be
+activated via
 
     .. code-block:: python
 
-        parameters.descriptors.use_atomic_density_energy_formula = True
+        parameters.use_atomic_density_formula = True
 
 The Gaussian representation algorithm is describe in
 the publication `Predicting electronic structures at any length scale with machine learning <doi.org/10.1038/s41524-023-01070-z>`_.

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -72,7 +72,6 @@
     "scipy",
     "oapackage",
     "matplotlib",
-    "horovod",
     "lammps",
     "total_energy",
     "pqkmeans",

diff --git a/examples/advanced/ex10_convert_numpy_openpmd.py b/examples/advanced/ex10_convert_numpy_openpmd.py
@@ -0,0 +1,100 @@
+import mala
+
+from mala.datahandling.data_repo import data_path
+import os
+
+parameters = mala.Parameters()
+parameters.descriptors.descriptors_contain_xyz = False
+
+# First, convert from Numpy files to openPMD.
+
+data_converter = mala.DataConverter(parameters)
+
+for snapshot in range(2):
+    data_converter.add_snapshot(
+        descriptor_input_type="numpy",
+        descriptor_input_path=os.path.join(
+            data_path, "Be_snapshot{}.in.npy".format(snapshot)
+        ),
+        target_input_type="numpy",
+        target_input_path=os.path.join(
+            data_path, "Be_snapshot{}.out.npy".format(snapshot)
+        ),
+        additional_info_input_type=None,
+        additional_info_input_path=None,
+        target_units=None,
+    )
+
+data_converter.convert_snapshots(
+    descriptor_save_path="./",
+    target_save_path="./",
+    additional_info_save_path="./",
+    naming_scheme="converted_from_numpy_*.bp5",
+    descriptor_calculation_kwargs={"working_directory": "./"},
+)
+
+# Convert those files back to Numpy to verify the data stays the same.
+
+data_converter = mala.DataConverter(parameters)
+
+for snapshot in range(2):
+    data_converter.add_snapshot(
+        descriptor_input_type="openpmd",
+        descriptor_input_path="converted_from_numpy_{}.in.bp5".format(
+            snapshot
+        ),
+        target_input_type="openpmd",
+        target_input_path="converted_from_numpy_{}.out.bp5".format(snapshot),
+        additional_info_input_type=None,
+        additional_info_input_path=None,
+        target_units=None,
+    )
+
+data_converter.convert_snapshots(
+    descriptor_save_path="./",
+    target_save_path="./",
+    additional_info_save_path="./",
+    naming_scheme="verify_against_original_numpy_data_*.npy",
+    descriptor_calculation_kwargs={"working_directory": "./"},
+)
+
+for snapshot in range(2):
+    for i_o in ["in", "out"]:
+        original = os.path.join(
+            data_path, "Be_snapshot{}.{}.npy".format(snapshot, i_o)
+        )
+        roundtrip = "verify_against_original_numpy_data_{}.{}.npy".format(
+            snapshot, i_o
+        )
+        import numpy as np
+
+        original_a = np.load(original)
+        roundtrip_a = np.load(roundtrip)
+        np.testing.assert_allclose(original_a, roundtrip_a)
+
+# Now, convert some openPMD data back to Numpy.
+
+data_converter = mala.DataConverter(parameters)
+
+for snapshot in range(2):
+    data_converter.add_snapshot(
+        descriptor_input_type="openpmd",
+        descriptor_input_path=os.path.join(
+            data_path, "Be_snapshot{}.in.h5".format(snapshot)
+        ),
+        target_input_type="openpmd",
+        target_input_path=os.path.join(
+            data_path, "Be_snapshot{}.out.h5".format(snapshot)
+        ),
+        additional_info_input_type=None,
+        additional_info_input_path=None,
+        target_units=None,
+    )
+
+data_converter.convert_snapshots(
+    descriptor_save_path="./",
+    target_save_path="./",
+    additional_info_save_path="./",
+    naming_scheme="converted_from_openpmd_*.npy",
+    descriptor_calculation_kwargs={"working_directory": "./"},
+)
diff --git a/external_modules/total_energy_module/total_energy.f90 b/external_modules/total_energy_module/total_energy.f90
@@ -11,7 +11,8 @@ SUBROUTINE initialize(file_name, y_planes_in, calculate_eigts_in)
   USE mp_global,         ONLY : mp_startup
   USE mp,                ONLY : mp_size
   USE read_input,        ONLY : read_input_file
-  USE command_line_options, ONLY: input_file_, command_line, ndiag_, nyfft_
+  USE command_line_options, ONLY: input_file_, command_line, ndiag_, nyfft_, &
+          pencil_decomposition_
   !
   IMPLICIT NONE
   CHARACTER(len=256) :: srvaddress
@@ -37,9 +38,9 @@ SUBROUTINE initialize(file_name, y_planes_in, calculate_eigts_in)
   IF (PRESENT(y_planes_in)) THEN
     IF (y_planes_in > 1) THEN
       nyfft_ = y_planes_in
+      pencil_decomposition_ = .true.
     ENDIF
   ENDIF
-
   !! checks if first string is contained in the second
   !
   CALL mp_startup ( start_images=.true., images_only=.true.)

diff --git a/mala/common/parameters.py b/mala/common/parameters.py
@@ -40,6 +40,7 @@ def __init__(
             "openpmd_configuration": {},
             "openpmd_granularity": 1,
             "lammps": True,
+            "atomic_density_formula": False,
         }
         pass
 
@@ -88,6 +89,11 @@ def _update_openpmd_granularity(self, new_granularity):
     def _update_lammps(self, new_lammps):
         self._configuration["lammps"] = new_lammps
 
+    def _update_atomic_density_formula(self, new_atomic_density_formula):
+        self._configuration["atomic_density_formula"] = (
+            new_atomic_density_formula
+        )
+
     @staticmethod
     def _member_to_json(member):
         if isinstance(member, (int, float, type(None), str)):
@@ -306,9 +312,9 @@ class ParametersDescriptors(ParametersBase):
                                 descriptors.
 
     bispectrum_twojmax : int
-        Bispectrum calculation: 2*jmax-parameter used for calculation of SNAP
-        descriptors. Default value for jmax is 5, so default value for
-        twojmax is 10.
+        Bispectrum calculation: 2*jmax-parameter used for calculation of
+        bispectrum descriptors. Default value for jmax is 5, so default value
+        for twojmax is 10.
 
     lammps_compute_file : string
         Bispectrum calculation: LAMMPS input file that is used to calculate the
@@ -322,11 +328,6 @@ class ParametersDescriptors(ParametersBase):
 
     atomic_density_sigma : float
         Sigma used for the calculation of the Gaussian descriptors.
-
-    use_atomic_density_energy_formula : bool
-        If True, Gaussian descriptors will be calculated for the
-        calculation of the Ewald sum as part of the total energy module.
-        Default is False.
     """
 
     def __init__(self):
@@ -356,7 +357,6 @@ def __init__(self):
         # atomic density may be used at the same time, if e.g. bispectrum
         # descriptors are used for a full inference, which then uses the atomic
         # density for the calculation of the Ewald sum.
-        self.use_atomic_density_energy_formula = False
         self.atomic_density_sigma = None
         self.atomic_density_cutoff = None
 
@@ -556,11 +556,6 @@ class ParametersData(ParametersBase):
 
     Attributes
     ----------
-    descriptors_contain_xyz : bool
-        Legacy option. If True, it is assumed that the first three entries of
-        the descriptor vector are the xyz coordinates and they are cut from the
-        descriptor vector. If False, no such cutting is peformed.
-
     snapshot_directories_list : list
         A list of all added snapshots.
 
@@ -1204,9 +1199,6 @@ class Parameters:
     hyperparameters : ParametersHyperparameterOptimization
         Parameters used for hyperparameter optimization.
 
-    debug : ParametersDebug
-        Container for all debugging parameters.
-
     manual_seed: int
         If not none, this value is used as manual seed for the neural networks.
         Can be used to make experiments comparable. Default: None.
@@ -1238,6 +1230,7 @@ def __init__(self):
         # different.
         self.openpmd_granularity = 1
         self.use_lammps = True
+        self.use_atomic_density_formula = False
 
     @property
     def openpmd_granularity(self):
@@ -1289,7 +1282,7 @@ def verbosity(self, value):
 
     @property
     def use_gpu(self):
-        """Control whether or not a GPU is used (provided there is one)."""
+        """Control whether a GPU is used (provided there is one)."""
         return self._use_gpu
 
     @use_gpu.setter
@@ -1304,6 +1297,12 @@ def use_gpu(self, value):
                     "GPU requested, but no GPU found. MALA will "
                     "operate with CPU only."
                 )
+        if self._use_gpu and self.use_lammps:
+            printout(
+                "Enabling atomic density formula because LAMMPS and GPU "
+                "are used."
+            )
+            self.use_atomic_density_formula = True
 
         # Invalidate, will be updated in setter.
         self.device = None
@@ -1316,7 +1315,7 @@ def use_gpu(self, value):
 
     @property
     def use_ddp(self):
-        """Control whether or not dd is used for parallel training."""
+        """Control whether ddp is used for parallel training."""
         return self._use_ddp
 
     @use_ddp.setter
@@ -1367,7 +1366,7 @@ def device(self, value):
 
     @property
     def use_mpi(self):
-        """Control whether or not MPI is used for paralle inference."""
+        """Control whether MPI is used for paralle inference."""
         return self._use_mpi
 
     @use_mpi.setter
@@ -1411,19 +1410,67 @@ def openpmd_configuration(self, value):
 
     @property
     def use_lammps(self):
-        """Control whether or not to use LAMMPS for descriptor calculation."""
+        """Control whether to use LAMMPS for descriptor calculation."""
         return self._use_lammps
 
     @use_lammps.setter
     def use_lammps(self, value):
         self._use_lammps = value
+        if self.use_gpu and value:
+            printout(
+                "Enabling atomic density formula because LAMMPS and GPU "
+                "are used."
+            )
+            self.use_atomic_density_formula = True
         self.network._update_lammps(self.use_lammps)
         self.descriptors._update_lammps(self.use_lammps)
         self.targets._update_lammps(self.use_lammps)
         self.data._update_lammps(self.use_lammps)
         self.running._update_lammps(self.use_lammps)
         self.hyperparameters._update_lammps(self.use_lammps)
 
+    @property
+    def use_atomic_density_formula(self):
+        """Control whether to use the atomic density formula.
+
+        This formula uses as a Gaussian representation of the atomic density
+        to calculate the structure factor and with it, the Ewald energy
+        and parts of the exchange-correlation energy. By using it, one can
+        go from N^2 to NlogN scaling, and offloads most of the computational
+        overhead of energy calculation from QE to LAMMPS. This is beneficial
+        since LAMMPS can benefit from GPU acceleration (QE GPU acceleration
+        is not used in the portion of the QE code MALA employs). If set
+        to True, this means MALA will perform another LAMMPS calculation
+        during inference. The hyperparameters for this atomic density
+        calculation are set via the parameters.descriptors object.
+        Default is False, except for when both use_gpu and use_lammps
+        are True, in which case this value will be set to True as well.
+        """
+        return self._use_atomic_density_formula
+
+    @use_atomic_density_formula.setter
+    def use_atomic_density_formula(self, value):
+        self._use_atomic_density_formula = value
+
+        self.network._update_atomic_density_formula(
+            self.use_atomic_density_formula
+        )
+        self.descriptors._update_atomic_density_formula(
+            self.use_atomic_density_formula
+        )
+        self.targets._update_atomic_density_formula(
+            self.use_atomic_density_formula
+        )
+        self.data._update_atomic_density_formula(
+            self.use_atomic_density_formula
+        )
+        self.running._update_atomic_density_formula(
+            self.use_atomic_density_formula
+        )
+        self.hyperparameters._update_atomic_density_formula(
+            self.use_atomic_density_formula
+        )
+
     def show(self):
         """Print name and values of all attributes of this object."""
         printout(
@@ -1616,6 +1663,18 @@ def load_from_file(
                     ].from_json(json_dict[key])
                     setattr(loaded_parameters, key, sub_parameters)
 
+                    # Backwards compatability:
+                    if key == "descriptors":
+                        if (
+                            "use_atomic_density_energy_formula"
+                            in json_dict[key]
+                        ):
+                            loaded_parameters.use_atomic_density_formula = (
+                                json_dict[key][
+                                    "use_atomic_density_energy_formula"
+                                ]
+                            )
+
             # We iterate a second time, to set global values, so that they
             # are properly forwarded.
             for key in json_dict:
@@ -1629,6 +1688,13 @@ def load_from_file(
                         setattr(loaded_parameters, key, json_dict[key])
             if no_snapshots is True:
                 loaded_parameters.data.snapshot_directories_list = []
+            # Backwards compatability: since the transfer of old property
+            # to new property happens _before_ all children descriptor classes
+            # are instantiated, it is not properly propagated. Thus, we
+            # simply have to set it to its own value again.
+            loaded_parameters.use_atomic_density_formula = (
+                loaded_parameters.use_atomic_density_formula
+            )
         else:
             raise Exception("Unsupported parameter save format.")
 

diff --git a/mala/common/physical_data.py b/mala/common/physical_data.py
@@ -555,6 +555,11 @@ def write_to_openpmd_iteration(
                 atoms_openpmd["position"][str(atom)].unit_SI = 1.0e-10
                 atoms_openpmd["positionOffset"][str(atom)].unit_SI = 1.0e-10
 
+        if any(i == 0 for i in self.grid_dimensions) and not isinstance(
+            array, self.SkipArrayWriting
+        ):
+            self.grid_dimensions = array.shape[0:-1]
+
         dataset = (
             array.dataset
             if isinstance(array, self.SkipArrayWriting)
@@ -564,8 +569,12 @@ def write_to_openpmd_iteration(
         # Global feature sizes:
         feature_global_from = 0
         feature_global_to = self.feature_size
-        if feature_global_to == 0 and isinstance(array, self.SkipArrayWriting):
-            feature_global_to = array.feature_size
+        if feature_global_to == 0:
+            feature_global_to = (
+                array.feature_size
+                if isinstance(array, self.SkipArrayWriting)
+                else array.shape[-1]
+            )
 
         # First loop: Only metadata, write metadata equivalently across ranks
         for current_feature in range(feature_global_from, feature_global_to):