Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement on-the-fly descriptor calculation #630

Merged
Merged
Changes from 1 commit
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
e154d2a
On-the-fly training works for the RAM case
RandomDefaultUser Jan 6, 2025
88a5edb
Lazy Loading training works now
RandomDefaultUser Jan 6, 2025
0e4ddfe
Checkpointing works now as well
RandomDefaultUser Jan 7, 2025
c43fefc
Made method private
RandomDefaultUser Jan 7, 2025
7a90ed5
Tester class now also works with on-the-fly calculations
RandomDefaultUser Jan 7, 2025
39a40c3
Prefetching works
RandomDefaultUser Jan 7, 2025
2201e34
Is this already enought to get DDP working?
RandomDefaultUser Jan 7, 2025
bc79637
Merge branch 'refs/heads/develop_lenz' into descriptors_on_the_fly
RandomDefaultUser Jan 7, 2025
c1737c5
Renamed "additional info", since it will be used more regularly with …
RandomDefaultUser Jan 7, 2025
4001d0a
Fixing a parallel writing bug
RandomDefaultUser Jan 8, 2025
e6a0723
Can I use DDP and MPI at the same time?
RandomDefaultUser Jan 8, 2025
5162404
It does not help
RandomDefaultUser Jan 8, 2025
3997229
Getting rid of the parallel modification for now
RandomDefaultUser Jan 8, 2025
15ff5dc
Shuffling from atomic positions works now
RandomDefaultUser Jan 9, 2025
8e8cb3f
Shuffling now works as part of the temporary pipeline
RandomDefaultUser Jan 9, 2025
615792b
Fixed docstrings
RandomDefaultUser Jan 9, 2025
d0e8de6
Added automatic snapshot type detection
RandomDefaultUser Jan 9, 2025
07126f1
Added new temporary framework to examples
RandomDefaultUser Jan 14, 2025
a7d7bd3
Fixed examples
RandomDefaultUser Jan 14, 2025
f0738ce
Added documentation
RandomDefaultUser Jan 14, 2025
d359057
Small adjustment in example
RandomDefaultUser Jan 14, 2025
9d87a2f
Implemented rudimentary json+openpmd
RandomDefaultUser Jan 14, 2025
d1960d4
Added tests for on-the-fly calculations
RandomDefaultUser Jan 14, 2025
ac255b2
Small adjustment in test
RandomDefaultUser Jan 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
On-the-fly training works for the RAM case
  • Loading branch information
RandomDefaultUser committed Jan 6, 2025
commit e154d2a49bcab8778f02f04f19802753f5dd85de
39 changes: 38 additions & 1 deletion mala/datahandling/data_handler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""DataHandler class that loads and scales data."""

import os
import tempfile

import numpy as np
import torch
@@ -169,6 +170,19 @@ def clear_data(self):
self.output_data_scaler.reset()
super(DataHandler, self).clear_data()

def delete_temporary_data(self):
"""
Delete temporary data files.

These may have been created during a training or testing process
when using atomic positions for on-the-fly calculation of descriptors
rather than precomputed data files.
"""
for snapshot in self.parameters.snapshot_directories_list:
if snapshot.temporary_input_file is not None:
if os.path.isfile(snapshot.temporary_input_file):
os.remove(snapshot.temporary_input_file)

# Preparing data
######################

@@ -595,14 +609,37 @@ def __load_data(self, function, data_type):
snapshot.input_npy_directory, snapshot.input_npy_file
)
units = snapshot.input_units

# If the input for the descriptors is actually a JSON
# file then we need to calculate the descriptors.
if snapshot.snapshot_type == "json+numpy":
snapshot.temporary_input_file = (
tempfile.NamedTemporaryFile(
delete=False,
prefix=snapshot.input_npy_file.split(".")[0],
suffix=".in.npy",
dir=snapshot.input_npy_directory,
).name
)
descriptors, grid = (
self.descriptor_calculator.calculate_from_json(
file
)
)
np.save(snapshot.temporary_input_file, descriptors)
file = snapshot.temporary_input_file

else:
file = os.path.join(
snapshot.output_npy_directory,
snapshot.output_npy_file,
)
units = snapshot.output_units

if snapshot.snapshot_type == "numpy":
if (
snapshot.snapshot_type == "numpy"
or snapshot.snapshot_type == "json+numpy"
):
calculator.read_from_numpy_file(
file,
units=units,
14 changes: 13 additions & 1 deletion mala/datahandling/data_handler_base.py
Original file line number Diff line number Diff line change
@@ -207,6 +207,15 @@ def _check_snapshots(self, comm=None):
),
comm=comm,
)
elif snapshot.snapshot_type == "json+numpy":
tmp_dimension = (
self.descriptor_calculator.read_dimensions_from_json(
os.path.join(
snapshot.input_npy_directory,
snapshot.input_npy_file,
)
)
)
else:
raise Exception("Unknown snapshot file type.")

@@ -235,7 +244,10 @@ def _check_snapshots(self, comm=None):
snapshot.output_npy_directory,
min_verbosity=1,
)
if snapshot.snapshot_type == "numpy":
if (
snapshot.snapshot_type == "numpy"
or snapshot.snapshot_type == "json+numpy"
):
tmp_dimension = (
self.target_calculator.read_dimensions_from_numpy_file(
os.path.join(
3 changes: 3 additions & 0 deletions mala/datahandling/snapshot.py
Original file line number Diff line number Diff line change
@@ -133,6 +133,9 @@ def __init__(
self.input_dimension = None
self.output_dimension = None

# Temporary descriptor files, which may be needed.
self.temporary_input_file = None

@classmethod
def from_json(cls, json_dict):
"""
4 changes: 4 additions & 0 deletions mala/descriptors/atomic_density.py
Original file line number Diff line number Diff line change
@@ -119,6 +119,10 @@ def _calculate(self, outdir, **kwargs):
else:
return self.__calculate_python(**kwargs)

def _read_feature_dimension_from_json(self, json_dict):
# For now, has to be adapted in the multielement case.
return 4

def __calculate_lammps(self, outdir, **kwargs):
"""Perform actual Gaussian descriptor calculation."""
# For version compatibility; older lammps versions (the serial version
32 changes: 19 additions & 13 deletions mala/descriptors/bispectrum.py
Original file line number Diff line number Diff line change
@@ -120,6 +120,24 @@ def _calculate(self, outdir, **kwargs):
else:
return self.__calculate_python(**kwargs)

def _read_feature_dimension_from_json(self, json_dict):
if self.parameters.descriptors_contain_xyz:
return self.__get_feature_size() - 3
else:
return self.__get_feature_size()

def __get_feature_size(self):
ncols0 = 3

# Analytical relation for fingerprint length
ncoeff = (
(self.parameters.bispectrum_twojmax + 2)
* (self.parameters.bispectrum_twojmax + 3)
* (self.parameters.bispectrum_twojmax + 4)
)
ncoeff = ncoeff // 24 # integer division
return ncols0 + ncoeff

def __calculate_lammps(self, outdir, **kwargs):
"""
Perform bispectrum calculation using LAMMPS.
@@ -173,19 +191,7 @@ def __calculate_lammps(self, outdir, **kwargs):

# Do the LAMMPS calculation and clean up.
lmp.file(self.parameters.lammps_compute_file)

# Set things not accessible from LAMMPS
# First 3 cols are x, y, z, coords
ncols0 = 3

# Analytical relation for fingerprint length
ncoeff = (
(self.parameters.bispectrum_twojmax + 2)
* (self.parameters.bispectrum_twojmax + 3)
* (self.parameters.bispectrum_twojmax + 4)
)
ncoeff = ncoeff // 24 # integer division
self.feature_size = ncols0 + ncoeff
self.feature_size = self.__get_feature_size()

# Extract data from LAMMPS calculation.
# This is different for the parallel and the serial case.
27 changes: 26 additions & 1 deletion mala/descriptors/descriptor.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""Base class for all descriptor calculators."""

from abc import abstractmethod
from functools import cached_property
import json
import os
import tempfile

import ase
from ase.cell import Cell
from ase.units import m
from ase.neighborlist import NeighborList, NewPrimitiveNeighborList
import numpy as np
@@ -375,6 +376,16 @@ def calculate_from_qe_out(

return self._calculate(working_directory, **kwargs)

def calculate_from_json(self, json_file, working_directory=".", **kwargs):
if isinstance(json_file, str):
json_dict = json.load(open(json_file, encoding="utf-8"))
else:
json_dict = json.load(json_file)
self.grid_dimensions = json_dict["grid_dimensions"]
self._atoms = ase.Atoms.fromdict(json_dict["atoms"])
self._voxel = Cell(json_dict["voxel"]["array"])
return self._calculate(working_directory, **kwargs)

def calculate_from_atoms(
self, atoms, grid_dimensions, working_directory=".", **kwargs
):
@@ -573,6 +584,16 @@ def convert_local_to_3d(self, descriptors_np):
).transpose([2, 1, 0, 3])
return descriptors_full, local_offset, local_reach

def read_dimensions_from_json(self, json_file):
if isinstance(json_file, str):
json_dict = json.load(open(json_file, encoding="utf-8"))
else:
json_dict = json.load(json_file)
grid_dimensions = json_dict["grid_dimensions"] + [
self._read_feature_dimension_from_json(json_dict)
]
return grid_dimensions

# Private methods
#################

@@ -1021,5 +1042,9 @@ def _grid_to_coord(self, gridpoint):
def _calculate(self, outdir, **kwargs):
pass

@abstractmethod
def _read_feature_dimension_from_json(self, json_dict):
pass

def _set_feature_size_from_array(self, array):
self.feature_size = np.shape(array)[-1]
6 changes: 6 additions & 0 deletions mala/descriptors/minterpy_descriptors.py
Original file line number Diff line number Diff line change
@@ -87,6 +87,12 @@ def backconvert_units(array, out_units):
else:
raise Exception("Unsupported unit for Minterpy descriptors.")

def _read_feature_dimension_from_json(self, json_dict):
raise Exception(
"This feature has not been implemented for Minterpy "
"descriptors."
)

def _calculate(self, atoms, outdir, grid_dimensions, **kwargs):
# For version compatibility; older lammps versions (the serial version
# we still use on some machines) have these constants as part of the
3 changes: 3 additions & 0 deletions mala/network/trainer.py
Original file line number Diff line number Diff line change
@@ -597,6 +597,9 @@ def train_network(self):
)
self.final_validation_loss = vloss

# Cleaning up temporary data files.
self.data.delete_temporary_data()

# Clean-up for pre-fetching lazy loading.
if self.data.parameters.use_lazy_loading_prefetch:
self._training_data_loaders.cleanup()
2 changes: 1 addition & 1 deletion mala/targets/target.py
Original file line number Diff line number Diff line change
@@ -653,7 +653,7 @@ def read_additional_calculation_data(self, data, data_type=None):
}
self.atomic_forces_dft = None
self.entropy_contribution_dft_calculation = None
self.grid_dimensions = [0, 0, 0]
self.grid_dimensions = json_dict["grid_dimensions"]
self.atoms = None

for key in json_dict: