Skip to content

Commit

Permalink
Merge pull request #91 from chrisiacovella/caching_and_more_data
Browse files Browse the repository at this point in the history
Dataset caching overhaul and additional datasets
  • Loading branch information
chrisiacovella authored May 2, 2024
2 parents e10ebe2 + 0aa23dd commit 02f53c7
Show file tree
Hide file tree
Showing 35 changed files with 4,566 additions and 1,231 deletions.
73 changes: 61 additions & 12 deletions modelforge/curation/ani1x_curation.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from modelforge.curation.curation_baseclass import DatasetCuration
from modelforge.utils.units import *
from typing import Optional
from loguru import logger
from openff.units import unit


class ANI1xCuration(DatasetCuration):
Expand Down Expand Up @@ -223,7 +223,9 @@ def _process_downloaded(
self,
local_path_dir: str,
name: str,
unit_testing_max_records: Optional[int] = None,
max_records: Optional[int] = None,
max_conformers_per_record: Optional[int] = None,
total_conformers: Optional[int] = None,
):
"""
Processes a downloaded dataset: extracts relevant information.
Expand All @@ -234,8 +236,15 @@ def _process_downloaded(
Path to the directory that contains the raw hdf5 datafile
name: str, required
Name of the raw hdf5 file,
unit_testing_max_records: int, optional, default=None
If set to an integer ('n') the routine will only process the first 'n' records; useful for unit tests.
max_records: int, optional, default=None
If set to an integer, 'n_r', the routine will only process the first 'n_r' records, useful for unit tests.
Can be used in conjunction with max_conformers_per_record and total_conformers.
max_conformers_per_record: int, optional, default=None
If set to an integer, 'n_c', the routine will only process the first 'n_c' conformers per record, useful for unit tests.
Can be used in conjunction with max_records and total_conformers.
total_conformers: int, optional, default=None
If set to an integer, 'n_t', the routine will only process the first 'n_t' conformers in total, useful for unit tests.
Can be used in conjunction with max_records and max_conformers_per_record.
Examples
--------
Expand Down Expand Up @@ -269,14 +278,32 @@ def _process_downloaded(
}
with h5py.File(input_file_name, "r") as hf:
names = list(hf.keys())
if unit_testing_max_records is None:
if max_records is None:
n_max = len(names)
else:
n_max = unit_testing_max_records
elif max_records is not None:
n_max = max_records

conformers_counter = 0

for i, name in tqdm(enumerate(names[0:n_max]), total=n_max):
if total_conformers is not None:
if conformers_counter >= total_conformers:
break

# Extract the total number of configurations for a given molecule
n_configs = hf[name]["coordinates"].shape[0]

if max_conformers_per_record is not None:
conformers_per_molecule = min(
hf[name]["coordinates"].shape[0], max_conformers_per_record
)
else:
conformers_per_molecule = hf[name]["coordinates"].shape[0]

if total_conformers is not None:
if conformers_counter + conformers_per_molecule > total_conformers:
conformers_per_molecule = total_conformers - conformers_counter

n_configs = conformers_per_molecule

keys_list = list(hf[name].keys())

Expand All @@ -300,13 +327,17 @@ def _process_downloaded(
if param_in in add_new_axis:
temp = temp[..., newaxis]

temp = temp[0:conformers_per_molecule]

param_unit = param_data["u_in"]
if param_unit is not None:
ani1x_temp[param_out] = temp * param_unit
else:
ani1x_temp[param_out] = temp

self.data.append(ani1x_temp)
conformers_counter += conformers_per_molecule

if self.convert_units:
self._convert_units()
# From documentation: By default, objects inside group are iterated in alphanumeric order.
Expand All @@ -318,7 +349,9 @@ def _process_downloaded(
def process(
self,
force_download: bool = False,
unit_testing_max_records: Optional[int] = None,
max_records: Optional[int] = None,
max_conformers_per_record: Optional[int] = None,
total_conformers: Optional[int] = None,
) -> None:
"""
Downloads the dataset, extracts relevant information, and writes an hdf5 file.
Expand All @@ -328,8 +361,15 @@ def process(
force_download: bool, optional, default=False
If the raw data_file is present in the local_cache_dir, the local copy will be used.
If True, this will force the software to download the data again, even if present.
unit_testing_max_records: int, optional, default=None
If set to an integer, 'n', the routine will only process the first 'n' records, useful for unit tests.
max_records: int, optional, default=None
If set to an integer, 'n_r', the routine will only process the first 'n_r' records, useful for unit tests.
Can be used in conjunction with max_conformers_per_record and total_conformers.
max_conformers_per_record: int, optional, default=None
If set to an integer, 'n_c', the routine will only process the first 'n_c' conformers per record, useful for unit tests.
Can be used in conjunction with max_records and total_conformers.
total_conformers: int, optional, default=None
If set to an integer, 'n_t', the routine will only process the first 'n_t' conformers in total, useful for unit tests.
Can be used in conjunction with max_records and max_conformers_per_record.
Examples
--------
Expand All @@ -338,6 +378,11 @@ def process(
>>> ani1_data.process()
"""
if max_records is not None and total_conformers is not None:
raise Exception(
"max_records and total_conformers cannot be set at the same time."
)

from modelforge.utils.remote import download_from_figshare

url = self.dataset_download_url
Expand All @@ -356,7 +401,11 @@ def process(
if self.name is None:
raise Exception("Failed to retrieve name of file from figshare.")
self._process_downloaded(
self.local_cache_dir, self.name, unit_testing_max_records
self.local_cache_dir,
self.name,
max_records=max_records,
max_conformers_per_record=max_conformers_per_record,
total_conformers=total_conformers,
)

self._generate_hdf5()
84 changes: 68 additions & 16 deletions modelforge/curation/ani2x_curation.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
from modelforge.curation.curation_baseclass import DatasetCuration
from modelforge.utils.units import *
from typing import Optional
from loguru import logger
from openff.units import unit


class ANI2xCuration(DatasetCuration):
"""
Routines to fetch and process the ANI-2x dataset into a curated hdf5 file.
The ANI-2x data set includes properties for small organic molecules that contain
H, C, N, O, S, F, and Cl. This dataset contains 9651712 conformers for 200,000
H, C, N, O, S, F, and Cl. This dataset contains 9651712 conformers for nearly 200,000 molecules.
This will fetch data generated with the wB97X/631Gd level of theory
used in the original ANI-2x paper, calculated using Gaussian 09
Expand Down Expand Up @@ -100,7 +100,9 @@ def _process_downloaded(
self,
local_path_dir: str,
name: str,
unit_testing_max_records: Optional[int] = None,
max_records: Optional[int] = None,
max_conformers_per_record: Optional[int] = None,
total_conformers: Optional[int] = None,
):
"""
Processes a downloaded dataset: extracts relevant information.
Expand All @@ -111,8 +113,16 @@ def _process_downloaded(
Path to the directory that contains the raw hdf5 datafile
name: str, required
Name of the raw hdf5 file,
unit_testing_max_records: int, optional, default=None
If set to an integer ('n') the routine will only process the first 'n' records; useful for unit tests.
max_records: int, optional, default=None
If set to an integer, 'n_r', the routine will only process the first 'n_r' records, useful for unit tests.
Can be used in conjunction with max_conformers_per_record.
max_conformers_per_record: int, optional, default=None
If set to an integer, 'n_c', the routine will only process the first 'n_c' conformers per record, useful for unit tests.
Can be used in conjunction with max_records or total_conformers.
total_conformers: int, optional, default=None
If set to an integer, 'n_t', the routine will only process the first 'n_t' conformers in total, useful for unit tests.
Can be used in conjunction with max_conformers_per_record.
Examples
--------
Expand All @@ -122,6 +132,9 @@ def _process_downloaded(

input_file_name = f"{local_path_dir}/{name}"
logger.debug(f"Processing {input_file_name}.")

conformers_counter = 0

with h5py.File(input_file_name, "r") as hf:
# The ani2x hdf5 file groups molecules by number of atoms
# we need to break up each of these groups into individual molecules
Expand All @@ -145,16 +158,24 @@ def _process_downloaded(

unique_molecules = np.unique(species, axis=0)

if unit_testing_max_records is None:
if max_records is None:
n_max = unique_molecules.shape[0]
else:
n_max = min(unit_testing_max_records, unique_molecules.shape[0])
unit_testing_max_records -= n_max
n_max = min(max_records, unique_molecules.shape[0])
max_records -= n_max

if n_max == 0:
break

for i, molecule in tqdm(
enumerate(unique_molecules[0:n_max]), total=n_max
):
# stop processing if we have reached the total number of conformers

if total_conformers is not None:
if conformers_counter >= total_conformers:
break

ds_temp = {}
# molecule represents an aray of atomic species, e.g., [ 8, 8 ] is O_2
# here we will create an array of shape( num_confomer, num_atoms) of bools
Expand All @@ -174,27 +195,41 @@ def _process_downloaded(
ds_temp["name"] = molecule_as_string
ds_temp["atomic_numbers"] = molecule.reshape(-1, 1)

ds_temp["n_configs"] = int(np.sum(mask))
conformers_per_molecule = int(np.sum(mask))
if max_conformers_per_record is not None:
conformers_per_molecule = min(
conformers_per_molecule, max_conformers_per_record
)
if total_conformers is not None:
conformers_per_molecule = min(
conformers_per_molecule,
total_conformers - conformers_counter,
)
ds_temp["n_configs"] = conformers_per_molecule

ds_temp["geometry"] = (
coordinates[mask] * self.qm_parameters["geometry"]["u_in"]
)
)[0:conformers_per_molecule]
ds_temp["energies"] = (
energies[mask].reshape(-1, 1)
* self.qm_parameters["energies"]["u_in"]
)
)[0:conformers_per_molecule]
ds_temp["forces"] = (
forces[mask] * self.qm_parameters["forces"]["u_in"]
)
)[0:conformers_per_molecule]

self.data.append(ds_temp)
conformers_counter += conformers_per_molecule

if self.convert_units:
self._convert_units()

def process(
self,
force_download: bool = False,
unit_testing_max_records: Optional[int] = None,
max_records: Optional[int] = None,
max_conformers_per_record: Optional[int] = None,
total_conformers: Optional[int] = None,
) -> None:
"""
Downloads the dataset, extracts relevant information, and writes an hdf5 file.
Expand All @@ -204,8 +239,16 @@ def process(
force_download: bool, optional, default=False
If the raw data_file is present in the local_cache_dir, the local copy will be used.
If True, this will force the software to download the data again, even if present.
unit_testing_max_records: int, optional, default=None
If set to an integer, 'n', the routine will only process the first 'n' records, useful for unit tests.
max_records: int, optional, default=None
If set to an integer, 'n_r', the routine will only process the first 'n_r' records, useful for unit tests.
Can be used in conjunction with max_conformers_per_record.
max_conformers_per_record: int, optional, default=None
If set to an integer, 'n_c', the routine will only process the first 'n_c' conformers per record, useful for unit tests.
Can be used in conjunction with max_records or total_conformers.
total_conformers: int, optional, default=None
If set to an integer, 'n_t', the routine will only process the first 'n_t' conformers in total, useful for unit tests.
Can be used in conjunction with max_conformers_per_record.
Examples
--------
Expand All @@ -214,6 +257,11 @@ def process(
>>> ani2_data.process()
"""
if max_records is not None and total_conformers is not None:
raise Exception(
"max_records and total_conformers cannot be set at the same time."
)

from modelforge.utils.remote import download_from_zenodo

url = self.dataset_download_url
Expand Down Expand Up @@ -247,7 +295,11 @@ def process(

# process the rest of the dataset
self._process_downloaded(
f"{self.local_cache_dir}/final_h5/", hdf5_filename, unit_testing_max_records
f"{self.local_cache_dir}/final_h5/",
hdf5_filename,
max_records,
max_conformers_per_record,
total_conformers,
)

self._generate_hdf5()
Loading

0 comments on commit 02f53c7

Please sign in to comment.