Merge pull request #91 from chrisiacovella/caching_and_more_data

Dataset caching overhaul and additional datasets
choderalab · May 2, 2024 · 02f53c7 · 02f53c7
2 parents e10ebe2 + 0aa23dd
commit 02f53c7
Show file tree

Hide file tree

Showing 35 changed files with 4,566 additions and 1,231 deletions.
diff --git a/modelforge/curation/ani1x_curation.py b/modelforge/curation/ani1x_curation.py
@@ -1,7 +1,7 @@
 from modelforge.curation.curation_baseclass import DatasetCuration
-from modelforge.utils.units import *
 from typing import Optional
 from loguru import logger
+from openff.units import unit
 
 
 class ANI1xCuration(DatasetCuration):
@@ -223,7 +223,9 @@ def _process_downloaded(
         self,
         local_path_dir: str,
         name: str,
-        unit_testing_max_records: Optional[int] = None,
+        max_records: Optional[int] = None,
+        max_conformers_per_record: Optional[int] = None,
+        total_conformers: Optional[int] = None,
     ):
         """
         Processes a downloaded dataset: extracts relevant information.
@@ -234,8 +236,15 @@ def _process_downloaded(
             Path to the directory that contains the raw hdf5 datafile
         name: str, required
             Name of the raw hdf5 file,
-        unit_testing_max_records: int, optional, default=None
-            If set to an integer ('n') the routine will only process the first 'n' records; useful for unit tests.
+        max_records: int, optional, default=None
+            If set to an integer, 'n_r', the routine will only process the first 'n_r' records, useful for unit tests.
+            Can be used in conjunction with max_conformers_per_record and total_conformers.
+        max_conformers_per_record: int, optional, default=None
+            If set to an integer, 'n_c', the routine will only process the first 'n_c' conformers per record, useful for unit tests.
+            Can be used in conjunction with max_records and total_conformers.
+        total_conformers: int, optional, default=None
+            If set to an integer, 'n_t', the routine will only process the first 'n_t' conformers in total, useful for unit tests.
+            Can be used in conjunction with max_records and max_conformers_per_record.
 
         Examples
         --------
@@ -269,14 +278,32 @@ def _process_downloaded(
         }
         with h5py.File(input_file_name, "r") as hf:
             names = list(hf.keys())
-            if unit_testing_max_records is None:
+            if max_records is None:
                 n_max = len(names)
-            else:
-                n_max = unit_testing_max_records
+            elif max_records is not None:
+                n_max = max_records
+
+            conformers_counter = 0
 
             for i, name in tqdm(enumerate(names[0:n_max]), total=n_max):
+                if total_conformers is not None:
+                    if conformers_counter >= total_conformers:
+                        break
+
                 # Extract the total number of configurations for a given molecule
-                n_configs = hf[name]["coordinates"].shape[0]
+
+                if max_conformers_per_record is not None:
+                    conformers_per_molecule = min(
+                        hf[name]["coordinates"].shape[0], max_conformers_per_record
+                    )
+                else:
+                    conformers_per_molecule = hf[name]["coordinates"].shape[0]
+
+                if total_conformers is not None:
+                    if conformers_counter + conformers_per_molecule > total_conformers:
+                        conformers_per_molecule = total_conformers - conformers_counter
+
+                n_configs = conformers_per_molecule
 
                 keys_list = list(hf[name].keys())
 
@@ -300,13 +327,17 @@ def _process_downloaded(
                     if param_in in add_new_axis:
                         temp = temp[..., newaxis]
 
+                    temp = temp[0:conformers_per_molecule]
+
                     param_unit = param_data["u_in"]
                     if param_unit is not None:
                         ani1x_temp[param_out] = temp * param_unit
                     else:
                         ani1x_temp[param_out] = temp
 
                 self.data.append(ani1x_temp)
+                conformers_counter += conformers_per_molecule
+
         if self.convert_units:
             self._convert_units()
         # From documentation: By default, objects inside group are iterated in alphanumeric order.
@@ -318,7 +349,9 @@ def _process_downloaded(
     def process(
         self,
         force_download: bool = False,
-        unit_testing_max_records: Optional[int] = None,
+        max_records: Optional[int] = None,
+        max_conformers_per_record: Optional[int] = None,
+        total_conformers: Optional[int] = None,
     ) -> None:
         """
         Downloads the dataset, extracts relevant information, and writes an hdf5 file.
@@ -328,8 +361,15 @@ def process(
         force_download: bool, optional, default=False
             If the raw data_file is present in the local_cache_dir, the local copy will be used.
             If True, this will force the software to download the data again, even if present.
-        unit_testing_max_records: int, optional, default=None
-            If set to an integer, 'n', the routine will only process the first 'n' records, useful for unit tests.
+        max_records: int, optional, default=None
+            If set to an integer, 'n_r', the routine will only process the first 'n_r' records, useful for unit tests.
+            Can be used in conjunction with max_conformers_per_record and total_conformers.
+        max_conformers_per_record: int, optional, default=None
+            If set to an integer, 'n_c', the routine will only process the first 'n_c' conformers per record, useful for unit tests.
+            Can be used in conjunction with max_records and total_conformers.
+        total_conformers: int, optional, default=None
+            If set to an integer, 'n_t', the routine will only process the first 'n_t' conformers in total, useful for unit tests.
+            Can be used in conjunction with max_records and max_conformers_per_record.
 
         Examples
         --------
@@ -338,6 +378,11 @@ def process(
         >>> ani1_data.process()
 
         """
+        if max_records is not None and total_conformers is not None:
+            raise Exception(
+                "max_records and total_conformers cannot be set at the same time."
+            )
+
         from modelforge.utils.remote import download_from_figshare
 
         url = self.dataset_download_url
@@ -356,7 +401,11 @@ def process(
         if self.name is None:
             raise Exception("Failed to retrieve name of file from figshare.")
         self._process_downloaded(
-            self.local_cache_dir, self.name, unit_testing_max_records
+            self.local_cache_dir,
+            self.name,
+            max_records=max_records,
+            max_conformers_per_record=max_conformers_per_record,
+            total_conformers=total_conformers,
         )
 
         self._generate_hdf5()
diff --git a/modelforge/curation/ani2x_curation.py b/modelforge/curation/ani2x_curation.py
@@ -1,15 +1,15 @@
 from modelforge.curation.curation_baseclass import DatasetCuration
-from modelforge.utils.units import *
 from typing import Optional
 from loguru import logger
+from openff.units import unit
 
 
 class ANI2xCuration(DatasetCuration):
     """
     Routines to fetch and process the ANI-2x dataset into a curated hdf5 file.
 
     The ANI-2x data set includes properties for small organic molecules that contain
-    H, C, N, O, S, F, and Cl.  This dataset contains 9651712 conformers for 200,000
+    H, C, N, O, S, F, and Cl.  This dataset contains 9651712 conformers for nearly 200,000 molecules.
     This will fetch data generated with the wB97X/631Gd level of theory
     used in the original ANI-2x paper, calculated using Gaussian 09
 
@@ -100,7 +100,9 @@ def _process_downloaded(
         self,
         local_path_dir: str,
         name: str,
-        unit_testing_max_records: Optional[int] = None,
+        max_records: Optional[int] = None,
+        max_conformers_per_record: Optional[int] = None,
+        total_conformers: Optional[int] = None,
     ):
         """
         Processes a downloaded dataset: extracts relevant information.
@@ -111,8 +113,16 @@ def _process_downloaded(
             Path to the directory that contains the raw hdf5 datafile
         name: str, required
             Name of the raw hdf5 file,
-        unit_testing_max_records: int, optional, default=None
-            If set to an integer ('n') the routine will only process the first 'n' records; useful for unit tests.
+        max_records: int, optional, default=None
+            If set to an integer, 'n_r', the routine will only process the first 'n_r' records, useful for unit tests.
+            Can be used in conjunction with max_conformers_per_record.
+        max_conformers_per_record: int, optional, default=None
+            If set to an integer, 'n_c', the routine will only process the first 'n_c' conformers per record, useful for unit tests.
+            Can be used in conjunction with max_records or total_conformers.
+        total_conformers: int, optional, default=None
+            If set to an integer, 'n_t', the routine will only process the first 'n_t' conformers in total, useful for unit tests.
+            Can be used in conjunction with  max_conformers_per_record.
+
 
         Examples
         --------
@@ -122,6 +132,9 @@ def _process_downloaded(
 
         input_file_name = f"{local_path_dir}/{name}"
         logger.debug(f"Processing {input_file_name}.")
+
+        conformers_counter = 0
+
         with h5py.File(input_file_name, "r") as hf:
             #  The ani2x hdf5 file groups molecules by number of atoms
             # we need to break up each of these groups into individual molecules
@@ -145,16 +158,24 @@ def _process_downloaded(
 
                 unique_molecules = np.unique(species, axis=0)
 
-                if unit_testing_max_records is None:
+                if max_records is None:
                     n_max = unique_molecules.shape[0]
                 else:
-                    n_max = min(unit_testing_max_records, unique_molecules.shape[0])
-                    unit_testing_max_records -= n_max
+                    n_max = min(max_records, unique_molecules.shape[0])
+                    max_records -= n_max
+
                 if n_max == 0:
                     break
+
                 for i, molecule in tqdm(
                     enumerate(unique_molecules[0:n_max]), total=n_max
                 ):
+                    # stop processing if we have reached the total number of conformers
+
+                    if total_conformers is not None:
+                        if conformers_counter >= total_conformers:
+                            break
+
                     ds_temp = {}
                     # molecule represents an aray of atomic species, e.g., [ 8, 8 ] is O_2
                     # here we will create an array of shape( num_confomer, num_atoms) of bools
@@ -174,27 +195,41 @@ def _process_downloaded(
                     ds_temp["name"] = molecule_as_string
                     ds_temp["atomic_numbers"] = molecule.reshape(-1, 1)
 
-                    ds_temp["n_configs"] = int(np.sum(mask))
+                    conformers_per_molecule = int(np.sum(mask))
+                    if max_conformers_per_record is not None:
+                        conformers_per_molecule = min(
+                            conformers_per_molecule, max_conformers_per_record
+                        )
+                    if total_conformers is not None:
+                        conformers_per_molecule = min(
+                            conformers_per_molecule,
+                            total_conformers - conformers_counter,
+                        )
+                    ds_temp["n_configs"] = conformers_per_molecule
 
                     ds_temp["geometry"] = (
                         coordinates[mask] * self.qm_parameters["geometry"]["u_in"]
-                    )
+                    )[0:conformers_per_molecule]
                     ds_temp["energies"] = (
                         energies[mask].reshape(-1, 1)
                         * self.qm_parameters["energies"]["u_in"]
-                    )
+                    )[0:conformers_per_molecule]
                     ds_temp["forces"] = (
                         forces[mask] * self.qm_parameters["forces"]["u_in"]
-                    )
+                    )[0:conformers_per_molecule]
 
                     self.data.append(ds_temp)
+                    conformers_counter += conformers_per_molecule
+
         if self.convert_units:
             self._convert_units()
 
     def process(
         self,
         force_download: bool = False,
-        unit_testing_max_records: Optional[int] = None,
+        max_records: Optional[int] = None,
+        max_conformers_per_record: Optional[int] = None,
+        total_conformers: Optional[int] = None,
     ) -> None:
         """
         Downloads the dataset, extracts relevant information, and writes an hdf5 file.
@@ -204,8 +239,16 @@ def process(
         force_download: bool, optional, default=False
             If the raw data_file is present in the local_cache_dir, the local copy will be used.
             If True, this will force the software to download the data again, even if present.
-        unit_testing_max_records: int, optional, default=None
-            If set to an integer, 'n', the routine will only process the first 'n' records, useful for unit tests.
+        max_records: int, optional, default=None
+            If set to an integer, 'n_r', the routine will only process the first 'n_r' records, useful for unit tests.
+            Can be used in conjunction with max_conformers_per_record.
+        max_conformers_per_record: int, optional, default=None
+            If set to an integer, 'n_c', the routine will only process the first 'n_c' conformers per record, useful for unit tests.
+            Can be used in conjunction with max_records or total_conformers.
+        total_conformers: int, optional, default=None
+            If set to an integer, 'n_t', the routine will only process the first 'n_t' conformers in total, useful for unit tests.
+            Can be used in conjunction with  max_conformers_per_record.
+
 
         Examples
         --------
@@ -214,6 +257,11 @@ def process(
         >>> ani2_data.process()
 
         """
+        if max_records is not None and total_conformers is not None:
+            raise Exception(
+                "max_records and total_conformers cannot be set at the same time."
+            )
+
         from modelforge.utils.remote import download_from_zenodo
 
         url = self.dataset_download_url
@@ -247,7 +295,11 @@ def process(
 
         # process the rest of the dataset
         self._process_downloaded(
-            f"{self.local_cache_dir}/final_h5/", hdf5_filename, unit_testing_max_records
+            f"{self.local_cache_dir}/final_h5/",
+            hdf5_filename,
+            max_records,
+            max_conformers_per_record,
+            total_conformers,
         )
 
         self._generate_hdf5()