Skip to content

Commit

Permalink
Rebuild a fits manifest from an HSC data directory and speedup HSC da…
Browse files Browse the repository at this point in the history
…ta loading (#115)

# Rebuild a fits manifest from an HSC data directory.
* Added a new verb rebuild_manifest
* When run with the HSC dataset class this verb will:
  0) Scan the data directory and ingest HSC cutout files
  1) Read in the original catalog file configured for download for metadata
  2) Write out rebuilt_manifest.fits in the data directory

* Fixed up config resolution so that fibad_config.toml in the cwd
  works again for CLI invocations.
* Adding progressive logging for long steps.
* Rebuild command will never open or use the manifest file in the data directory
   because the assumption is that file is corrupt.

# Speeding up HSC Data loading
* Parallelizing _scan_file_dimensions() Using Schwimmbad and 
   multiprocessing to parallelize extracting the dimensions of files in 
   HSCDataSet to effect speedup of 124x on 10M+ file datasets.

* Added progressive log entries for HSCDataSet file scan
* Use manifest by default when no filter_catalog provided.  
   This skips the file scan on large datasets
* Choose number of processes in a way that doesn't run afoul of system limits

Co-authored-by: Drew Oldag <[email protected]>
  • Loading branch information
mtauraso and drewoldag authored Nov 19, 2024
1 parent ae484c1 commit 4eb8301
Show file tree
Hide file tree
Showing 7 changed files with 345 additions and 67 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ dependencies = [
"toml", # Used to load configuration files as dictionaries
"torch", # Used for CNN model and in train.py
"torchvision", # Used in hsc data loader, example autoencoder, and CNN model data set
"schwimmbad", # Used to speedup hsc data loader file scans
]

[project.scripts]
Expand Down
63 changes: 31 additions & 32 deletions src/fibad/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,15 @@ def __init__(
runtime_config_filepath: Union[Path, str] = None,
default_config_filepath: Union[Path, str] = DEFAULT_CONFIG_FILEPATH,
):
self.fibad_default_config = self._read_runtime_config(default_config_filepath)
self.fibad_default_config = ConfigManager._read_runtime_config(default_config_filepath)

self.runtime_config_filepath = runtime_config_filepath
if self.runtime_config_filepath is None:
self.runtime_config_filepath = ConfigManager.resolve_runtime_config(runtime_config_filepath)
if self.runtime_config_filepath is DEFAULT_CONFIG_FILEPATH:
self.user_specific_config = ConfigDict()
else:
self.user_specific_config = self._read_runtime_config(self.runtime_config_filepath)
self.user_specific_config = ConfigManager._read_runtime_config(self.runtime_config_filepath)

self.external_library_config_paths = self._find_external_library_default_config_paths(
self.external_library_config_paths = ConfigManager._find_external_library_default_config_paths(
self.user_specific_config
)

Expand All @@ -93,7 +93,7 @@ def __init__(

self.config = self.merge_configs(self.overall_default_config, self.user_specific_config)
if not self.config["general"]["dev_mode"]:
self._validate_runtime_config(self.config, self.overall_default_config)
ConfigManager._validate_runtime_config(self.config, self.overall_default_config)

@staticmethod
def _read_runtime_config(config_filepath: Union[Path, str] = DEFAULT_CONFIG_FILEPATH) -> ConfigDict:
Expand Down Expand Up @@ -232,38 +232,37 @@ def _validate_runtime_config(runtime_config: ConfigDict, default_config: ConfigD
raise RuntimeError(msg)
ConfigManager._validate_runtime_config(runtime_config[key], default_config[key])

@staticmethod
def resolve_runtime_config(runtime_config_filepath: Union[Path, str, None] = None) -> Path:
"""Resolve a user-supplied runtime config to where we will actually pull config from.
def resolve_runtime_config(runtime_config_filepath: Union[Path, str, None] = None) -> Path:
"""Resolve a user-supplied runtime config to where we will actually pull config from.
1) If a runtime config file is specified, we will use that file
2) If no file is specified and there is a file named "fibad_config.toml" in the cwd we will use that file
3) If no file is specified and there is no file named "fibad_config.toml" in the current working directory
we will exclusively work off the configuration defaults in the packaged "fibad_default_config.toml"
file.
1) If a runtime config file is specified, we will use that file.
2) If no file is specified and there is a file named "fibad_config.toml" in the cwd we will use it.
3) If no file is specified and there is no file named "fibad_config.toml" in the cwd we will
exclusively work off the configuration defaults in the packaged "fibad_default_config.toml" file.
Parameters
----------
runtime_config_filepath : Union[Path, str, None], optional
Location of the supplied config file, by default None
Parameters
----------
runtime_config_filepath : Union[Path, str, None], optional
Location of the supplied config file, by default None
Returns
-------
Path
Path to the configuration file ultimately used for config resolution. When we fall back to the
package supplied default config file, the Path to that file is returned.
"""
if isinstance(runtime_config_filepath, str):
runtime_config_filepath = Path(runtime_config_filepath)
Returns
-------
Path
Path to the configuration file ultimately used for config resolution. When we fall back to the
package supplied default config file, the Path to that file is returned.
"""
if isinstance(runtime_config_filepath, str):
runtime_config_filepath = Path(runtime_config_filepath)

# If a named config exists in cwd, and no config specified on cmdline, use cwd.
if runtime_config_filepath is None and DEFAULT_USER_CONFIG_FILEPATH.exists():
runtime_config_filepath = DEFAULT_USER_CONFIG_FILEPATH
# If a named config exists in cwd, and no config specified on cmdline, use cwd.
if runtime_config_filepath is None and DEFAULT_USER_CONFIG_FILEPATH.exists():
runtime_config_filepath = DEFAULT_USER_CONFIG_FILEPATH

if runtime_config_filepath is None:
runtime_config_filepath = DEFAULT_CONFIG_FILEPATH
if runtime_config_filepath is None:
runtime_config_filepath = DEFAULT_CONFIG_FILEPATH

return runtime_config_filepath
return runtime_config_filepath


def create_results_dir(config: ConfigDict, postfix: Union[Path, str]) -> Path:
Expand Down
Loading

0 comments on commit 4eb8301

Please sign in to comment.