Skip to content

Commit

Permalink
Merge pull request #1391 from pyiron/content
Browse files Browse the repository at this point in the history
Move recursive loading of DataContainer to HDF5Content
  • Loading branch information
pmrv authored Mar 25, 2024
2 parents 3099b16 + 9b760bd commit c1aa430
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 55 deletions.
154 changes: 101 additions & 53 deletions pyiron_base/jobs/job/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,81 @@
"""


def recursive_load_from_hdf(project_hdf5: ProjectHDFio, item: str):
"""
Load given item from HDF, but check also for DataContainer along the way.
If `item` exists as is in HDF, return it, otherwise break it up along every slash and try to load a
:class:`~.DataContainer` and then try to index with the remainder of the path, i.e.
>>> recursive_load_from_hdf(hdf, 'my/path/to/value')
is equivalent to one of (in this order)
>>> hdf['my/path/to'].to_object()['value']
>>> hdf['my/path'].to_object()['to/value']
>>> hdf['my'].to_object()['path/to/value']
in case
>>> hdf['/my/path/to/value']
doesn't exist.
Args:
project_hdf5 (ProjectHDFio): HDF file to access
item (str): path to value, may contain `/`
Returns:
object: whatever was found in the HDF file
None: if nothing was found in the HDF file
"""

def successive_path_splits(name_lst):
"""
Yield successive split/joins of a path, i.e.
/a/b/c/d
gives
/a/b/c, d
/a/b, c/d
/a, b/c/d
"""
for i in range(1, len(name_lst)):
# where we are looking for the data container
container_path = "/".join(name_lst[:-i])
# where we are looking for data in the container
data_path = "/".join(name_lst[-1:])
yield container_path, data_path

try:
group = project_hdf5[item]
if (
isinstance(group, ProjectHDFio)
and "NAME" in group
and group["NAME"] == "DataContainer"
):
return group.to_object(lazy=True)
else:
return group
except ValueError:
pass

name_lst = item.split("/")

for container_path, data_path in successive_path_splits(name_lst):
try:
group = project_hdf5[container_path]
if (
isinstance(group, ProjectHDFio)
and "NAME" in group
and group["NAME"] == "DataContainer"
):
return group.to_object(lazy=True)[data_path]
except (ValueError, IndexError, KeyError):
# either group does not contain a data container or it is does, but it does not have the path we're
# looking for
pass

class JobCore(HasGroups):
__doc__ = (
"""
Expand Down Expand Up @@ -906,60 +981,26 @@ def __getitem__(self, item):
Returns:
dict, list, float, int, :class:`.DataContainer`, None: data or data object; if nothing is found None is returned
"""

# first try to access HDF5 directly to make the common case fast
try:
group = self._hdf5[item]
if (
isinstance(group, ProjectHDFio)
and "NAME" in group
and group["NAME"] == "DataContainer"
):
return group.to_object(lazy=True)
else:
return group
except ValueError:
pass

name_lst = item.split("/")

def successive_path_splits(name_lst):
"""
Yield successive split/joins of a path, i.e.
/a/b/c/d
gives
/a/b/c, d
/a/b, c/d
/a, b/c/d
"""
for i in range(1, len(name_lst)):
# where we are looking for the data container
container_path = "/".join(name_lst[:-i])
# where we are looking for data in the container
data_path = "/".join(name_lst[-1:])
yield container_path, data_path

for container_path, data_path in successive_path_splits(name_lst):
try:
group = self._hdf5[container_path]
if (
isinstance(group, ProjectHDFio)
and "NAME" in group
and group["NAME"] == "DataContainer"
):
return group.to_object(lazy=True)[data_path]
except (ValueError, IndexError, KeyError):
# either group does not contain a data container or it is does, but it does not have the path we're
# looking for
pass

if item in self.files.list():
value = recursive_load_from_hdf(self._hdf5, item)
if value is not None:
return value

# only try to read files when no slashes are present:
# downstream code will often do something like job['path/to/output'] to check if certain values exist and branch
# on that. In cases where they don't exists this would then trigger us to decompress the job files in memory on
# every check which slows down things a lot. Generally these value checks will be of the form output/.../...
# i.e. contain slashes and file access tend to be just the file name without slashes, so I separate those cases
# here like this. In those cases where we actually have sub directories in the job folders we can beef up the
# file browser.
if "/" not in item and item in self.files.list():
warnings.warn(
"Using __getitem__ on a job to access files in deprecated: use job.files instead!",
category=DeprecationWarning,
)
return _job_read_file(self, item)

name_lst = item.split("/")
item_obj = name_lst[0]
if item_obj in self._list_ext_childs():
# ToDo: Murn['strain_0.9'] - sucht im HDF5 file, dort gibt es aber die entsprechenden Gruppen noch nicht.
Expand Down Expand Up @@ -1089,7 +1130,6 @@ def __getattr__(self, name):
def __repr__(self):
return f"{self.__class__.__name__}({repr(self._job_dict)})"


class HDF5Content(object):
"""
Access the HDF5 file of the job
Expand All @@ -1099,12 +1139,20 @@ def __init__(self, project_hdf5):
self._project_hdf5 = project_hdf5

def __getattr__(self, name):
if name in self._project_hdf5.list_nodes():
return self._project_hdf5.__getitem__(name)
elif name in self._project_hdf5.list_groups():
return HDF5Content(self._project_hdf5.__getitem__(name))
try:
return self[name]
except KeyError:
raise AttributeError(name) from None

def __getitem__(self, item):
value = recursive_load_from_hdf(self._project_hdf5, item)
if value is not None:
return value

if name in self._project_hdf5.list_groups():
return HDF5Content(self._project_hdf5[item])
else:
raise AttributeError
raise KeyError(item)

def __dir__(self):
return self._project_hdf5.list_nodes() + self._project_hdf5.list_groups()
Expand Down
4 changes: 2 additions & 2 deletions tests/job/test_hdf5content.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pyiron_base._tests import PyironTestCase


class DatabasePropertyIntegration(PyironTestCase):
class InspectTest(PyironTestCase):
@classmethod
def setUpClass(cls):
cls.file_location = os.path.dirname(os.path.abspath(__file__))
Expand All @@ -32,7 +32,7 @@ def test_inspect_job(self):
job_inspect.content.input.__repr__(), job_inspect["input"].__repr__()
)
self.assertEqual(
sorted(dir(job_inspect.content.input)),
sorted((job_inspect.content.input).keys()),
sorted(job_inspect["input"].list_nodes()
+ job_inspect["input"].list_groups())
)
Expand Down

0 comments on commit c1aa430

Please sign in to comment.