Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move recursive loading of DataContainer to HDF5Content #1391

Merged
merged 7 commits into from
Mar 25, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 72 additions & 53 deletions pyiron_base/jobs/job/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,52 @@
"""


def recursive_load_from_hdf(project_hdf5, item):
try:
group = project_hdf5[item]
if (
isinstance(group, ProjectHDFio)
and "NAME" in group
and group["NAME"] == "DataContainer"
):
return group.to_object(lazy=True)
else:
return group
except ValueError:
pass

name_lst = item.split("/")

def successive_path_splits(name_lst):
pmrv marked this conversation as resolved.
Show resolved Hide resolved
"""
Yield successive split/joins of a path, i.e.
/a/b/c/d
gives
/a/b/c, d
/a/b, c/d
/a, b/c/d
"""
for i in range(1, len(name_lst)):
# where we are looking for the data container
container_path = "/".join(name_lst[:-i])
# where we are looking for data in the container
data_path = "/".join(name_lst[-1:])
yield container_path, data_path

for container_path, data_path in successive_path_splits(name_lst):
try:
group = project_hdf5[container_path]
if (
isinstance(group, ProjectHDFio)
and "NAME" in group
and group["NAME"] == "DataContainer"
):
return group.to_object(lazy=True)[data_path]
except (ValueError, IndexError, KeyError):
# either group does not contain a data container or it is does, but it does not have the path we're
# looking for
pass

class JobCore(HasGroups):
__doc__ = (
"""
Expand Down Expand Up @@ -906,60 +952,26 @@ def __getitem__(self, item):
Returns:
dict, list, float, int, :class:`.DataContainer`, None: data or data object; if nothing is found None is returned
"""

# first try to access HDF5 directly to make the common case fast
try:
group = self._hdf5[item]
if (
isinstance(group, ProjectHDFio)
and "NAME" in group
and group["NAME"] == "DataContainer"
):
return group.to_object(lazy=True)
else:
return group
except ValueError:
pass

name_lst = item.split("/")

def successive_path_splits(name_lst):
"""
Yield successive split/joins of a path, i.e.
/a/b/c/d
gives
/a/b/c, d
/a/b, c/d
/a, b/c/d
"""
for i in range(1, len(name_lst)):
# where we are looking for the data container
container_path = "/".join(name_lst[:-i])
# where we are looking for data in the container
data_path = "/".join(name_lst[-1:])
yield container_path, data_path

for container_path, data_path in successive_path_splits(name_lst):
try:
group = self._hdf5[container_path]
if (
isinstance(group, ProjectHDFio)
and "NAME" in group
and group["NAME"] == "DataContainer"
):
return group.to_object(lazy=True)[data_path]
except (ValueError, IndexError, KeyError):
# either group does not contain a data container or it is does, but it does not have the path we're
# looking for
pass

if item in self.files.list():
value = recursive_load_from_hdf(self._hdf5, item)
if value is not None:
return value

# only try to read files when no slashes are present:
# downstream code will often do something like job['path/to/output'] to check if certain values exist and branch
# on that. In cases where they don't exists this would then trigger us to decompress the job files in memory on
# every check which slows down things a lot. Generally these value checks will be of the form output/.../...
# i.e. contain slashes and file access tend to be just the file name without slashes, so I separate those cases
# here like this. In those cases where we actually have sub directories in the job folders we can beef up the
# file browser.
if "/" not in item and item in self.files.list():
jan-janssen marked this conversation as resolved.
Show resolved Hide resolved
warnings.warn(
"Using __getitem__ on a job to access files in deprecated: use job.files instead!",
category=DeprecationWarning,
)
return _job_read_file(self, item)

name_lst = item.split("/")
item_obj = name_lst[0]
if item_obj in self._list_ext_childs():
# ToDo: Murn['strain_0.9'] - sucht im HDF5 file, dort gibt es aber die entsprechenden Gruppen noch nicht.
Expand Down Expand Up @@ -1089,7 +1101,6 @@ def __getattr__(self, name):
def __repr__(self):
return f"{self.__class__.__name__}({repr(self._job_dict)})"


class HDF5Content(object):
"""
Access the HDF5 file of the job
Expand All @@ -1099,12 +1110,20 @@ def __init__(self, project_hdf5):
self._project_hdf5 = project_hdf5

def __getattr__(self, name):
if name in self._project_hdf5.list_nodes():
return self._project_hdf5.__getitem__(name)
elif name in self._project_hdf5.list_groups():
return HDF5Content(self._project_hdf5.__getitem__(name))
try:
return self[name]
except KeyError:
raise AttributeError(name) from None

def __getitem__(self, item):
value = recursive_load_from_hdf(self._project_hdf5, item)
if value is not None:
return value

if name in self._project_hdf5.list_groups():
return HDF5Content(self._project_hdf5[item])
else:
raise AttributeError
raise KeyError(item)

def __dir__(self):
return self._project_hdf5.list_nodes() + self._project_hdf5.list_groups()
Expand Down
4 changes: 2 additions & 2 deletions tests/job/test_hdf5content.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pyiron_base._tests import PyironTestCase


class DatabasePropertyIntegration(PyironTestCase):
class InspectTest(PyironTestCase):
@classmethod
def setUpClass(cls):
cls.file_location = os.path.dirname(os.path.abspath(__file__))
Expand All @@ -32,7 +32,7 @@ def test_inspect_job(self):
job_inspect.content.input.__repr__(), job_inspect["input"].__repr__()
)
self.assertEqual(
sorted(dir(job_inspect.content.input)),
sorted((job_inspect.content.input).keys()),
sorted(job_inspect["input"].list_nodes()
+ job_inspect["input"].list_groups())
)
Expand Down
Loading