From 868d123c357cf056e953c30ae509c3a97ab93427 Mon Sep 17 00:00:00 2001 From: Marvin Poul Date: Tue, 19 Mar 2024 17:23:11 +0100 Subject: [PATCH 1/7] Move recursive loading of DataContainer to HDF5Content This will allow users and internal code to use job.content[...] if they what they are looking for is saved in HDF5 storage and obviate the need to rely on job[...] which may be slow because it is also searching the compressed job files. I will need some logic like this to fix the slow loading of pyiron_atomistics jobs. --- pyiron_base/jobs/job/core.py | 113 +++++++++++++++++++---------------- 1 file changed, 62 insertions(+), 51 deletions(-) diff --git a/pyiron_base/jobs/job/core.py b/pyiron_base/jobs/job/core.py index ce3a5835d..911da59e8 100644 --- a/pyiron_base/jobs/job/core.py +++ b/pyiron_base/jobs/job/core.py @@ -108,6 +108,52 @@ """ +def recursive_load_from_hdf(project_hdf5, item): + try: + group = project_hdf5[item] + if ( + isinstance(group, ProjectHDFio) + and "NAME" in group + and group["NAME"] == "DataContainer" + ): + return group.to_object(lazy=True) + else: + return group + except ValueError: + pass + + name_lst = item.split("/") + + def successive_path_splits(name_lst): + """ + Yield successive split/joins of a path, i.e. + /a/b/c/d + gives + /a/b/c, d + /a/b, c/d + /a, b/c/d + """ + for i in range(1, len(name_lst)): + # where we are looking for the data container + container_path = "/".join(name_lst[:-i]) + # where we are looking for data in the container + data_path = "/".join(name_lst[-1:]) + yield container_path, data_path + + for container_path, data_path in successive_path_splits(name_lst): + try: + group = project_hdf5[container_path] + if ( + isinstance(group, ProjectHDFio) + and "NAME" in group + and group["NAME"] == "DataContainer" + ): + return group.to_object(lazy=True)[data_path] + except (ValueError, IndexError, KeyError): + # either group does not contain a data container or it is does, but it does not have the path we're + # looking for + pass + class JobCore(HasGroups): __doc__ = ( """ @@ -906,52 +952,10 @@ def __getitem__(self, item): Returns: dict, list, float, int, :class:`.DataContainer`, None: data or data object; if nothing is found None is returned """ - # first try to access HDF5 directly to make the common case fast - try: - group = self._hdf5[item] - if ( - isinstance(group, ProjectHDFio) - and "NAME" in group - and group["NAME"] == "DataContainer" - ): - return group.to_object(lazy=True) - else: - return group - except ValueError: - pass - - name_lst = item.split("/") - - def successive_path_splits(name_lst): - """ - Yield successive split/joins of a path, i.e. - /a/b/c/d - gives - /a/b/c, d - /a/b, c/d - /a, b/c/d - """ - for i in range(1, len(name_lst)): - # where we are looking for the data container - container_path = "/".join(name_lst[:-i]) - # where we are looking for data in the container - data_path = "/".join(name_lst[-1:]) - yield container_path, data_path - - for container_path, data_path in successive_path_splits(name_lst): - try: - group = self._hdf5[container_path] - if ( - isinstance(group, ProjectHDFio) - and "NAME" in group - and group["NAME"] == "DataContainer" - ): - return group.to_object(lazy=True)[data_path] - except (ValueError, IndexError, KeyError): - # either group does not contain a data container or it is does, but it does not have the path we're - # looking for - pass + value = recursive_load_from_hdf(self._project_hdf5, item) + if value is not None: + return value if item in self.files.list(): warnings.warn( @@ -1089,7 +1093,6 @@ def __getattr__(self, name): def __repr__(self): return f"{self.__class__.__name__}({repr(self._job_dict)})" - class HDF5Content(object): """ Access the HDF5 file of the job @@ -1099,12 +1102,20 @@ def __init__(self, project_hdf5): self._project_hdf5 = project_hdf5 def __getattr__(self, name): - if name in self._project_hdf5.list_nodes(): - return self._project_hdf5.__getitem__(name) - elif name in self._project_hdf5.list_groups(): - return HDF5Content(self._project_hdf5.__getitem__(name)) + try: + return self[name] + except KeyError: + raise AttributeError(name) from None + + def __getitem__(self, item): + value = recursive_load_from_hdf(self._project_hdf5, item) + if value is not None: + return value + + if name in self._project_hdf5.list_groups(): + return HDF5Content(self._project_hdf5[item]) else: - raise AttributeError + raise KeyError(item) def __dir__(self): return self._project_hdf5.list_nodes() + self._project_hdf5.list_groups() From cf8bc28b6080dc8298b708a16ff16e1067621e2d Mon Sep 17 00:00:00 2001 From: Marvin Poul Date: Tue, 19 Mar 2024 23:07:53 +0100 Subject: [PATCH 2/7] Fix typo --- pyiron_base/jobs/job/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiron_base/jobs/job/core.py b/pyiron_base/jobs/job/core.py index 911da59e8..d6fc89055 100644 --- a/pyiron_base/jobs/job/core.py +++ b/pyiron_base/jobs/job/core.py @@ -953,7 +953,7 @@ def __getitem__(self, item): dict, list, float, int, :class:`.DataContainer`, None: data or data object; if nothing is found None is returned """ # first try to access HDF5 directly to make the common case fast - value = recursive_load_from_hdf(self._project_hdf5, item) + value = recursive_load_from_hdf(self._hdf5, item) if value is not None: return value From b93ad8fbeeaca32b9b66a5d6a56857e4b89b78fa Mon Sep 17 00:00:00 2001 From: Marvin Poul Date: Tue, 19 Mar 2024 23:57:18 +0100 Subject: [PATCH 3/7] redefine missing var --- pyiron_base/jobs/job/core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyiron_base/jobs/job/core.py b/pyiron_base/jobs/job/core.py index d6fc89055..55a5cd613 100644 --- a/pyiron_base/jobs/job/core.py +++ b/pyiron_base/jobs/job/core.py @@ -964,6 +964,7 @@ def __getitem__(self, item): ) return _job_read_file(self, item) + name_lst = item.split("/") item_obj = name_lst[0] if item_obj in self._list_ext_childs(): # ToDo: Murn['strain_0.9'] - sucht im HDF5 file, dort gibt es aber die entsprechenden Gruppen noch nicht. From 5dfd269273fd23db3ee67d3db35207001c33ecab Mon Sep 17 00:00:00 2001 From: Marvin Poul Date: Fri, 22 Mar 2024 15:01:54 +0100 Subject: [PATCH 4/7] Rename test case old name made no sense --- tests/job/test_hdf5content.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/job/test_hdf5content.py b/tests/job/test_hdf5content.py index 80cfd4dec..6b7e88d6f 100644 --- a/tests/job/test_hdf5content.py +++ b/tests/job/test_hdf5content.py @@ -8,7 +8,7 @@ from pyiron_base._tests import PyironTestCase -class DatabasePropertyIntegration(PyironTestCase): +class InspectTest(PyironTestCase): @classmethod def setUpClass(cls): cls.file_location = os.path.dirname(os.path.abspath(__file__)) From c931e3190d851a0d1ac130a85fa5936b3766f0ae Mon Sep 17 00:00:00 2001 From: Marvin Poul Date: Fri, 22 Mar 2024 15:09:48 +0100 Subject: [PATCH 5/7] Fix unit test --- tests/job/test_hdf5content.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/job/test_hdf5content.py b/tests/job/test_hdf5content.py index 6b7e88d6f..0bf4c0d87 100644 --- a/tests/job/test_hdf5content.py +++ b/tests/job/test_hdf5content.py @@ -32,7 +32,7 @@ def test_inspect_job(self): job_inspect.content.input.__repr__(), job_inspect["input"].__repr__() ) self.assertEqual( - sorted(dir(job_inspect.content.input)), + sorted((job_inspect.content.input).keys()), sorted(job_inspect["input"].list_nodes() + job_inspect["input"].list_groups()) ) From 3420de744c224a755205bbb1a83dc2182f611cef Mon Sep 17 00:00:00 2001 From: Marvin Poul Date: Fri, 22 Mar 2024 16:14:44 +0100 Subject: [PATCH 6/7] Add check to not decompress files accidentally --- pyiron_base/jobs/job/core.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pyiron_base/jobs/job/core.py b/pyiron_base/jobs/job/core.py index 55a5cd613..4045286a0 100644 --- a/pyiron_base/jobs/job/core.py +++ b/pyiron_base/jobs/job/core.py @@ -957,7 +957,14 @@ def __getitem__(self, item): if value is not None: return value - if item in self.files.list(): + # only try to read files when no slashes are present: + # downstream code will often do something like job['path/to/output'] to check if certain values exist and branch + # on that. In cases where they don't exists this would then trigger us to decompress the job files in memory on + # every check which slows down things a lot. Generally these value checks will be of the form output/.../... + # i.e. contain slashes and file access tend to be just the file name without slashes, so I separate those cases + # here like this. In those cases where we actually have sub directories in the job folders we can beef up the + # file browser. + if "/" not in item and item in self.files.list(): warnings.warn( "Using __getitem__ on a job to access files in deprecated: use job.files instead!", category=DeprecationWarning, From 9b760bde2b5106b0296b0c834f52483c4a7f590a Mon Sep 17 00:00:00 2001 From: Marvin Poul Date: Fri, 22 Mar 2024 16:44:14 +0100 Subject: [PATCH 7/7] Move internal function def; add docstring --- pyiron_base/jobs/job/core.py | 57 +++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 14 deletions(-) diff --git a/pyiron_base/jobs/job/core.py b/pyiron_base/jobs/job/core.py index 4045286a0..bd0bffeef 100644 --- a/pyiron_base/jobs/job/core.py +++ b/pyiron_base/jobs/job/core.py @@ -108,21 +108,35 @@ """ -def recursive_load_from_hdf(project_hdf5, item): - try: - group = project_hdf5[item] - if ( - isinstance(group, ProjectHDFio) - and "NAME" in group - and group["NAME"] == "DataContainer" - ): - return group.to_object(lazy=True) - else: - return group - except ValueError: - pass +def recursive_load_from_hdf(project_hdf5: ProjectHDFio, item: str): + """ + Load given item from HDF, but check also for DataContainer along the way. - name_lst = item.split("/") + If `item` exists as is in HDF, return it, otherwise break it up along every slash and try to load a + :class:`~.DataContainer` and then try to index with the remainder of the path, i.e. + + >>> recursive_load_from_hdf(hdf, 'my/path/to/value') + + is equivalent to one of (in this order) + + >>> hdf['my/path/to'].to_object()['value'] + >>> hdf['my/path'].to_object()['to/value'] + >>> hdf['my'].to_object()['path/to/value'] + + in case + + >>> hdf['/my/path/to/value'] + + doesn't exist. + + Args: + project_hdf5 (ProjectHDFio): HDF file to access + item (str): path to value, may contain `/` + + Returns: + object: whatever was found in the HDF file + None: if nothing was found in the HDF file + """ def successive_path_splits(name_lst): """ @@ -140,6 +154,21 @@ def successive_path_splits(name_lst): data_path = "/".join(name_lst[-1:]) yield container_path, data_path + try: + group = project_hdf5[item] + if ( + isinstance(group, ProjectHDFio) + and "NAME" in group + and group["NAME"] == "DataContainer" + ): + return group.to_object(lazy=True) + else: + return group + except ValueError: + pass + + name_lst = item.split("/") + for container_path, data_path in successive_path_splits(name_lst): try: group = project_hdf5[container_path]