From 868d123c357cf056e953c30ae509c3a97ab93427 Mon Sep 17 00:00:00 2001
From: Marvin Poul <poul@mpie.de>
Date: Tue, 19 Mar 2024 17:23:11 +0100
Subject: [PATCH 1/7] Move recursive loading of DataContainer to HDF5Content

This will allow users and internal code to use job.content[...] if they what they are looking for
is saved in HDF5 storage and obviate the need to rely on job[...] which may be slow because it is
also searching the compressed job files.

I will need some logic like this to fix the slow loading of pyiron_atomistics jobs.
---
 pyiron_base/jobs/job/core.py | 113 +++++++++++++++++++----------------
 1 file changed, 62 insertions(+), 51 deletions(-)

diff --git a/pyiron_base/jobs/job/core.py b/pyiron_base/jobs/job/core.py
index ce3a5835d..911da59e8 100644
--- a/pyiron_base/jobs/job/core.py
+++ b/pyiron_base/jobs/job/core.py
@@ -108,6 +108,52 @@
 """
 
 
+def recursive_load_from_hdf(project_hdf5, item):
+    try:
+        group = project_hdf5[item]
+        if (
+            isinstance(group, ProjectHDFio)
+            and "NAME" in group
+            and group["NAME"] == "DataContainer"
+        ):
+            return group.to_object(lazy=True)
+        else:
+            return group
+    except ValueError:
+        pass
+
+    name_lst = item.split("/")
+
+    def successive_path_splits(name_lst):
+        """
+        Yield successive split/joins of a path, i.e.
+        /a/b/c/d
+        gives
+        /a/b/c, d
+        /a/b, c/d
+        /a, b/c/d
+        """
+        for i in range(1, len(name_lst)):
+            # where we are looking for the data container
+            container_path = "/".join(name_lst[:-i])
+            # where we are looking for data in the container
+            data_path = "/".join(name_lst[-1:])
+            yield container_path, data_path
+
+    for container_path, data_path in successive_path_splits(name_lst):
+        try:
+            group = project_hdf5[container_path]
+            if (
+                isinstance(group, ProjectHDFio)
+                and "NAME" in group
+                and group["NAME"] == "DataContainer"
+            ):
+                return group.to_object(lazy=True)[data_path]
+        except (ValueError, IndexError, KeyError):
+            # either group does not contain a data container or it is does, but it does not have the path we're
+            # looking for
+            pass
+
 class JobCore(HasGroups):
     __doc__ = (
         """
@@ -906,52 +952,10 @@ def __getitem__(self, item):
         Returns:
             dict, list, float, int, :class:`.DataContainer`, None: data or data object; if nothing is found None is returned
         """
-
         # first try to access HDF5 directly to make the common case fast
-        try:
-            group = self._hdf5[item]
-            if (
-                isinstance(group, ProjectHDFio)
-                and "NAME" in group
-                and group["NAME"] == "DataContainer"
-            ):
-                return group.to_object(lazy=True)
-            else:
-                return group
-        except ValueError:
-            pass
-
-        name_lst = item.split("/")
-
-        def successive_path_splits(name_lst):
-            """
-            Yield successive split/joins of a path, i.e.
-            /a/b/c/d
-            gives
-            /a/b/c, d
-            /a/b, c/d
-            /a, b/c/d
-            """
-            for i in range(1, len(name_lst)):
-                # where we are looking for the data container
-                container_path = "/".join(name_lst[:-i])
-                # where we are looking for data in the container
-                data_path = "/".join(name_lst[-1:])
-                yield container_path, data_path
-
-        for container_path, data_path in successive_path_splits(name_lst):
-            try:
-                group = self._hdf5[container_path]
-                if (
-                    isinstance(group, ProjectHDFio)
-                    and "NAME" in group
-                    and group["NAME"] == "DataContainer"
-                ):
-                    return group.to_object(lazy=True)[data_path]
-            except (ValueError, IndexError, KeyError):
-                # either group does not contain a data container or it is does, but it does not have the path we're
-                # looking for
-                pass
+        value = recursive_load_from_hdf(self._project_hdf5, item)
+        if value is not None:
+            return value
 
         if item in self.files.list():
             warnings.warn(
@@ -1089,7 +1093,6 @@ def __getattr__(self, name):
     def __repr__(self):
         return f"{self.__class__.__name__}({repr(self._job_dict)})"
 
-
 class HDF5Content(object):
     """
     Access the HDF5 file of the job
@@ -1099,12 +1102,20 @@ def __init__(self, project_hdf5):
         self._project_hdf5 = project_hdf5
 
     def __getattr__(self, name):
-        if name in self._project_hdf5.list_nodes():
-            return self._project_hdf5.__getitem__(name)
-        elif name in self._project_hdf5.list_groups():
-            return HDF5Content(self._project_hdf5.__getitem__(name))
+        try:
+            return self[name]
+        except KeyError:
+            raise AttributeError(name) from None
+
+    def __getitem__(self, item):
+        value = recursive_load_from_hdf(self._project_hdf5, item)
+        if value is not None:
+            return value
+
+        if name in self._project_hdf5.list_groups():
+            return HDF5Content(self._project_hdf5[item])
         else:
-            raise AttributeError
+            raise KeyError(item)
 
     def __dir__(self):
         return self._project_hdf5.list_nodes() + self._project_hdf5.list_groups()

From cf8bc28b6080dc8298b708a16ff16e1067621e2d Mon Sep 17 00:00:00 2001
From: Marvin Poul <poul@mpie.de>
Date: Tue, 19 Mar 2024 23:07:53 +0100
Subject: [PATCH 2/7] Fix typo

---
 pyiron_base/jobs/job/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyiron_base/jobs/job/core.py b/pyiron_base/jobs/job/core.py
index 911da59e8..d6fc89055 100644
--- a/pyiron_base/jobs/job/core.py
+++ b/pyiron_base/jobs/job/core.py
@@ -953,7 +953,7 @@ def __getitem__(self, item):
             dict, list, float, int, :class:`.DataContainer`, None: data or data object; if nothing is found None is returned
         """
         # first try to access HDF5 directly to make the common case fast
-        value = recursive_load_from_hdf(self._project_hdf5, item)
+        value = recursive_load_from_hdf(self._hdf5, item)
         if value is not None:
             return value
 

From b93ad8fbeeaca32b9b66a5d6a56857e4b89b78fa Mon Sep 17 00:00:00 2001
From: Marvin Poul <poul@mpie.de>
Date: Tue, 19 Mar 2024 23:57:18 +0100
Subject: [PATCH 3/7] redefine missing var

---
 pyiron_base/jobs/job/core.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyiron_base/jobs/job/core.py b/pyiron_base/jobs/job/core.py
index d6fc89055..55a5cd613 100644
--- a/pyiron_base/jobs/job/core.py
+++ b/pyiron_base/jobs/job/core.py
@@ -964,6 +964,7 @@ def __getitem__(self, item):
             )
             return _job_read_file(self, item)
 
+        name_lst = item.split("/")
         item_obj = name_lst[0]
         if item_obj in self._list_ext_childs():
             # ToDo: Murn['strain_0.9'] - sucht im HDF5 file, dort gibt es aber die entsprechenden Gruppen noch nicht.

From 5dfd269273fd23db3ee67d3db35207001c33ecab Mon Sep 17 00:00:00 2001
From: Marvin Poul <poul@mpie.de>
Date: Fri, 22 Mar 2024 15:01:54 +0100
Subject: [PATCH 4/7] Rename test case

old name made no sense
---
 tests/job/test_hdf5content.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/job/test_hdf5content.py b/tests/job/test_hdf5content.py
index 80cfd4dec..6b7e88d6f 100644
--- a/tests/job/test_hdf5content.py
+++ b/tests/job/test_hdf5content.py
@@ -8,7 +8,7 @@
 from pyiron_base._tests import PyironTestCase
 
 
-class DatabasePropertyIntegration(PyironTestCase):
+class InspectTest(PyironTestCase):
     @classmethod
     def setUpClass(cls):
         cls.file_location = os.path.dirname(os.path.abspath(__file__))

From c931e3190d851a0d1ac130a85fa5936b3766f0ae Mon Sep 17 00:00:00 2001
From: Marvin Poul <poul@mpie.de>
Date: Fri, 22 Mar 2024 15:09:48 +0100
Subject: [PATCH 5/7] Fix unit test

---
 tests/job/test_hdf5content.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/job/test_hdf5content.py b/tests/job/test_hdf5content.py
index 6b7e88d6f..0bf4c0d87 100644
--- a/tests/job/test_hdf5content.py
+++ b/tests/job/test_hdf5content.py
@@ -32,7 +32,7 @@ def test_inspect_job(self):
             job_inspect.content.input.__repr__(), job_inspect["input"].__repr__()
         )
         self.assertEqual(
-            sorted(dir(job_inspect.content.input)),
+            sorted((job_inspect.content.input).keys()),
             sorted(job_inspect["input"].list_nodes()
                     + job_inspect["input"].list_groups())
         )

From 3420de744c224a755205bbb1a83dc2182f611cef Mon Sep 17 00:00:00 2001
From: Marvin Poul <poul@mpie.de>
Date: Fri, 22 Mar 2024 16:14:44 +0100
Subject: [PATCH 6/7] Add check to not decompress files accidentally

---
 pyiron_base/jobs/job/core.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pyiron_base/jobs/job/core.py b/pyiron_base/jobs/job/core.py
index 55a5cd613..4045286a0 100644
--- a/pyiron_base/jobs/job/core.py
+++ b/pyiron_base/jobs/job/core.py
@@ -957,7 +957,14 @@ def __getitem__(self, item):
         if value is not None:
             return value
 
-        if item in self.files.list():
+        # only try to read files when no slashes are present:
+        # downstream code will often do something like job['path/to/output'] to check if certain values exist and branch
+        # on that.  In cases where they don't exists this would then trigger us to decompress the job files in memory on
+        # every check which slows down things a lot.  Generally these value checks will be of the form output/.../...
+        # i.e. contain slashes and file access tend to be just the file name without slashes, so I separate those cases
+        # here like this.  In those cases where we actually have sub directories in the job folders we can beef up the
+        # file browser.
+        if "/" not in item and item in self.files.list():
             warnings.warn(
                 "Using __getitem__ on a job to access files in deprecated: use job.files instead!",
                 category=DeprecationWarning,

From 9b760bde2b5106b0296b0c834f52483c4a7f590a Mon Sep 17 00:00:00 2001
From: Marvin Poul <poul@mpie.de>
Date: Fri, 22 Mar 2024 16:44:14 +0100
Subject: [PATCH 7/7] Move internal function def; add docstring

---
 pyiron_base/jobs/job/core.py | 57 +++++++++++++++++++++++++++---------
 1 file changed, 43 insertions(+), 14 deletions(-)

diff --git a/pyiron_base/jobs/job/core.py b/pyiron_base/jobs/job/core.py
index 4045286a0..bd0bffeef 100644
--- a/pyiron_base/jobs/job/core.py
+++ b/pyiron_base/jobs/job/core.py
@@ -108,21 +108,35 @@
 """
 
 
-def recursive_load_from_hdf(project_hdf5, item):
-    try:
-        group = project_hdf5[item]
-        if (
-            isinstance(group, ProjectHDFio)
-            and "NAME" in group
-            and group["NAME"] == "DataContainer"
-        ):
-            return group.to_object(lazy=True)
-        else:
-            return group
-    except ValueError:
-        pass
+def recursive_load_from_hdf(project_hdf5: ProjectHDFio, item: str):
+    """
+    Load given item from HDF, but check also for DataContainer along the way.
 
-    name_lst = item.split("/")
+    If `item` exists as is in HDF, return it, otherwise break it up along every slash and try to load a
+    :class:`~.DataContainer` and then try to index with the remainder of the path, i.e.
+
+    >>> recursive_load_from_hdf(hdf, 'my/path/to/value')
+
+    is equivalent to one of (in this order)
+
+    >>> hdf['my/path/to'].to_object()['value']
+    >>> hdf['my/path'].to_object()['to/value']
+    >>> hdf['my'].to_object()['path/to/value']
+
+    in case
+
+    >>> hdf['/my/path/to/value']
+
+    doesn't exist.
+
+    Args:
+        project_hdf5 (ProjectHDFio): HDF file to access
+        item (str): path to value, may contain `/`
+
+    Returns:
+        object: whatever was found in the HDF file
+        None: if nothing was found in the HDF file
+    """
 
     def successive_path_splits(name_lst):
         """
@@ -140,6 +154,21 @@ def successive_path_splits(name_lst):
             data_path = "/".join(name_lst[-1:])
             yield container_path, data_path
 
+    try:
+        group = project_hdf5[item]
+        if (
+            isinstance(group, ProjectHDFio)
+            and "NAME" in group
+            and group["NAME"] == "DataContainer"
+        ):
+            return group.to_object(lazy=True)
+        else:
+            return group
+    except ValueError:
+        pass
+
+    name_lst = item.split("/")
+
     for container_path, data_path in successive_path_splits(name_lst):
         try:
             group = project_hdf5[container_path]