lanl · jpulidojr · Jan 21, 2025 · Dec 15, 2024 · Dec 15, 2024 · Dec 15, 2024
diff --git a/.github/workflows/test_core.yml b/.github/workflows/test_core.yml
@@ -0,0 +1,34 @@
+name: core.py test
+
+on:
+  push:
+    branches:
+    - main
+  pull_request:
+    branches:
+    - main
+
+
+jobs:
+  linux:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.11']
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install -r requirements.extras.txt
+        pip install .
+    - name: Test reader
+      run: |
+        pip install pytest
+        pytest dsi/tests/test_core.py
diff --git a/.github/workflows/test_env.yml b/.github/workflows/test_env.yml
@@ -0,0 +1,34 @@
+name: env.py test
+
+on:
+  push:
+    branches:
+    - main
+  pull_request:
+    branches:
+    - main
+
+
+jobs:
+  linux:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.11']
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install -r requirements.extras.txt
+        pip install .
+    - name: Test reader
+      run: |
+        pip install pytest
+        pytest dsi/plugins/tests/test_env.py
diff --git a/.github/workflows/test_file_reader.yml b/.github/workflows/test_file_reader.yml
@@ -9,7 +9,6 @@ on:
     - main
 
 
-
 jobs:
   linux:
     runs-on: ubuntu-latest
@@ -27,8 +26,8 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install -r requirements.txt
+        pip install -r requirements.extras.txt
         pip install .  
-        pip install graphviz
     - name: Test reader
       run: |
         pip install pytest

diff --git a/.github/workflows/test_file_writer.yml b/.github/workflows/test_file_writer.yml
@@ -26,9 +26,8 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install -r requirements.txt
-        python -m pip install opencv-python
+        pip install -r requirements.extras.txt
         pip install .
-        pip install graphviz
         sudo apt-get install graphviz
     - name: Test reader
       run: |

diff --git a/.github/workflows/test_sqlalchemy.yml b/.github/workflows/test_sqlalchemy.yml
@@ -0,0 +1,34 @@
+name: sqlalchemy.py test
+
+on:
+  push:
+    branches:
+    - main
+  pull_request:
+    branches:
+    - main
+
+
+jobs:
+  linux:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.11']
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install -r requirements.extras.txt
+        pip install .
+    - name: Test reader
+      run: |
+        pip install pytest
+        pytest dsi/backends/tests/test_sqlalchemy.py
diff --git a/.github/workflows/test_sqlite.yml b/.github/workflows/test_sqlite.yml
@@ -26,8 +26,8 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install -r requirements.txt
+        pip install -r requirements.extras.txt
         pip install .
-        pip install ipykernel
     - name: Test reader
       run: |
         pip install pytest

diff --git a/docs/index.rst b/docs/index.rst
@@ -12,6 +12,7 @@ The Data Science Infrastructure Project (DSI)
 
    introduction
    installation
+   contributing_readers
    plugins
    backends
    core

diff --git a/docs/introduction.rst b/docs/introduction.rst
@@ -1,7 +1,7 @@
 
 
 
-The goal of the Data Science Infrastructure Project (DSI) is to manage data through metadata capture and curation.  DSI  capabilities can be used to develop workflows to support management of simulation data, AI/ML approaches, ensemble data, and other sources of data typically found in scientific computing.  DSI infrastructure is designed to be flexible and with these considerations in mind:
+The goal of the Data Science Infrastructure Project (DSI) is to manage data through metadata capture and curation.  DSI capabilities can be used to develop workflows to support management of simulation data, AI/ML approaches, ensemble data, and other sources of data typically found in scientific computing.  DSI infrastructure is designed to be flexible and with these considerations in mind:
 
 - Data management is subject to strict, POSIX-enforced, file security.
 - DSI capabilities support a wide range of common metadata queries.

diff --git a/dsi/backends/sqlalchemy.py b/dsi/backends/sqlalchemy.py
@@ -8,11 +8,6 @@
 from sqlalchemy.orm import relationship
 from sqlalchemy import create_engine
 from sqlalchemy.orm import Session
-import csv
-import json
-import re
-import yaml
-import toml
 
 from dsi.backends.filesystem import Filesystem
 

diff --git a/dsi/backends/sqlite.py b/dsi/backends/sqlite.py
@@ -170,13 +170,18 @@ def put_artifacts(self, collection, isVerbose=False):
             self.cur.execute(create_query)
             for tableName, tableData in artifacts["dsi_units"].items():
                 if len(tableData) > 0:
-                    for col_unit_pair in tableData:
-                        str_query = f'INSERT OR IGNORE INTO dsi_units VALUES ("{tableName}", "{col_unit_pair[0]}", "{col_unit_pair[1]}")'
-                        try:
-                            self.cur.execute(str_query)
-                        except sqlite3.Error as e:
+                    for col, unit in tableData.items():
+                        str_query = f'INSERT INTO dsi_units VALUES ("{tableName}", "{col}", "{unit}")'
+                        unit_result = self.cur.execute(f"SELECT unit FROM dsi_units WHERE column = '{col}';").fetchone()
+                        if unit_result and unit_result[0] != unit:
                             self.con.rollback()
-                            return e
+                            return f"Cannot ingest different units for the column {col} in {tableName}"
+                        elif not unit_result:
+                            try:
+                                self.cur.execute(str_query)
+                            except sqlite3.Error as e:
+                                self.con.rollback()
+                                return e
 
         try:
             self.con.commit()
@@ -218,10 +223,11 @@ def get_artifacts(self, query, isVerbose=False, dict_return = False):
         else:
             return data
 
-    def inspect_artifacts(self, collection, interactive=False):
+    def inspect_artifacts(self, interactive=False):
         import nbconvert as nbc
         import nbformat as nbf
         dsi_relations, dsi_units = None, None
+        collection = self.read_to_artifact(only_units_relations=True)
         if "dsi_relations" in collection.keys():
             dsi_relations = dict(collection["dsi_relations"])
         if "dsi_units" in collection.keys():
@@ -319,7 +325,7 @@ def inspect_artifacts(self, collection, interactive=False):
                 fh.write(html_content)
 
     # SQLITE READER FUNCTION
-    def read_to_artifact(self):
+    def read_to_artifact(self, only_units_relations = False):
         artifact = OrderedDict()
         artifact["dsi_relations"] = OrderedDict([("primary_key",[]), ("foreign_key", [])])
 
@@ -340,14 +346,15 @@ def read_to_artifact(self):
                 if colInfo[5] == 1:
                     pkList.append((tableName, colInfo[1]))
 
-            data = self.cur.execute(f"SELECT * FROM {tableName};").fetchall()
-            for row in data:
-                for colName, val in zip(colDict.keys(), row):
-                    if val == "NULL":
-                        colDict[colName].append(None)
-                    else:
-                        colDict[colName].append(val)
-            artifact[tableName] = colDict
+            if only_units_relations == False:
+                data = self.cur.execute(f"SELECT * FROM {tableName};").fetchall()
+                for row in data:
+                    for colName, val in zip(colDict.keys(), row):
+                        if val == "NULL":
+                            colDict[colName].append(None)
+                        else:
+                            colDict[colName].append(val)
+                artifact[tableName] = colDict
 
             fkData = self.cur.execute(f"PRAGMA foreign_key_list({tableName});").fetchall()
             for row in fkData:
@@ -372,8 +379,8 @@ def read_units_helper(self):
         for row in unitsTable:
             tableName = row[0]
             if tableName not in unitsDict.keys():
-                unitsDict[tableName] = []
-            unitsDict[tableName].append((row[1], row[2]))
+                unitsDict[tableName] = {}
+            unitsDict[tableName][row[1]] = row[2]
         return unitsDict
 
     # Closes connection to server

diff --git a/dsi/backends/tests/test_sqlite.py b/dsi/backends/tests/test_sqlite.py
@@ -67,7 +67,7 @@ def test_artifact_inspect():
         os.remove(dbpath)
     store = Sqlite(dbpath, run_table=False)
     store.put_artifacts(valid_middleware_datastructure)
-    store.inspect_artifacts(valid_middleware_datastructure)
+    store.inspect_artifacts()
     assert True
 
 def test_artifact_read():

diff --git a/dsi/core.py b/dsi/core.py
@@ -24,7 +24,7 @@ class Terminal():
     BACKEND_IMPLEMENTATIONS = ['gufi', 'sqlite', 'parquet']
     PLUGIN_PREFIX = ['dsi.plugins']
     PLUGIN_IMPLEMENTATIONS = ['env', 'file_reader', 'file_writer']
-    VALID_PLUGINS = ['Hostname', 'SystemKernel', 'GitInfo', 'Bueno', 'Csv', 'ER_Diagram', 'YAML1', 'TOML1', "Table_Plot", "Schema", "Csv_Writer"]
+    VALID_PLUGINS = ['Hostname', 'SystemKernel', 'GitInfo', 'Bueno', 'Csv', 'ER_Diagram', 'YAML1', 'TOML1', "Table_Plot", "Schema", "Csv_Writer", "MetadataReader1"]
     VALID_BACKENDS = ['Gufi', 'Sqlite', 'Parquet']
     VALID_MODULES = VALID_PLUGINS + VALID_BACKENDS
     VALID_MODULE_FUNCTIONS = {'plugin': ['reader', 'writer'], 
@@ -151,10 +151,14 @@ def load_module(self, mod_type, mod_name, mod_function, **kwargs):
                             for colName, colData in table_metadata.items():
                                 if colName in self.active_metadata[table_name].keys() and table_name != "dsi_units":
                                     self.active_metadata[table_name][colName] += colData
-                                elif colName not in self.active_metadata[table_name].keys():# and table_name == "dsi_units":
+                                elif colName in self.active_metadata[table_name].keys() and table_name == "dsi_units":
+                                    for key, col_unit in colData.items():
+                                        if key not in self.active_metadata[table_name][colName]:
+                                            self.active_metadata[table_name][colName][key] = col_unit
+                                        elif key in self.active_metadata[table_name][colName] and self.active_metadata[table_name][colName][key] != col_unit:
+                                            raise ValueError(f"Cannot have a different set of units for column {key} in {colName}")
+                                elif colName not in self.active_metadata[table_name].keys():
                                     self.active_metadata[table_name][colName] = colData
-                                # elif colName not in self.active_metadata[table_name].keys() and table_name != "dsi_units":
-                                #     raise ValueError(f"Mismatched column input for table {table_name}")
                 elif mod_type == "backend":
                     if "run_table" in class_.__init__.__code__.co_varnames:
                         kwargs['run_table'] = self.runTable
@@ -207,7 +211,6 @@ def add_external_python_module(self, mod_type, mod_name, mod_path):
 
         term = Terminal()
         term.add_external_python_module('plugin', 'my_python_file',
-
                                         '/the/path/to/my_python_file.py')
 
         term.load_module('plugin', 'MyPlugin', 'reader')
@@ -270,7 +273,8 @@ def artifact_handler(self, interaction_type, query = None, **kwargs):
 
                 if interaction_type in ['put', 'set'] and module_type == 'back-write':
                     if self.backup_db_flag == True and os.path.getsize(obj.filename) > 100:
-                        backup_file = obj.filename[:obj.filename.rfind('.')] + "_backup" + obj.filename[obj.filename.rfind('.'):]
+                        formatted_datetime = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
+                        backup_file = obj.filename[:obj.filename.rfind('.')] + "_backup_" + formatted_datetime + obj.filename[obj.filename.rfind('.'):]
                         shutil.copyfile(obj.filename, backup_file)
                     errorMessage = obj.put_artifacts(collection = self.active_metadata, **kwargs)
                     if errorMessage is not None:
@@ -284,28 +288,20 @@ def artifact_handler(self, interaction_type, query = None, **kwargs):
                         self.logger.info(f"Query to get data: {query}")
                         kwargs['query'] = query
                     get_artifact_data = obj.get_artifacts(**kwargs)
-                    # else:
-                    #     #raise ValueError("Need to specify a query of the database to return data")
-                    #     # This is a valid use-case, may give a warning for now
-                    #     get_artifact_data = obj.get_artifacts(**kwargs)
                     operation_success = True
 
                 elif interaction_type == 'inspect':
-                    # if module_type == 'back-write':
-                    #     errorMessage = obj.put_artifacts(
-                    #         collection=self.active_metadata, **kwargs)
-                    #     if errorMessage is not None:
-                    #         print("Error in ingesting data to db in inspect artifact handler. Generating Jupyter notebook with previous instance of db")
-                    if not self.active_metadata:
-                        raise ValueError("Error in inspect artifact handler: Need to ingest data to DSI abstraction before generating Jupyter notebook")
-                    obj.inspect_artifacts(collection=self.active_metadata, **kwargs)
-                    operation_success = True
+                    if os.path.getsize(obj.filename) > 100:
+                        obj.inspect_artifacts(**kwargs)
+                        operation_success = True
+                    else:
+                        raise ValueError("Error in inspect artifact handler: Need to ingest data into a backend before generating Jupyter notebook")
 
                 elif interaction_type == "read" and module_type == 'back-read':
                     self.active_metadata = obj.read_to_artifact()
                     operation_success = True
                 elif interaction_type == "read" and module_type == 'back-write':
-                    raise ValueError("Can only call read to artifact handler with a back-READ backend")
+                    raise ValueError("Can only call read artifact handler with a back-READ backend")
 
                 end = datetime.now()
                 self.logger.info(f"Runtime: {end-start}")
@@ -332,16 +328,6 @@ def update_abstraction(self, table_name, table_data):
         if not isinstance(table_data, OrderedDict):
             raise ValueError("table_data needs to be in the form of an Ordered Dictionary")
         self.active_metadata[table_name] = table_data
-
-        #allow more plugins to be loaded and can call transload again
-        # self.transload_lock = False
-
-        #need to unload all loaded plugins to prevent duplicate reading when transload called again
-        # mods = self.active_modules
-        # for obj in mods['reader']:
-        #     self.unload_module('plugin', obj.__class__.__name__, "reader")
-        # for obj in mods['writer']:
-        #     self.unload_module('plugin', obj.__class__.__name__, "writer")
 
 
 class Sync():

diff --git a/dsi/plugins/env.py b/dsi/plugins/env.py
@@ -7,7 +7,7 @@
 
 from dsi.plugins.metadata import StructuredMetadata
 from dsi.plugins.plugin_models import (
-    GitInfoModel, HostnameModel, SystemKernelModel
+    EnvironmentModel, GitInfoModel, HostnameModel, SystemKernelModel, create_dynamic_model
 )