lanl · Leahh02 · Jul 8, 2024 · Jul 8, 2024 · Jul 9, 2024 · Jul 9, 2024
diff --git a/beeflow/client/bee_client.py b/beeflow/client/bee_client.py
@@ -592,6 +592,14 @@ def reexecute(wf_name: str = typer.Argument(..., help='The workflow name'),
     logging.info(f'ReExecute Workflow: {resp.text}')
     return wf_id
 
+@app.command()
+def dag(wf_id: str = typer.Argument(..., callback=match_short_id)):
+    """Export a DAG of the workflow to a GraphML file."""
+    try:
+        wf_utils.export_workflow_dag(wf_id)
+        typer.echo(f"DAG for workflow {wf_id} has been exported successfully.")
+    except Exception as e:
+        error_exit(f"Failed to export DAG: {str(e)}")
 
 @app.callback(invoke_without_command=True)
 def version_callback(version: bool = False):

diff --git a/beeflow/client/core.py b/beeflow/client/core.py
@@ -27,9 +27,9 @@
 from beeflow.common import paths
 from beeflow.wf_manager.resources import wf_utils
 
-from beeflow.common.db import wfm_db
-from beeflow.common.db.bdb import connect_db
-from beeflow.wf_manager.common import dep_manager
+from beeflow.common.deps import container_manager
+from beeflow.common.deps import neo4j_manager
+from beeflow.common.deps import redis_manager
 
 
 class ComponentManager:
@@ -194,49 +194,34 @@ def celery():
         """Start the celery task queue."""
         log = open_log('celery')
         # Setting --pool=solo to avoid preforking multiple processes
-        return subprocess.Popen(['celery', '-A', 'beeflow.common.celery', 'worker', '--pool=solo'],
-                                stdout=log, stderr=log)
+        return subprocess.Popen(['celery', '-A', 'beeflow.common.deps.celery_manager',
+                                 'worker', '--pool=solo'], stdout=log, stderr=log)
 
     # Run this before daemonizing in order to avoid slow background start
-    container_path = paths.redis_container()
+    # container_path = paths.redis_container()
     # If it exists, we assume that it actually has a valid container
-    if not os.path.exists(container_path):
+    # if not os.path.exists(container_path):
+        # print('Unpacking Redis image...')
+        # subprocess.check_call(['ch-convert', '-i', 'tar', '-o', 'dir',
+        #                       bc.get('DEFAULT', 'redis_image'), container_path])
+    if not container_manager.check_container_dir('redis'):
         print('Unpacking Redis image...')
-        subprocess.check_call(['ch-convert', '-i', 'tar', '-o', 'dir',
-                               bc.get('DEFAULT', 'redis_image'), container_path])
+        container_manager.create_image('redis')
+
+    if not container_manager.check_container_dir('neo4j'):
+        print('Unpacking Neo4j image...')
+        container_manager.create_image('neo4j')
+
+    @mgr.component('neo4j-database', ('wf_manager',))
+    def start_neo4j():
+        """Start the neo4j graph database."""
+        return neo4j_manager.start()
 
     @mgr.component('redis', ())
-    def redis():
+    def start_redis():
         """Start redis."""
-        data_dir = 'data'
-        os.makedirs(os.path.join(paths.redis_root(), data_dir), exist_ok=True)
-        conf_name = 'redis.conf'
-        container_path = paths.redis_container()
-        # Dump the config
-        conf_path = os.path.join(paths.redis_root(), conf_name)
-        if not os.path.exists(conf_path):
-            with open(conf_path, 'w', encoding='utf-8') as fp:
-                # Don't listen on TCP
-                print('port 0', file=fp)
-                print('dir', os.path.join('/mnt', data_dir), file=fp)
-                print('maxmemory 2mb', file=fp)
-                print('unixsocket', os.path.join('/mnt', paths.redis_sock_fname()), file=fp)
-                print('unixsocketperm 700', file=fp)
-        cmd = [
-            'ch-run',
-            f'--bind={paths.redis_root()}:/mnt',
-            container_path,
-            'redis-server',
-            '/mnt/redis.conf',
-        ]
         log = open_log('redis')
-        # Ran into a strange "Failed to configure LOCALE for invalid locale name."
-        # from Redis, so setting LANG=C. This could have consequences for UTF-8
-        # strings.
-        env = dict(os.environ)
-        env['LANG'] = 'C'
-        env['LC_ALL'] = 'C'
-        return subprocess.Popen(cmd, env=env, stdout=log, stderr=log)
+        return redis_manager.start(log)
 
     # Workflow manager and task manager need to be opened with PIPE for their stdout/stderr
     if need_slurmrestd():
@@ -477,23 +462,6 @@ def stop(query='yes'):
         print(f'Beeflow has stopped. Check the log at "{beeflow_log}".')
 
 
-def kill_active_workflows(active_states, workflow_list):
-    """Kill workflows with active states."""
-    db_path = wf_utils.get_db_path()
-    db = connect_db(wfm_db, db_path)
-    success = True
-    for name, wf_id, state in workflow_list:
-        if state in active_states:
-            pid = db.workflows.get_gdb_pid(wf_id)
-            if pid > 0:
-                dep_manager.kill_gdb(pid)
-            else:
-                # Failure most likely caused by an Initializing workflow.
-                print(f"No process for {name}, {wf_id}, {state}.")
-                success = False
-    return success
-
-
 def archive_dir(dir_to_archive):
     """Archive directories for archive flag in reset."""
     archive_dirs = ['logs', 'container_archive', 'archives', 'workflows']
@@ -561,9 +529,6 @@ def reset(archive: bool = typer.Option(False, '--archive', '-a',
             # Exit out if the user didn't really mean to do a reset
             sys.exit()
         elif absolutely_sure in ("y", "yes"):
-            # First stop all active workflow processes
-            workflow_list = bee_client.get_wf_list()
-            kill_active_workflows(active_states, workflow_list)
             # Stop all of the beeflow processes
             stop("quiet")
             print("Beeflow is shutting down.")

diff --git a/beeflow/common/config_driver.py b/beeflow/common/config_driver.py
@@ -301,7 +301,7 @@ def validate_chrun_opts(opts):
                  info='HTTPS port used for the graph database')
 VALIDATOR.option('graphdb', 'gdb_image_mntdir', default=join_path('/tmp', USER),
                  info='graph database image mount directory', validator=validation.make_dir)
-VALIDATOR.option('graphdb', 'sleep_time', validator=int, default=10,
+VALIDATOR.option('graphdb', 'sleep_time', validator=int, default=1,
                  info='how long to wait for the graph database to come up (this can take a while, '
                       'depending on the system)')
 # Builder

diff --git a/beeflow/common/db/wfm_db.py b/beeflow/common/db/wfm_db.py
@@ -10,7 +10,7 @@ class WorkflowInfo:
 
     def __init__(self, db_file):
         """Initialize Info and db file."""
-        self.Info = namedtuple("Info", "id wfm_port tm_port sched_port num_workflows") # noqa Snake Case
+        self.Info = namedtuple("Info", "id wfm_port tm_port sched_port num_workflows bolt_port http_port https_port gdb_pid") # noqa Snake Case
         self.db_file = db_file
 
     def set_port(self, component, new_port):
@@ -22,7 +22,7 @@ def get_port(self, component):
         """Return port for the specified component."""
         # Need to add code here to make sure we chose a valid component.
         stmt = f"SELECT {component}_port FROM info"
-        result = bdb.getone(self.db_file, stmt)
+        result = bdb.getone(self.db_file, stmt)[0]
         port = result
         return port
 
@@ -45,6 +45,18 @@ def get_info(self):
         info = self.Info(*result)
         return info
 
+    def get_gdb_pid(self):
+        """Return the gdb pid."""
+        stmt = "SELECT gdb_pid FROM info"
+        result = bdb.getone(self.db_file, stmt)[0]
+        gdb_pid = result
+        return gdb_pid
+
+    def update_gdb_pid(self, gdb_pid):
+        """Update the gdb PID."""
+        stmt = "UPDATE info SET gdb_pid=?"
+        bdb.run(self.db_file, stmt, [gdb_pid])
+
 
 class Workflows:
     """Workflow database object."""
@@ -53,7 +65,7 @@ def __init__(self, db_file):
         """Initialize Task, db_file, and Workflow object."""
         self.Task = namedtuple("Task", "id task_id workflow_id name resource state slurm_id") #noqa
         self.db_file = db_file
-        self.Workflow = namedtuple("Workflow", "id workflow_id name state run_dir bolt_port http_port https_port gdb_pid") #noqa
+        self.Workflow = namedtuple("Workflow", "id workflow_id name state run_dir") #noqa
 
     def get_workflow(self, workflow_id):
         """Return a workflow object."""
@@ -69,13 +81,11 @@ def get_workflows(self):
         workflows = [self.Workflow(*workflow) for workflow in result]
         return workflows
 
-    def init_workflow(self, workflow_id, name, run_dir, bolt_port, http_port, https_port):
+    def init_workflow(self, workflow_id, name, run_dir):
         """Insert a new workflow into the database."""
-        stmt = """INSERT INTO workflows (workflow_id, name, state, run_dir,
-                                         bolt_port, http_port, https_port, gdb_pid)
-                  VALUES(?, ?, ?, ?, ?, ?, ?, ?);"""
-        bdb.run(self.db_file, stmt, [workflow_id, name, 'Initializing', run_dir,
-                                     bolt_port, http_port, https_port, -1])
+        stmt = """INSERT INTO workflows (workflow_id, name, state, run_dir)
+                  VALUES(?, ?, ?, ?);"""
+        bdb.run(self.db_file, stmt, [workflow_id, name, 'Initializing', run_dir])
 
     def delete_workflow(self, workflow_id):
         """Delete a workflow from the database."""
@@ -123,42 +133,9 @@ def get_task(self, task_id, workflow_id):
         result = bdb.getone(self.db_file, stmt, [task_id, workflow_id])
         return result
 
-    def get_bolt_port(self, workflow_id):
-        """Return the bolt port associated with a workflow."""
-        stmt = "SELECT bolt_port FROM workflows WHERE workflow_id=?"
-        result = bdb.getone(self.db_file, stmt, [workflow_id])[0]
-        bolt_port = result
-        return bolt_port
-
-    def get_http_port(self, workflow_id):
-        """Return the bolt port associated with a workflow."""
-        stmt = "SELECT http_port FROM workflows WHERE workflow_id=?"
-        result = bdb.getone(self.db_file, stmt, [workflow_id])[0]
-        http_port = result
-        return http_port
-
-    def get_https_port(self, workflow_id):
-        """Return the bolt port associated with a workflow."""
-        stmt = "SELECT https_port FROM workflows WHERE workflow_id=?"
-        result = bdb.getone(self.db_file, stmt, [workflow_id])[0]
-        https_port = result
-        return https_port
-
-    def get_gdb_pid(self, workflow_id):
-        """Return the bolt port associated with a workflow."""
-        stmt = "SELECT gdb_pid FROM workflows WHERE workflow_id=?"
-        result = bdb.getone(self.db_file, stmt, [workflow_id])[0]
-        gdb_pid = result
-        return gdb_pid
-
-    def update_gdb_pid(self, workflow_id, gdb_pid):
-        """Update the gdb PID associated with a workflow."""
-        stmt = "UPDATE workflows SET gdb_pid=? WHERE workflow_id=?"
-        bdb.run(self.db_file, stmt, [gdb_pid, workflow_id])
-
     def get_run_dir(self, workflow_id):
-        """Return the bolt port associated with a workflow."""
-        stmt = "SELECT run_dir FROM workflows WHERE workflow_id=?"
+        """Return the run directory."""
+        stmt = "SELECT run_dir FROM info WHERE workflow_id=?"
         result = bdb.getone(self.db_file, stmt, [workflow_id])[0]
         run_dir = result
         return run_dir
@@ -180,11 +157,8 @@ def _init_tables(self):
                                 workflow_id INTEGER UNIQUE,
                                 name TEXT,
                                 state TEST NOT NULL,
-                                run_dir STR,
-                                bolt_port INTEGER,
-                                http_port INTEGER,
-                                https_port INTEGER,
-                                gdb_pid INTEGER);"""
+                                run_dir STR
+                                );"""
 
         tasks_stmt = """CREATE TABLE IF NOT EXISTS tasks (
                         id INTEGER PRIMARY KEY,
@@ -205,17 +179,21 @@ def _init_tables(self):
                            wfm_port INTEGER,
                            tm_port INTEGER,
                            sched_port INTEGER,
-                           num_workflows INTEGER
+                           num_workflows INTEGER,
+                           bolt_port INTEGER,
+                           http_port INTEGER,
+                           https_port INTEGER,
+                           gdb_pid INTEGER
                            );"""
 
         bdb.create_table(self.db_file, workflows_stmt)
         bdb.create_table(self.db_file, tasks_stmt)
         if not bdb.table_exists(self.db_file, 'info'):
             bdb.create_table(self.db_file, info_stmt)
             # insert a new workflow into the database
-            stmt = """INSERT INTO info (wfm_port, tm_port, sched_port, num_workflows)
-                                            VALUES(?, ?, ?, ?);"""
-            bdb.run(self.db_file, stmt, [-1, -1, -1, 0])
+            stmt = """INSERT INTO info (wfm_port, tm_port, sched_port, num_workflows,
+                bolt_port, http_port, https_port, gdb_pid) VALUES(?, ?, ?, ?, ?, ?, ?, ?);"""
+            bdb.run(self.db_file, stmt, [-1, -1, -1, 0, -1, -1, -1, -1])
 
     @property
     def workflows(self):

diff --git a/beeflow/common/celery.py → beeflow/common/deps/celery_manager.py b/beeflow/common/celery.py → beeflow/common/deps/celery_manager.py
diff --git a/beeflow/common/deps/container_manager.py b/beeflow/common/deps/container_manager.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+
+"""Functions for managing the BEE depency container and associated bind mounts."""
+
+import os
+import shutil
+import subprocess
+
+from beeflow.common.config_driver import BeeConfig as bc
+from beeflow.common import paths
+from celery import shared_task #noqa pylama can't find celery
+
+
+class NoContainerRuntime(Exception):
+    """An exception for no container runtime like charliecloud or singularity."""
+
+
+def check_container_runtime():
+    """Check if the container runtime is currently installed."""
+    # Needs to support singuarity as well
+    if shutil.which("ch-convert") is None or shutil.which("ch-run") is None:
+        print("ch-convert or ch-run not found. Charliecloud required"
+              " for neo4j container.")
+        raise NoContainerRuntime('')
+
+
+def make_dep_dir():
+    """Make a new bee dependency container directory."""
+    bee_workdir = paths.workdir()
+    bee_dir = f'{bee_workdir}/deps'
+    bee_dir_exists = os.path.isdir(bee_dir)
+    if not bee_dir_exists:
+        os.makedirs(bee_dir)
+
+
+def get_dep_dir():
+    """Return the dependency directory path."""
+    bee_workdir = paths.workdir()
+    bee_container_dir = f'{bee_workdir}/deps/'
+    return bee_container_dir
+
+
+def get_container_dir(dep_name):
+    """Return the depency container path."""
+    container_name = dep_name + '_container'
+    return get_dep_dir() + container_name
+
+
+def check_container_dir(dep_name):
+    """Return true if the container directory exists."""
+    container_dir = get_container_dir(dep_name)
+    container_dir_exists = os.path.isdir(container_dir)
+    return container_dir_exists
+
+
+def create_image(dep_name):
+    """Create a new BEE dependency container if one does not exist.
+
+    By default, the container is stored in /tmp/<user>/beeflow/deps.
+    """
+    # Can throw an exception that needs to be handled by the caller
+    check_container_runtime()
+
+    image = bc.get('DEFAULT', dep_name + '_image')
+
+    # Check for BEE dependency container directory:
+    container_dir_exists = check_container_dir(dep_name)
+    if container_dir_exists:
+        print(f"Already have {dep_name} container")
+        return
+
+    make_dep_dir()
+    container_dir = get_container_dir(dep_name)
+
+    # Build new dependency container
+    try:
+        subprocess.run(["ch-convert", "-i", "tar", "-o", "dir",
+                        str(image), str(container_dir)], check=True)
+    except subprocess.CalledProcessError as error:
+        print(f"ch-convert failed: {error}")
+        shutil.rmtree(container_dir)
+        print(f"{dep_name} container mount directory {container_dir} removed")
+        return
+
+    # If neo4j, make the certificates directory
+    if dep_name == 'neo4j':
+        container_certs_path = os.path.join(container_dir, 'var/lib/neo4j/certificates')
+        os.makedirs(container_certs_path, exist_ok=True)