From 90f12912c0c0a7cc13c4daf12b20c945efff420b Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Wed, 24 Jul 2024 19:08:41 -0700
Subject: [PATCH] Migrate ConvertJob to use Jinja2 template.

---
 sequence_processing_pipeline/ConvertJob.py    | 162 ++++++++++--------
 sequence_processing_pipeline/NuQCJob.py       |   3 +-
 .../templates/convert_job.sh                  |  17 ++
 .../tests/test_ConvertJob.py                  |  21 ++-
 4 files changed, 121 insertions(+), 82 deletions(-)
 create mode 100644 sequence_processing_pipeline/templates/convert_job.sh

diff --git a/sequence_processing_pipeline/ConvertJob.py b/sequence_processing_pipeline/ConvertJob.py
index 3d8d2244..0a4ed289 100644
--- a/sequence_processing_pipeline/ConvertJob.py
+++ b/sequence_processing_pipeline/ConvertJob.py
@@ -1,9 +1,32 @@
-from os.path import join, exists
 from sequence_processing_pipeline.Job import Job
 from sequence_processing_pipeline.PipelineError import (PipelineError,
                                                         JobFailedError)
 import logging
 import re
+from jinja2 import BaseLoader, Environment, TemplateNotFound
+import pathlib
+from os.path import join, exists, getmtime
+
+
+# taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader
+class KISSLoader(BaseLoader):
+    def __init__(self, path):
+        # pin the path for loader to the location sequence_processing_pipeline
+        # (the location of this file), along w/the relative path to the
+        # templates directory.
+        self.path = join(pathlib.Path(__file__).parent.resolve(), path)
+
+    def get_source(self, environment, template):
+        path = join(self.path, template)
+        if not exists(path):
+            raise TemplateNotFound(template)
+        mtime = getmtime(path)
+        with open(path) as f:
+            source = f.read()
+        return source, path, lambda: mtime == getmtime(path)
+
+
+logging.basicConfig(level=logging.DEBUG)
 
 
 class ConvertJob(Job):
@@ -39,12 +62,23 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         self.node_count = node_count
         self.nprocs = nprocs
         self.wall_time_limit = wall_time_limit
+
+        # TODO: This value is currently a string e.g.: '1gb' or '10gb' read
+        #  in from the configuration json file. However this param should be
+        #  changed to process_mem_in_gb or similar and the string changed to
+        #  a numerical value.
         self.pmem = pmem
         self.bcl_tool = bcl_tool_path
         self.qiita_job_id = qiita_job_id
+        # CHARLIE
         self.job_script_path = join(self.output_path, f"{self.job_name}.sh")
         self.suffix = 'fastq.gz'
 
+        # for projects that use sequence_processing_pipeline as a dependency,
+        # jinja_env must be set to sequence_processing_pipeline's root path,
+        # rather than the project's root path.
+        self.jinja_env = Environment(loader=KISSLoader('templates'))
+
         tmp = False
         for executable_name in ['bcl2fastq', 'bcl-convert']:
             if executable_name in self.bcl_tool:
@@ -63,76 +97,62 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         self._generate_job_script()
 
     def _generate_job_script(self):
-        """
-        Generate a Torque job script for processing supplied root_directory.
-        :return: The path to the newly-created job-script.
-        """
-        lines = []
-
-        lines.append("#!/bin/bash")
-        lines.append(f"#SBATCH --job-name {self.qiita_job_id}_{self.job_name}")
-        lines.append(f"#SBATCH -p {self.queue_name}")
-        lines.append(f'#SBATCH -N {self.node_count}')
-        lines.append(f'#SBATCH -n {self.nprocs}')
-        lines.append("#SBATCH --time %d" % self.wall_time_limit)
-
-        # send an email to the list of users defined below when a job starts,
-        # terminates, or aborts. This is used to confirm that the package's
-        # own reporting mechanism is reporting correctly.
-        lines.append("#SBATCH --mail-type=ALL")
-
-        # list of users to be contacted independently of this package's
-        # notification system, when a job starts, terminates, or gets aborted.
-        lines.append("#SBATCH --mail-user qiita.help@gmail.com")
-
-        lines.append(f"#SBATCH --mem-per-cpu {self.pmem}")
-
-        lines.append("set -x")
-        lines.append('date')
-        lines.append('hostname')
-        lines.append(f'cd {self.root_dir}')
-
-        if self.modules_to_load:
-            lines.append("module load " + ' '.join(self.modules_to_load))
-
-        # Assume that the bcl-convert tool is named 'bcl-convert' and choose
-        # accordingly.
-        if 'bcl-convert' in self.bcl_tool:
-            lines.append(('%s '
-                          '--sample-sheet "%s" '
-                          '--output-directory %s '
-                          '--bcl-input-directory . '
-                          '--bcl-num-decompression-threads 16 '
-                          '--bcl-num-conversion-threads 16 '
-                          '--bcl-num-compression-threads 16 '
-                          '--bcl-num-parallel-tiles 16 '
-                          '--bcl-sampleproject-subdirectories true '
-                          '--force') % (self.bcl_tool,
-                                        self.sample_sheet_path,
-                                        self.output_path))
-
-            # equivalent cp for bcl-conversion (see below) needed.
-        else:
-            lines.append(('%s '
-                          '--sample-sheet "%s" '
-                          '--minimum-trimmed-read-length 1 '
-                          '--mask-short-adapter-reads 1 '
-                          '-R . '
-                          '-o %s '
-                          '--loading-threads 16 '
-                          '--processing-threads 16 '
-                          '--writing-threads 16 '
-                          '--create-fastq-for-index-reads '
-                          '--ignore-missing-positions ') %
-                         (self.bcl_tool,
-                          self.sample_sheet_path,
-                          self.output_path))
-
-        with open(self.job_script_path, 'w') as f:
-            for line in lines:
-                # remove long spaces in some lines.
-                line = re.sub(r'\s+', ' ', line)
-                f.write(f"{line}\n")
+        # bypass generating job script for a force-fail job, since it is
+        # not needed.
+        if self.force_job_fail:
+            return None
+
+        template = self.jinja_env.get_template("convert_job.sh")
+
+        job_name = f'{self.qiita_job_id}_{self.job_name}'
+
+        with open(self.job_script_path, mode="w", encoding="utf-8") as f:
+            if 'bcl-convert' in self.bcl_tool:
+                cmd_line = (f'{self.bcl_tool} '
+                            f'--sample-sheet "{self.sample_sheet_path}" '
+                            f'--output-directory {self.output_path} '
+                            '--bcl-input-directory . '
+                            '--bcl-num-decompression-threads 16 '
+                            '--bcl-num-conversion-threads 16 '
+                            '--bcl-num-compression-threads 16 '
+                            '--bcl-num-parallel-tiles 16 '
+                            '--bcl-sampleproject-subdirectories true '
+                            '--force')
+                # equivalent cp for bcl-conversion (see below) needed.
+            else:
+                cmd_line = (f'{self.bcl_tool} '
+                            f'--sample-sheet "{self.sample_sheet_path}" '
+                            '--minimum-trimmed-read-length 1 '
+                            '--mask-short-adapter-reads 1 '
+                            '-R . '
+                            f'-o {self.output_path} '
+                            '--loading-threads 16 '
+                            '--processing-threads 16 '
+                            '--writing-threads 16 '
+                            '--create-fastq-for-index-reads '
+                            '--ignore-missing-positions ')
+
+            params = {'job_name': job_name,
+                      'queue_name': self.queue_name,
+                      'node_count': self.node_count,
+                      'nprocs': self.nprocs,
+                      'wall_time_limit': self.wall_time_limit,
+                      'mem_per_cpu': self.pmem,
+                      'run_dir': self.root_dir,
+                      'sheet_path': self.sample_sheet_path,
+                      'cmd_line': cmd_line}
+
+            # generate a string of linux system modules to load before
+            # processing begins.
+            if self.modules_to_load:
+                # if {{modules_to_load}} is defined, not empty and not false,
+                # then the line "module load <modules to load>" will be
+                # added to the template.
+                params['modules_to_load'] = ' '.join(self.modules_to_load)
+
+            f.write(template.render(**params))
+
+        return self.job_script_path
 
     def run(self, callback=None):
         """
diff --git a/sequence_processing_pipeline/NuQCJob.py b/sequence_processing_pipeline/NuQCJob.py
index b1c27900..fa431eef 100644
--- a/sequence_processing_pipeline/NuQCJob.py
+++ b/sequence_processing_pipeline/NuQCJob.py
@@ -1,4 +1,4 @@
-from jinja2 import BaseLoader, TemplateNotFound
+from jinja2 import BaseLoader, Environment, TemplateNotFound
 from metapool import load_sample_sheet
 from os import stat, makedirs, rename
 from os.path import join, basename, dirname, exists, abspath, getmtime
@@ -10,7 +10,6 @@
 import logging
 from sequence_processing_pipeline.Commands import split_similar_size_bins
 from sequence_processing_pipeline.util import iter_paired_files
-from jinja2 import Environment
 import glob
 import re
 from sys import executable
diff --git a/sequence_processing_pipeline/templates/convert_job.sh b/sequence_processing_pipeline/templates/convert_job.sh
new file mode 100644
index 00000000..5c2a2835
--- /dev/null
+++ b/sequence_processing_pipeline/templates/convert_job.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+#SBATCH -J {{job_name}}
+#SBATCH -p {{queue_name}}
+#SBATCH -N {{node_count}}
+#SBATCH -n {{nprocs}}
+#SBATCH --time {{wall_time_limit}}
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user qiita.help@gmail.com
+#SBATCH --mem-per-cpu {{mem_per_cpu}}
+set -x
+date
+hostname
+cd {{run_dir}}
+{% if modules_to_load %}
+    module load {{modules_to_load}}
+{% endif %}
+{{cmd_line}}
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/test_ConvertJob.py b/sequence_processing_pipeline/tests/test_ConvertJob.py
index df81fdcf..c2cf318b 100644
--- a/sequence_processing_pipeline/tests/test_ConvertJob.py
+++ b/sequence_processing_pipeline/tests/test_ConvertJob.py
@@ -910,6 +910,7 @@ def tearDown(self):
         rmtree(self.good_output_path)
 
     def test_creation(self):
+        self.maxDiff = None
         run_dir = self.base_path('211021_A00000_0000_SAMPLE')
         inv_input_directory = self.base_path('inv_input_directory')
         qiita_id = 'abcdabcdabcdabcdabcdabcdabcdabcd'
@@ -934,13 +935,15 @@ def test_creation(self):
                        'ConvertJob.sh')) as f:
             obs = ''.join(f.readlines())
 
-        # ssp should be just the value of the self.path() partial function by
-        # itself. For readability, SCRIPT_EXP addresses the '/' separator.
-        # Hence, the trailing '/' is redundant and should be removed here.
-        self.assertEqual(obs,
-                         SCRIPT_EXP.format(ssp=self.base_path('').rstrip('/'),
-                                           gop=self.good_output_path,
-                                           run_dir=run_dir))
+        # substitute variables in expected output with the run-time values
+        # that we expect.
+        exp = SCRIPT_EXP.replace("{run_dir}", run_dir).\
+            replace("{gop}", self.good_output_path).\
+            replace("{ssp}/", self.base_path(''))
+
+        # remove trailing whitespace from the ends of each parameter, since
+        # it's not important.
+        self.assertEqual(obs.rstrip(), exp.rstrip())
 
     def test_error_msg_from_logs(self):
         run_dir = self.base_path('211021_A00000_0000_SAMPLE')
@@ -998,7 +1001,7 @@ def test_parse_sample_sheet(self):
 
 SCRIPT_EXP = ''.join([
     '#!/bin/bash\n',
-    '#SBATCH --job-name abcdabcdabcdabcdabcdabcdabcdabcd_ConvertJob\n',
+    '#SBATCH -J abcdabcdabcdabcdabcdabcdabcdabcd_ConvertJob\n',
     '#SBATCH -p qiita\n',
     '#SBATCH -N 1\n',
     '#SBATCH -n 16\n',
@@ -1009,7 +1012,7 @@ def test_parse_sample_sheet(self):
     'set -x\n',
     'date\n',
     'hostname\n',
-    'cd {run_dir}\n',
+    'cd {run_dir}\n\n',
     'tests/bin/bcl-convert --sample-sheet "{ssp}/good-sample-sheet.csv" '
     '--output-directory {gop}/ConvertJob '
     '--bcl-input-directory . '