From 5b2d8293f17ae14ee1cf031a8dfc7a7bc472bc0a Mon Sep 17 00:00:00 2001 From: aledj2 Date: Wed, 25 May 2022 16:06:09 +0100 Subject: [PATCH 1/8] add support for TSO and update some documentation. fix#28 fix#29 --- wscleaner/README.md | 8 +++++++ wscleaner/wscleaner/lib.py | 37 +++++++++++++++++++++++++++++- wscleaner/wscleaner_command_dev.sh | 12 ++++++++++ 3 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 wscleaner/wscleaner_command_dev.sh diff --git a/wscleaner/README.md b/wscleaner/README.md index a19d3af..92467aa 100644 --- a/wscleaner/README.md +++ b/wscleaner/README.md @@ -8,6 +8,12 @@ When executed, Runfolders in the input (root) directory are deleted based on the * All local FASTQ files are uploaded and in a 'closed' state * Six logfiles are present in the DNA Nexus project /Logfiles directory +or if the run is identified as a TSO500 run, based on: + * the bcl2fastq2_output.log file created by the automated scripts + + AND + * Presence of `_TSO` in the human readable DNANexus project name + A DNAnexus API key must be cached locally using the `--set-key` option. ## Install @@ -17,6 +23,8 @@ git clone https://github.com/moka-guys/workstation_housekeeping.git pip install workstation_housekeeping/wscleaner wscleaner --version # Print version number ``` +Note that may need to activate the environment before installing with pip. +On the workstation 2 environments exist - wscleaner and wscleaner_test (for development work) ## Quickstart diff --git a/wscleaner/wscleaner/lib.py b/wscleaner/wscleaner/lib.py index 1b542a3..50b0b04 100644 --- a/wscleaner/wscleaner/lib.py +++ b/wscleaner/wscleaner/lib.py @@ -12,6 +12,7 @@ import shutil import time from pathlib import Path +import os import dxpy @@ -63,6 +64,35 @@ def find_fastqs(self, count=False): else: self.logger.debug(f'{self.name} contains {len(fastq_filenames)} fastq files: {fastq_filenames}') return fastq_filenames + + def TSO500_check(self): + """ + Checks if the run is a TSO500 run. These need to be cleaned up but do not contain fastqs + Returns True if TSO run detected. + """ + logfile_check=False + project_name=False + bcl2fastq_filepath=os.path.join(self.path,"bcl2fastq2_output.log") + # ensure not trying to open files that don't exist + if os.path.isdir(self.path) and os.path.exists(bcl2fastq_filepath): + # open bcl2fastq file - should contain a standard statement from automated scripts + with open(bcl2fastq_filepath) as demultiplexing_file: + # take last line of the logfile - look for statement produced by automated scripts for TSO runs + if demultiplexing_file.readlines()[-1].startswith("TSO500 run."): + logfile_check=True + self.logger.debug(f'bcl2fastq2_output.log for {self.name} contains the string expected for TSO500 runs') + # may be an issue identifying the DNAnexus project + # get the dnanexus project name to assess if contains "_TSO" + if self.dx_project.id: + nexus_project_name = dxpy.describe(self.dx_project.id)["name"] + if "_TSO" in nexus_project_name: + self.logger.debug(f'DNANexus project name {nexus_project_name} contains the string "_TSO"') + project_name=True + else: + self.logger.debug(f'DNANexus project name {nexus_project_name} does NOT contain the string "_TSO"') + # if both checks pass return true + if project_name and logfile_check: + return True class DxProjectRunFolder(): @@ -185,8 +215,13 @@ def find_runfolders(self, min_age=None): runfolder_objects = [] for directory in subdirectories: rf = RunFolder(directory) + self.logger.debug(f'ASSESING IF {rf.name} IS TSO500 RUNFOLDER.') + # catch TSO500 runfolders here (do not contain fastqs) + if (rf.age >= min_age) and (rf.TSO500_check()): + self.logger.debug(f'{rf.name} IS TSO500 RUNFOLDER.') + runfolder_objects.append(rf) # Criteria for runfolder: Older than or equal to min_age and contains fastq.gz files - if (rf.age >= min_age) and (rf.find_fastqs(count=True) > 0): + elif (rf.age >= min_age) and (rf.find_fastqs(count=True) > 0): self.logger.debug(f'{rf.name} IS RUNFOLDER.') runfolder_objects.append(rf) else: diff --git a/wscleaner/wscleaner_command_dev.sh b/wscleaner/wscleaner_command_dev.sh new file mode 100644 index 0000000..a9694f5 --- /dev/null +++ b/wscleaner/wscleaner_command_dev.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +# Activate wscleaner environment +eval "$(/usr/local/bin/miniconda3/bin/conda shell.bash hook)" # Add conda environment to system path +conda activate wscleaner_test + +# Set variables +logfile="/usr/local/src/mokaguys/automate_demultiplexing_logfiles/wscleaner_logs/$(date -d now +%y%m%d)_wscleaner.log" +runfolders="/media/data3/share" + +# Execute +/usr/local/bin/miniconda3/envs/wscleaner_test/bin/python3 /usr/local/src/mokaguys/development_area/workstation_housekeeping/wscleaner/wscleaner/main.py $runfolders --logfile $logfile --dry-run --min-age=1 From 4e13f0ed5baec3a6206f0e650bf877aa17fdf513 Mon Sep 17 00:00:00 2001 From: aledj2 Date: Wed, 25 May 2022 16:07:21 +0100 Subject: [PATCH 2/8] adsave readmeix#28 --- wscleaner/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wscleaner/README.md b/wscleaner/README.md index 92467aa..7499639 100644 --- a/wscleaner/README.md +++ b/wscleaner/README.md @@ -6,7 +6,7 @@ When executed, Runfolders in the input (root) directory are deleted based on the * A single DNAnexus project is found matching the runfolder name * All local FASTQ files are uploaded and in a 'closed' state -* Six logfiles are present in the DNA Nexus project /Logfiles directory +* 5 logfiles are present in the DNA Nexus project /Logfiles directory or if the run is identified as a TSO500 run, based on: * the bcl2fastq2_output.log file created by the automated scripts From 881ebdba1ec8f73da6a4ffe698a353d1d04b4ddd Mon Sep 17 00:00:00 2001 From: aledj2 Date: Wed, 25 May 2022 16:09:02 +0100 Subject: [PATCH 3/8] remove extra log statement --- wscleaner/wscleaner/lib.py | 1 - 1 file changed, 1 deletion(-) diff --git a/wscleaner/wscleaner/lib.py b/wscleaner/wscleaner/lib.py index 50b0b04..0d53c3e 100644 --- a/wscleaner/wscleaner/lib.py +++ b/wscleaner/wscleaner/lib.py @@ -215,7 +215,6 @@ def find_runfolders(self, min_age=None): runfolder_objects = [] for directory in subdirectories: rf = RunFolder(directory) - self.logger.debug(f'ASSESING IF {rf.name} IS TSO500 RUNFOLDER.') # catch TSO500 runfolders here (do not contain fastqs) if (rf.age >= min_age) and (rf.TSO500_check()): self.logger.debug(f'{rf.name} IS TSO500 RUNFOLDER.') From f1d42cc0d13abd1a982ab3130a25ade12423d69d Mon Sep 17 00:00:00 2001 From: aledj2 Date: Fri, 27 May 2022 14:05:59 +0100 Subject: [PATCH 4/8] add info to readme to make installation and usage clearer --- wscleaner/README.md | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/wscleaner/README.md b/wscleaner/README.md index 7499639..155ea2b 100644 --- a/wscleaner/README.md +++ b/wscleaner/README.md @@ -16,23 +16,26 @@ or if the run is identified as a TSO500 run, based on: A DNAnexus API key must be cached locally using the `--set-key` option. +## Workstation Environment +The directory `env/` in this repository contains conda environment scripts for the workstation. These remove conflicts in the PYTHONPATH environment variable by editing the variable when conda is activated. The conda documentation describes where to place these scripts under ['saving environment variables'](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#macos-and-linux). + ## Install +As descibed above, on the workstation 2 environments exist - wscleaner and wscleaner_test (for development work). +You need to activate these environment before installing with pip (as below). + ```bash git clone https://github.com/moka-guys/workstation_housekeeping.git pip install workstation_housekeeping/wscleaner wscleaner --version # Print version number ``` -Note that may need to activate the environment before installing with pip. -On the workstation 2 environments exist - wscleaner and wscleaner_test (for development work) -## Quickstart +## Automated usage +The script `wscleaner_command.sh` is called by the crontab. This activates the enviroment and passes the logfile path (and any other non-default arguments). +A development command script `wscleaner_command_dev.sh` can be used to call the test environment and provide testing arguments, eg --dry-run -```bash -wscleaner ROOT_DIRECTORY -``` -## Usage +## Manual Usage ``` usage: wscleaner [-h] [--auth AUTH] [--dry-run] [--logfile LOGFILE] @@ -63,10 +66,6 @@ optional arguments: pytest . --auth_token DNA_NEXUS_KEY ``` -## Workstation Environment -The directory `env/` in this repository contains conda environment scripts for the workstation. These remove conflicts in the PYTHONPATH environment variable by editing the variable when conda is activated. The conda documentation describes where to place these scripts under ['saving environment variables'](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#macos-and-linux). - - ## License Developed by Viapath Genome Informatics From 30b3d91c50e156673088a5fa65e6baaf77b58de3 Mon Sep 17 00:00:00 2001 From: aledj2 Date: Fri, 27 May 2022 15:13:21 +0100 Subject: [PATCH 5/8] add egg-info --- .gitignore | 1 - wscleaner/wscleaner.egg-info/PKG-INFO | 11 +++++ wscleaner/wscleaner.egg-info/SOURCES.txt | 14 +++++++ .../wscleaner.egg-info/dependency_links.txt | 1 + wscleaner/wscleaner.egg-info/entry_points.txt | 3 ++ wscleaner/wscleaner.egg-info/not-zip-safe | 1 + wscleaner/wscleaner.egg-info/requires.txt | 5 +++ wscleaner/wscleaner.egg-info/top_level.txt | 1 + wscleaner/wscleaner/lib.py | 40 ++++++++++++------- 9 files changed, 62 insertions(+), 15 deletions(-) create mode 100644 wscleaner/wscleaner.egg-info/PKG-INFO create mode 100644 wscleaner/wscleaner.egg-info/SOURCES.txt create mode 100644 wscleaner/wscleaner.egg-info/dependency_links.txt create mode 100644 wscleaner/wscleaner.egg-info/entry_points.txt create mode 100644 wscleaner/wscleaner.egg-info/not-zip-safe create mode 100644 wscleaner/wscleaner.egg-info/requires.txt create mode 100644 wscleaner/wscleaner.egg-info/top_level.txt diff --git a/.gitignore b/.gitignore index cf1f3f1..71ad737 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ *.pyc -*.egg-info wscleaner/wscleaner/config.json wscleaner/test/test_dir*.txt wscleaner/test/data \ No newline at end of file diff --git a/wscleaner/wscleaner.egg-info/PKG-INFO b/wscleaner/wscleaner.egg-info/PKG-INFO new file mode 100644 index 0000000..21a7b3c --- /dev/null +++ b/wscleaner/wscleaner.egg-info/PKG-INFO @@ -0,0 +1,11 @@ +Metadata-Version: 1.10 +Name: wscleaner +Version: 1.10 +Summary: Package to remove uploaded runfolders from the Viapath Genome Informatics NGS workstation +Home-page: https://github.com/moka-guys/workstation_housekeeping +Author: Nana Mensah +Author-email: gst-tr.MokaGuys@nhs.net +License: MIT +Description: UNKNOWN +Platform: UNKNOWN +Requires-Python: >=3.6.8 diff --git a/wscleaner/wscleaner.egg-info/SOURCES.txt b/wscleaner/wscleaner.egg-info/SOURCES.txt new file mode 100644 index 0000000..3f0673e --- /dev/null +++ b/wscleaner/wscleaner.egg-info/SOURCES.txt @@ -0,0 +1,14 @@ +README.md +setup.py +test/test_all.py +wscleaner/__init__.py +wscleaner/lib.py +wscleaner/main.py +wscleaner/mokaguys_logger.py +wscleaner.egg-info/PKG-INFO +wscleaner.egg-info/SOURCES.txt +wscleaner.egg-info/dependency_links.txt +wscleaner.egg-info/entry_points.txt +wscleaner.egg-info/not-zip-safe +wscleaner.egg-info/requires.txt +wscleaner.egg-info/top_level.txt \ No newline at end of file diff --git a/wscleaner/wscleaner.egg-info/dependency_links.txt b/wscleaner/wscleaner.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/wscleaner/wscleaner.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/wscleaner/wscleaner.egg-info/entry_points.txt b/wscleaner/wscleaner.egg-info/entry_points.txt new file mode 100644 index 0000000..88bbf2f --- /dev/null +++ b/wscleaner/wscleaner.egg-info/entry_points.txt @@ -0,0 +1,3 @@ +[console_scripts] +wscleaner = wscleaner.main:main + diff --git a/wscleaner/wscleaner.egg-info/not-zip-safe b/wscleaner/wscleaner.egg-info/not-zip-safe new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/wscleaner/wscleaner.egg-info/not-zip-safe @@ -0,0 +1 @@ + diff --git a/wscleaner/wscleaner.egg-info/requires.txt b/wscleaner/wscleaner.egg-info/requires.txt new file mode 100644 index 0000000..83c9b55 --- /dev/null +++ b/wscleaner/wscleaner.egg-info/requires.txt @@ -0,0 +1,5 @@ +docutils>=0.3 +dxpy==0.279.0 +pytest==4.4.0 +pytest-cov==2.6.1 +Sphinx==2.0.1 diff --git a/wscleaner/wscleaner.egg-info/top_level.txt b/wscleaner/wscleaner.egg-info/top_level.txt new file mode 100644 index 0000000..85d1fd0 --- /dev/null +++ b/wscleaner/wscleaner.egg-info/top_level.txt @@ -0,0 +1 @@ +wscleaner diff --git a/wscleaner/wscleaner/lib.py b/wscleaner/wscleaner/lib.py index 0d53c3e..8843ff3 100644 --- a/wscleaner/wscleaner/lib.py +++ b/wscleaner/wscleaner/lib.py @@ -35,6 +35,7 @@ class RunFolder(): def __init__(self, path): self.logger = logging.getLogger(__name__ + '.RunFolder') self.path = Path(path) + self.RTA_complete_exists = os.path.isfile(os.path.join(self.path,"RTAComplete.txt")) self.name = self.path.name self.logger.debug(f'Initiating RunFolder instance for {self.name}') self.dx_project = DxProjectRunFolder(self.name) @@ -211,24 +212,33 @@ def find_runfolders(self, min_age=None): Returns: runfolder_objects(list): List of wscleaner.lib.RunFolder objects. """ - subdirectories = self.root.iterdir() runfolder_objects = [] - for directory in subdirectories: + # list all directories in the runfolder dir. + for directory in [directory for directory in self.root.iterdir() if directory.is_dir()]: rf = RunFolder(directory) - # catch TSO500 runfolders here (do not contain fastqs) - if (rf.age >= min_age) and (rf.TSO500_check()): - self.logger.debug(f'{rf.name} IS TSO500 RUNFOLDER.') - runfolder_objects.append(rf) - # Criteria for runfolder: Older than or equal to min_age and contains fastq.gz files - elif (rf.age >= min_age) and (rf.find_fastqs(count=True) > 0): - self.logger.debug(f'{rf.name} IS RUNFOLDER.') - runfolder_objects.append(rf) + # skip any folders that do not have an RTAComplete.txt file + if not rf.RTA_complete_exists: + self.logger.debug(f'{rf.name} is not a runfolder, or sequencing has not yet finished.') else: - self.logger.debug(f'{rf.name} IS NOT RUNFOLDER.') + # catch TSO500 runfolders here (do not contain fastqs) + if (rf.age >= min_age) and (rf.TSO500_check()): + self.logger.debug(f'{rf.name} IS TSO500 RUNFOLDER.') + runfolder_objects.append(rf) + # Criteria for runfolder: Older than or equal to min_age and contains fastq.gz files + elif (rf.age >= min_age) and (rf.find_fastqs(count=True) > 0): + self.logger.debug(f'{rf.name} IS RUNFOLDER.') + runfolder_objects.append(rf) + # shouldn't get this far anymore - leave in just incase. + else: + self.logger.debug(f'{rf.name} IS NOT RUNFOLDER.') + return runfolder_objects def check_fastqs(self, runfolder): - """Returns true if a runfolder's fastq.gz files match those in it's DNAnexus project.""" + """ + Returns true if a runfolder's fastq.gz files match those in it's DNAnexus project. + Ensures all fastqs were uploaded. + """ dx_fastqs = runfolder.dx_project.find_fastqs() local_fastqs = runfolder.find_fastqs() fastq_bool = all([fastq in dx_fastqs for fastq in local_fastqs]) @@ -236,8 +246,10 @@ def check_fastqs(self, runfolder): return fastq_bool def check_logfiles(self, runfolder, logfile_count): - """Returns true if a runfolder's DNAnexus project contains 6 logfiles in the - expected location""" + """Returns true if a runfolder's DNAnexus project contains X logfiles in the + expected location. + X is defined in the --logfile-count argument provided (default = 5) + """ dx_logfiles = runfolder.dx_project.count_logfiles() logfile_bool = (dx_logfiles == logfile_count) self.logger.debug(f'{runfolder.name} LOGFILE BOOL: {logfile_bool}') From 5adf9774620be00853ac123e951e172106f10984 Mon Sep 17 00:00:00 2001 From: aledj2 Date: Fri, 27 May 2022 15:30:21 +0100 Subject: [PATCH 6/8] add some comments and log statements --- wscleaner/wscleaner/lib.py | 8 +++++--- wscleaner/wscleaner/main.py | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/wscleaner/wscleaner/lib.py b/wscleaner/wscleaner/lib.py index 8843ff3..5d84e65 100644 --- a/wscleaner/wscleaner/lib.py +++ b/wscleaner/wscleaner/lib.py @@ -82,6 +82,8 @@ def TSO500_check(self): if demultiplexing_file.readlines()[-1].startswith("TSO500 run."): logfile_check=True self.logger.debug(f'bcl2fastq2_output.log for {self.name} contains the string expected for TSO500 runs') + else: + self.logger.debug(f'bcl2fastq2_output.log for {self.name} DOES NOT contain expected TSO500 string') # may be an issue identifying the DNAnexus project # get the dnanexus project name to assess if contains "_TSO" if self.dx_project.id: @@ -222,15 +224,15 @@ def find_runfolders(self, min_age=None): else: # catch TSO500 runfolders here (do not contain fastqs) if (rf.age >= min_age) and (rf.TSO500_check()): - self.logger.debug(f'{rf.name} IS TSO500 RUNFOLDER.') + self.logger.debug(f'{rf.name} is a TSO500 runfolder and is >= {min_age} days old.') runfolder_objects.append(rf) # Criteria for runfolder: Older than or equal to min_age and contains fastq.gz files elif (rf.age >= min_age) and (rf.find_fastqs(count=True) > 0): - self.logger.debug(f'{rf.name} IS RUNFOLDER.') + self.logger.debug(f'{rf.name} contains 1 or more fastq and is >= {min_age} days old.') runfolder_objects.append(rf) # shouldn't get this far anymore - leave in just incase. else: - self.logger.debug(f'{rf.name} IS NOT RUNFOLDER.') + self.logger.debug(f'{rf.name} has 0 fastqs, is not a TSO runfolder or is < {min_age} days old.') return runfolder_objects diff --git a/wscleaner/wscleaner/main.py b/wscleaner/wscleaner/main.py index e5b5fd2..a90798c 100644 --- a/wscleaner/wscleaner/main.py +++ b/wscleaner/wscleaner/main.py @@ -54,8 +54,9 @@ def main(): # If dry-run CLI flag is given, no directories are deleted by the runfolder manager. RFM = RunFolderManager(args.root, dry_run=args.dry_run) logger.info(f'Root directory {args.root}') + logger.info(f'Identifying local runfolders to consider deleting') local_runfolders = RFM.find_runfolders(min_age=args.min_age) - logger.debug(f'Found local runfolders: {[rf.name for rf in local_runfolders]}') + logger.debug(f'Found local runfolders to consider deleting: {[rf.name for rf in local_runfolders]}') for runfolder in local_runfolders: logger.info(f'Processing {runfolder.name}') From e5a66511a57bff686a9b540ae638e5662306b21d Mon Sep 17 00:00:00 2001 From: aledj2 Date: Fri, 27 May 2022 15:32:36 +0100 Subject: [PATCH 7/8] amend readme --- wscleaner/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wscleaner/README.md b/wscleaner/README.md index 155ea2b..1d01119 100644 --- a/wscleaner/README.md +++ b/wscleaner/README.md @@ -6,7 +6,7 @@ When executed, Runfolders in the input (root) directory are deleted based on the * A single DNAnexus project is found matching the runfolder name * All local FASTQ files are uploaded and in a 'closed' state -* 5 logfiles are present in the DNA Nexus project /Logfiles directory +* X logfiles are present in the DNA Nexus project /Logfiles directory (NB X can be added as a command line argument - default is 5) or if the run is identified as a TSO500 run, based on: * the bcl2fastq2_output.log file created by the automated scripts From 396652b490b0ee3a588ba97ab02b413604b45e98 Mon Sep 17 00:00:00 2001 From: aledj2 Date: Fri, 27 May 2022 15:33:04 +0100 Subject: [PATCH 8/8] amend readme --- wscleaner/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/wscleaner/README.md b/wscleaner/README.md index 1d01119..9322b0d 100644 --- a/wscleaner/README.md +++ b/wscleaner/README.md @@ -10,7 +10,6 @@ When executed, Runfolders in the input (root) directory are deleted based on the or if the run is identified as a TSO500 run, based on: * the bcl2fastq2_output.log file created by the automated scripts - AND * Presence of `_TSO` in the human readable DNANexus project name