From ef7b59fc4a0918481652ed6ae60128ed04055558 Mon Sep 17 00:00:00 2001 From: Rachel Duffin Date: Thu, 14 Mar 2024 12:55:57 +0000 Subject: [PATCH 1/3] Remove obsolete scripts - backup runfolder now in automated_scripts, findfastqs.sh not needed --- .gitignore | 2 - README.md | 43 ----- backup_runfolder.py | 397 ----------------------------------------- findfastqs.sh | 15 -- housekeeping_config.py | 2 - 5 files changed, 459 deletions(-) delete mode 100755 backup_runfolder.py delete mode 100755 findfastqs.sh delete mode 100644 housekeeping_config.py diff --git a/.gitignore b/.gitignore index 71ad737..42bdb55 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,2 @@ *.pyc wscleaner/wscleaner/config.json -wscleaner/test/test_dir*.txt -wscleaner/test/data \ No newline at end of file diff --git a/README.md b/README.md index f529369..16307bb 100644 --- a/README.md +++ b/README.md @@ -2,49 +2,6 @@ Scripts to manage data on the NGS workstation ---- - -## backup_runfolder.py - -Uploads an Illumina runfolder to DNANexus. - -### Usage - -```bash -backup_runfolder.py [-h] -i RUNFOLDER [-a AUTH_TOKEN] [--ignore IGNORE] [-p PROJECT] [--logpath LOGPATH] -``` - -### What are the dependencies for this script? - -This tool requires the DNAnexus utilities `ua` (upload agent) and `dx` (DNAnexus toolkit) to be available in the system PATH. Python3 is required, and this tool uses packages from the standard library. - -### How does this tool work? - -* The script parses the input parameters, asserting that the given runfolder exists. -* If the `-p` option is given, the script attempts to find a matching DNAnexus project. Otherwise, it looks for a single project matching the runfolder name. If more or less than 1 project matches, the script logs an error and exits. -* The runfolder is traversed and a list of files in each folder is obtained. If any comma-separated strings passed to the `--ignore` argument are present within the filepath, or filename the file is excluded. - -* The DNAnexus `ua` utility is used to upload files in batches of 100 at a time. The number of upload tries is set to 100 with the `--tries` flag. -* Orthogonal tests are performed to: - * A count of files that should be uploaded (using the ignore terms if provided) - * A count of files in the DNA Nexus project - * (If relevant) A count of files in the DNA Nexus project containing a pattern to be ignored. NB this may not be accurate if the ignore term is found in the result of dx find data (eg present in project name) -* Logs from this and the script are written to a logfile, named "runfolder_backup_runfolder.log". A destination for this file can be passed to the `--logpath` flag. - ---- - -## findfastqs.sh - -Report the number of gzipped fastq files in an Illumina runfolder. - -### Usage - -```bash -$ findfastqs.sh RUNFOLDER ->>> RUNFOLDER has 156 demultiplexed fastq files with 2 undetermined. Total: 158 -``` - ---- ## Workstation Cleaner (wscleaner) diff --git a/backup_runfolder.py b/backup_runfolder.py deleted file mode 100755 index 8434255..0000000 --- a/backup_runfolder.py +++ /dev/null @@ -1,397 +0,0 @@ -#!/usr/bin/env python3 -"""backup_runfolder - -Uploads an Illumina runfolder to DNANexus. - -Example: - usage: backup_runfolder.py [-h] -i RUNFOLDER -a AUTH [--ignore IGNORE] [-p PROJECT] - [--logpath LOGPATH] - where IGNORE is a comma seperated string of terms which prevents the upload of files if that term is present in the filename or filepath. -""" - -import argparse -import re -import os -import sys -import subprocess -from distutils.spawn import find_executable -import math - -import logging -from logging.config import dictConfig -import housekeeping_config as config - -def log_setup(args): - """Set up script logging object. - Logs are written to STDERR and appended to a logfile, named after the input runfolder. - Create loggers by assigning logging.getLogger('') objects. Call using desired log level - as method. Formatter objects define the format of the log string, while Handler objects dictate - where the log is recorded. See the 'logging' module docs in the standard library for detail. - - Logs can be written with different levels, in order of event severity: DEBUG, INFO, WARNING, ERROR, - CRITICAL. Each has a corresponding method that can be used to log events at that level of severity. - Logs are written by passing a string to one of these methods (see example below). - - An example of the logging protocol: - # Create logging object. This can be performed anywhere in the script once config created. - logger = logging.getLogger('backup_runfolder') - # Write to log with level 'INFO' - logger.info('Searching for executables...') - >>> 2018-11-06 10:11:30,071 backup_runfolder INFO - Searching for executables... - """ - # If logfile path passed to --logpath, prepend to logfile name, else write to current directory - logpath = args.logpath if args.logpath else "" - # Set logfile name as runfolder name with '.log' extension - logfile_name = "".join([os.path.basename(args.runfolder.strip("/")), "_backup_runfolder.log"]) - logfile_fullpath = os.path.join(logpath, logfile_name) - - # Create dictionary with logging config parameters. - # Loggers can be configured explicitly through code, config files, or the dictConfig module. Here, - # a dictionary is created to define a logger the writes messages to both the terminal (logging.StreamHandler, - # which writes to STDERR) and a logfile (logging.FileHandler, set to 'runfoldername.log', prefxied with - # a path if --logpath given at the command line). These parameters are added to a root logger, - # from which all future loggers in the module, initiated with logging.getLogger, will inherit. - logging_config = dict( - version=1.0, - formatters={'log_formatter': {'format': "{asctime} {name} {levelname} - {message}", 'style': '{'}}, - handlers={ - 'stream_handler': {'class': 'logging.StreamHandler', 'formatter': 'log_formatter', 'level': logging.DEBUG}, - 'file_handler': {'class': 'logging.FileHandler', 'formatter': 'log_formatter', 'level': logging.DEBUG, - 'filename': os.path.join(logpath, logfile_name)}}, - root={'handlers': ['file_handler'], 'level': logging.DEBUG} - ) - - # Read the logging config and initaite root logger for the script. - dictConfig(logging_config) - # Log the beginning of the script with the root logger. - logging.info('START. Logging to %s', logfile_fullpath) - -def cli_arguments(args): - """Parses command line arguments. - Args: - args: A list containing the expected commandline arguments. Example: - ['backup_runfolder.py', '-i', 'media/data1/share/180216_M02353_0185_000000000-D357Y/', '-a', - 'AUTH_TOKEN', '-p', '003_180924_TrioPipelineGATK', '--ignore', '.txt'] - Returns: - An argparse.parser object with methods named after long-option command-line arguments. Example: - --runfolder "media/data1/share/runfolder" --> parser.parse_args(args).runfolder - """ - # Define arguments. - parser = argparse.ArgumentParser() - # The runfolder string argument is immediately passed to os.path.expanduser using the *type* argument, and this - # value is contained as the .runfolder() method in the object returned by parser.parser_args(). - # Os.path.expanduser allows expands tilde signs (~) to a string containing the user home directory. - parser.add_argument('-i', '--runfolder', required=True, help='An Illumina runfolder directory', type=os.path.expanduser) - parser.add_argument('-a', '--auth-token', help='A string or file containing a DNAnexus authorisation key used to access the DNANexus project. Default = /usr/local/src/mokaguys/.dnanexus_auth_token', default='/usr/local/src/mokaguys/.dnanexus_auth_token', type=os.path.expanduser) - parser.add_argument('--ignore', default=None, help="Comma-separated list of patterns which prevents the file from being uploaded if any pattern is present in filename or filepath.") - # Note: When no project is given to the -p argument below, this script searches for a project in DNAnexus. See UAcaller.find_nexus_project() for details. - parser.add_argument('-p', '--project', default=None, help='The name of an existing DNAnexus project for the given runfolder') - parser.add_argument('--logpath', help='Logfile output directory', type=os.path.expanduser) - # Collect arguments and return - return parser.parse_args(args) - -def find_executables(programs): - """Check programs (input arguments) exist in system path. - Args: - programs - A list of executeable program names. E.g. ['dx','ua'] - these are commands that - would execute on the command line. - """ - logger = logging.getLogger('backup_runfolder.find_executables') - # all() returns True if all items in a list evaluate True. Used here to raise error if any calls - # to find_executable() fail. This function uses the distutils.spawn.find_executable package to - # assert the programs are callable by parsing the directories in the system PATH variable (i.e. bash `which` command). - if not all([find_executable(program) for program in programs]): - logger.exception('Could not find one of the following programs: %s', programs) - else: - logger.info('Found programs: %s', ",".join(programs)) - -class UAcaller(): - """Uploads a runfolder to DNA Nexus. - Attributes: - runfolder: Runfolder path as given on command line - runfolder_name: The name of the runfolder without parent directories - auth_token: DNAnexus api key. Passed as string or filename argument. - project: DNAnexus project corresponding to the input runfolder - ignore: A comma-separated string of regular expressions. Used to skip files for upload. - logger: Class-level logger object - - Methods: - find_nexus_project(project): Searches DNAnexus for a project matching the input. If the - input argument is 'None', searches for the first project matching self.runfolder. - call_upload_agent(): Calls the DNAnexus upload agent using the class attributes - """ - def __init__(self, runfolder, auth_token, project, ignore): - # Initiate class-lvel logging object. This object inherits from any root loggers defined using - # the python logging module. All subsequent calls to self.logger will log as per the root configuration. - self.logger = logging.getLogger('backup_runfolder.UAcaller') - - # Set runfolder directory path strings - # Get the full (absolute) path of the input runfolder with os.path.abspath - self.runfolder = os.path.abspath(runfolder) - # Check runfolder exists - if not os.path.isdir(self.runfolder): - raise IOError('Invalid runfolder given as input') - self.runfolder_name = os.path.basename(self.runfolder) - - # Set DNAnexus authentication token from input. This function will distinguish between a file or - # string provided as an argument. If not provided, the crednetials file in the home directory is used. - self.auth_token = self.read_auth_token(auth_token) - # Set DNAnexus project. If no project given, search DNAnexus for a project matching the runfolder name. - self.project = self.find_nexus_project(project) - # List of patterns to exclude files from upload - self.ignore = ignore - # set upload agent path - self.ua_path = config.ua_path - - def read_auth_token(self, key_input): - """Return the DNAnexus authentication toxen from the first line of an input file or an input string. - Args: - key_file_string: A file or string containing a DNAnexus authentication key.""" - self.logger.info('Reading authentication token...') - # Attempt to read the auth key from the first line of the input, assuming it is a file - try: - with open(key_input, "r") as infile: - auth_token = infile.readlines()[0].strip() - # If the file does not exist, use the input auth key as provided - except FileNotFoundError: - auth_token = key_input.strip() - return auth_token - - def find_nexus_project(self, project): - """Search DNAnexus for the project given as an input argument. If the input is 'None', - searches for a project matching self.runfolder. - Args: - project: The name of a project on DNAnexus. If None, searches using runfolder name. - """ - self.logger.info('Searching for DNAnexus project...') - # Get list of projects from DNAnexus as a string. Due to python3's default use of bytestrings - # from various modules, bytes.decode() must be called to return the output as a pyton str object. - # This is required for pattern matching with the re module. - projects = subprocess.check_output(['dx', 'find', 'projects', '--auth',self.auth_token]).decode() - # Set the regular expression pattern for asserting that the project exists in DNAnexus. - # The bytes() function is required to create bytestrings - if project is None: - # If no project given, search for one or more word character, using \w+ ([a-zA-Z0-9_]), - # either side of the runfolder name given to the class - pattern = r'(\w*{}\w*)'.format(self.runfolder_name) - else: - # Else, search for the exact project name passed to the function - pattern = r'({})'.format(project) - - # List all strings captured by the regular expression pattern defined to match the project - project_matches = re.findall(pattern, projects) - - # If only one project is found, return this value - if len(project_matches) == 1: - return project_matches[0] - # Else if any other number of matching projects is foud, log this event and raise an Error - else: - self.logger.error('DNAnexus projects found: %s', project_matches) - self.logger.error('%s matching DNAnexus projects were found for pattern: %s. '\ - 'Repeat script by giving explicit project to -p/--project flag', len(project_matches), pattern) - raise ValueError('Invalid DNAnexus project name. 0 or >1 matching projects found.') - - def get_nexus_filepath(self, folder_path): - """ - To recreate the directory structure in DNA Nexus need to take relative path of each the subfolder. - This subfolder path is prefixed with the top level folder in DNA Nexus(the project name without the first four characters (002_)). - Returns a tuple (DNAnexus upload folder path, full DNAnexus file path) - DNAnexus upload folder path is used in the upload agent's '--folder' argument. - Args: - folder_path - The path of a local folder containing files to be uploaded to DNAnexus. - Returns: - A tuple: (DNAnexus upload folder path, full DNAneuxs file path) - Example: - self.get_nexus_filepath('/media/data1/share/runfolder/RTALogs/') - >>> (runfolder/RTALogs, PROJECT:/runfolder/RTALogs/) - """ - # Clean the runfolder name and parent folders from the input file path. Features of the regular expression below: - # {} - Replaced with the runfolder name by call to str.format(self.runfolder) - # [\/] - Looks a forward or backward slash in this position, accounting for linux or windows filesystems - # (.*)$ - Capture all characters to the end of the line. - # Parentheses in regular expressions capture a group, the first of which can be returned from re.search().group(1) - # if we are uploading files in the root of a runfolder need to skip this step - if folder_path == self.runfolder: - clean_runfolder_path = "" - else: - clean_runfolder_path = re.search(r'{}[\/](.*)$'.format(self.runfolder), folder_path).group(1) - - # Prepend the nexus folder path to cleaned path. the nexus folder path is the project name without the first four characters (002_). - nexus_path = "'/" + os.path.join(self.project[4:],clean_runfolder_path) + "'" - - # Return the nexus folder and full project filepath - return nexus_path, "{}:{}".format(self.project, nexus_path) - - - def ignore_file(self,filepath): - # if an ignore pattern was specified - if self.ignore: - # split this string on comma and loop through list - for pattern in self.ignore.split(","): - # make ignore pattern and filepath upper case and search filepath for the pattern - if pattern.upper() in filepath.upper(): - # if present return True to indicate the file should not be uploaded - return True - # if no search patterns given, or pattern not found in filepath return False to say file can be uploaded - return False - - - def call_upload_agent(self): - """ - Loop through the runfolder and build the upload agent command. - It is quicker to upload files in paralell so all files in a folder are added to a list and a single command issued per folder - """ - # create a dictionary to hold the directories as a key, and the list of files as the value - file_dict = {} - # walk through run folder - for root, subfolders, files in os.walk(self.runfolder): - # for any subfolders - for folder in subfolders: - # build path to the folder - folderpath = os.path.join(root, folder) - # create a dictionary entry for this folder - file_dict[folderpath] = [] - # create a list of filepaths for all files in the folder - filepath_list = [os.path.join(folderpath,file) for file in os.listdir(folderpath) if os.path.isfile(os.path.join(folderpath, file))] - # loop through this list - for filepath in filepath_list: - # test filepath for ignore patterns - if not self.ignore_file(filepath): - # if ignore pattern not found add filepath to list - file_dict[folderpath].append(filepath) - # repeat for the root (not just subfolders) - # build path to the folder - folderpath = os.path.join(root) - # create a dictionary entry for this folder - file_dict[folderpath] = [] - # create a list of filepaths for all files in the folder - filepath_list = [os.path.join(folderpath,file) for file in os.listdir(folderpath) if os.path.isfile(os.path.join(folderpath, file))] - # loop through this list - for filepath in filepath_list: - # test filepath for ignore patterns - if not self.ignore_file(filepath): - # if ignore pattern not found add filepath to list - file_dict[folderpath].append(filepath) - - # report the folders and files to be uploaded - self.logger.info('Files for upload: %s', file_dict) - - # call upload agent on each folder - for path in file_dict: - # if there are any files to upload - if file_dict[path]: - # create the nexus path for each dir - nexus_path, project_filepath = self.get_nexus_filepath(path) - self.logger.info('Calling upload agent on %s to location %s', path, project_filepath) - # upload agent has a max number of uploads of 1000 per command. uploadingmultiple files at a time is quicker, but uploading too many at a time has caused it to hang. - # count number of files in list and divide by 100.0 eg 20/100.0 = 0.02. ceil rounds up to the nearest integer (0.02->1). If there are 100, ceil(100/100.0)=1.0 if there are 750 ceil(750/100.0)=8.0 - iterations_needed = math.ceil(len(file_dict[path]) / 100.0) - # set the iterations count to 1 - iteration_count = 1 - # will pass a slice of the file list to the upload agent so set variables for start and stop so it uploads files 0-999 - start = 0 - stop = 100 - # while we haven't finished the iterations - while iteration_count <= iterations_needed: - # if it's the last iteration, set stop == length of list so not to ask for elements that aren't in the list (if 4 items in list len(list)=4 and slice of 0:4 won't miss the last element) - if iteration_count == iterations_needed: - stop = len(file_dict[path]) - self.logger.info('uploading files %d to %d', start, stop) - # the upload agent command can take multiple files seperated by a space. the full file path is required for each file - files_string = "" - # take a slice of list using from and to - for file in file_dict[path][start:stop]: - files_string = files_string + " '" + os.path.join(path, file) + "'" - - # increase the iteration_count and start and stop by 1000 for the next iteration so second iteration will do files 1000-1999 - iteration_count += 1 - start += 100 - stop += 100 - - # Create DNAnexus upload command - nexus_upload_command = ('{ua_path} --auth-token {auth_token} --project {nexus_project} --folder {nexus_folder} --do-not-compress --upload-threads 10 --tries 100 {files}'.format( - ua_path=self.ua_path, auth_token=self.auth_token, nexus_project=self.project, nexus_folder=nexus_path, files=files_string)) - - # Mask the autentication key in the upload command and log - masked_nexus_upload_command = nexus_upload_command.replace(self.auth_token, "") - self.logger.info(masked_nexus_upload_command) - # Call upload command redirecting stderr to stdout - proc = subprocess.Popen([nexus_upload_command], stderr=subprocess.STDOUT, stdout=subprocess.PIPE, shell=True) - # Capture output streams (err is redirected to out above) - (out, err) = proc.communicate() - # Write output stream to logfile and terminal - self.logger.debug(out.decode()) - - def count_uploaded_files(self): - # count number of files to be uploaded - # if ignore terms given need to add a grep step - if self.ignore: - # -v excludes any files matching the given terms (stated with -e) - # -i makes this search case insensitive - grep_ignore = "| grep -v -i " - # split ignore string on comma and loop through list - for pattern in self.ignore.split(","): - grep_ignore = grep_ignore + ' -e "' + pattern + '" ' - else: - grep_ignore = "" - - local_file_count = "find " + self.runfolder + " -type f " + grep_ignore + " | wc -l" - - # Call upload command redirecting stderr to stdout - proc = subprocess.Popen([local_file_count], stderr=subprocess.STDOUT, stdout=subprocess.PIPE, shell=True) - # Capture output streams (err is redirected to out above) - (out, err) = proc.communicate() - # Write output stream to logfile and terminal - self.logger.info('%s files that should have been uploaded (excluding any with ignore terms in filename or path)', out.decode().rstrip()) - - # count number of uploaded files - uploaded_file_count = "dx find data --project %s | wc -l" % (self.project) - - # Call upload command redirecting stderr to stdout - proc = subprocess.Popen([uploaded_file_count], stderr=subprocess.STDOUT, stdout=subprocess.PIPE, shell=True) - # Capture output streams (err is redirected to out above) - (out, err) = proc.communicate() - # Write output stream to logfile and terminal - self.logger.info('%s files present in DNANexus project', out.decode().rstrip()) - - if self.ignore: - # test for presense of any ignore strings in project - uploaded_file_count_ignore = "dx find data --project %s " % (self.project) + grep_ignore.replace("-v","") + " | wc -l" - - # Call upload command redirecting stderr to stdout - proc = subprocess.Popen([uploaded_file_count_ignore], stderr=subprocess.STDOUT, stdout=subprocess.PIPE, shell=True) - # Capture output streams (err is redirected to out above) - (out, err) = proc.communicate() - # Write output stream to logfile and terminal - self.logger.info('%s files present in DNANexus project containing one of the ignore terms. NB this may not be accurate if the ignore term is found in the result of dx find data (eg present in project name)', out.decode().rstrip()) - - -def main(args): - """Uploads runfolder to DNAnexus by passing given arguments to the DNAnexus upload agent.""" - # Get command line arguments - parsed_args = cli_arguments(args) - # Set up logger - log_setup(parsed_args) - logger = logging.getLogger('backup_runfolder') - logger.info('Parsed args: %s', args) - - # Check DNAnexus utilities exist in system path. - logger.info('Searching for executables...') - find_executables([config.ua_path, 'dx']) - - # Create an object to set up the upload agent command - logger.info('Creating UAcaller object with the following arguments: %s', vars(parsed_args)) - ua_object = UAcaller(runfolder=parsed_args.runfolder, project=parsed_args.project, auth_token=parsed_args.auth_token, ignore=parsed_args.ignore) - - # Call upload agent on runfolder - logger.info('Arguments read to object. Calling upload agent for input files.') - ua_object.call_upload_agent() - - # run tests to count files - logger.info('Counting the number of files that need to be uploaded, have been uploaded and check if any that should have been ignored are in Nexus.') - ua_object.count_uploaded_files() - - logger.info('END.') - -if __name__ == '__main__': - main(sys.argv[1:]) diff --git a/findfastqs.sh b/findfastqs.sh deleted file mode 100755 index cc2d00b..0000000 --- a/findfastqs.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -# Commands to help with runfolder managemenet (Nana) -findfastqs(){ - input_directory=$1 - # Find all fastqs in the input directory. - # Grep -v filters out undetermined fastqs from this list, which are not typically uploaded to DNAnexus - numfqs=$(find $input_directory -iname "*.fastq.gz" | grep -v "Undetermined" | wc -l); - # Count the number of undetermined fastqs in input directory - undetermined=$(find $1 -iname "Undetermined*.fastq.gz" | wc -l); - total=$((numfqs + undetermined)) - echo "$input_directory has $numfqs demultiplexed fastq files with $undetermined undetermined. Total: $total"; - } - -findfastqs $1 \ No newline at end of file diff --git a/housekeeping_config.py b/housekeeping_config.py deleted file mode 100644 index b211541..0000000 --- a/housekeeping_config.py +++ /dev/null @@ -1,2 +0,0 @@ -ua_path="/usr/local/src/mokaguys/apps/dnanexus-upload-agent-1.5.17-linux/ua" - From 850b0d8df7371ad89fcf0026626b27ba6411580b Mon Sep 17 00:00:00 2001 From: RachelDuffin Date: Thu, 2 May 2024 11:33:24 +0100 Subject: [PATCH 2/3] Remove ngrok_start --- README.md | 70 ++++++++++++++++++++++++++++++++++----------- ngrok_start.sh | 37 ------------------------ wscleaner/README.md | 70 --------------------------------------------- 3 files changed, 54 insertions(+), 123 deletions(-) delete mode 100755 ngrok_start.sh delete mode 100644 wscleaner/README.md diff --git a/README.md b/README.md index 16307bb..6239425 100644 --- a/README.md +++ b/README.md @@ -1,32 +1,70 @@ -# Workstation Housekeeping v1.11 +## Workstation Cleaner (wscleaner) -Scripts to manage data on the NGS workstation +Workstation Cleaner (wscleaner) deletes local directories that have been uploaded to the DNAnexus cloud storage service. +When executed, Runfolders in the input (root) directory are deleted based on the following criteria: -## Workstation Cleaner (wscleaner) +* A single DNAnexus project is found matching the runfolder name +* All local FASTQ files are uploaded and in a 'closed' state +* X logfiles are present in the DNA Nexus project /Logfiles directory (NB X can be added as a command line argument - default is 5) + +or if the run is identified as a TSO500 run, based on: + * the bcl2fastq2_output.log file created by the automated scripts + AND + * Presence of `_TSO` in the human readable DNANexus project name + +A DNAnexus API key must be cached locally using the `--set-key` option. + +## Workstation Environment +The directory `env/` in this repository contains conda environment scripts for the workstation. These remove conflicts in the PYTHONPATH environment variable by editing the variable when conda is activated. The conda documentation describes where to place these scripts under ['saving environment variables'](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#macos-and-linux). -Delete local directories that have been uploaded to the DNAnexus cloud storage service. -See wscleaner readme for more info +## Install +As descibed above, on the workstation 2 environments exist - wscleaner and wscleaner_test (for development work). +You need to activate these environment before installing with pip (as below). -## ngrok_start.sh -Allow SSH access to the system by running ngrok as a background process. As of v1.11 supports dockerised ngrok instance. +```bash +git clone https://github.com/moka-guys/workstation_housekeeping.git +pip install workstation_housekeeping/wscleaner +wscleaner --version # Print version number +``` -### Installation +## Automated usage +The script `wscleaner_command.sh` is called by the crontab. This activates the enviroment and passes the logfile path (and any other non-default arguments). +A development command script `wscleaner_command_dev.sh` can be used to call the test environment and provide testing arguments, eg --dry-run -See knowledge base article for ngrok installation. -### Usage +## Manual Usage -Non-dockerised ngrok: +``` +usage: wscleaner [-h] [--auth AUTH] [--dry-run] [--logfile LOGFILE] + [--min-age MIN_AGE] [--logfile-count LOGFILE_COUNT] + [--version] + root -`sudo bash ngrok_start.sh` +positional arguments: + root A directory containing runfolders to process -Dockerised ngrok: +optional arguments: + -h, --help show this help message and exit + --auth AUTH A text file containing the DNANexus authentication + token + --dry-run Perform a dry run without deleting files + --logfile LOGFILE A path for the application logfile + --min-age MIN_AGE The age (days) a runfolder must be to be deleted + --logfile-count LOGFILE_COUNT + The number of logfiles a runfolder must have in + /Logfiles + --version Print version +``` -`sudo bash ngrok_start.sh docker` +## Test -### output +```bash +# Run from the cloned repo directory after installation +pytest . --auth_token DNA_NEXUS_KEY +``` -The script will output the ngrok connection details +## License +Developed by Viapath Genome Informatics diff --git a/ngrok_start.sh b/ngrok_start.sh deleted file mode 100755 index 54490fb..0000000 --- a/ngrok_start.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash -# ngrok_start.sh - A script to allow SSH access to the system by running ngrok as a background process -# Prints SSH details if ngrok is already running. -# Note: The ngrok process can be closed at anytime with `kill $(pidof ngrok)` - -# Get the process ID of ngrok if it is already running on the system -EXISTING_PROCESS=$(pidof ngrok) -ngrok_instance=$1 -# If ngrok is not running, start as a background process and print the url for SSH access -if [ -z $EXISTING_PROCESS ] ;then - if [[ $ngrok_instance == "docker" ]]; then - # cat the ngrok password into a variable so it can be passed as a environment argument to docker (-e) - # run docker container in detached mode (-d) - # name the instance - # set --net=host attaches the container to the host network, rather than the docker network - this means ports don't need to be remapped. - # and use ngrok/ngrok:latest tcp --region eu 22 : Open an connection to ngrok server on port 22 - # &> /dev/null : Discard stdout and stderr to empty output stream - ngrok_token=$(cat /usr/local/src/mokaguys/.ngrok) - docker run -d --name NGROK --net=host -it -e NGROK_AUTHTOKEN=$ngrok_token ngrok/ngrok:latest tcp 22 --region eu &> /dev/null - else - # nohup [command] : Keep the process running the command even after you quit the session - # ngrok tcp --region eu 22 : Open an connection to ngrok server on port 22 - # &> /dev/null : Discard stdout and stderr to empty output stream - # & : Run as a background process - nohup ngrok tcp --region eu 22 &> /dev/null & - fi - # Pause for a few seconds to allow the connection to complete. - sleep 3 - # Write the ngrok public url for SSH access to the syslog. - # Triggers alert in slack with ssh url details and writes to stderr. - NGROK_URL=$(curl http://localhost:4040/api/tunnels 2>/dev/null | jq ".tunnels[0].public_url") - logger -s "ngrok_start - new workstation host - $NGROK_URL" -else - # If ngrok is already running, print the public url for SSH access to stderr - NGROK_URL=$(curl http://localhost:4040/api/tunnels 2>/dev/null | jq ".tunnels[0].public_url") - echo "ngrok_start - $NGROK_URL" 1>&2 -fi diff --git a/wscleaner/README.md b/wscleaner/README.md deleted file mode 100644 index 9322b0d..0000000 --- a/wscleaner/README.md +++ /dev/null @@ -1,70 +0,0 @@ -# Workstation Cleaner - -Workstation Cleaner (wscleaner) deletes local directories that have been uploaded to the DNAnexus cloud storage service. - -When executed, Runfolders in the input (root) directory are deleted based on the following criteria: - -* A single DNAnexus project is found matching the runfolder name -* All local FASTQ files are uploaded and in a 'closed' state -* X logfiles are present in the DNA Nexus project /Logfiles directory (NB X can be added as a command line argument - default is 5) - -or if the run is identified as a TSO500 run, based on: - * the bcl2fastq2_output.log file created by the automated scripts - AND - * Presence of `_TSO` in the human readable DNANexus project name - -A DNAnexus API key must be cached locally using the `--set-key` option. - -## Workstation Environment -The directory `env/` in this repository contains conda environment scripts for the workstation. These remove conflicts in the PYTHONPATH environment variable by editing the variable when conda is activated. The conda documentation describes where to place these scripts under ['saving environment variables'](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#macos-and-linux). - -## Install -As descibed above, on the workstation 2 environments exist - wscleaner and wscleaner_test (for development work). -You need to activate these environment before installing with pip (as below). - - -```bash -git clone https://github.com/moka-guys/workstation_housekeeping.git -pip install workstation_housekeeping/wscleaner -wscleaner --version # Print version number -``` - -## Automated usage -The script `wscleaner_command.sh` is called by the crontab. This activates the enviroment and passes the logfile path (and any other non-default arguments). -A development command script `wscleaner_command_dev.sh` can be used to call the test environment and provide testing arguments, eg --dry-run - - -## Manual Usage - -``` -usage: wscleaner [-h] [--auth AUTH] [--dry-run] [--logfile LOGFILE] - [--min-age MIN_AGE] [--logfile-count LOGFILE_COUNT] - [--version] - root - -positional arguments: - root A directory containing runfolders to process - -optional arguments: - -h, --help show this help message and exit - --auth AUTH A text file containing the DNANexus authentication - token - --dry-run Perform a dry run without deleting files - --logfile LOGFILE A path for the application logfile - --min-age MIN_AGE The age (days) a runfolder must be to be deleted - --logfile-count LOGFILE_COUNT - The number of logfiles a runfolder must have in - /Logfiles - --version Print version -``` - -## Test - -```bash -# Run from the cloned repo directory after installation -pytest . --auth_token DNA_NEXUS_KEY -``` - -## License - -Developed by Viapath Genome Informatics From 074865a4408434fd8cc48862a1c7728e51ae861e Mon Sep 17 00:00:00 2001 From: RachelDuffin Date: Fri, 7 Jun 2024 15:54:46 +0100 Subject: [PATCH 3/3] Correct company name --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6239425..85b3305 100644 --- a/README.md +++ b/README.md @@ -67,4 +67,4 @@ pytest . --auth_token DNA_NEXUS_KEY ## License -Developed by Viapath Genome Informatics +Developed by Synnovis Genome Informatics