diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cf1f3f1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +*.pyc +*.egg-info +wscleaner/wscleaner/config.json +wscleaner/test/test_dir*.txt +wscleaner/test/data \ No newline at end of file diff --git a/README.md b/README.md index babc175..a177f44 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,25 @@ -# Workstation Housekeeping v1.4 +# Workstation Housekeeping v1.5 + Scripts to manage data on the NGS workstation --- ## backup_runfolder.py + Uploads an Illumina runfolder to DNANexus. -### Quickstart -``` - usage: backup_runfolder.py [-h] -i RUNFOLDER [-a AUTH_TOKEN] [--ignore IGNORE] [-p PROJECT] [--logpath LOGPATH] +### Usage + +```bash +backup_runfolder.py [-h] -i RUNFOLDER [-a AUTH_TOKEN] [--ignore IGNORE] [-p PROJECT] [--logpath LOGPATH] ``` ### What are the dependencies for this script? + This tool requires the DNAnexus utilities `ua` (upload agent) and `dx` (DNAnexus toolkit) to be available in the system PATH. Python3 is required, and this tool uses packages from the standard library. ### How does this tool work? + * The script parses the input parameters, asserting that the given runfolder exists. * If the `-p` option is given, the script attempts to find a matching DNAnexus project. Otherwise, it looks for a single project matching the runfolder name. If more or less than 1 project matches, the script logs an error and exits. * The runfolder is traversed and a list of files in each folder is obtained. If any comma-separated strings passed to the `--ignore` argument are present within the filepath, or filename the file is excluded. @@ -26,14 +31,38 @@ This tool requires the DNAnexus utilities `ua` (upload agent) and `dx` (DNAnexus * (If relevant) A count of files in the DNA Nexus project containing a pattern to be ignored. NB this may not be accurate if the ignore term is found in the result of dx find data (eg present in project name) * Logs from this and the script are written to a logfile, named after the runfolder. A destination for this file can be passed to the `--logpath` flag. +--- ## findfastqs.sh + Report the number of gzipped fastq files in an Illumina runfolder. ### Usage -``` + +```bash $ findfastqs.sh RUNFOLDER >>> RUNFOLDER has 156 demultiplexed fastq files with 2 undetermined. Total: 158 ``` --- + +## Workstation Cleaner (wscleaner) + +Delete local directories that have been uploaded to the DNAnexus cloud storage service. + +### Install + +```bash +git clone https://github.com/moka-guys/workstation_housekeeping.git +pip install workstation_housekeeping/wscleaner +wscleaner --version # Print version number +``` + +### Usage + +```bash +wscleaner --set-key DNA_NEXUS_KEY # Cache dnanexus api key +wscleaner ROOT_DIRECTORY --logfile LOGFILE_PATH +``` + +--- \ No newline at end of file diff --git a/wscleaner/DESIGN.md b/wscleaner/DESIGN.md new file mode 100644 index 0000000..8259925 --- /dev/null +++ b/wscleaner/DESIGN.md @@ -0,0 +1,32 @@ +# Workstation Cleaner Design Document + +Owner: Nana Mensah +Date: 30/05/19 +Status: Draft + +## Brief + +The Viapath Genome Informatics team use a linux workstation to manage sequencing files. These files are uploaded to the DNAnexus service for storage, however clearing the workstation is time intensive. + +## User Story + +As a Clinical Bioinformatician, I need to automate the deletion of sequencing folders that have been successfuly backed up, so that I can free up time for other duties. + +## Functional requirements + +FR1. Accurately detect sequencing folders have been successfully backed up +FR2. Delete old sequencing folders that are successfully backed up +FR3. Log all activity to a local logfile + +## Non-functional requirements + +NF1. Run from the Linux command line +NF2. Process runfolders within 24 hours +NF3. Use any available DNAnexus SDKs +NF4. Attempt to process all folders at least once + +## Design Summary + +A RunFolderManager class will instatiate objects for local Runfolders, each of which has an associated DNA Nexus project object. The manager loops over the runfolders and deletes them if all checks pass. + +DNA Nexus projects are accessed with the dxpy module, a python wrapper for the DNA Nexus API. Credentials are cached locally using the command-line option '--set-key'. diff --git a/wscleaner/README.md b/wscleaner/README.md new file mode 100644 index 0000000..515e4cb --- /dev/null +++ b/wscleaner/README.md @@ -0,0 +1,57 @@ +# Workstation Cleaner + +Workstation Cleaner (wscleaner) deletes local directories that have been uploaded to the DNAnexus cloud storage service. + +When executed, Runfolders in the input (root) directory are deleted based on the following criteria: + +* A single DNAnexus project is found matching the runfolder name +* All local FASTQ files are uploaded and in a 'closed' state +* Six logfiles are present in the DNA Nexus project /Logfiles directory + +A DNAnexus API key must be cached locally using the `--set-key` option. + +## Install + +```bash +git clone https://github.com/moka-guys/workstation_housekeeping.git +pip install workstation_housekeeping/wscleaner +wscleaner --version # Print version number +``` + +## Quickstart + +```bash +wscleaner --set-key DNA_NEXUS_KEY # Cache dnanexus api key +wscleaner ROOT_DIRECTORY +``` + +## Usage + +``` +wscleaner [-h] [--set-key SET_KEY] [--print-key] [--dry-run] + [--logfile LOGFILE] [--min-age MIN_AGE] [--version] + root + +positional arguments: + root A directory containing runfolders to process + +optional arguments: + -h, --help show this help message and exit + --set-key SET_KEY Cache a DNA Nexus API key + --print-key Print the cached DNA Nexus API key + --dry-run Perform a dry run without deleting files + --logfile LOGFILE A path for the application logfile + --min-age MIN_AGE The age (days) a runfolder must be to be deleted + --version Print version +``` + +## Test + +```bash +# Run from the cloned repo directory after installation +pytest . --auth_token DNA_NEXUS_KEY +``` + +## License + +Developed by Viapath Genome Informatics diff --git a/wscleaner/setup.py b/wscleaner/setup.py new file mode 100644 index 0000000..186120a --- /dev/null +++ b/wscleaner/setup.py @@ -0,0 +1,24 @@ +from setuptools import setup, find_packages + +setup(name='wscleaner', + version='1.0', + description='Package to remove uploaded runfolders from \ + the Viapath Genome Informatics NGS workstation', + url='https://github.com/NMNS93/wscleaner', + author='Nana Mensah', + author_email='gst-tr.MokaGuys@nhs.net', + license='MIT', + packages=find_packages(), + zip_safe=False, + + python_requires = '>=3.6.8', + install_requires = ['docutils>=0.3', 'dxpy==0.279.0', 'pytest==4.4.0', 'pytest-cov==2.6.1', + 'Sphinx==2.0.1', 'psutil==5.6.1'], + + package_data = {}, + + entry_points={ + 'console_scripts': 'wscleaner = wscleaner.main:main' + } + + ) diff --git a/wscleaner/test/conftest.py b/wscleaner/test/conftest.py new file mode 100644 index 0000000..6ac8d67 --- /dev/null +++ b/wscleaner/test/conftest.py @@ -0,0 +1,39 @@ +"""conftest.py + +Config for pytest. +""" +import pytest +import pathlib + +def pytest_addoption(parser): + """Add command line options to pytest""" + parser.addoption("--auth_token", action="store", default=None, help="A DNANexus authentication key") + +@pytest.fixture +def auth_token(request): + """Create pytest fixture from command line argument for authentication token""" + return request.config.getoption("--auth_token") + +@pytest.fixture(scope="session") +def data_test_runfolders(): + """A fixture that returns a list of tuples containing (runfolder_name, fastq_list_file).""" + return [ + ('190408_NB551068_0234_AHJ7MTAFXY_NGS265B', 'test/test_dir_1_fastqs.txt'), + ('190410_NB551068_0235_AHKGMGAFXY_NGS265C', 'test/test_dir_2_fastqs.txt') + ] + +@pytest.fixture(scope="session", autouse=True) +def create_test_dirs(request, data_test_runfolders): + """Create test data for testing. + + This is an autouse fixture with session scope, meaning it is run once before any tests are collected. + """ + for runfolder_name, fastq_list_file in data_test_runfolders: + # Create the runfolder directory as per Illumina spec + test_path = f'test/data/{runfolder_name}/Data/Intensities/BaseCalls' + pathlib.Path(test_path).mkdir(parents=True, exist_ok=True) + # Generate empty fastqfiles in runfolder + with open(fastq_list_file) as f: + fastq_list = f.read().splitlines() + for fastq_file in fastq_list: + pathlib.Path(test_path, fastq_file).touch(mode=777, exist_ok=True) diff --git a/wscleaner/test/coverage.txt b/wscleaner/test/coverage.txt new file mode 100644 index 0000000..d94e13e --- /dev/null +++ b/wscleaner/test/coverage.txt @@ -0,0 +1,21 @@ +============================= test session starts ============================== +platform linux -- Python 3.6.8, pytest-4.4.0, py-1.8.0, pluggy-0.9.0 +rootdir: /home/nana/Documents/MOKAGUYS/wscleaner +plugins: cov-2.6.1 +collected 9 items + +test/test_all.py ......... [100%] + +----------- coverage: platform linux, python 3.6.8-final-0 ----------- +Name Stmts Miss Cover +-------------------------------------------------- +wscleaner/__init__.py 0 0 100% +wscleaner/auth.py 35 14 60% +wscleaner/lib.py 101 6 94% +wscleaner/main.py 43 26 40% +wscleaner/mokaguys_logger.py 10 5 50% +-------------------------------------------------- +TOTAL 189 51 73% + + +========================== 9 passed in 44.68 seconds =========================== diff --git a/wscleaner/test/generate.py b/wscleaner/test/generate.py new file mode 100644 index 0000000..5f45155 --- /dev/null +++ b/wscleaner/test/generate.py @@ -0,0 +1,30 @@ +"""generate.py + +Generates dummy data for testing. +""" + +import pathlib + +def data_test_runfolders(): + """A fixture that returns a list of tuples containing (runfolder_name, fastq_list_file).""" + return [ + ('190408_NB551068_0234_AHJ7MTAFXY_NGS265B', 'test/test_dir_1_fastqs.txt'), + ('190410_NB551068_0235_AHKGMGAFXY_NGS265C', 'test/test_dir_2_fastqs.txt') + ] + +def create_test_dirs(test_data): + """Create test data for testing. + + This is an autouse fixture with session scope, meaning it is run once before any tests are collected. + """ + for runfolder_name, fastq_list_file in test_data: + # Create the runfolder directory as per Illumina spec + test_path = f'test/data/{runfolder_name}/Data/Intensities/BaseCalls' + pathlib.Path(test_path).mkdir(parents=True, exist_ok=True) + # Generate empty fastqfiles in runfolder + with open(fastq_list_file) as f: + fastq_list = f.read().splitlines() + for fastq_file in fastq_list: + pathlib.Path(test_path, fastq_file).touch(mode=777, exist_ok=True) + +create_test_dirs(data_test_runfolders()) \ No newline at end of file diff --git a/wscleaner/test/test_all.py b/wscleaner/test/test_all.py new file mode 100644 index 0000000..fcc7f2a --- /dev/null +++ b/wscleaner/test/test_all.py @@ -0,0 +1,109 @@ +import pytest +import dxpy +from pathlib import Path +import argparse +import json +import sys +import shutil + +from pkg_resources import resource_filename +from wscleaner.auth import SetKeyAction, dx_set_auth, CONFIG_FILE +from wscleaner.main import cli_parser +from wscleaner.lib import RunFolderManager, RunFolder + +# AUTH: Set DNAnexus authentication for tests +def test_auth(auth_token): + """Test that an authentication token is passed to pytest as a command line argument""" + assert auth_token is not None + +@pytest.fixture(autouse=True) +def set_auth(auth_token): + """Set the authenticatino token for all subsequent tests""" + dx_set_auth(auth_token) + + +# FIXTURES: Define functions to use in downstream tests +@pytest.fixture +def rfm(): + """Return an instance of the runfolder manager with the test/data directory""" + test_path = Path(str(Path(__file__).parent), 'data') + rfm = RunFolderManager(str(test_path)) + return rfm + +@pytest.fixture +def rfm_dry(): + """Return an instance of the runfolder manager with the test/data directory""" + test_path = Path(str(Path(__file__).parent), 'data') + rfm_dry = RunFolderManager(str(test_path), dry_run=True) + return rfm_dry + +# TESTS +class TestAuth: + def test_set_auth(self, auth_token): + """test that the authentication token is set correctly""" + authobj = dx_set_auth(auth_token) + assert dxpy.SECURITY_CONTEXT['auth_token'] == auth_token + + def test_setkey(self, monkeypatch, auth_token): + """test that the --set-key command-line argument caches the authentication token""" + # Set setkey cli arguments + sys.argv = ['python', 'wscleaner', '--set-key', auth_token] + # Mock Action object + # Parse args + with pytest.raises(SystemExit) as err: + args = cli_parser() + # Make assertions on created config file + fn = resource_filename('wscleaner',CONFIG_FILE) + with open(fn, 'r') as f: + assert auth_token in f.read() + # Delete temp config + Path(fn).unlink() + +class TestFolders: + def test_runfolders_ready(self, data_test_runfolders, rfm): + """Test that runfolders in the test directory pass checks for deletion. Est. 20 seconds.""" + for runfolder in rfm.find_runfolders(min_age=0): + assert all([runfolder.dx_project, rfm.check_fastqs(runfolder), rfm.check_logfiles(runfolder)]) + + def test_find_fastqs(self, data_test_runfolders): + """Tests the correct number of fastqs are present in local and uploaded directories""" + for runfolder_name, fastq_list_file in data_test_runfolders: + rf = RunFolder(Path('test/data', runfolder_name)) + with open(fastq_list_file) as f: + test_folder_fastqs = len(f.readlines()) + assert len(rf.find_fastqs()) == test_folder_fastqs + assert len(rf.dx_project.find_fastqs()) == test_folder_fastqs + + def test_min_age(self, rfm): + """test that the runfolder age function records age""" + runfolders = rfm.find_runfolders(min_age=10) + # Asser that this test runfolder was recently generated + assert all([ rf.age > 10 for rf in runfolders ]) + +class TestRFM: + def test_find_runfolders(self, data_test_runfolders, rfm): + """test the runfolder manager directory finding function""" + rfm_runfolders = rfm.find_runfolders(min_age=0) + runfolder_names = [str(folder.path.name) for folder in rfm_runfolders] + test_runfolder_names = [ rf for rf, fastq_list_file in data_test_runfolders ] + runfolders_bools = [ item in runfolder_names for item in test_runfolder_names ] + assert all(runfolders_bools) + + def test_validate(self, rfm): + """test the runfoldermanager _validate function correctly reads the path""" + assert rfm.root.name == Path(str(Path(__file__).parent), 'data').name + + def test_delete(self, monkeypatch, rfm): + """test that the runfolder manager delete call creates the log of deleted files. + Here, the pytest monkeypatch fixture is used to overwrite the delete function and persist the test directories. + """ + test_folder = rfm.find_runfolders(min_age=0)[0] + monkeypatch.setattr(shutil, 'rmtree', lambda x: 'TEST_DELETED') + rfm.delete(test_folder) + assert test_folder.name in rfm.deleted + + def test_dry_run(self, rfm_dry): + """test that the dry_run option does not cause the test directory to be deleted""" + test_folder = rfm_dry.find_runfolders(min_age=0)[0] + rfm_dry.delete(test_folder) + assert test_folder.name not in rfm_dry.deleted diff --git a/wscleaner/wscleaner/__init__.py b/wscleaner/wscleaner/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wscleaner/wscleaner/auth.py b/wscleaner/wscleaner/auth.py new file mode 100644 index 0000000..f823574 --- /dev/null +++ b/wscleaner/wscleaner/auth.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +"""auth.py + +Utlily classes for the workstation cleaner module. + +Methods: + get_config(): Read the DNANexus API token from the application cache file + dx_set_auth(): Set the DNAnexus authentication token used in each instance of the application + +Classes: + SetKeyAction: Set the DNAnexus authentication token used in future instances of the application and exit + PrintKeyAction: Print the cached DNAnexus authentication key +""" + +from pathlib import Path +from pkg_resources import resource_filename +import json +import argparse +import dxpy + +import logging +logger = logging.getLogger(__name__) + +CONFIG_FILE = 'config.json' + +def get_config(config=CONFIG_FILE): + """Read the DNANexus API token from the application cache file + + Returns: + filename (object): A python file object + Raises: + AttirbuteError: Config file not found. + """ + # Return the file object containing the cached DNAnexus token if it exists + filename = resource_filename('wscleaner',config) + logger.debug(f'Config: {Path(filename).name}') + if Path(filename).is_file(): + return filename + else: + raise AttributeError('Config file not found. Set auth key with --set-key.') + +def dx_set_auth(auth_token=None): + """Set the DNAnexus authentication token used in future instances of the application and exit + + Args: + auth_token (str): A DNAnexus authentication key""" + if auth_token: + security_context = {'auth_token_type': 'Bearer', 'auth_token': auth_token} + else: + filename = get_config() + with open(filename, 'r') as f: + # Password is written to the cache as a dictionary. Loaded here using json module + pwd = json.load(f) + security_context = {'auth_token_type': 'Bearer', 'auth_token': pwd['auth_token']} + dxpy.set_security_context(security_context) + +class SetKeyAction(argparse.Action): + """Set the DNAnexus authentication key based on command line arguments and exit the program. + + Inherits from argparse.Action, which initiates __call__() when the linked argument is present. + """ + # Override argparse.Action.__call__() with desired behaviour + def __call__(self, parser, namespace, values, option_string=None, config=CONFIG_FILE): + filename = resource_filename('wscleaner',config) + with open(filename, 'w') as f: + # 'values' contains authentication token given on the command line. Store for future + # wscleaner calls to set as the DNAnexus dxpy security context. + json.dump({'auth_token': values}, f) + parser.exit() + + +class PrintKeyAction(argparse.Action): + """Print the cached DNAnexus authentication key + + Inherits from argparse.Action, which initiates __call__() when the linked argument is present. + """ + # Override argparse.Action.__call__() with desired behaviour + def __call__(self, parser, namespace, values, option_string=None): + filename = get_config() + with open(filename, 'r') as f: + pwd = json.load(f) + print(pwd) + parser.exit() diff --git a/wscleaner/wscleaner/lib.py b/wscleaner/wscleaner/lib.py new file mode 100644 index 0000000..7fc1040 --- /dev/null +++ b/wscleaner/wscleaner/lib.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 +"""lib.py + +Utlily classes for the workstation cleaner module. + +Classes: + RunFolder: A local directory containing files with the 'fastq.gz' extension + DxProjectRunFolder: A DNAnexus project + RunFolderManager: Contains methods for finding, checking and deleting runfolders in a root directory. +""" +import logging +import shutil +import time +from pathlib import Path + +import dxpy + + +logger = logging.getLogger(__name__) + +class RunFolder(): + """A local directory containing files with the 'fastq.gz' extension + + Arguments: + path (str): The path of a local directory + Attributes: + path (Pathlib.Path): A path object created from the input directory + name (str): The runfolder/directory name + dx_project (DxProjectRunfolder): A DX Project object + age (int): Age of the runfolder in days + Methods: + find_fastqs: Returns a list of local files with the 'fastq.gz' extension + """ + def __init__(self, path): + self.logger = logging.getLogger(__name__ + '.RunFolder') + self.path = Path(path) + self.name = self.path.name + self.logger.debug(f'Initiating RunFolder instance for {self.name}') + self.dx_project = DxProjectRunFolder(self.name) + + @property + def age(self): + """Returns runfolder age in days""" + age_in_seconds = time.time() - self.path.stat().st_mtime + age_in_days = age_in_seconds // (24 * 3600) + self.logger.debug(f'{self.name} age is {age_in_days}') + return age_in_days + + def find_fastqs(self, count=False): + """Returns a list or count of local files with the 'fastq.gz' extension + Args: + count(bool): Returns number of fastqs if True. + """ + # Find paths of files with fastq.gz extension + fastq_paths = self.path.rglob('*.fastq.gz') + # Sort fastq filenames for cleaner logfile outputs + fastq_filenames_unsorted = [ path.name for path in fastq_paths ] + fastq_filenames = sorted(fastq_filenames_unsorted) + # Return number of fastqs if count is True, otherwise return fastq file names + if count: + self.logger.debug(f'{self.name} contains {len(fastq_filenames)} fastq files') + return len(fastq_filenames) + else: + self.logger.debug(f'{self.name} contains {len(fastq_filenames)} fastq files: {fastq_filenames}') + return fastq_filenames + + +class DxProjectRunFolder(): + """A DNAnexus project. + + Arguments: + runfolder_name (str): The name of a local runfolder + Attributes: + runfolder (str): Runfolder name + id (str): Project ID of the matching runfolder project in DNANexus. + Methods: + find_fastqs: Returns a list of files in the DNAnexus project (self.id) with the fastq.gz extension + count_logfiles: Count logfiles in the DNAnexus project (self.id). Logfiles are in an expected location + """ + def __init__(self, runfolder_name): + self.logger = logging.getLogger(__name__ + '.DXProjectRunFolder') + self.runfolder = runfolder_name + self.id = self.__dx_find_one_project() + + def find_fastqs(self): + """Returns a list of files in the DNAnexus project (self.id) with the fastq.gz extension""" + # Search dnanexus for files with the fastq.gz extension. + # name_mode='regexp' tells dxpy to look for any occurence of 'fastq.gz' in the filename + search_response = dxpy.find_data_objects( + project=self.id, classname='file', name='fastq.gz', name_mode='regexp' + ) + file_ids = [ result['id'] for result in search_response ] + + # Gather a list of uploaded fastq files with the state 'closed', indicating a completed upload. + fastq_filenames_unsorted = [] + for dx_file in file_ids: + file_description = dxpy.describe(dx_file) + if file_description['state'] == 'closed': + fastq_filenames_unsorted.append(file_description['name']) + # Sort fastq filenames for cleaner logfile output + fastq_filenames = sorted(fastq_filenames_unsorted) + self.logger.debug(f'{self.id} contains {len(fastq_filenames)} "closed" fastq files: {fastq_filenames}') + return fastq_filenames + + def count_logfiles(self): + """Count logfiles in the DNAnexus project (self.id). Logfiles are in an expected location. + Returns: + logfile_count (int): A count of logfiles""" + # Set uploaded runfolder name. Runfolder is renamed upon upload to the DNANexus project + # without the first four characters + uploaded_runfolder = dxpy.describe(self.id)['name'][4:] + # Set logfile location in DNANexus project. This is expected in 'Logfiles/', a subdirectory of the uploaded runfolder + logfile_dir = str(Path('/',uploaded_runfolder,'Logfiles')) + logfile_list = dxpy.find_data_objects(project=self.id, folder=logfile_dir, classname='file') + logfile_count = len(list(logfile_list)) + return logfile_count + + def __dx_find_one_project(self): + """Find a single DNAnexus project from the input runfolder name + + Returns: + A DNAnexus project ID. If the search fails, returns None. + """ + try: + # Search for the project matching self.runfolder. + # name_mode='regexp' - look for any occurence of the runfolder name in the project name. + # Setting more_ok/zero_ok to False ensures only one project is succesfully returned. + project = dxpy.find_one_project(name=self.runfolder, name_mode='regexp', more_ok=False, zero_ok=False) + self.logger.debug(f'{self.runfolder} DNAnexus project: {project["id"]}') + return project['id'] + except dxpy.exceptions.DXSearchError: + # Catch exception and raise none + self.logger.debug(f'0 or >1 DNAnexus projects found for {self.runfolder}') + return None + + def __bool__(self): + """Allows boolean expressions on class instances which return True if a single DNAnexus project was found.""" + if self.id: + return True + else: + return False + +class RunFolderManager(): + """Contains methods for finding, checking and deleting runfolders in a root directory. + + Args: + directory (str): A parent directory containing runfolders to process + dry_run (bool): Do not delete directories + Attributes: + root(pathlib.Path): A path object to the root directory + deleted(List): A list of deleted runfolders populated by calls to self.delete() + Methods: + find_runfolders(): Search the parent directory for subdirectories containing fastq.gz files. + Returns wscleaner.lib.RunFolder objects. + check_fastqs(): Returns true if a runfolder's fastq.gz files match those in it's DNAnexus project. + check_logfiles(): Returns true if a runfolder's DNAnexus project contains 6 logfiles in the + expected location + delete(): Delete the local runfolder from the root directory and append name to self.deleted. + Raises: + __validate():ValueError: The directory passed to the class instance does not exist. + """ + def __init__(self, directory, dry_run=False): + self.logger = logging.getLogger(__name__ + '.RunFolderManager') + self.__validate(directory) + self.root = Path(directory) + self.__dry_run = dry_run + self.deleted = [] # Delete runfolders appended here by self.deleted + + def __validate(self, directory): + """Check that input directory exists. Log and raise error if otherwise.""" + try: + assert Path(directory).is_dir() + except AssertionError: + self.logger.error(f'Directory does not exist: {directory}', exc_info=True) + raise + + def find_runfolders(self, min_age=None): + """Search the parent directory for subdirectories containing fastq.gz files. + Args: + min_age(int): Minimum age in days of runfolders returned. + Returns: + runfolder_objects(list): List of wscleaner.lib.RunFolder objects. + """ + subdirectories = self.root.iterdir() + runfolder_objects = [] + for directory in subdirectories: + rf = RunFolder(directory) + # Criteria for runfolder: Older than or equal to min_age and contains fastq.gz files + if (rf.age >= min_age) and (rf.find_fastqs(count=True) > 0): + self.logger.debug(f'{rf.name} IS RUNFOLDER.') + runfolder_objects.append(rf) + else: + self.logger.debug(f'{rf.name} IS NOT RUNFOLDER.') + return runfolder_objects + + def check_fastqs(self, runfolder): + """Returns true if a runfolder's fastq.gz files match those in it's DNAnexus project.""" + dx_fastqs = runfolder.dx_project.find_fastqs() + local_fastqs = runfolder.find_fastqs() + fastq_bool = all([fastq in dx_fastqs for fastq in local_fastqs]) + self.logger.debug(f'{runfolder.name} FASTQ BOOL: {fastq_bool}') + return fastq_bool + + def check_logfiles(self, runfolder): + """Returns true if a runfolder's DNAnexus project contains 6 logfiles in the + expected location""" + dx_logfiles = runfolder.dx_project.count_logfiles() + logfile_bool = dx_logfiles >= 6 + self.logger.debug(f'{runfolder.name} LOGFILE BOOL: {logfile_bool}') + return logfile_bool + + def delete(self, runfolder): + """Delete the local runfolder from the root directory and append name to self.deleted.""" + if self.__dry_run: + self.logger.info(f'DRY RUN DELETE {runfolder.name}') + else: + self.deleted.append(runfolder.name) + shutil.rmtree(runfolder.path) + self.logger.info(f'{runfolder.name} DELETED.') diff --git a/wscleaner/wscleaner/main.py b/wscleaner/wscleaner/main.py new file mode 100644 index 0000000..cf8a51e --- /dev/null +++ b/wscleaner/wscleaner/main.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +"""wscleaner + +Delete runfolders in a root directory on the condition that it has uploaded to DNA Nexus. + +Methods: + cli_parser(): Parses command line arguments + main(): Process input directory or API keys +""" + +import argparse +import logging +import pkg_resources +from wscleaner import mokaguys_logger +from wscleaner.auth import SetKeyAction, PrintKeyAction, dx_set_auth +from wscleaner.lib import RunFolder, RunFolderManager + + + +def cli_parser(): + """Parses command line arguments. + Args: None. The argparse.ArgumentParser auto-collects arguments from sys.args + Returns: Argument parser object with a 'root' attribute if root directory given. + Otherwise, --set-key and --print-key exit after actions are performed. + """ + parser = argparse.ArgumentParser() + # argparse API for adding custom routines. SetKeyAction and PrintKeyAction are classes with + # routines that exit the software after an action is performed. + parser.register('action', 'setkey', SetKeyAction) + parser.register('action', 'printkey', PrintKeyAction) + # Define CLI arguments + parser.add_argument('--set-key', action='setkey', help='Cache a DNA Nexus API key') + parser.add_argument('--print-key', nargs=0, action='printkey', help='Print the cached DNA Nexus API key') + parser.add_argument('--dry-run', help='Perform a dry run without deleting files', action='store_true', default=False) + parser.add_argument('root', help='A directory containing runfolders to process') + parser.add_argument('--logfile', help='A path for the application logfile', default='mokaguys_logger.log') + parser.add_argument('--min-age', help='The age (days) a runfolder must be to be deleted', type=int, default=14) + # Get version from setup.py as version CLI response + version_number = pkg_resources.require("wscleaner")[0].version + parser.add_argument('--version', help='Print version', action='version', version=f"wscleaner v{version_number}") + args = parser.parse_args() + return args + +def main(): + # Parse CLI arguments. Some arguments will exit the program intentionally. See docstring for detail. + args = cli_parser() + + # Setup logging for module. Submodules inherit log handlers and filters + mokaguys_logger.log_setup(args.logfile) + logger = logging.getLogger(__name__) + logger.info(f'START') + + # Setup dxpy with cached authentication token + dx_set_auth() + + # Set root directory and search it for runfolders + # If dry-run CLI flag is given, no directories are deleted by the runfolder manager. + RFM = RunFolderManager(args.root, dry_run=args.dry_run) + logger.info(f'Root directory {args.root}') + local_runfolders = RFM.find_runfolders(min_age=args.min_age) + logger.debug(f'Found local runfolders: {[rf.name for rf in local_runfolders]}') + + for runfolder in local_runfolders: + logger.info(f'Processing {runfolder.name}') + # Delete runfolder if it meets the backup criteria + # runfolder.dx_project is evaluated first as following criteria checks depend on it + if runfolder.dx_project: + fastqs_uploaded = RFM.check_fastqs(runfolder) + logfiles_uploaded = RFM.check_logfiles(runfolder) + if fastqs_uploaded and logfiles_uploaded: + RFM.delete(runfolder) + elif not fastqs_uploaded: + logger.warning(f'{runfolder.name} - FASTQ MISMATCH') + elif not logfiles_uploaded: + logger.warning(f'{runfolder.name} - LOGFILE MISMATCH') + else: + logger.warning(f'{runfolder.name} - DX PROJECT MISMATCH') + + # Record runfolders removed by this iteration + logger.info(f'Runfolders deleted in this instance: {RFM.deleted}') + logger.info(f'END') + # END + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/wscleaner/wscleaner/mokaguys_logger.py b/wscleaner/wscleaner/mokaguys_logger.py new file mode 100644 index 0000000..83f605b --- /dev/null +++ b/wscleaner/wscleaner/mokaguys_logger.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +"""mokaguys_logger.py + +Log messages using the python standard library logging module. + +Version: 1.0 +Timestamp: 30/05/19 +""" + +import logging +from logging.config import dictConfig + +def log_setup(logfile_name, syslog='/dev/log'): + """Setup application logging using python's standard library logging module + + Args: + logfile_name(str): The name of the output logfile written to by the file handler + syslog(str): Output target for the system log handler + """ + logging_config = dict( + version=1.0, + formatters={'log_formatter': {'format': "{asctime} {name}.{module}: {levelname} - {message}", + 'style': '{', 'datefmt': r'%Y-%m-%d %H:%M:%S'}}, + handlers={ + # DEBUG message are ommitted from the console output by setting the stream handler level + # to INFO, making console outputs easier to read. DEBUG messages are still written to + # the application logfile and system log. + 'stream_handler': {'class': 'logging.StreamHandler', 'formatter': 'log_formatter', 'level': logging.INFO}, + 'file_handler': {'class': 'logging.FileHandler', 'formatter': 'log_formatter', 'level': logging.DEBUG, + 'filename': logfile_name}, + 'syslog_handler': {'class': 'logging.handlers.SysLogHandler', 'formatter': 'log_formatter', 'level': logging.DEBUG, + 'address': syslog}}, + root={'handlers': ['file_handler', 'stream_handler', 'syslog_handler'], 'level': logging.DEBUG} + ) + dictConfig(logging_config) + + +if __name__ == '__main__': + log_setup() + log = logging.getLogger('TEST') + log.info('TEST')