FASTQ_Pairkey_Matcher.py

#!/usr/bin/python3

from os import listdir
from datetime import datetime
import getpass
import re
import textwrap
import itertools
import argparse

# we allow you to specify the filename for the shell script
# and provides -h and --help messages for free!
parser = argparse.ArgumentParser(description='build shell script')
parser.add_argument('--shellFilename', help='filename for the shell script',
                        default="junk.sh"
                    )
parser.add_argument('--debug', help='print debug output',
                        action="store_true"
                    )
args = parser.parse_args()
# after the above line args.filename will equal either the specified filename
# or junk.sh if the user didn't specify a filename

# in production we will find the filenames in the filesystem
# files = listdir(".")
# example San_Diego_Honeybee_015_S30_L008_R1_001.fastq

# we are currently working with just a text file of the filenames
files = open("./names.txt").read().split()

# regex to parse the parts of the filename
filePat = re.compile('san_diego_honeybee_(.*)_(.*)_(.*)_(.*)_(.*)', re.IGNORECASE)

# apply the parsing regex and return a dict with info from the parsed filename
def getEntry(f) :
    m = filePat.match(f)
    # the numbered groups from a regex match are 0 (the whole matched string)
    # and then 1 (the 1st paren match), 2 (the 2nd paren match), etc
    return {
        'filename':     m.group(0),
        'beeNumber':    m.group(1),
        'run':          m.group(2),
        'lane':         m.group(3),
        'read':         m.group(4),
        # our pairkey is beenNumber:run:lane, eg "015:S30:L008"
        'pairKey':      m.group(1) + ":" + m.group(2) + ":" + m.group(3)
    }
    

# get a list of the filenames converted to dicts from getEntry()
elist = list(map(getEntry, files))

# get a list of the keys from the elist
pairKeys = list(map(lambda e: e.get("pairKey"), elist))

print("Creating Pairkey Dictionary")
# this will have one value for each pairkey which will be another dict
# which has an R1 and and R2 key
# eg pkDictionary.get("015:S30:L008") will return a dict with R1 and R2
# values representing the R1 and R2 files for that pairkey
pkDictionary = {}

#get the unique pairKeys
pairKeySet = set(pairKeys)

# we are going to create pair dicts for each pair of R1 and R2 filenames
# with the keys R1 and R2.  We will then loop over the getEntry dicts
# putting each file into the appropriate pair dict in the overall pkDictionary
for pk in pairKeySet: 
    # each entry will be an empty dictionary to start.  Later we will
    # add R1 and R2 to these dictionaries
    # for now pkDictionary.get("015:S30:L008") will return an empty dict 
    pkDictionary[pk] = {}

# place the entries in the Pairkey dictionary.  
for fileInfo in elist: 
    pk                  = fileInfo["pairKey"]   # get the pairkey for this file
    keyDict             = pkDictionary[pk]      # find the matching pair dict
    fileRead            = fileInfo["read"]      # find R1 or R2 from filename
    keyDict[fileRead]   = fileInfo              # add to pair dict as R1 or R2

# print out the items for debugging/logging if debug flag is used 
if ( args.debug ):
    for k, v in pkDictionary.items():
        print(textwrap.dedent(f"""
        -----
        R1 is {v.get("R1")}
        R2 is {v.get("R2")} """))

# lets validate!
# look for dicts in pkDictionary missing one of the filenames
# and print a warning message
for k, v in pkDictionary.items():
    if  ( (v.get("R1") == None) or (v.get("R2") == None) ):
        print("\n++++++\n")
        print("Invalid file pair found:")
        print(v)

# output a shell script
# FIXME -- what if junk.sh already exists?
print(f"outputting shell script to {args.shellFilename}")
with open(args.shellFilename, "w") as output:
    header = textwrap.dedent(f"""
        #/bin/bash
        # this script generated by newGen.py
        # creation time {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
        # created by user {getpass.getuser()}
        # 

        """)
# deleted print header section to facilitate downstream analysis
    print(file=output)
    for k, v in pkDictionary.items():
        r1 = v.get("R1")
        r2 = v.get("R2")
        if  ( (r1 is not None) and (r2 is not None) ):
            # the "combine" below is the example command for the shell script
            cmd = textwrap.dedent(f"""
                # command generated by newGen.py
                combine {r1["filename"]} {r2["filename"]}
                """)
# deleted the print cmd option in order to  facilitate downstream analysis 
            print( file=output)