From 45eb1aef33b95f15db86aa78f1ff4075c4684b13 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 1 Dec 2020 10:41:18 -0800 Subject: [PATCH 1/9] Output all informational BAM tags from 10x bams MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current implementation of the `bam2fasta` step does not retain information about whether the read was aligned or not in the read ID for the fasta. This can be found by looking at the aligned/unaligned fastas separately, but I'd like to have that information entirely in the read name and not need to look anywhere else. ### What are all the supported flags? Here are links to all suppported flags and some of the important ones - [Offiical SAM tags](https://samtools.github.io/hts-specs/SAMtags.pdf) - `GN`: *Semicolon-separated list of gene names that are compatible with this alignment. Gene names are specified with `gene_name` key in the reference GTF attribute column.* - `RE`: *Single character indicating the region type of this alignment (E = exonic, N = intronic, I = intergenic).* - [10x Genomics specific tags](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/bam) - `NH`: *Number of reported alignments that contain the query in the current record* - `HI`: *Query hit index* ### Aligned, but no gene assigned - [Offiical SAM tags](https://samtools.github.io/hts-specs/SAMtags.pdf) - `GN`: not present - `RE`: present - [10x Genomics specific tags](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/bam) - `NH`: integer > 0 - `HI`: integer >= 0
``` Tue 1 Dec - 10:25  ~/data_sm/immune-evolution/pipeline-results/mouse/kmermaid/lung--mouse--remove-ribo/10x-bams  olga@lrrr  samtools view MACA_18m_M_LUNG_52__possorted_genome_bam.bam | head A00111:77:H3YKNDMXX:1:2115:20862:34710 272 chr1 3014879 0 90M * 0 0 GGGCTTATAAAGTTTGCAAGTCTAATGGGCCTCTATTTGCTGTGATGGCTGAGTAGGCCATCTGTGGATACATTGGCTGCTAGTGACAAG FF----F8FF--FF8-F--FF-------F----F-FFF-F--F-----FF8F-F-F-F-FF---8-FFFF--F---F-F--F--F-F--F NH:i:5 HI:i:3 AS:i:68 nM:i:10 RE:A:I BC:Z:GCAGTAGA QT:Z:F8FFFF8F CR:Z:TTAGGACCACGAAATA CY:Z:8-8-88F8FFFFFFFF CB:Z:TTAGGACCACGAAATA-1 UR:Z:GGGCTCCACA UY:Z:--F-FFFFFF UB:Z:GGGCTCCACA RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:1 A00111:77:H3YKNDMXX:1:1405:5104:9893 272 chr1 3014893 0 90M * 0 0 TGCAAGTCCAATGGGCCTCTATTTGCAGTGATGGCCGACTAGGCCATCTTTTGATACATATGCAGCAAGAGACAAGAGCTTCGGGGTACT FFF-FFFF8--FFFFFFFFF-F8--F-F8FF-FFFFFFF-8F-F88-F-FFFF-F-F-F-FFF-FF--FFFFF--FFFFFFFFFFFFFFF NH:i:10 HI:i:8 AS:i:84 nM:i:2 RE:A:I BC:Z:CAGTACTG QT:Z:F8FFFFFF CR:Z:ACGTCAACAGTAAGAT CY:Z:8F88888F88-FFFFF CB:Z:ACGTCAACAGTAAGAT-1 UR:Z:CGGACACGGT UY:Z:FF-FFFFFFF UB:Z:CGGACACGGT RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:1 A00111:77:H3YKNDMXX:1:2323:22173:34882 272 chr1 3014912 0 88M2S * 0 0 TATTTGCAGTGATGGCCGACTAGGCCATCTTTTGTTAGATTTGCAGCTAGAGACAAGAGCTCCGGGGTACTGGTTTGTTCATATTGTTGC F-FF-8FFF-F-8-FF--F8F-FF---F-FFF8F-FF-FF-F---FF-FF-F----FFFF---FFFFFF-FFFFF-F-FFFFFF--F--- NH:i:6 HI:i:3 AS:i:76 nM:i:5 RE:A:I BC:Z:GCAGTAGA QT:Z:FFFFFFFF CR:Z:TGGGCGTAGATCATGG CY:Z:888F888FFFFFFFFF CB:Z:TGGGCGTAGATCACGG-1 UR:Z:GACGGTGCGG UY:Z:FFFFFFFFFF UB:Z:GACGGTGCGG RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:1 A00111:77:H3YKNDMXX:2:1280:1832:14591 272 chr1 3015028 0 90M * 0 0 TAGCTCCTTGGGTAATTTCTCTAGCTCCTCCATTGGGGGCCGTGTGACCCATCCAATAGCTGACTGTGATCATCCACTTATGTGTTTGCT F-FFFFFFFFFFFF--FFFFF--FFFFF-FF-FFFFFFFFFFFFFFFFFF-FFF--FFFFFFFFFFFFFFFFFFFFFFF-FFFFFFFFFF NH:i:7 HI:i:4 AS:i:86 nM:i:1 RE:A:I BC:Z:TTCCCGAC QT:Z:FFFFFFFF CR:Z:CTCGTCATCTGACCTC CY:Z:88888F88FFFFFF8F CB:Z:CTCGTCATCTGACCTC-1 UR:Z:GAATAGCAGC UY:Z:FFF-FFFFFF UB:Z:GAATAGCAGC RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:2 A00111:77:H3YKNDMXX:1:1430:23863:10379 272 chr1 3016933 0 90M * 0 0 ATGTATTTTATATTATTTGTGACTATTGAGAAGGGTGTTGTTTCCCTAATTTCTTTCTCAGCCTGTTTATCCTTTGTGTACAGAAAGGCC FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:6 HI:i:5 AS:i:88 nM:i:0 RE:A:I BC:Z:TTCCCGAC QT:Z:FFFFFFFF CR:Z:CGTTAGAAGACGCTTT CY:Z:F8F88FFFF8FFFFFF CB:Z:CGTTAGAAGACGCTTT-1 UR:Z:AACTGTTTCG UY:Z:FFFFFFFFFF UB:Z:AACTGTTTCG RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:1 A00111:77:H3YKNDMXX:1:1227:6424:20165 272 chr1 3018511 3 90M * 0 0 TTATTTATGTTGTTATTGAAGATCAGCCTTAGTCCATGGTGATCTGATAGGATGCATGGGACAATTTCAATATTTTTGTATATGTTGACG FFF-FFFFFFFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:2 HI:i:2 AS:i:88 nM:i:0 RE:A:I BC:Z:CAGTACTG QT:Z:8FFFFFFF CR:Z:CATATGGGTTGTTTGG CY:Z:8F88-F88FF-FFF-F CB:Z:CATATGGGTTGTTTGG-1 UR:Z:CTTTATGTGT UY:Z:-FFFFFFFFF UB:Z:CTTTATGTGT RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:1 A00111:77:H3YKNDMXX:1:2214:12319:26866 0 chr1 3025870 255 90M * 0 0 CTGGTAATAAAGACACATGCCCTATTATGTTCATAGCAGCCTTATTTATAAAAGCCAGAAGCTGGAAAGAACCCAGATGCCCCTCAACAG FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:1 HI:i:1 AS:i:88 nM:i:0 RE:A:I BC:Z:TTCCCGAC QT:Z:FFFFFFFF CR:Z:CCTTGAGTTTCATGTG CY:Z:FF8888FFFFFFFFFF UR:Z:TTTTGCAAAT UY:Z:FFFFFFFFFF UB:Z:TTTTGCAAAT RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:1 A00111:77:H3YKNDMXX:1:2206:23827:3834 256 chr1 3026959 1 41M302444N49M * 0 0 TAACTAGTGTCGCAACAATAAAATTTGAGCTTTGATCAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFF-F8FFFFF-FFFFFFFFFFFFFFFF8F8F-FFFF NH:i:3 HI:i:3 AS:i:77 nM:i:0 RE:A:I BC:Z:GCAGTAGA QT:Z:FFFFFFFF CR:Z:CGATGGCGTCAACATC CY:Z:FF888F8FFFFFFFFF CB:Z:CGATGGCGTCAACATC-1 UR:Z:CGCCCGTTGC UY:Z:FFFFFFFFFF UB:Z:CGCCCGTTGC RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:1 A00111:77:H3YKNDMXX:1:2363:25120:29496 256 chr1 3026959 1 41M302444N49M * 0 0 TAACTAGTGTCGCAACAATAAAATTTGAGCTTTGATCAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF-FFFFFFF88FF8FFFFFFFFFFF8F-FFFFFFFF-F NH:i:3 HI:i:3 AS:i:77 nM:i:0 RE:A:I BC:Z:TTCCCGAC QT:Z:FFFFFFFF CR:Z:TAGTGGTCAGCGTTCG CY:Z:888F88F8FFFFFFFF CB:Z:TAGTGGTCAGCGTTCG-1 UR:Z:CGTTCATGCC UY:Z:FFFFFFFFFF UB:Z:CGTTCATGCC RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:1 A00111:77:H3YKNDMXX:2:1380:6903:5149 16 chr1 3027849 255 90M * 0 0 ACATGTATTTCTCTCATTTTTACAACACAGTTTTGTTATTGACACTTCACTCTAACATCAGAAGTGATTGCAAGAAAAAAGTTGTTTTTT FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:1 HI:i:1 AS:i:88 nM:i:0 RE:A:I BC:Z:TTCCCGAC QT:Z:FFFFFFFF CR:Z:CAGTAACGTACGAAAT CY:Z:FFFFFFFFFFFFFFFF CB:Z:CAGTAACGTACGAAAT-1 UR:Z:AAGCGTGGAT UY:Z:FFFFFFFFFF UB:Z:AAGCGTGGAT RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:2 ```
### Aligned, with assigned gene - [Offiical SAM tags](https://samtools.github.io/hts-specs/SAMtags.pdf) - `GN`: present - `RE`: present - [10x Genomics specific tags](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/bam) - `NH`: integer > 0 - `HI`: integer >= 0
``` Tue 1 Dec - 10:25  ~/data_sm/immune-evolution/pipeline-results/mouse/kmermaid/lung--mouse--remove-ribo/10x-bams  olga@lrrr  samtools view MACA_18m_M_LUNG_52__possorted_genome_bam.bam | rg GN: | head A00111:77:H3YKNDMXX:1:2433:27670:3756 1040 chr1 3365854 255 90M * 0 0 AAAGAAATTGTGATATATTTCTTTGTCACATGATCAGCATAGTAGATGATGTGTCTTCATTTCCTACAAAAAAGAGGAAGCAGTTAAAAT FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:1 HI:i:1 AS:i:88 nM:i:0 TX:Z:ENSMUST00000195335.1,+2606,90M GX:Z:ENSMUSG00000103377.1 GN:Z:Gm37180 RE:A:E BC:Z:GCAGTAGA QT:Z:FFFFFFFF CR:Z:GCTGCTTTCGATGAGG CY:Z:88888888FFFFFFFF CB:Z:GCTGCTTTCGATGAGG-1 UR:Z:CGTTGATAGT UY:Z:FFFFFFFFFF UB:Z:CGTTGATAGT RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:1 A00111:77:H3YKNDMXX:2:2488:25409:10833 1040 chr1 3365875 255 90M * 0 0 TTTGTCACATGATCAGCATAGTAGATGATGTGTCTTCATTTCCTACAAAAAAGAGGAAGCAGTTAAAATTGTGTGTGTGTGGTTCTGGAT FFFFF8FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF--FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:1 HI:i:1 AS:i:88 nM:i:0 TX:Z:ENSMUST00000195335.1,+2585,90M GX:Z:ENSMUSG00000103377.1 GN:Z:Gm37180 RE:A:E BC:Z:GCAGTAGA QT:Z:FFFFFFFF CR:Z:ATCATGGAGTGGTAGC CY:Z:8F888FFFFFFFFFFF CB:Z:ATCATGGAGTGGTAGC-1 UR:Z:CCCTTTATGT UY:Z:FFFFFFFFFF UB:Z:CCCTTTATGT RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:2 A00111:77:H3YKNDMXX:1:2424:27543:35164 16 chr1 3365890 255 90M * 0 0 GCATAGTAGATGATGTGTCTTCATTTCCTACAAAAAAGAGGAGGCAGTTAAAATTGTGTGTGTGTGGTTCTGGATTAAATATTATTAATC FF-FFFFFFFFFFFFFFFFFFFFF8FFFFFFFFFFFFFFFFFFFFF8-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:1 HI:i:1 AS:i:86 nM:i:1 TX:Z:ENSMUST00000195335.1,+2570,90M GX:Z:ENSMUSG00000103377.1 GN:Z:Gm37180 RE:A:E BC:Z:AGTAGTCT QT:Z:FFFFFFFF CR:Z:ATCATGGAGTGGTAGC CY:Z:8F88888FFFFFFFFF CB:Z:ATCATGGAGTGGTAGC-1 UR:Z:CCCTTTATGT UY:Z:FFFFFFFFFF UB:Z:CCCTTTATGT RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:1 A00111:77:H3YKNDMXX:2:2137:20609:29387 1040 chr1 3365906 255 79M1I10M * 0 0 GTCTTCATTTCCTACAAAAAAGAGGAAGCAGTTAAAATTGTGTGTGTGTGGTTCTGGATTAAATATTATTAATCAAAAAAGGGGGCTGTC FFFFFFFF8FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:1 HI:i:1 AS:i:83 nM:i:0 TX:Z:ENSMUST00000195335.1,+2555,10M1I79M GX:Z:ENSMUSG00000103377.1 GN:Z:Gm37180 RE:A:E BC:Z:CAGTACTG QT:Z:FFFFFFFF CR:Z:ATCATGGAGTGGTAGC CY:Z:8F88FF8FFFFFFFFF CB:Z:ATCATGGAGTGGTAGC-1 UR:Z:CCCTTTATGT UY:Z:FFFFFFFFFF UB:Z:CCCTTTATGT RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:2 A00111:77:H3YKNDMXX:1:1252:23511:15781 1040 chr1 3365915 255 90M * 0 0 TCCTACAAAAAAGAGGAAGCAGTTAAAATTGTGTGTGTGTGGTTCTGGATTAAATATTATTAATCAAAAAGGGGGCTGTCAGTAGGATGA FFF8FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:1 HI:i:1 AS:i:88 nM:i:0 TX:Z:ENSMUST00000195335.1,+2545,90M GX:Z:ENSMUSG00000103377.1 GN:Z:Gm37180 RE:A:E BC:Z:GCAGTAGA QT:Z:FFFFFFFF CR:Z:GCTGCTTTCGATGAGG CY:Z:FF8FF8FFFFFFFFFF CB:Z:GCTGCTTTCGATGAGG-1 UR:Z:CGTTGATAGT UY:Z:FFFFFFFFFF UB:Z:CGTTGATAGT RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:1 A00111:77:H3YKNDMXX:2:1378:22209:19664 1040 chr1 3365950 255 35M1I54M * 0 0 TGTGTGGTTCTGGATTAAATATTATTAATCAAAAAAGGGGGCTGTCAGTAGGATGATATAAGATATAGATGTAGTTTATCTCCTAATCCC FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:1 HI:i:1 AS:i:83 nM:i:0 TX:Z:ENSMUST00000195335.1,+2511,54M1I35M GX:Z:ENSMUSG00000103377.1 GN:Z:Gm37180 RE:A:E BC:Z:CAGTACTG QT:Z:FFFFFFFF CR:Z:ATCATGGAGTGGTAGC CY:Z:8F88FFFFFFFFFFFF CB:Z:ATCATGGAGTGGTAGC-1 UR:Z:CCCTTTATGT UY:Z:FFFFFFFFFF UB:Z:CCCTTTATGT RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:2 A00111:77:H3YKNDMXX:1:1214:26558:27007 16 chr1 3365959 255 90M * 0 0 CTGGATTAAATATTATTAATCAAAAAGGGGGCTGTCAGTAGGATGATATAAGATATAGATGTAGTTTATCTCCTAATCCCACCCTTCCTC FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:1 HI:i:1 AS:i:88 nM:i:0 TX:Z:ENSMUST00000195335.1,+2501,90M GX:Z:ENSMUSG00000103377.1 GN:Z:Gm37180 RE:A:E BC:Z:TTCCCGAC QT:Z:F8FFFFFF CR:Z:GCTGCTTTCGATGAGG CY:Z:8FF8F8FFFF8FFFFF CB:Z:GCTGCTTTCGATGAGG-1 UR:Z:CGTTGATAGT UY:Z:FFFFFFFFFF UB:Z:CGTTGATAGT RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:1 A00111:77:H3YKNDMXX:2:1169:24306:30655 1040 chr1 3365985 255 90M * 0 0 GGGGGCTGTCAGTAGGATGATATAAGATATAGATGTAGTTTATCTCCTAATCCCACCCTTCCTCAAAGATTTCTGTCAGTGACATTGTTA FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:1 HI:i:1 AS:i:88 nM:i:0 TX:Z:ENSMUST00000195335.1,+2475,90M GX:Z:ENSMUSG00000103377.1 GN:Z:Gm37180 RE:A:E BC:Z:GCAGTAGA QT:Z:FFFFFFFF CR:Z:GCTGCTTTCGATGAGG CY:Z:8F88F8FFFFFFFFFF CB:Z:GCTGCTTTCGATGAGG-1 UR:Z:CGTTGATAGT UY:Z:FFFFFFFFFF UB:Z:CGTTGATAGT RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:2 A00111:77:H3YKNDMXX:1:1339:19226:22232 1040 chr1 3366061 255 90M * 0 0 CAGTGACATTGTTATCAGACTCAAACATGGGGATGATTCTGCCAGTGACTTTAATTACTTTCCCATCAAAGGCCCATTGAGCAGTTTCAC FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:1 HI:i:1 AS:i:88 nM:i:0 TX:Z:ENSMUST00000195335.1,+2399,90M GX:Z:ENSMUSG00000103377.1 GN:Z:Gm37180 RE:A:E BC:Z:TTCCCGAC QT:Z:FFFFFFFF CR:Z:GCTGCTTTCGATGAGG CY:Z:FFFFFFFFFFFFFFFF CB:Z:GCTGCTTTCGATGAGG-1 UR:Z:CGTTGATAGT UY:Z:FFFFFFFFFF UB:Z:CGTTGATAGT RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:1 A00111:77:H3YKNDMXX:2:2120:12717:35509 1040 chr1 3376586 255 90M * 0 0 TTGTTTTGGTTTTGTGTGGGTGTGGTTTTTTTAAATATATTCTTGTTTTCTTGGGTTTTGTGAAGACAGTTCTCTTGAATTGTGTTTCGG FFFFFF8FFF-FFFFFFF8FFFFFFFF8FFFFFF-FFF--8FFFFFFFFF-FFFFFF-F88FF8F8FFFFFFFF8FFFFFFFFFFFFFF- NH:i:1 HI:i:1 AS:i:88 nM:i:0 TX:Z:ENSMUST00000192336.1,+1113,90M GX:Z:ENSMUSG00000104017.1 GN:Z:Gm37363 RE:A:E BC:Z:AGTAGTCT QT:Z:8FF8F8FF CR:Z:CATTATCGTACTCGCG CY:Z:FF8F8-FF8FFFFF8F CB:Z:CATTATCGTACTCGCG-1 UR:Z:AACTAAGATA UY:Z:FFFFFFFFFF UB:Z:AACTAAGATA RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:2 ```
### *Unaligned* (thus no gene assigned) - [Offiical SAM tags](https://samtools.github.io/hts-specs/SAMtags.pdf) - `GN`: Not present - `RE`: Not present - [10x Genomics specific tags](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/bam) - `NH`: integer == 0 - `HI`: integer == 0
``` Tue 1 Dec - 10:35  ~/data_sm/immune-evolution/pipeline-results/mouse/kmermaid/lung--mouse--remove-ribo/10x-bams  olga@lrrr  tail MACA_18m_M_LUNG_52__CTACATTGTTCGGGCT.sam A00111:77:H3YKNDMXX:2:1184:18973:15248 4 * 0 0 * * 0 0 CTCCTACGGGCCAGGGGGATCCTATCACAAAAGAATAAAGCAGCCTGATTGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:0 HI:i:0 AS:i:57 nM:i:1 uT:A:1 BC:Z:GCAGTAGA QT:Z:FFFF8FFF CR:Z:CTACATTGTTCGGGCT CY:Z:FF8FFFFFFFFFFFFF CB:Z:CTACATTGTTCGGGCT-1 UR:Z:TAACGTTAGC UY:Z:FFFFFFFFFF UB:Z:TAACGTTAGC RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:2 A00111:77:H3YKNDMXX:2:1185:7355:3771 4 * 0 0 * * 0 0 GCAGTGGTATCAACGCAGAGTACATGGGGCTCATCCGGTCTCTTTGGCCTCGCCGGTAGAAGCAAGATGACGAAGGGACCGTCATCCTTT FFFFFFFFF8FFFFFFFFFFFFFF-FFFFFFF-FFFFFFF-F-F-FFFFFFFFFFFF-FFFFFF-FFFFFFFFFFFFF-FFFF-FFFF-F NH:i:0 HI:i:0 AS:i:57 nM:i:2 uT:A:1 BC:Z:GCAGTAGA QT:Z:FFFFFFFF CR:Z:CTACATTGTTCGGGCT CY:Z:F88F8F8FFFFFFFFF CB:Z:CTACATTGTTCGGGCT-1 UR:Z:ACTAGATAAC UY:Z:FFFFFFFFFF UB:Z:ACTAGATAAC RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:2 A00111:77:H3YKNDMXX:2:1187:27362:31563 4 * 0 0 * * 0 0 CGGTTAATAAAAAAAAATACCCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGAATAAAATTTAAAAAAAAAAAAAAATATA F-F-8--F-F-F-F-FF-FFF---FFFF-FF-FFFF-FF--F-FF-F-FFFF-FF--FF--F----F------F--FF-FFFF---F-FF NH:i:0 HI:i:0 AS:i:53 nM:i:5 uT:A:1 BC:Z:GCAGTAGA QT:Z:FFFFFFF8 CR:Z:CTACATTGTTCGGGCT CY:Z:88F88FFFFFFFFFFF CB:Z:CTACATTGTTCGGGCT-1 UR:Z:CGCAGAATTC UY:Z:FFFFFFFFFF UB:Z:CGCAGAATTC RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:2 A00111:77:H3YKNDMXX:2:1201:28736:20353 4 * 0 0 * * 0 0 GCTCCAGCTCCTGCCCACCCACCCCCAAATACCATAACATACACTTATTAAAATACCCACAATTAGAGCCCTTGCAGAGATTTATAAAAA FFFFFFFFFFFFF-FF-FFF--FFFF-----FF-F-F--F-F---8--F-F----F8F----FF--F-8F--F--F-8--88F---FF-- NH:i:0 HI:i:0 AS:i:37 nM:i:10 uT:A:1 BC:Z:GCAGTAGA QT:Z:FFFFFFFF CR:Z:CTACATTGTTCGGGCT CY:Z:888FF8FFFFFFFFFF CB:Z:CTACATTGTTCGGGCT-1 UR:Z:ATCCAGTAGA UY:Z:FFFFFFFFFF UB:Z:ATCCAGTAGA RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:2 A00111:77:H3YKNDMXX:2:1202:12680:3349 4 * 0 0 * * 0 0 GGGGGGGGGGGGGGGGGGGGGGGGTGTGGGTTGGGGAGGTGAGTGGGGGGCTGAGGTGGGGGATGATAAGAAAAGGGAAGGGAATAGGAA F-FF-FFFFFFFF-F----FFF-F-F-F----F-F-------------F------------------F--F--------8--------8- NH:i:0 HI:i:0 AS:i:32 nM:i:3 uT:A:1 BC:Z:GCAGTAGA QT:Z:FFFFFFFF CR:Z:CTACATTGTTCGGGCT CY:Z:8F8-F8-8FFF-F-FF CB:Z:CTACATTGTTCGGGCT-1 UR:Z:AGCAGGGAGC UY:Z:FFFFF-FFFF UB:Z:AGCAGGGAGC RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:2 A00111:77:H3YKNDMXX:2:1203:30400:17096 4 * 0 0 * * 0 0 AAGCAGTGGTATCAACGCAGAGTACATGGGGATTTATTTTCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTACTTTCTAAAT FFFFFFFF-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF-F-FFFF-FFFFFFFFFF-FFF-FF8FF8F8-FF-F8F---F NH:i:0 HI:i:0 AS:i:53 nM:i:1 uT:A:1 BC:Z:GCAGTAGA QT:Z:F8FFF8FF CR:Z:CTACATTGTTCGGGCT CY:Z:-F88F--F-FFFF8FF CB:Z:CTACATTGTTCGGGCT-1 UR:Z:TCCCTTACGC UY:Z:FFFFFFFFFF UB:Z:TCCCTTACGC RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:2 A00111:77:H3YKNDMXX:2:1206:15483:21825 4 * 0 0 * * 0 0 CGGTGTTTAAAAAAAAAAAAAAATAAAAAAAAAAAAAATAAACAAAAAAAAAAAAAAAAAAAAAATAATTTAAAAAAAAAACAAAAAAAA F---------F-F--F-FF-FF--F--FFFFFFFF-FF-FFFF-FF-F--F-FFFFFFFFFF8F--FF-F-F--8-FFF---FFFF--8F NH:i:0 HI:i:0 AS:i:50 nM:i:10 uT:A:1 BC:Z:GCAGTAGA QT:Z:FFFFFFFF CR:Z:CTACATTGTTCGGGCT CY:Z:8F8FFF8FFFFFFFFF CB:Z:CTACATTGTTCGGGCT-1 UR:Z:ACCTTTAATC UY:Z:FFFFFFFFFF UB:Z:ACCTTTAATC RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:2 A00111:77:H3YKNDMXX:2:1208:4155:18208 4 * 0 0 * * 0 0 GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTAGGGGGGAGAAAAGACCAAGGAGAGGGAGAACAGGGCGCAGGCGGAG -FFFFFFFFF-FFFFFFFFF--FF-FF-F-F-F--------------------------------------------------------- NH:i:0 HI:i:0 AS:i:43 nM:i:3 uT:A:1 BC:Z:GCAGTAGA QT:Z:FFFFFFFF CR:Z:CTACATTGTTCGGGCT CY:Z:FF888FFFFFFFFFFF CB:Z:CTACATTGTTCGGGCT-1 UR:Z:AAGACACTAT UY:Z:FFFFFFFFFF UB:Z:AAGACACTAT RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:2 A00111:77:H3YKNDMXX:2:1208:27380:25207 4 * 0 0 * * 0 0 AGGGGAAAAAAAAAAAAAAAATGCAAAAAAAAAAAAAAAAATAAAATAAAAAAAAAAAAAAAAAATCAGAAAATAAAAAAAAAAAAATAA --F---F-FF-FF-FFF--F-----FFFFF----FF-F--F-F--F-FFFFFFF---FFFFF-----F-------FFF-FFFF8-F---- NH:i:0 HI:i:0 AS:i:40 nM:i:7 uT:A:1 BC:Z:GCAGTAGA QT:Z:FFFFFFFF CR:Z:CTACATTGTTCGGGCT CY:Z:FF8FFFFFFFFFFFFF CB:Z:CTACATTGTTCGGGCT-1 UR:Z:CTCCATACTG UY:Z:FFFFFFFFFF UB:Z:CTCCATACTG RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:2 A00111:77:H3YKNDMXX:2:1209:7663:1736 4 * 0 0 * * 0 0 GTGGTATCAACGCAGAGTACATGGGATTGAAGAAATTGCAGAAAACTGTAGAAGGATAAGCCGGCCCTTATATAAACATTTTTGTAGGAT -F-F-F-FFFFFFFFFFFFFFFFFFFF--FFFF-F--FFF--FFFFFFFF-FFF-F---F-----F---FFFFFFFF8F-FFFFFF-FFF NH:i:0 HI:i:0 AS:i:57 nM:i:3 uT:A:1 BC:Z:GCAGTAGA QT:Z:FFFFFFFF CR:Z:CTACATTGTTCTGGCT CY:Z:-F--FFF8FFF-FFFF CB:Z:CTACATTGTTCGGGCT-1 UR:Z:CAACCTGCGA UY:Z:FFFFFF-FFF UB:Z:CAACCTGCGA RG:Z:MACA_18m_M_LUNG_52:MissingLibrary:1:H3YKNDMXX:2 ```
Thus, this PR adds at least `NH`, `HI`, and `RE` tags, plus all known tags just in case they're needed for downstream processing. --- nextflow.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index 35cbc9b3..b72c4365 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,7 +21,7 @@ params { // Parsing 10x bam files tenx_tgz = false - tenx_tags = "CB,XC,UB,XM,XB,RG,GN,GX,TX" + tenx_tags = "CB,CR,CY,XC,UB,UR,UY,AN,TR,XM,XB,RG,GN,GX,TX,NH,HI,AS,nM,RE,MM,pa,xf,fb,fr,fq,fx" tenx_cell_barcode_pattern = '(CB|XC):Z:([ACGT]+)(\\-1)?' tenx_molecular_barcode_pattern = '(UB|XB|XM):Z:([ACGT]+)' tenx_min_umi_per_cell = 0 @@ -219,4 +219,4 @@ def check_max(obj, type) { return obj } } -} \ No newline at end of file +} From ca38bac93157889e8455d8e3c0b318fdcf151547 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 4 Jan 2021 16:48:51 -0800 Subject: [PATCH 2/9] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f3031008..70da83be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,7 @@ barcode fastq * Fix the use of `skip_multiqc` flag condition with if and not when * Updated sencha=1.0.3 to fix the bug in memory errors possibly with the numpy array on unique filenames ([PR #96 on sencha](https://github.com/czbiohub/leaftea/pull/96)) * Do `sourmash compute` on all input ksizes, and all peptide molecule types, at once to save disk reading/writing efforts +* Don't save translate csvs and jsons by default, add separate `--save_translate_json` and `--save_translate_csv` ### `Dependencies` From 285ba5883d312092ab12f0dcd1d0e507a215b1b5 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 4 Jan 2021 16:49:03 -0800 Subject: [PATCH 3/9] add save_translate_{json,csv} params --- nextflow.config | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nextflow.config b/nextflow.config index 6829d7e0..80aba6cd 100644 --- a/nextflow.config +++ b/nextflow.config @@ -50,6 +50,10 @@ params { translate_jaccard_threshold = 0.95 reference_proteome_fasta = false bloomfilter_tablesize = '1e8' + // Saving the translate results for each dataset makes it take extra long + // Recommended for debugging purposes only + save_translate_csv = false + save_translate_json = false // Ribosomal RNA removal From bc905d43e53c55f271e378682346ef629bc5d87b Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 4 Jan 2021 16:49:15 -0800 Subject: [PATCH 4/9] Don't save json and csv from translate by default --- main.nf | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 3ae51693..afaea958 100644 --- a/main.nf +++ b/main.nf @@ -440,6 +440,10 @@ Channel .map { row -> file(row) } .set { sortmerna_fasta } +// --- Parse Translate parameters --- +save_translate_csv = params.save_translate_csv +save_translate_json = params.save_translate_json + // --- Parse the Sourmash parameters ---- ksizes = params.ksizes?.toString().tokenize(',') @@ -1195,13 +1199,16 @@ if (!params.remove_ribo_rna) { set val(sample_id), file("${sample_id}__coding_summary.json") into ch_coding_scores_json script: + csv_flag = save_translate_csv ? "--csv ${sample_id}__coding_scores.csv" : '' + json_flag = save_translate_json ? "--json-summary ${sample_id}__coding_summary.json" : '' + """ sencha translate \\ --molecule ${molecule} \\ --coding-nucleotide-fasta ${sample_id}__coding_reads_nucleotides.fasta \\ --noncoding-nucleotide-fasta ${sample_id}__noncoding_reads_nucleotides.fasta \\ - --csv ${sample_id}__coding_scores.csv \\ - --json-summary ${sample_id}__coding_summary.json \\ + ${csv_flag} \\ + ${json_flag} \\ --jaccard-threshold ${jaccard_threshold} \\ --peptide-ksize ${peptide_ksize} \\ --peptides-are-bloom-filter \\ From c884bd3d836c9147219de6e5a57011bd601a881a Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 4 Jan 2021 17:07:22 -0800 Subject: [PATCH 5/9] Add dummy csv and json so nextflow doesn't complain --- main.nf | 49 ++++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/main.nf b/main.nf index afaea958..e46eb008 100644 --- a/main.nf +++ b/main.nf @@ -1184,7 +1184,14 @@ if (!params.remove_ribo_rna) { process translate { tag "${sample_id}" label "low_memory_long" - publishDir "${params.outdir}/translate/", mode: params.publish_dir_mode + publishDir "${params.outdir}/translate/", mode: params.publish_dir_mode, + saveAs: { + filename -> + if (save_translate_csv and filename.indexOf(".csv") > 0) "description/$filename" + if (save_translate_json and filename.indexOf(".json") > 0) "description/$filename" + else if (filename.indexOf(".sig") > 0) "sigs/$filename" + else null + } input: set bloom_id, molecule, file(bloom_filter) from ch_sencha_bloom_filter.collect() @@ -1195,26 +1202,30 @@ if (!params.remove_ribo_rna) { set val(sample_id), file("${sample_id}__noncoding_reads_nucleotides.fasta") into ch_noncoding_nucleotides_potentially_empty set val(sample_id), file("${sample_id}__coding_reads_peptides.fasta") into ch_translated_protein_seqs set val(sample_id), file("${sample_id}__coding_reads_nucleotides.fasta") into ch_translatable_nucleotide_seqs - set val(sample_id), file("${sample_id}__coding_scores.csv") into ch_coding_scores_csv - set val(sample_id), file("${sample_id}__coding_summary.json") into ch_coding_scores_json + set val(sample_id), file(translate_csv) into ch_coding_scores_csv + set val(sample_id), file(translate_json) into ch_coding_scores_json - script: - csv_flag = save_translate_csv ? "--csv ${sample_id}__coding_scores.csv" : '' - json_flag = save_translate_json ? "--json-summary ${sample_id}__coding_summary.json" : '' + script: + translate_json = "${sample_id}__coding_summary.json" + translate_csv = "${sample_id}__coding_scores.csv" + csv_flag = save_translate_csv ? "--csv ${translate_csv}" : '' + json_flag = save_translate_json ? "--json-summary ${translate_json}" : '' - """ - sencha translate \\ - --molecule ${molecule} \\ - --coding-nucleotide-fasta ${sample_id}__coding_reads_nucleotides.fasta \\ - --noncoding-nucleotide-fasta ${sample_id}__noncoding_reads_nucleotides.fasta \\ - ${csv_flag} \\ - ${json_flag} \\ - --jaccard-threshold ${jaccard_threshold} \\ - --peptide-ksize ${peptide_ksize} \\ - --peptides-are-bloom-filter \\ - ${bloom_filter} \\ - ${reads} > ${sample_id}__coding_reads_peptides.fasta - """ + """ + sencha translate \\ + --molecule ${molecule} \\ + --coding-nucleotide-fasta ${sample_id}__coding_reads_nucleotides.fasta \\ + --noncoding-nucleotide-fasta ${sample_id}__noncoding_reads_nucleotides.fasta \\ + ${csv_flag} \\ + ${json_flag} \\ + --jaccard-threshold ${jaccard_threshold} \\ + --peptide-ksize ${peptide_ksize} \\ + --peptides-are-bloom-filter \\ + ${bloom_filter} \\ + ${reads} > ${sample_id}__coding_reads_peptides.fasta + touch ${translate_csv} + touch ${translate_json} + """ } // Remove empty files From dd40741bf493202eb4726b2e67f8ad725b3e528d Mon Sep 17 00:00:00 2001 From: Pranathi Vemuri Date: Tue, 5 Jan 2021 13:08:16 -0800 Subject: [PATCH 6/9] Update main.nf --- main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index e46eb008..8b9bf3d6 100644 --- a/main.nf +++ b/main.nf @@ -1187,8 +1187,8 @@ if (!params.remove_ribo_rna) { publishDir "${params.outdir}/translate/", mode: params.publish_dir_mode, saveAs: { filename -> - if (save_translate_csv and filename.indexOf(".csv") > 0) "description/$filename" - if (save_translate_json and filename.indexOf(".json") > 0) "description/$filename" + if (save_translate_csv && filename.indexOf(".csv") > 0) "description/$filename" + if (save_translate_json && filename.indexOf(".json") > 0) "description/$filename" else if (filename.indexOf(".sig") > 0) "sigs/$filename" else null } @@ -1685,4 +1685,4 @@ def checkHostname() { } } } -} \ No newline at end of file +} From 8c4ea003ac0d3a57a96185f48d5518ada2f41af1 Mon Sep 17 00:00:00 2001 From: Pranathi Vemuri Date: Tue, 5 Jan 2021 13:12:17 -0800 Subject: [PATCH 7/9] Update Dockerfile --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index ce612a0e..46764af7 100755 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM nfcore/base:1.12 +FROM nfcore/base:1.12.1 LABEL authors="Olga Botvinnik" \ description="Docker image containing all software requirements for the nf-core/kmermaid pipeline" @@ -14,4 +14,4 @@ RUN conda env export --name nf-core-kmermaid-0.1.0dev > nf-core-kmermaid-0.1.0de # Instruct R processes to use these empty files instead of clashing with a local version RUN touch .Rprofile -RUN touch .Renviron \ No newline at end of file +RUN touch .Renviron From 0547970b54bce5f092726a91b9dc60bafcb88eab Mon Sep 17 00:00:00 2001 From: pranathivemuri Date: Tue, 5 Jan 2021 13:43:03 -0800 Subject: [PATCH 8/9] correct the schema for 2 params added --- nextflow_schema.json | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index aa93d85c..c8dd9ccb 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -485,5 +485,17 @@ { "$ref": "#/definitions/generic_options" } - ] + ], + "properties": { + "save_translate_csv": { + "type": "string", + "description": "Path to save the coding scores as a csv", + "default": "False" + }, + "save_translate_json": { + "type": "string", + "description": "Path to save summarization of coding/\" \"noncoding/other categorizations, the \" \"min/max/mean/median/stddev of Jaccard scores, and other as a json", + "default": "False" + } + } } \ No newline at end of file From 67bdaeaf5aabedb0b8ee7476a20e82a849a27019 Mon Sep 17 00:00:00 2001 From: Pranathi Vemuri Date: Tue, 5 Jan 2021 14:12:18 -0800 Subject: [PATCH 9/9] Update Dockerfile --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index ce612a0e..46764af7 100755 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM nfcore/base:1.12 +FROM nfcore/base:1.12.1 LABEL authors="Olga Botvinnik" \ description="Docker image containing all software requirements for the nf-core/kmermaid pipeline" @@ -14,4 +14,4 @@ RUN conda env export --name nf-core-kmermaid-0.1.0dev > nf-core-kmermaid-0.1.0de # Instruct R processes to use these empty files instead of clashing with a local version RUN touch .Rprofile -RUN touch .Renviron \ No newline at end of file +RUN touch .Renviron