From 8b332791a1626d587352aea8f8c51980615b0b1a Mon Sep 17 00:00:00 2001 From: John Lees Date: Thu, 7 Mar 2024 17:31:03 +0000 Subject: [PATCH] Proposed fix for repeat ref bias Write all mapped middle bases at the end This ensures consistency with the ska nk output, but is not necessarily 'more correct' --- Cargo.toml | 2 +- src/ska_ref/aln_writer.rs | 27 +++++++++---------- tests/test_results_correct/map_aln_k9.stdout | 2 +- .../map_aln_k9_filter.stdout | 2 +- 4 files changed, 15 insertions(+), 18 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9fb4247..959a834 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ska" -version = "0.3.6" +version = "0.3.7" authors = [ "John Lees ", "Simon Harris ", diff --git a/src/ska_ref/aln_writer.rs b/src/ska_ref/aln_writer.rs index 1a8afb2..618ad79 100644 --- a/src/ska_ref/aln_writer.rs +++ b/src/ska_ref/aln_writer.rs @@ -41,8 +41,8 @@ pub struct AlnWriter<'a> { repeat_regions: &'a Vec, /// Whether to treat all ambiguous bases as Ns mask_ambig: bool, - /// Buffer for amibguous bases, which are written at finalisation. - _ambig_out: Vec<(u8, usize)>, + /// Buffer for middle bases, which are written at finalisation. + _middle_out: Vec<(u8, usize)>, } impl<'a> AlnWriter<'a> { @@ -68,7 +68,7 @@ impl<'a> AlnWriter<'a> { finalised: false, repeat_regions, mask_ambig, - _ambig_out: Vec::new(), + _middle_out: Vec::new(), } } @@ -125,14 +125,13 @@ impl<'a> AlnWriter<'a> { while mapped_chrom > self.curr_chrom { self.fill_contig(); } - // Ambiguous bases may clash with the flanks, which are copied from - // reference. Deal with these in `finalise`. - if is_ambiguous(base) { - self._ambig_out.push(( - if self.mask_ambig { b'N' } else { base }, - mapped_pos + self.chrom_offset, - )); - } + // Middle bases may clash with the flanks in complex repeats, which then + // are copied from reference. Deal with these in `finalise`. + self._middle_out.push(( + if is_ambiguous(base) && self.mask_ambig { b'N' } else { base }, + mapped_pos + self.chrom_offset, + )); + if mapped_pos < self.next_pos { self.last_mapped = mapped_pos; } else { @@ -146,8 +145,6 @@ impl<'a> AlnWriter<'a> { let end = mapped_pos; self.seq_out[(start + self.chrom_offset)..(end + self.chrom_offset)] .copy_from_slice(&self.ref_seq[self.curr_chrom][start..end]); - // Middle base - self.seq_out[mapped_pos + self.chrom_offset] = base; // update indices self.next_pos = mapped_pos + self.half_split_len + 1; @@ -164,8 +161,8 @@ impl<'a> AlnWriter<'a> { self.fill_contig(); } // Make sure any ambiguous bases are correct - for (ambig_base, ambig_pos) in &self._ambig_out { - self.seq_out[*ambig_pos] = *ambig_base; + for (middle_base, middle_pos) in &self._middle_out { + self.seq_out[*middle_pos] = *middle_base; } // Mask repeats for repeat_idx in self.repeat_regions { diff --git a/tests/test_results_correct/map_aln_k9.stdout b/tests/test_results_correct/map_aln_k9.stdout index 8888733..0c9243c 100644 --- a/tests/test_results_correct/map_aln_k9.stdout +++ b/tests/test_results_correct/map_aln_k9.stdout @@ -1,4 +1,4 @@ >test_1 ------------GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTSTTTTAAGAGTSTTTTGATGGTTT------------ >test_2 -------------GATTTGAAGGCGTTTATGATATATAGCTGTACGAGAGTCTTTTAAAAGTGTTTTGATGGTTT------------ +------------GATTTGAAGGCGTTTATGATATATAGCTGTACGAGAGTCTTTTAAAAGTCTTTTGATGGTTT------------ diff --git a/tests/test_results_correct/map_aln_k9_filter.stdout b/tests/test_results_correct/map_aln_k9_filter.stdout index 549e2be..f1a0840 100644 --- a/tests/test_results_correct/map_aln_k9_filter.stdout +++ b/tests/test_results_correct/map_aln_k9_filter.stdout @@ -1,4 +1,4 @@ >test_1 ------------GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTNTTTTAAGAGTNTTTTGATGGTTT------------ >test_2 -------------GATTTGAAGGCGTTTATGATATATAGCTGTACGAGAGTCTTTTAAAAGTGTTTTGATGGTTT------------ +------------GATTTGAAGGCGTTTATGATATATAGCTGTACGAGAGTCTTTTAAAAGTCTTTTGATGGTTT------------