diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 4a11c5268..2a9890772 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -52,10 +52,10 @@ jobs: with: submodules: recursive - - name: Install stable toolchain + - name: Install nightly toolchain uses: actions-rs/toolchain@v1.0.6 with: - toolchain: stable + toolchain: nightly override: true - name: Install system dependencies @@ -67,8 +67,9 @@ jobs: - name: Run cargo-tarpaulin uses: actions-rs/tarpaulin@v0.1 with: + # TODO: update to latest tarpaulin once artefact download is fixed: https://github.com/actions-rs/tarpaulin/pull/23 version: "0.22.0" - args: "--all-features --out Lcov -- --test-threads 1" + args: "--all-features --run-types Tests,Doctests --out Lcov -- --test-threads 1" - name: Upload coverage uses: coverallsapp/github-action@v1 @@ -84,9 +85,6 @@ jobs: target: - no-default-features - all-features - - musl-release-no-default-features - - musl-release-all-features - - musl-all-features include: - target: no-default-features args: --no-default-features @@ -95,18 +93,6 @@ jobs: args: --all-features toolchain_target: x86_64-unknown-linux-musl use_cross: false - - target: musl-release-no-default-features - args: --release --target x86_64-unknown-linux-musl --no-default-features - toolchain_target: x86_64-unknown-linux-musl - use_cross: true - - target: musl-release-all-features - args: --release --target x86_64-unknown-linux-musl --all-features --verbose - toolchain_target: x86_64-unknown-linux-musl - use_cross: true - - target: musl-all-features - args: --target x86_64-unknown-linux-musl --all-features --verbose - toolchain_target: x86_64-unknown-linux-musl - use_cross: true steps: - name: Checkout repository @@ -139,29 +125,28 @@ jobs: strategy: matrix: target: - - intel-catalina - - intel-bigsur - - m1-bigsur + - intel-monterey + - intel-ventura + - silicon-sonoma include: - - target: intel-catalina - os: macOS-10.15 + - target: intel-monterey + os: macOS-12.0 toolchain_target: x86_64-apple-darwin toolchain: stable - aux_args: "" - default: false - - target: intel-bigsur - os: macOS-11.0 + aux_args: --target x86_64-apple-darwin + default: true + - target: intel-ventura + os: macOS-13.0 toolchain_target: x86_64-apple-darwin toolchain: stable + aux_args: --target x86_64-apple-darwin + default: true + - target: silicon-sonoma + os: macOS-14.0 + toolchain_target: aarch64-apple-darwin + toolchain: stable aux_args: "" default: false - # TODO enable again and try to find out why this fails - # - target: m1-bigsur - # os: macOS-11.0 - # toolchain_target: aarch64-apple-darwin - # toolchain: nightly - # aux_args: --target aarch64-apple-darwin - # default: true steps: - name: Checkout repository @@ -172,19 +157,14 @@ jobs: - name: Install stable toolchain uses: actions-rs/toolchain@v1.0.6 with: - toolchain: stable + toolchain: ${{ matrix.toolchain }} + target: ${{ matrix.toolchain_target }} override: true + default: ${{ matrix.default }} - name: Install htslib dependencies run: brew install bzip2 zlib xz curl-openssl - #- uses: actions-rs/toolchain@v1.0.6 - # with: - # toolchain: ${{ matrix.toolchain }} - # target: ${{ matrix.toolchain_target }} - # #override: true - # default: ${{ matrix.default }} - - name: Test uses: actions-rs/cargo@v1.0.1 with: diff --git a/CHANGELOG.md b/CHANGELOG.md index cbf2d54fc..b01750214 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,67 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/). +## [0.47.0](https://github.com/rust-bio/rust-htslib/compare/v0.46.0...v0.47.0) (2024-05-22) + + +### Features + +* Add fasta::build function + FaidxBuildError ([#418](https://github.com/rust-bio/rust-htslib/issues/418)) ([7c575ef](https://github.com/rust-bio/rust-htslib/commit/7c575ef549908745f34d9371986551f3d70ed444)) +* Add rust_htslib::bcf::index::build ([#408](https://github.com/rust-bio/rust-htslib/issues/408)) ([79d70cd](https://github.com/rust-bio/rust-htslib/commit/79d70cd6683f1a019e9052baa495dada709db432)) +* derive PartialEq and Eq for bam:: and bcf::Format ([#428](https://github.com/rust-bio/rust-htslib/issues/428)) ([528e543](https://github.com/rust-bio/rust-htslib/commit/528e54367367487a28bbc2566bd37de995f8ed1d)) + + +### Bug Fixes + +* bam::Record:new should return a valid record ([#361](https://github.com/rust-bio/rust-htslib/issues/361)) ([87f2011](https://github.com/rust-bio/rust-htslib/commit/87f20116c4337eda17a416ebafb8976abc188d87)) +* build for macOS ([#431](https://github.com/rust-bio/rust-htslib/issues/431)) ([d869fdd](https://github.com/rust-bio/rust-htslib/commit/d869fdda03900cafae0f4f60b033121dcd57b723)) +* in bam record buffer, change the start of the window to the first added item in last iteration ([#430](https://github.com/rust-bio/rust-htslib/issues/430)) ([56ee2bd](https://github.com/rust-bio/rust-htslib/commit/56ee2bd562788dad0dc8516d0e3db90ffa916320)) +* Panic on trailing omitted FORMAT records ([#417](https://github.com/rust-bio/rust-htslib/issues/417)) ([9f575ee](https://github.com/rust-bio/rust-htslib/commit/9f575ee40e15737731bc8234812c0cf36c1157f4)) + +## [0.46.0](https://github.com/rust-bio/rust-htslib/compare/v0.45.0...v0.46.0) (2024-02-22) + + +### Features + +* making several RecordBuffer methods public ([6757f52](https://github.com/rust-bio/rust-htslib/commit/6757f5219955fd4edba4f61e62978ce1e001068e)) + + +### Bug Fixes + +* fix building libz-sys ([#420](https://github.com/rust-bio/rust-htslib/issues/420)) ([01c8849](https://github.com/rust-bio/rust-htslib/commit/01c884945686e7a6756406b579fde28657f70b36)) + +## [0.45.0](https://github.com/rust-bio/rust-htslib/compare/v0.44.1...v0.45.0) (2024-02-07) + + +### Features + +* adding function to get sequence length to faidx mod ([#410](https://github.com/rust-bio/rust-htslib/issues/410)) ([ae79eba](https://github.com/rust-bio/rust-htslib/commit/ae79eba82ef6929105bdbe08246a8e973660899e)) + + +### Bug Fixes + +* Loosen acceptable types to support current linux build on aarch64 ([#415](https://github.com/rust-bio/rust-htslib/issues/415)) ([1d78d12](https://github.com/rust-bio/rust-htslib/commit/1d78d1251a052461605d28cd8cf832ccad93ef73)) + +## [0.44.1](https://github.com/rust-bio/rust-htslib/compare/v0.44.0...v0.44.1) (2023-06-21) + + +### Bug Fixes + +* use correct return value in bcf_get_format and bcf_get_info_values ([#398](https://github.com/rust-bio/rust-htslib/issues/398)) ([f9a1981](https://github.com/rust-bio/rust-htslib/commit/f9a1981fa84eef39e35f868ddfc773ea265b94b3)) + +## [0.44.0](https://github.com/rust-bio/rust-htslib/compare/v0.43.1...v0.44.0) (2023-06-20) + + +### Features + +* implement Clone for bcf::Record ([#394](https://github.com/rust-bio/rust-htslib/issues/394)) ([e89538d](https://github.com/rust-bio/rust-htslib/commit/e89538d5a9971c6508ac38d92ac468f3d70241aa)) +* implement htslib basemod api ([#385](https://github.com/rust-bio/rust-htslib/issues/385)) ([8beee14](https://github.com/rust-bio/rust-htslib/commit/8beee145a116f7ae936f1b6e36d876116dca18f1)) + + +### Bug Fixes + +* include doctests in test coverage calculations ([#397](https://github.com/rust-bio/rust-htslib/issues/397)) ([8ed0837](https://github.com/rust-bio/rust-htslib/commit/8ed083783fa1dce09535564a090d37f687fc832f)) + ## [0.43.1](https://github.com/rust-bio/rust-htslib/compare/v0.43.0...v0.43.1) (2023-05-16) diff --git a/Cargo.toml b/Cargo.toml index 035aab298..d95dcb3a9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,18 +9,19 @@ license = "MIT" name = "rust-htslib" readme = "README.md" repository = "https://github.com/rust-bio/rust-htslib.git" -version = "0.43.1" +version = "0.47.0" [package.metadata.release] pre-release-commit-message = "release version {{version}}" tag-message = "Version {{version}} of Rust-HTSlib." [dependencies] +libz-sys = ">=1.1.15" bio-types = ">=0.9" byteorder = "1.3" custom_derive = "0.1" derive-new = "0.5" -hts-sys = {version = "2.0.3", default-features = false} +hts-sys = {version = "2.1.4", default-features = false, features = ["bindgen"]} ieee754 = "0.2" lazy_static = "1.4" libc = "0.2" diff --git a/Cross.toml b/Cross.toml deleted file mode 100644 index c6bb51e09..000000000 --- a/Cross.toml +++ /dev/null @@ -1,11 +0,0 @@ -[build.env] -passthrough = [ - "RUST_DEBUG", - "RUST_BACKTRACE", - "CFLAGS" -] - -[target.x86_64-unknown-linux-musl] -image = "brainstorm/cross-x86_64-unknown-linux-musl:1.0.4" -[target.x86_64-unknown-linux-gnu] -image = "brainstorm/cross-x86_64-unknown-linux-gnu:1.0.4" diff --git a/README.md b/README.md index 3a5343405..0662a4cc8 100644 --- a/README.md +++ b/README.md @@ -22,36 +22,6 @@ If you only want to use the library, there is no need to clone the repository. G rust-htslib comes with pre-built bindings to htslib for Mac and Linux. You will need a C toolchain compatible with the `cc` crate. The build script for this crate will automatically build a link htslib. - -### MUSL build -To compile this for MUSL crate you need docker and cross: - -```shell -$ cargo install cross -$ cross build # will build with GNU GCC or LLVM toolchains -``` - -If you want to run rust-htslib code on AWS lambda, [you'll need to statically compile it with MUSL](https://github.com/awslabs/aws-lambda-rust-runtime/issues/17#issuecomment-577490373) as follows: - -```shell -$ cross build --target x86_64-unknown-linux-musl # will build with MUSL toolchain -``` - -Alternatively, you can also install it locally by installing the development headers of zlib, bzip2 and xz. For instance, in Debian systems one needs the following dependencies: - -```shell -$ sudo apt-get install zlib1g-dev libbz2-dev liblzma-dev clang pkg-config -``` - -We provide Dockerfile bases that provide these dependencies. Refer to the [docker](https://github.com/rust-bio/rust-htslib/tree/master/docker) directory in this repository for the latest instructions, including LLVM installation. - -On OSX: - -```shell -$ brew install FiloSottile/musl-cross/musl-cross -$ brew install bzip2 zlib xz curl-openssl -``` - ## Usage Add this to your `Cargo.toml`: diff --git a/docker/Dockerfile.gnu b/docker/Dockerfile.gnu deleted file mode 100644 index 6c2078bf2..000000000 --- a/docker/Dockerfile.gnu +++ /dev/null @@ -1,7 +0,0 @@ -FROM rustembedded/cross:x86_64-unknown-linux-gnu - -ENV LIBCLANG_PATH /usr/lib/llvm-10/lib -ENV LLVM_CONFIG_PATH /usr/bin -RUN apt-get update -RUN apt-get install -y build-essential wget gnupg lsb-release software-properties-common apt-transport-https ca-certificates # Otherwise LLVM bump below fails -RUN bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" diff --git a/docker/Dockerfile.musl b/docker/Dockerfile.musl deleted file mode 100644 index 4988ab7e4..000000000 --- a/docker/Dockerfile.musl +++ /dev/null @@ -1,16 +0,0 @@ -FROM brainstorm/cross-x86_64-unknown-linux-musl:upstream - -ENV DEBIAN_FRONTEND noninteractive -ENV PKG_CONFIG_ALLOW_CROSS 1 - -ENV LIBCLANG_PATH /usr/lib/llvm-10/lib -ENV LLVM_CONFIG_PATH /usr/bin - -WORKDIR /root - -# Otherwise LLVM bump below fails -RUN apt-get install -y wget gnupg lsb-release software-properties-common apt-transport-https ca-certificates - -# Autodetect and fetch latest LLVM repos for the current distro, avoids LLVM warnings and other issues, might generate slower builds for now though, see: -# https://www.phoronix.com/scan.php?page=news_item&px=Rust-Hurt-On-LLVM-10 -RUN bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" diff --git a/docker/README.md b/docker/README.md deleted file mode 100644 index 8ce0ee1cd..000000000 --- a/docker/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# cross rustembedded containers - -Allows to compile (rust-)htslib in a variety of environments and architectures via [rustembedded cross](https://github.com/rust-embedded/cross). - -## Quickstart - -```shell -$ cd docker -$ docker build -t brainstorm/cross-x86_64-unknown-linux-musl:1.0.0 . -f Dockerfile.musl -$ docker build -t brainstorm/cross-x86_64-unknown-linux-gnu:1.0.0 . -f Dockerfile.gnu -``` - -Then, to build and test rust-htslib with the above containers, proceed as you would with `cargo`, using `cross` instead, i.e: - -```shell -$ cross build --target x86_64-unknown-linux-musl -``` diff --git a/docker/config-musl-cross-make.mak b/docker/config-musl-cross-make.mak deleted file mode 100644 index 424678462..000000000 --- a/docker/config-musl-cross-make.mak +++ /dev/null @@ -1,6 +0,0 @@ -TARGET = x86_64-linux-musl -OUTPUT = /usr/local/musl -GCC_VER = 9.2.0 -BINUTILS_VER = 2.33.1 -MUSL_VER=1.2.0 - diff --git a/src/bam/buffer.rs b/src/bam/buffer.rs index 0e71fdfa7..2bf41d589 100644 --- a/src/bam/buffer.rs +++ b/src/bam/buffer.rs @@ -11,7 +11,6 @@ use std::str; use crate::bam; use crate::bam::Read; use crate::errors::{Error, Result}; - /// A buffer for BAM records. This allows access regions in a sorted BAM file while iterating /// over it in a single pass. /// The buffer is implemented as a ringbuffer, such that extension or movement to the right has @@ -25,6 +24,7 @@ pub struct RecordBuffer { cache_cigar: bool, min_refetch_distance: u64, buffer_record: Rc, + start_pos: Option, } unsafe impl Sync for RecordBuffer {} @@ -45,6 +45,7 @@ impl RecordBuffer { cache_cigar, min_refetch_distance: 1, buffer_record: Rc::new(bam::Record::new()), + start_pos: Some(0), } } @@ -57,16 +58,16 @@ impl RecordBuffer { } /// Return start position of buffer - fn start(&self) -> Option { + pub fn start(&self) -> Option { self.inner.front().map(|rec| rec.pos() as u64) } /// Return end position of buffer. - fn end(&self) -> Option { + pub fn end(&self) -> Option { self.inner.back().map(|rec| rec.pos() as u64) } - fn tid(&self) -> Option { + pub fn tid(&self) -> Option { self.inner.back().map(|rec| rec.tid()) } @@ -89,7 +90,7 @@ impl RecordBuffer { if self.inner.is_empty() || window_start.saturating_sub(self.end().unwrap()) >= self.min_refetch_distance || self.tid().unwrap() != tid as i32 - || self.start().unwrap() > window_start + || self.start().unwrap() > self.start_pos.unwrap() { let end = self.reader.header.target_len(tid).unwrap(); self.reader.fetch((tid, window_start, end))?; @@ -145,6 +146,7 @@ impl RecordBuffer { added += 1; } } + self.start_pos = Some(self.start().unwrap_or(window_start)); Ok((added, deleted)) } else { diff --git a/src/bam/ext.rs b/src/bam/ext.rs index c113d2920..614498049 100644 --- a/src/bam/ext.rs +++ b/src/bam/ext.rs @@ -434,6 +434,9 @@ impl BamRecordExtensions for bam::Record { fn reference_start(&self) -> i64 { self.pos() } + + /// Calculate the rightmost base position of an alignment on the reference genome. + /// Returns the coordinate of the first base after the alignment (0-based). fn reference_end(&self) -> i64 { unsafe { htslib::bam_endpos(self.inner_ptr()) } } diff --git a/src/bam/mod.rs b/src/bam/mod.rs index 2d7651c4f..478b0ac63 100644 --- a/src/bam/mod.rs +++ b/src/bam/mod.rs @@ -677,7 +677,7 @@ impl IndexedReader { /// /// Both iterating (with [.records()](trait.Read.html#tymethod.records)) and looping without allocation (with [.read()](trait.Read.html#tymethod.read) are a two stage process: /// 1. 'fetch' the region of interest - /// 2. iter/loop trough the reads. + /// 2. iter/loop through the reads. /// /// Example: /// ``` @@ -703,6 +703,8 @@ impl IndexedReader { /// coordinates' expansion), i32, u32, and u64 (with a possible panic! if the coordinate /// won't fit an i64). /// + /// `start` and `stop` are zero-based. `start` is inclusive, `stop` is exclusive. + /// /// This replaces the old fetch and fetch_str implementations. pub fn fetch<'a, T: Into>>(&mut self, fetch_definition: T) -> Result<()> { //this 'compile time redirect' safes us @@ -1037,7 +1039,7 @@ impl Drop for IndexedReader { } } -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Format { Sam, Bam, @@ -1125,7 +1127,7 @@ impl Writer { ); (*rec).text = text as *mut c_char; - (*rec).l_text = l_text as u64; + (*rec).l_text = l_text; rec }; @@ -1390,9 +1392,9 @@ impl HeaderView { header_string.len(), ); - let rec = htslib::sam_hdr_parse((l_text + 1) as u64, text as *const c_char); + let rec = htslib::sam_hdr_parse((l_text + 1), text as *const c_char); (*rec).text = text as *mut c_char; - (*rec).l_text = l_text as u64; + (*rec).l_text = l_text; rec }; @@ -1578,7 +1580,7 @@ CCCCCCCCCCCCCCCCCCC"[..], assert_eq!(c1.inner().core.l_qname, b1.inner().core.l_qname); assert_eq!(c1.inner().core.n_cigar, b1.inner().core.n_cigar); assert_eq!(c1.inner().core.l_qseq, b1.inner().core.l_qseq); - assert_eq!(c1.inner().core.isize, b1.inner().core.isize); + assert_eq!(c1.inner().core.isize_, b1.inner().core.isize_); //... except m_data } } @@ -2449,7 +2451,7 @@ CCCCCCCCCCCCCCCCCCC"[..], where F: Fn(&record::Record) -> Option, { - let mut bam_reader = Reader::from_path(bamfile).unwrap(); // internal functions, just unwarp + let mut bam_reader = Reader::from_path(bamfile).unwrap(); // internal functions, just unwrap let header = header::Header::from_template(bam_reader.header()); let mut sam_writer = Writer::from_path(samfile, &header, Format::Sam).unwrap(); for record in bam_reader.records() { @@ -3000,6 +3002,58 @@ CCCCCCCCCCCCCCCCCCC"[..], assert_eq!(header_refseqs[0].get("LN").unwrap(), "10000000",); } + #[test] + fn test_bam_new() { + // Create the path to write the tmp test BAM + let tmp = tempfile::Builder::new() + .prefix("rust-htslib") + .tempdir() + .expect("Cannot create temp dir"); + let bampath = tmp.path().join("test.bam"); + + // write an unmapped BAM record (uBAM) + { + // Build the header + let mut header = Header::new(); + + // Add the version + header.push_record( + HeaderRecord::new(b"HD") + .push_tag(b"VN", &"1.6") + .push_tag(b"SO", &"unsorted"), + ); + + // Build the writer + let mut writer = Writer::from_path(&bampath, &header, Format::Bam).unwrap(); + + // Build an empty record + let mut record = Record::new(); + + // Write the record (this previously seg-faulted) + assert!(writer.write(&record).is_ok()); + } + + // Read the record + { + // Build th reader + let mut reader = Reader::from_path(&bampath).expect("Error opening file."); + + // Read the record + let mut rec = Record::new(); + match reader.read(&mut rec) { + Some(r) => r.expect("Failed to read record."), + None => panic!("No record read."), + }; + + // Check a few things + assert!(rec.is_unmapped()); + assert_eq!(rec.tid(), -1); + assert_eq!(rec.pos(), -1); + assert_eq!(rec.mtid(), -1); + assert_eq!(rec.mpos(), -1); + } + } + #[test] fn test_idxstats_bam() { let mut reader = IndexedReader::from_path("test/test.bam").unwrap(); diff --git a/src/bam/record.rs b/src/bam/record.rs index f69b252f7..ced3cc8aa 100644 --- a/src/bam/record.rs +++ b/src/bam/record.rs @@ -4,6 +4,7 @@ // except according to those terms. use std::convert::TryFrom; +use std::convert::TryInto; use std::ffi; use std::fmt; use std::marker::PhantomData; @@ -113,12 +114,23 @@ fn extranul_from_qname(qname: &[u8]) -> usize { impl Record { /// Create an empty BAM record. pub fn new() -> Self { - Record { + let mut record = Record { inner: unsafe { MaybeUninit::zeroed().assume_init() }, own: true, cigar: None, header: None, - } + }; + // The read/query name needs to be set as empty to properly initialize + // the record + record.set_qname(b""); + // Developer note: these are needed so the returned record is properly + // initialized as unmapped. + record.set_unmapped(); + record.set_tid(-1); + record.set_pos(-1); + record.set_mpos(-1); + record.set_mtid(-1); + record } pub fn from_inner(from: *mut htslib::bam1_t) -> Self { @@ -151,8 +163,8 @@ impl Record { let mut sam_string = htslib::kstring_t { s: sam_copy.as_ptr() as *mut c_char, - l: sam_copy.len() as u64, - m: sam_copy.len() as u64, + l: sam_copy.len(), + m: sam_copy.len(), }; let succ = unsafe { @@ -285,12 +297,12 @@ impl Record { /// Get insert size. pub fn insert_size(&self) -> i64 { - self.inner().core.isize + self.inner().core.isize_ } /// Set insert size. pub fn set_insert_size(&mut self, insert_size: i64) { - self.inner_mut().core.isize = insert_size; + self.inner_mut().core.isize_ = insert_size; } fn qname_capacity(&self) -> usize { @@ -1020,6 +1032,48 @@ impl Record { } } + /// Access the base modifications associated with this Record through the MM tag. + /// Example: + /// ``` + /// use rust_htslib::bam::{Read, Reader, Record}; + /// let mut bam = Reader::from_path(&"test/base_mods/MM-orient.sam").unwrap(); + /// let mut mod_count = 0; + /// for r in bam.records() { + /// let record = r.unwrap(); + /// if let Ok(mods) = record.basemods_iter() { + /// // print metadata for the modifications present in this record + /// for mod_code in mods.recorded() { + /// if let Ok(mod_metadata) = mods.query_type(*mod_code) { + /// println!("mod found with code {}/{} flags: [{} {} {}]", + /// mod_code, *mod_code as u8 as char, + /// mod_metadata.strand, mod_metadata.implicit, mod_metadata.canonical as u8 as char); + /// } + /// } + /// + /// // iterate over the modifications in this record + /// // the modifications are returned as a tuple with the + /// // position within SEQ and an hts_base_mod struct + /// for res in mods { + /// if let Ok( (position, m) ) = res { + /// println!("{} {},{}", position, m.modified_base as u8 as char, m.qual); + /// mod_count += 1; + /// } + /// } + /// }; + /// } + /// assert_eq!(mod_count, 14); + /// ``` + pub fn basemods_iter(&self) -> Result { + BaseModificationsIter::new(self) + } + + /// An iterator that returns all of the modifications for each position as a vector. + /// This is useful for the case where multiple possible modifications can be annotated + /// at a single position (for example a C could be 5-mC or 5-hmC) + pub fn basemods_position_iter(&self) -> Result { + BaseModificationsPositionIter::new(self) + } + /// Infer read pair orientation from record. Returns `SequenceReadPairOrientation::None` if record /// is not paired, mates are not mapping to the same contig, or mates start at the /// same position. @@ -2171,6 +2225,242 @@ impl fmt::Display for CigarStringView { } } +pub struct BaseModificationMetadata { + pub strand: i32, + pub implicit: i32, + pub canonical: u8, +} + +/// struct containing the internal state required to access +/// the base modifications for a bam::Record +pub struct BaseModificationState<'a> { + record: &'a Record, + state: *mut htslib::hts_base_mod_state, + buffer: Vec, + buffer_pos: i32, +} + +impl BaseModificationState<'_> { + /// Initialize a new BaseModification struct from a bam::Record + /// This function allocates memory for the state structure + /// and initializes the iterator to the start of the modification + /// records. + fn new<'a>(r: &'a Record) -> Result> { + let mut bm = unsafe { + BaseModificationState { + record: r, + state: hts_sys::hts_base_mod_state_alloc(), + buffer: Vec::new(), + buffer_pos: -1, + } + }; + + if bm.state.is_null() { + panic!("Unable to allocate memory for hts_base_mod_state"); + } + + // parse the MM tag to initialize the state + unsafe { + let ret = hts_sys::bam_parse_basemod(bm.record.inner_ptr(), bm.state); + if ret != 0 { + return Err(Error::BamBaseModificationTagNotFound); + } + } + + let types = bm.recorded(); + bm.buffer.reserve(types.len()); + return Ok(bm); + } + + pub fn buffer_next_mods(&mut self) -> Result { + unsafe { + let ret = hts_sys::bam_next_basemod( + self.record.inner_ptr(), + self.state, + self.buffer.as_mut_ptr(), + self.buffer.capacity() as i32, + &mut self.buffer_pos, + ); + + if ret < 0 { + return Err(Error::BamBaseModificationIterationFailed); + } + + // the htslib API won't write more than buffer.capacity() mods to the output array but it will + // return the actual number of modifications found. We return an error to the caller + // in the case where there was insufficient storage to return all mods. + if ret as usize > self.buffer.capacity() { + return Err(Error::BamBaseModificationTooManyMods); + } + + // we read the modifications directly into the vector, which does + // not update the length so needs to be manually set + self.buffer.set_len(ret as usize); + + return Ok(ret as usize); + } + } + + /// Return an array containing the modification codes listed for this record. + /// Positive values are ascii character codes (eg m), negative values are chEBI codes. + pub fn recorded<'a>(&self) -> &'a [i32] { + unsafe { + let mut n: i32 = 0; + let data_ptr: *const i32 = hts_sys::bam_mods_recorded(self.state, &mut n); + + // htslib should not return a null pointer, even when there are no base mods + if data_ptr.is_null() { + panic!("Unable to obtain pointer to base modifications"); + } + assert!(n >= 0); + return slice::from_raw_parts(data_ptr, n as usize); + } + } + + /// Return metadata for the specified character code indicating the strand + /// the base modification was called on, whether the tag uses implicit mode + /// and the ascii code for the canonical base. + /// If there are multiple modifications with the same code this will return the data + /// for the first mod. See https://github.com/samtools/htslib/issues/1635 + pub fn query_type<'a>(&self, code: i32) -> Result { + unsafe { + let mut strand: i32 = 0; + let mut implicit: i32 = 0; + // This may be i8 or u8 in hts_sys. + let mut canonical: c_char = 0; + + let ret = hts_sys::bam_mods_query_type( + self.state, + code, + &mut strand, + &mut implicit, + &mut canonical, + ); + if ret == -1 { + return Err(Error::BamBaseModificationTypeNotFound); + } else { + return Ok(BaseModificationMetadata { + strand, + implicit, + canonical: canonical.try_into().unwrap(), + }); + } + } + } +} + +impl Drop for BaseModificationState<'_> { + fn drop<'a>(&mut self) { + unsafe { + hts_sys::hts_base_mod_state_free(self.state); + } + } +} + +/// Iterator over the base modifications that returns +/// a vector for all of the mods at each position +pub struct BaseModificationsPositionIter<'a> { + mod_state: BaseModificationState<'a>, +} + +impl BaseModificationsPositionIter<'_> { + fn new<'a>(r: &'a Record) -> Result> { + let state = BaseModificationState::new(r)?; + Ok(BaseModificationsPositionIter { mod_state: state }) + } + + pub fn recorded<'a>(&self) -> &'a [i32] { + return self.mod_state.recorded(); + } + + pub fn query_type<'a>(&self, code: i32) -> Result { + return self.mod_state.query_type(code); + } +} + +impl<'a> Iterator for BaseModificationsPositionIter<'a> { + type Item = Result<(i32, Vec)>; + + fn next(&mut self) -> Option { + let ret = self.mod_state.buffer_next_mods(); + + // Three possible things happened in buffer_next_mods: + // 1. the htslib API call was successful but there are no more mods + // 2. ths htslib API call was successful and we read some mods + // 3. the htslib API call failed, we propogate the error wrapped in an option + match ret { + Ok(num_mods) => { + if num_mods == 0 { + return None; + } else { + let data = (self.mod_state.buffer_pos, self.mod_state.buffer.clone()); + return Some(Ok(data)); + } + } + Err(e) => return Some(Err(e)), + } + } +} + +/// Iterator over the base modifications that returns +/// the next modification found, one by one +pub struct BaseModificationsIter<'a> { + mod_state: BaseModificationState<'a>, + buffer_idx: usize, +} + +impl BaseModificationsIter<'_> { + fn new<'a>(r: &'a Record) -> Result> { + let state = BaseModificationState::new(r)?; + Ok(BaseModificationsIter { + mod_state: state, + buffer_idx: 0, + }) + } + + pub fn recorded<'a>(&self) -> &'a [i32] { + return self.mod_state.recorded(); + } + + pub fn query_type<'a>(&self, code: i32) -> Result { + return self.mod_state.query_type(code); + } +} + +impl<'a> Iterator for BaseModificationsIter<'a> { + type Item = Result<(i32, hts_sys::hts_base_mod)>; + + fn next(&mut self) -> Option { + if self.buffer_idx == self.mod_state.buffer.len() { + // need to use the internal state to read the next + // set of modifications into the buffer + let ret = self.mod_state.buffer_next_mods(); + + match ret { + Ok(num_mods) => { + if num_mods == 0 { + // done iterating + return None; + } else { + // we read some mods, reset the position in the buffer then fall through + self.buffer_idx = 0; + } + } + Err(e) => return Some(Err(e)), + } + } + + // if we got here when there are mods buffered that we haven't emitted yet + assert!(self.buffer_idx < self.mod_state.buffer.len()); + let data = ( + self.mod_state.buffer_pos, + self.mod_state.buffer[self.buffer_idx], + ); + self.buffer_idx += 1; + return Some(Ok(data)); + } +} + #[cfg(test)] mod tests { use super::*; @@ -2616,3 +2906,83 @@ mod alignment_cigar_tests { } } } + +#[cfg(test)] +mod basemod_tests { + use crate::bam::{Read, Reader}; + + #[test] + pub fn test_count_recorded() { + let mut bam = Reader::from_path(&"test/base_mods/MM-double.sam").unwrap(); + + for r in bam.records() { + let record = r.unwrap(); + if let Ok(mods) = record.basemods_iter() { + let n = mods.recorded().len(); + assert_eq!(n, 3); + }; + } + } + + #[test] + pub fn test_query_type() { + let mut bam = Reader::from_path(&"test/base_mods/MM-orient.sam").unwrap(); + + let mut n_fwd = 0; + let mut n_rev = 0; + + for r in bam.records() { + let record = r.unwrap(); + if let Ok(mods) = record.basemods_iter() { + for mod_code in mods.recorded() { + if let Ok(mod_metadata) = mods.query_type(*mod_code) { + if mod_metadata.strand == 0 { + n_fwd += 1; + } + if mod_metadata.strand == 1 { + n_rev += 1; + } + } + } + }; + } + assert_eq!(n_fwd, 2); + assert_eq!(n_rev, 2); + } + + #[test] + pub fn test_mod_iter() { + let mut bam = Reader::from_path(&"test/base_mods/MM-double.sam").unwrap(); + let expected_positions = [1, 7, 12, 13, 13, 22, 30, 31]; + let mut i = 0; + + for r in bam.records() { + let record = r.unwrap(); + for res in record.basemods_iter().unwrap() { + if let Ok((position, _m)) = res { + assert_eq!(position, expected_positions[i]); + i += 1; + } + } + } + } + + #[test] + pub fn test_position_iter() { + let mut bam = Reader::from_path(&"test/base_mods/MM-double.sam").unwrap(); + let expected_positions = [1, 7, 12, 13, 22, 30, 31]; + let expected_counts = [1, 1, 1, 2, 1, 1, 1]; + let mut i = 0; + + for r in bam.records() { + let record = r.unwrap(); + for res in record.basemods_position_iter().unwrap() { + if let Ok((position, elements)) = res { + assert_eq!(position, expected_positions[i]); + assert_eq!(elements.len(), expected_counts[i]); + i += 1; + } + } + } + } +} diff --git a/src/bam/record_serde.rs b/src/bam/record_serde.rs index 7d457c7b7..1bf8a2fa1 100644 --- a/src/bam/record_serde.rs +++ b/src/bam/record_serde.rs @@ -8,6 +8,8 @@ use serde_bytes::{ByteBuf, Bytes}; use crate::bam::record::Record; fn fix_l_extranul(rec: &mut Record) { + // first, reset the number of extranuls to 0 for calling .qname(); then calculate how many we actually have + rec.inner_mut().core.l_extranul = 0; let l_extranul = rec.qname().iter().rev().take_while(|x| **x == 0u8).count() as u8; rec.inner_mut().core.l_extranul = l_extranul; } @@ -29,7 +31,7 @@ impl Serialize for Record { state.serialize_field("seq_len", &core.l_qseq)?; state.serialize_field("mtid", &core.mtid)?; state.serialize_field("mpos", &core.mpos)?; - state.serialize_field("isize", &core.isize)?; + state.serialize_field("isize", &core.isize_)?; state.serialize_field("data", Bytes::new(self.data()))?; state.end() } @@ -138,7 +140,7 @@ impl<'de> Deserialize<'de> for Record { let mpos = seq .next_element()? .ok_or_else(|| de::Error::invalid_length(0, &self))?; - let isize = seq + let isize_ = seq .next_element()? .ok_or_else(|| de::Error::invalid_length(0, &self))?; let data = seq @@ -159,7 +161,7 @@ impl<'de> Deserialize<'de> for Record { m.l_qseq = seq_len; m.mtid = mtid; m.mpos = mpos; - m.isize = isize; + m.isize_ = isize_; } rec.set_data(&data); @@ -271,7 +273,7 @@ impl<'de> Deserialize<'de> for Record { let seq_len = seq_len.ok_or_else(|| de::Error::missing_field("seq_len"))?; let mtid = mtid.ok_or_else(|| de::Error::missing_field("mtid"))?; let mpos = mpos.ok_or_else(|| de::Error::missing_field("mpos"))?; - let isize = isize.ok_or_else(|| de::Error::missing_field("isize"))?; + let isize_ = isize.ok_or_else(|| de::Error::missing_field("isize"))?; let data = data .ok_or_else(|| de::Error::missing_field("data"))? .into_vec(); @@ -289,7 +291,7 @@ impl<'de> Deserialize<'de> for Record { m.l_qseq = seq_len; m.mtid = mtid; m.mpos = mpos; - m.isize = isize; + m.isize_ = isize_; } rec.set_data(&data); diff --git a/src/bcf/index.rs b/src/bcf/index.rs new file mode 100644 index 000000000..332fa215e --- /dev/null +++ b/src/bcf/index.rs @@ -0,0 +1,100 @@ +use crate::{htslib, utils}; + +#[derive(Debug)] +pub struct BcfBuildError { + pub msg: String, +} + +/// Index type to build. +pub enum Type { + /// Tabix index + Tbx, + /// CSI index, with given minimum shift + Csi(u32), +} + +impl Type { + fn min_shift(&self) -> i32 { + match self { + Self::Tbx => 0, + Self::Csi(x) => *x as i32, + } + } +} + +impl BcfBuildError { + pub fn error_message(error: i32) -> &'static str { + match error { + -1 => "indexing failed", + -2 => "opening @fn failed", + -3 => "format not indexable", + -4 => "failed to create and/or save the index", + _ => "unknown error", + } + } +} +impl std::error::Error for BcfBuildError {} + +impl std::fmt::Display for BcfBuildError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "BcfBuildError{{msg: {}}}", self.msg) + } +} + +/// Build a bcf or vcf.gz index. +/// Builds tbi or csi depending on index_type. +/// +///``` +/// // Index a sorted bcf file with csi. +/// let bcf_path = concat!(env!("CARGO_MANIFEST_DIR"), "/test/test_multi.bcf"); +/// rust_htslib::bcf::index::build(&bcf_path, Some(&"built_test.csi"), /*n_threads=*/ 4u32, rust_htslib::bcf::index::Type::Csi(14)).expect("Failed to build csi index for bcf file."); +/// assert!(std::path::Path::new(&"built_test.csi").exists()); +/// +/// // Index a bgzip-compresed vcf file with tabix. +/// let vcf_path = concat!(env!("CARGO_MANIFEST_DIR"), "/test/test_left.vcf.gz"); +/// rust_htslib::bcf::index::build(&vcf_path, Some(&"built_test_vcf.tbx"), /*n_threads=*/ 4u32, rust_htslib::bcf::index::Type::Tbx).expect("Failed to build tbx index for vcf file."); +/// assert!(std::path::Path::new(&"built_test_vcf.tbx").exists()); +/// +/// // Cannot build a tbi index for a bcf file: returns an Err(BcfBuildError). +/// assert!(std::panic::catch_unwind(|| rust_htslib::bcf::index::build(bcf_path, Some("built_test.tbi"), 4u32, rust_htslib::bcf::index::Type::Tbx).unwrap()).is_err()); +/// +/// // Cannot built a csi index for a vcf file: returns an Err(BcfBuildError). +/// let vcf_path = concat!(env!("CARGO_MANIFEST_DIR"), "/test/test_various.vcf"); +/// assert!(std::panic::catch_unwind(|| rust_htslib::bcf::index::build(&vcf_path, Some(&"built_test_vcf.csi"), /*n_threads=*/ 4u32, rust_htslib::bcf::index::Type::Csi(14)).expect("Failed to build csi index for vcf file.")).is_err()); +///``` +/// +pub fn build>( + bcf_path: P, + idx_path: Option

, + n_threads: u32, + index_type: Type, +) -> Result<(), BcfBuildError> { + let min_shift = index_type.min_shift(); + let idx_path_cstr = idx_path.and_then(|x| utils::path_to_cstring(&x)); + let bcf_path = utils::path_to_cstring(&bcf_path).ok_or(BcfBuildError { + msg: format!( + "Failed to format bcf_path to cstring: {:?}", + bcf_path.as_ref().display() + ), + })?; + let return_code = unsafe { + htslib::bcf_index_build3( + bcf_path.as_ptr(), + idx_path_cstr + .as_ref() + .map_or(std::ptr::null(), |p| p.as_ptr()), + min_shift, + n_threads as i32, + ) + }; + if return_code == 0 { + Ok(()) + } else { + Err(BcfBuildError { + msg: format!( + "Failed to build bcf index. Error: {return_code:?}/{}", + BcfBuildError::error_message(return_code) + ), + }) + } +} diff --git a/src/bcf/mod.rs b/src/bcf/mod.rs index 3e3b0d6ac..149ddd9a2 100644 --- a/src/bcf/mod.rs +++ b/src/bcf/mod.rs @@ -112,6 +112,7 @@ use url::Url; pub mod buffer; pub mod header; +pub mod index; pub mod record; use crate::bcf::header::{HeaderView, SampleSubset}; @@ -631,7 +632,7 @@ pub mod synced { } } -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum Format { Vcf, Bcf, @@ -1022,15 +1023,12 @@ mod tests { .expect("Missing tag")[0], format!("string{}", i + 1).as_bytes() ); - println!( - "{}", - String::from_utf8_lossy( - record - .format(b"FS1") - .string() - .expect("Error reading string.")[0] - ) - ); + let fs1_str_vec = record + .format_shared_buffer(b"FS1", &mut buffer) + .string() + .expect("Error reading string."); + assert_eq!(fs1_str_vec.len(), 2); + println!("{}", String::from_utf8_lossy(fs1_str_vec[0])); assert_eq!( record .format(b"FS1") @@ -1551,6 +1549,24 @@ mod tests { ); } + #[test] + fn test_trailing_omitted_format_fields() { + let mut reader = Reader::from_path("test/test_trailing_omitted_format.vcf").unwrap(); + let first_record = reader + .records() + .next() + .unwrap() + .expect("Fail to read record"); + + let expected: Vec<&[u8]> = Vec::new(); + assert_eq!(*first_record.format(b"STR").string().unwrap(), expected,); + assert_eq!( + *first_record.format(b"INT").integer().unwrap(), + vec![&[i32::missing()]], + ); + assert!(first_record.format(b"FLT").float().unwrap()[0][0].is_nan(),); + } + // #[test] // fn test_buffer_lifetime() { // let mut reader = Reader::from_path("test/obs-cornercase.vcf").unwrap(); diff --git a/src/bcf/record.rs b/src/bcf/record.rs index f95c2773b..c1066cd30 100644 --- a/src/bcf/record.rs +++ b/src/bcf/record.rs @@ -1008,7 +1008,7 @@ impl Record { } pub fn remove_alleles(&mut self, remove: &[bool]) -> Result<()> { - let rm_set = unsafe { htslib::kbs_init(remove.len() as u64) }; + let rm_set = unsafe { htslib::kbs_init(remove.len()) }; for (i, &r) in remove.iter().enumerate() { if r { @@ -1251,7 +1251,7 @@ impl<'a, 'b, B: BorrowMut + Borrow + 'b> Info<'a, B> { str::from_utf8(self.tag).unwrap().to_owned() } - fn data(&mut self, data_type: u32) -> Result> { + fn data(&mut self, data_type: u32) -> Result> { let mut n: i32 = self.buffer.borrow().len; let c_str = ffi::CString::new(self.tag).unwrap(); let ret = unsafe { @@ -1270,7 +1270,7 @@ impl<'a, 'b, B: BorrowMut + Borrow + 'b> Info<'a, B> { -1 => Err(Error::BcfUndefinedTag { tag: self.desc() }), -2 => Err(Error::BcfUnexpectedType { tag: self.desc() }), -3 => Ok(None), - ret => Ok(Some((n as usize, ret))), + ret => Ok(Some(ret)), } } @@ -1284,9 +1284,10 @@ impl<'a, 'b, B: BorrowMut + Borrow + 'b> Info<'a, B> { /// memory. pub fn integer(mut self) -> Result>> { self.data(htslib::BCF_HT_INT).map(|data| { - data.map(|(n, ret)| { - let values = - unsafe { slice::from_raw_parts(self.buffer.borrow().inner as *const i32, n) }; + data.map(|ret| { + let values = unsafe { + slice::from_raw_parts(self.buffer.borrow().inner as *const i32, ret as usize) + }; BufferBacked::new(&values[..ret as usize], self.buffer) }) }) @@ -1302,9 +1303,10 @@ impl<'a, 'b, B: BorrowMut + Borrow + 'b> Info<'a, B> { /// memory. pub fn float(mut self) -> Result>> { self.data(htslib::BCF_HT_REAL).map(|data| { - data.map(|(n, ret)| { - let values = - unsafe { slice::from_raw_parts(self.buffer.borrow().inner as *const f32, n) }; + data.map(|ret| { + let values = unsafe { + slice::from_raw_parts(self.buffer.borrow().inner as *const f32, ret as usize) + }; BufferBacked::new(&values[..ret as usize], self.buffer) }) }) @@ -1313,7 +1315,7 @@ impl<'a, 'b, B: BorrowMut + Borrow + 'b> Info<'a, B> { /// Get flags from tag. `false` if not set. pub fn flag(&mut self) -> Result { self.data(htslib::BCF_HT_FLAG).map(|data| match data { - Some((_, ret)) => ret == 1, + Some(ret) => ret == 1, None => false, }) } @@ -1326,7 +1328,7 @@ impl<'a, 'b, B: BorrowMut + Borrow + 'b> Info<'a, B> { /// memory. pub fn string(mut self) -> Result, B>>> { self.data(htslib::BCF_HT_STR).map(|data| { - data.map(|(_, ret)| { + data.map(|ret| { BufferBacked::new( unsafe { slice::from_raw_parts(self.buffer.borrow().inner as *const u8, ret as usize) @@ -1402,7 +1404,7 @@ impl<'a, 'b, B: BorrowMut + Borrow + 'b> Format<'a, B> { } /// Read and decode format data into a given type. - fn data(&mut self, data_type: u32) -> Result<(usize, i32)> { + fn data(&mut self, data_type: u32) -> Result { let mut n: i32 = self.buffer.borrow().len; let c_str = ffi::CString::new(self.tag).unwrap(); let ret = unsafe { @@ -1423,7 +1425,7 @@ impl<'a, 'b, B: BorrowMut + Borrow + 'b> Format<'a, B> { tag: self.desc(), record: self.record.desc(), }), - ret => Ok((n as usize, ret)), + ret => Ok(ret), } } @@ -1434,12 +1436,17 @@ impl<'a, 'b, B: BorrowMut + Borrow + 'b> Format<'a, B> { /// the BufferBacked object is already dropped, you will access unallocated /// memory. pub fn integer(mut self) -> Result, B>> { - self.data(htslib::BCF_HT_INT).map(|(n, _)| { + self.data(htslib::BCF_HT_INT).map(|ret| { BufferBacked::new( - unsafe { slice::from_raw_parts(self.buffer.borrow_mut().inner as *const i32, n) } - .chunks(self.values_per_sample()) - .map(|s| trim_slice(s)) - .collect(), + unsafe { + slice::from_raw_parts( + self.buffer.borrow_mut().inner as *const i32, + ret as usize, + ) + } + .chunks(self.values_per_sample()) + .map(|s| trim_slice(s)) + .collect(), self.buffer, ) }) @@ -1452,12 +1459,17 @@ impl<'a, 'b, B: BorrowMut + Borrow + 'b> Format<'a, B> { /// the BufferBacked object is already dropped, you will access unallocated /// memory. pub fn float(mut self) -> Result, B>> { - self.data(htslib::BCF_HT_REAL).map(|(n, _)| { + self.data(htslib::BCF_HT_REAL).map(|ret| { BufferBacked::new( - unsafe { slice::from_raw_parts(self.buffer.borrow_mut().inner as *const f32, n) } - .chunks(self.values_per_sample()) - .map(|s| trim_slice(s)) - .collect(), + unsafe { + slice::from_raw_parts( + self.buffer.borrow_mut().inner as *const f32, + ret as usize, + ) + } + .chunks(self.values_per_sample()) + .map(|s| trim_slice(s)) + .collect(), self.buffer, ) }) @@ -1470,17 +1482,22 @@ impl<'a, 'b, B: BorrowMut + Borrow + 'b> Format<'a, B> { /// the BufferBacked object is already dropped, you will access unallocated /// memory. pub fn string(mut self) -> Result, B>> { - self.data(htslib::BCF_HT_STR).map(|(n, _)| { + self.data(htslib::BCF_HT_STR).map(|ret| { + if ret == 0 { + return BufferBacked::new(Vec::new(), self.buffer); + } BufferBacked::new( - unsafe { slice::from_raw_parts(self.buffer.borrow_mut().inner as *const u8, n) } - .chunks(self.values_per_sample()) - .map(|s| { - // stop at zero character - s.split(|c| *c == 0u8) - .next() - .expect("Bug: returned string should not be empty.") - }) - .collect(), + unsafe { + slice::from_raw_parts(self.buffer.borrow_mut().inner as *const u8, ret as usize) + } + .chunks(self.values_per_sample()) + .map(|s| { + // stop at zero character + s.split(|c| *c == 0u8) + .next() + .expect("Bug: returned string should not be empty.") + }) + .collect(), self.buffer, ) }) diff --git a/src/bgzf/mod.rs b/src/bgzf/mod.rs index a99af4de4..06d6ee7a1 100644 --- a/src/bgzf/mod.rs +++ b/src/bgzf/mod.rs @@ -112,11 +112,7 @@ impl Reader { impl std::io::Read for Reader { fn read(&mut self, buf: &mut [u8]) -> std::io::Result { let nbytes = unsafe { - htslib::bgzf_read( - self.inner, - buf.as_mut_ptr() as *mut libc::c_void, - buf.len() as u64, - ) + htslib::bgzf_read(self.inner, buf.as_mut_ptr() as *mut libc::c_void, buf.len()) }; if nbytes < 0 { Err(std::io::Error::new( @@ -257,13 +253,8 @@ impl Writer { impl std::io::Write for Writer { fn write(&mut self, buf: &[u8]) -> std::io::Result { - let nbytes = unsafe { - htslib::bgzf_write( - self.inner, - buf.as_ptr() as *mut libc::c_void, - buf.len() as u64, - ) - }; + let nbytes = + unsafe { htslib::bgzf_write(self.inner, buf.as_ptr() as *mut libc::c_void, buf.len()) }; if nbytes < 0 { Err(std::io::Error::new( std::io::ErrorKind::Other, diff --git a/src/errors.rs b/src/errors.rs index 4e459c1ca..7ee071d2a 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -33,6 +33,8 @@ pub enum Error { FaidxPositionTooLarge, #[error("bad conversion of sequence name")] FaidxBadSeqName, + #[error("failed to build index for fasta file {path:?}")] + FaidxBuildFailed { path: std::path::PathBuf }, // Errors for Tbx #[error("previous iterator generation failed")] @@ -90,6 +92,16 @@ pub enum Error { #[error("failed to add aux field, tag is already present")] BamAuxTagAlreadyPresent, + // Errors for base modification fields + #[error("no base modification tag found for record")] + BamBaseModificationTagNotFound, + #[error("no base modification with the specified code found in record")] + BamBaseModificationTypeNotFound, + #[error("base modification iteration failed")] + BamBaseModificationIterationFailed, + #[error("base modification found too many modifications")] + BamBaseModificationTooManyMods, + // Errors for BCF #[error("error allocating internal data structure for BCF/VCF reader (out of memory?)")] BcfAllocationError, diff --git a/src/faidx/mod.rs b/src/faidx/mod.rs index 838fdc810..50dd33016 100644 --- a/src/faidx/mod.rs +++ b/src/faidx/mod.rs @@ -22,6 +22,31 @@ pub struct Reader { inner: *mut htslib::faidx_t, } +/// +/// Build a faidx for input path. +/// +/// # Errors +/// If indexing fails. Could be malformatted or file could not be accessible. +/// +///``` +/// use rust_htslib::faidx::build; +/// let path = std::path::PathBuf::from(concat!(env!("CARGO_MANIFEST_DIR"),"/test/test_cram.fa")); +/// build(&path).expect("Failed to build fasta index"); +///``` +/// +pub fn build( + path: impl Into, +) -> Result<(), std::boxed::Box> { + let path = path.into(); + let os_path = std::ffi::CString::new(path.display().to_string())?; + let rc = unsafe { htslib::fai_build(os_path.as_ptr()) }; + if rc < 0 { + Err(Error::FaidxBuildFailed { path })? + } else { + Ok(()) + } +} + impl Reader { /// Create a new Reader from a path. /// @@ -126,6 +151,40 @@ impl Reader { Ok(out) } + + /// Fetches the length of the given sequence name. + /// + /// # Arguments + /// + /// * `name` - the name of the template sequence (e.g., "chr1") + pub fn fetch_seq_len>(&self, name: N) -> u64 { + let cname = ffi::CString::new(name.as_ref().as_bytes()).unwrap(); + let seq_len = unsafe { htslib::faidx_seq_len(self.inner, cname.as_ptr()) }; + seq_len as u64 + } + + /// Returns a Result> for all seq names. + /// # Errors + /// + /// * `errors::Error::FaidxBadSeqName` - missing sequence name for sequence id. + /// + /// If thrown, the index is malformed, and the number of sequences in the index does not match the number of sequence names available. + ///``` + /// use rust_htslib::faidx::build; + /// let path = std::path::PathBuf::from(concat!(env!("CARGO_MANIFEST_DIR"),"/test/test_cram.fa")); + /// build(&path).expect("Failed to build fasta index"); + /// let reader = rust_htslib::faidx::Reader::from_path(path).expect("Failed to open faidx"); + /// assert_eq!(reader.seq_names(), Ok(vec!["chr1".to_string(), "chr2".to_string(), "chr3".to_string()])); + ///``` + /// + pub fn seq_names(&self) -> Result> { + let num_seq = self.n_seqs(); + let mut ret = Vec::with_capacity(num_seq as usize); + for seq_id in 0..num_seq { + ret.push(self.seq_name(seq_id as i32)?); + } + Ok(ret) + } } impl Drop for Reader { @@ -247,6 +306,15 @@ mod tests { assert_eq!(n, "chr2"); } + #[test] + fn faidx_get_seq_len() { + let r = open_reader(); + let chr1_len = r.fetch_seq_len("chr1"); + let chr2_len = r.fetch_seq_len("chr2"); + assert_eq!(chr1_len, 120u64); + assert_eq!(chr2_len, 120u64); + } + #[test] fn open_many_readers() { for _ in 0..500_000 { diff --git a/test/base_mods/MM-double.sam b/test/base_mods/MM-double.sam new file mode 100644 index 000000000..608516fc1 --- /dev/null +++ b/test/base_mods/MM-double.sam @@ -0,0 +1,3 @@ +@CO Modifications called on both strands of the same record, +@CO including potentially at the same location simultaneously. +* 0 * 0 0 * * 0 0 AGGATCTCTAGCGGATCGGCGGGGGATATGCCATAT * Mm:Z:C+m,1,3,0;G-m,0,2,0,4;G+o,4; Ml:B:C,128,153,179,115,141,166,192,102 diff --git a/test/base_mods/MM-orient.sam b/test/base_mods/MM-orient.sam new file mode 100644 index 000000000..363e7c2be --- /dev/null +++ b/test/base_mods/MM-orient.sam @@ -0,0 +1,6 @@ +@CO Testing mods on top and bottom strand, but also in +@CO original vs reverse-complemented orientation +top-fwd 0 * 0 0 * * 0 0 AGGATCTCTAGCGGATCGGCGGGGGATATGCCATAT * Mm:Z:C+m,1,3,0; Ml:B:C,128,153,179 +top-rev 16 * 0 0 * * 0 0 ATATGGCATATCCCCCGCCGATCCGCTAGAGATCCT * Mm:Z:C+m,1,3,0; Ml:B:C,128,153,179 +bot-fwd 0 * 0 0 * * 0 0 AGGATCTCTAGCGGATCGGCGGGGGATATGCCATAT * Mm:Z:G-m,0,0,4,3; Ml:B:C,115,141,166,192 +bot-rev 16 * 0 0 * * 0 0 ATATGGCATATCCCCCGCCGATCCGCTAGAGATCCT * Mm:Z:G-m,0,0,4,3; Ml:B:C,115,141,166,192 diff --git a/test/test_trailing_omitted_format.vcf b/test/test_trailing_omitted_format.vcf new file mode 100644 index 000000000..68892198b --- /dev/null +++ b/test/test_trailing_omitted_format.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##contig= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 +chr1 1234 . t a . . FOO=1 GT:STR:FLT:INT .