Skip to content

Commit

Permalink
fix: makes Header.to_hashmap() not call unwrap.
Browse files Browse the repository at this point in the history
Regex still calls unwrap, add test to make sure it's safe. Change API to
return a Result.
  • Loading branch information
ArtRand committed Jun 14, 2023
1 parent e89538d commit bc5c9f8
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 31 deletions.
88 changes: 60 additions & 28 deletions src/bam/header.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
// except according to those terms.

use crate::bam::HeaderView;
use crate::errors::{Error, Result};
use lazy_static::lazy_static;
use linear_map::LinearMap;
use regex::Regex;
Expand Down Expand Up @@ -67,42 +68,50 @@ impl Header {
/// This returns a header as a HashMap.
/// Comment lines starting with "@CO" will NOT be included in the HashMap.
/// Comment lines can be obtained by the `comments` function.
pub fn to_hashmap(&self) -> HashMap<String, Vec<LinearMap<String, String>>> {
pub fn to_hashmap(&self) -> Result<HashMap<String, Vec<LinearMap<String, String>>>> {
let mut header_map = HashMap::default();

lazy_static! {
static ref REC_TYPE_RE: Regex = Regex::new(r"@([A-Z][A-Z])").unwrap();
static ref TAG_RE: Regex = Regex::new(r"([A-Za-z][A-Za-z0-9]):([ -~]*)").unwrap();
}

let header_string = String::from_utf8(self.to_bytes()).unwrap();

for line in header_string.split('\n').filter(|x| !x.is_empty()) {
let parts: Vec<_> = line.split('\t').filter(|x| !x.is_empty()).collect();
// assert!(rec_type_re.is_match(parts[0]));
let record_type = REC_TYPE_RE
.captures(parts[0])
.unwrap()
.get(1)
.unwrap()
.as_str()
.to_owned();
if record_type.eq("CO") {
continue;
}
let mut field = LinearMap::default();
for part in parts.iter().skip(1) {
let cap = TAG_RE.captures(part).unwrap();
let tag = cap.get(1).unwrap().as_str().to_owned();
let value = cap.get(2).unwrap().as_str().to_owned();
field.insert(tag, value);
if let Ok(header_string) = String::from_utf8(self.to_bytes()) {
for line in header_string.split('\n').filter(|x| !x.is_empty()) {
let parts: Vec<_> = line.split('\t').filter(|x| !x.is_empty()).collect();
if parts.is_empty() {
continue;
}
let record_type = REC_TYPE_RE
.captures(parts[0])
.and_then(|captures| captures.get(1))
.map(|m| m.as_str().to_owned());

if let Some(record_type) = record_type {
if record_type == "CO" {
continue
}
let mut field = LinearMap::default();
for part in parts.iter().skip(1) {
if let Some(cap) = TAG_RE.captures(part) {
let tag = cap.get(1).unwrap().as_str().to_owned();
let value = cap.get(2).unwrap().as_str().to_owned();
field.insert(tag, value);
} else {
return Err(Error::HeaderParse);
}
}
header_map
.entry(record_type)
.or_insert_with(Vec::new)
.push(field);
} else {
return Err(Error::HeaderParse);
}
}
header_map
.entry(record_type)
.or_insert_with(Vec::new)
.push(field);
Ok(header_map)
} else {
Err(Error::HeaderParse)
}
header_map
}

/// Returns an iterator of comment lines.
Expand Down Expand Up @@ -160,6 +169,7 @@ impl<'a> HeaderRecord<'a> {
#[cfg(test)]
mod tests {
use super::HeaderRecord;
use crate::bam::Header;

#[test]
fn test_push_tag() {
Expand All @@ -174,4 +184,26 @@ mod tests {

assert_eq!(record.to_bytes(), b"@HD\tX1:0\tX2:0\tX3:x\tX4:x\tX5:x");
}

#[test]
fn test_header_hash_map() {
let mut records = Vec::new();
let mut record = HeaderRecord::new(b"HD");
record.push_tag(b"X1", 0);
records.push(record);
let mut record = HeaderRecord::new(b"PG");
record.push_tag(b"ID", "mytool");
records.push(record);
let mut record = HeaderRecord::new(b"PG");
record.push_tag(b"ID", "other_tool");
records.push(record);
let header = Header {
records: records.into_iter().map(|rec| rec.to_bytes()).collect(),
};
let hm = header.to_hashmap().unwrap();
assert!(hm.contains_key("HD"));
assert!(hm.contains_key("PG"));
assert_eq!(hm.get("HD").unwrap().len(), 1);
assert_eq!(hm.get("PG").unwrap().len(), 2);
}
}
2 changes: 1 addition & 1 deletion src/bam/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2994,7 +2994,7 @@ CCCCCCCCCCCCCCCCCCC"[..],
#[test]
fn test_bam_header_sync() {
let reader = Reader::from_path("test/test_issue_156_no_text.bam").unwrap();
let header_hashmap = Header::from_template(reader.header()).to_hashmap();
let header_hashmap = Header::from_template(reader.header()).to_hashmap().unwrap();
let header_refseqs = header_hashmap.get("SQ".into()).unwrap();
assert_eq!(header_refseqs[0].get("SN").unwrap(), "ref_1",);
assert_eq!(header_refseqs[0].get("LN").unwrap(), "10000000",);
Expand Down
2 changes: 2 additions & 0 deletions src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ pub enum Error {
BamPileup,
#[error("file is not sorted by position")]
BamUnsorted,
#[error("error parsing header")]
HeaderParse,

// Errors for BAM auxiliary fields
#[error("failed to add aux field (out of memory?)")]
Expand Down
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
//! let bam = bam::Reader::from_path(&"test/test.bam").unwrap();
//! let header = bam::Header::from_template(bam.header());
//!
//! // print header records to the terminal, akin to samtool
//! for (key, records) in header.to_hashmap() {
//! // print header records to the terminal, akin to samtools
//! for (key, records) in header.to_hashmap().expect("should parse header") {
//! for record in records {
//! println!("@{}\tSN:{}\tLN:{}", key, record["SN"], record["LN"]);
//! }
Expand Down

0 comments on commit bc5c9f8

Please sign in to comment.