From 257597f4219f33686a315eea2fa9c0b4d5a46c0c Mon Sep 17 00:00:00 2001 From: adamnemecek Date: Sun, 18 Jun 2023 18:58:33 -0700 Subject: [PATCH] Ran cargo fmt, some nitpicks. (#60) * ran cargo fmt * use Self * refactoring * use mathces * use Self * use matches + use Self * some refactoring in parser. * refactoring in parser.rs --- examples/utf8.rs | 138 ++--- examples/utf8_mixed.rs | 71 +-- src/char_class.rs | 6 +- src/lib.rs | 2 +- src/parser.rs | 153 +++--- src/utf8.rs | 1113 ++++++++++++++++++++-------------------- 6 files changed, 725 insertions(+), 758 deletions(-) diff --git a/examples/utf8.rs b/examples/utf8.rs index f781c87..918e356 100644 --- a/examples/utf8.rs +++ b/examples/utf8.rs @@ -1,68 +1,70 @@ -// Example shows basic UTF-8 combinators - -use pom::utf8::*; - -fn main() { - // Informal, Spanish-language movie database format - let input = "\ -Título: Abre los ojos -Año: 1997 -Director: Alejandro Amenábar - -Título: Amores Perros -Director: Alejandro González Iñárritu -Año: 2000 - -Título: La montaña sagrada -Año: 1973 -Director: Alejandro Jodorowsky -"; - - enum DataLine<'a> { - Title(&'a str), - Director(&'a str), - Year(i32), - } - - fn positive<'a>() -> Parser<'a, i32> { -// let integer = (one_of("123456789") - one_of("0123456789").repeat(0..)) | sym(b'0'); // TODO - let digit = one_of("0123456789"); - let integer = digit.discard().repeat(1..); - integer.collect().convert(|x|x.parse::()) - } - - fn rest_str<'a>() -> Parser<'a, &'a str> { - any().repeat(1..).collect() - } - - fn separator<'a>() ->Parser<'a, ()> { - seq(": ").discard() - } - - let parser = - (seq("Título") * separator() * rest_str().map(|s| DataLine::Title(s))) - | (seq("Director") * separator() * rest_str().map(|s| DataLine::Director(s))) - | (seq("Año") * separator() * positive().map(|i| DataLine::Year(i))); - - { - let mut title_opt:Option<&str> = None; - let mut year_opt:Option = None; - let mut director_opt:Option<&str> = None; - - for line in input.lines() { - if !line.is_empty() { // Skip blank lines without parsing - // Parse line - match parser.parse_str(line).unwrap() { - DataLine::Title(s) => title_opt = Some(s), - DataLine::Director(s) => director_opt = Some(s), - DataLine::Year(s) => year_opt = Some(s), - } - // When all three line types have been collected, print them - if let (Some(title), Some(year), Some(director)) = (title_opt,year_opt,director_opt) { - println!("Title: {}\nDirector: {}\nYear: {}\n", title, director, year); - (title_opt, year_opt, director_opt) = (None,None,None); - } - } - } - } -} +// Example shows basic UTF-8 combinators + +use pom::utf8::*; + +fn main() { + // Informal, Spanish-language movie database format + let input = "\ +Título: Abre los ojos +Año: 1997 +Director: Alejandro Amenábar + +Título: Amores Perros +Director: Alejandro González Iñárritu +Año: 2000 + +Título: La montaña sagrada +Año: 1973 +Director: Alejandro Jodorowsky +"; + + enum DataLine<'a> { + Title(&'a str), + Director(&'a str), + Year(i32), + } + + fn positive<'a>() -> Parser<'a, i32> { + // let integer = (one_of("123456789") - one_of("0123456789").repeat(0..)) | sym(b'0'); // TODO + let digit = one_of("0123456789"); + let integer = digit.discard().repeat(1..); + integer.collect().convert(|x| x.parse::()) + } + + fn rest_str<'a>() -> Parser<'a, &'a str> { + any().repeat(1..).collect() + } + + fn separator<'a>() -> Parser<'a, ()> { + seq(": ").discard() + } + + let parser = (seq("Título") * separator() * rest_str().map(|s| DataLine::Title(s))) + | (seq("Director") * separator() * rest_str().map(|s| DataLine::Director(s))) + | (seq("Año") * separator() * positive().map(|i| DataLine::Year(i))); + + { + let mut title_opt: Option<&str> = None; + let mut year_opt: Option = None; + let mut director_opt: Option<&str> = None; + + for line in input.lines() { + if !line.is_empty() { + // Skip blank lines without parsing + // Parse line + match parser.parse_str(line).unwrap() { + DataLine::Title(s) => title_opt = Some(s), + DataLine::Director(s) => director_opt = Some(s), + DataLine::Year(s) => year_opt = Some(s), + } + // When all three line types have been collected, print them + if let (Some(title), Some(year), Some(director)) = + (title_opt, year_opt, director_opt) + { + println!("Title: {}\nDirector: {}\nYear: {}\n", title, director, year); + (title_opt, year_opt, director_opt) = (None, None, None); + } + } + } + } +} diff --git a/examples/utf8_mixed.rs b/examples/utf8_mixed.rs index b1c86fc..63bbed9 100644 --- a/examples/utf8_mixed.rs +++ b/examples/utf8_mixed.rs @@ -1,35 +1,36 @@ -// Example shows UTF-8 combinators intermixed with binary combinators - -use pom::parser::*; -use pom::utf8; - -fn main() { - // A parser for MsgPack (but only messages encoding a string) - let testcases: [Vec; 6] = [ - vec![0b10100100, 0b11110000, 0b10011111, 0b10100100, 0b10010100], // 🤔, max-size 31 format - vec![0xd9, 4, 0b11110000, 0b10011111, 0b10011000, 0b10101110], // 😮, max-size 255 format - vec![0xda, 0, 4, 0b11110000, 0b10011111, 0b10100100, 0b10101111], // 🤯, max-size 2^16-1 format - vec![0xdb, 0, 0, 0, 4, 0b11110000, 0b10011111, 0b10010010, 0b10100101], // 💥, max-size 2^32-1 format - vec![0xc4, 4, 0b11110000, 0b10011111, 0b10011000, 0b10101110], // Valid MsgPack, but not a string (binary) - vec![0b10100100, 0b10010100, 0b10100100, 0b10011111, 0b11110000], // A MsgPack string, but invalid UTF-8 - ]; - - const MASK:u8 = 0b11100000; // size 31 format is denoted by 3 high bits == 101 - const SIZE_31:u8 = 0b10100000; - - fn rest_as_str<'a>() -> utf8::Parser<'a, &'a str> { - utf8::any().repeat(0..).collect() - } - - // Demo parser does not verify that the claimed length matches the actual length (but checking so is simple with >>) - let parser = - (sym(0xdb) * any().repeat(4) * rest_as_str()) // 2^32-1 format - | (sym(0xda) * any().repeat(2) * rest_as_str()) // 2^16-1 format - | (sym(0xd9) * any() * rest_as_str()) // 255 format - | (is_a(|x| x&MASK == SIZE_31) * rest_as_str()) // 31 format - - end(); - - for testcase in testcases.iter() { - println!("{:?}", parser.parse(testcase)); - } -} +// Example shows UTF-8 combinators intermixed with binary combinators + +use pom::parser::*; +use pom::utf8; + +fn main() { + // A parser for MsgPack (but only messages encoding a string) + let testcases: [Vec; 6] = [ + vec![0b10100100, 0b11110000, 0b10011111, 0b10100100, 0b10010100], // 🤔, max-size 31 format + vec![0xd9, 4, 0b11110000, 0b10011111, 0b10011000, 0b10101110], // 😮, max-size 255 format + vec![0xda, 0, 4, 0b11110000, 0b10011111, 0b10100100, 0b10101111], // 🤯, max-size 2^16-1 format + vec![ + 0xdb, 0, 0, 0, 4, 0b11110000, 0b10011111, 0b10010010, 0b10100101, + ], // 💥, max-size 2^32-1 format + vec![0xc4, 4, 0b11110000, 0b10011111, 0b10011000, 0b10101110], // Valid MsgPack, but not a string (binary) + vec![0b10100100, 0b10010100, 0b10100100, 0b10011111, 0b11110000], // A MsgPack string, but invalid UTF-8 + ]; + + const MASK: u8 = 0b11100000; // size 31 format is denoted by 3 high bits == 101 + const SIZE_31: u8 = 0b10100000; + + fn rest_as_str<'a>() -> utf8::Parser<'a, &'a str> { + utf8::any().repeat(0..).collect() + } + + // Demo parser does not verify that the claimed length matches the actual length (but checking so is simple with >>) + let parser = (sym(0xdb) * any().repeat(4) * rest_as_str()) // 2^32-1 format + | (sym(0xda) * any().repeat(2) * rest_as_str()) // 2^16-1 format + | (sym(0xd9) * any() * rest_as_str()) // 255 format + | (is_a(|x| x&MASK == SIZE_31) * rest_as_str()) // 31 format + - end(); + + for testcase in testcases.iter() { + println!("{:?}", parser.parse(testcase)); + } +} diff --git a/src/char_class.rs b/src/char_class.rs index b05c053..73c2e06 100644 --- a/src/char_class.rs +++ b/src/char_class.rs @@ -19,7 +19,7 @@ pub fn alphanum(term: u8) -> bool { /// Recognises a hexadecimal digit, `0-9a-fA-F`. #[inline] pub fn hex_digit(term: u8) -> bool { - (0x30..=0x39).contains(&term) || (0x41..=0x46).contains(&term) || (0x61..=0x66).contains(&term) + matches!(term, 0x30..=0x39 | 0x41..=0x46 | 0x61..=0x66) } /// Recognises an octal digit, `0-7`. @@ -31,11 +31,11 @@ pub fn oct_digit(term: u8) -> bool { /// Recognises a space or tab. #[inline] pub fn space(term: u8) -> bool { - term == b' ' || term == b'\t' + matches!(term, b' ' | b'\t') } /// Recognises a space, tab, line feed, or carriage return. #[inline] pub fn multispace(term: u8) -> bool { - space(term) || term == b'\n' || term == b'\r' + space(term) || matches!(term, b'\n' | b'\r') } diff --git a/src/lib.rs b/src/lib.rs index 395a619..6c3bd5c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,7 +9,7 @@ pub mod parser; pub mod char_class; /// Variants of parser functions specialized for matching UTF-8 strings and returning chars. -/// Method and constructor names/functionality are generally the same as in base parser module. +/// Method and constructor names/functionality are generally the same as in base parser module. #[cfg(feature = "utf8")] pub mod utf8; diff --git a/src/parser.rs b/src/parser.rs index 75961e7..7d8be24 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -14,11 +14,11 @@ pub struct Parser<'a, I, O> { impl<'a, I, O> Parser<'a, I, O> { /// Create new parser. - pub fn new

(parse: P) -> Parser<'a, I, O> + pub fn new

(parse: P) -> Self where P: Fn(&'a [I], usize) -> Result<(O, usize)> + 'a, { - Parser { + Self { method: Box::new(parse), } } @@ -66,14 +66,14 @@ impl<'a, I, O> Parser<'a, I, O> { } /// Cache parser output result to speed up backtracking. - pub fn cache(self) -> Parser<'a, I, O> + pub fn cache(self) -> Self where O: Clone + 'a, { use std::cell::RefCell; use std::collections::HashMap; let results = RefCell::new(HashMap::new()); - Parser::new(move |input: &'a [I], start: usize| { + Self::new(move |input: &'a [I], start: usize| { let key = (start, format!("{:p}", &self.method)); results .borrow_mut() @@ -153,12 +153,9 @@ impl<'a, I, O> Parser<'a, I, O> { Unbounded => (), } - if let Ok((item, item_pos)) = (self.method)(input, pos) { - items.push(item); - pos = item_pos; - } else { - break; - } + let Ok((item, item_pos)) = (self.method)(input, pos) else { break }; + items.push(item); + pos = item_pos; } if let Included(&min_count) = range.start() { if items.len() < min_count { @@ -177,7 +174,7 @@ impl<'a, I, O> Parser<'a, I, O> { } /// Give parser a name to identify parsing errors. - pub fn name(self, name: &'a str) -> Parser<'a, I, O> + pub fn name(self, name: &'a str) -> Self where O: 'a, { @@ -197,7 +194,7 @@ impl<'a, I, O> Parser<'a, I, O> { } /// Mark parser as expected, abort early when failed in ordered choice. - pub fn expect(self, name: &'a str) -> Parser<'a, I, O> + pub fn expect(self, name: &'a str) -> Self where O: 'a, { @@ -225,14 +222,13 @@ where I: Clone, { Parser::new(|input: &[I], start: usize| { - if let Some(s) = input.get(start) { - Ok((s.clone(), start + 1)) - } else { - Err(Error::Mismatch { + let Some(s) = input.get(start) else { + return Err(Error::Mismatch { message: "end of input reached".to_owned(), position: start, }) - } + }; + Ok((s.clone(), start + 1)) }) } @@ -242,18 +238,14 @@ where I: Clone + PartialEq + Display, { Parser::new(move |input: &'a [I], start: usize| { - if let Some(s) = input.get(start) { - if t == *s { - Ok((s.clone(), start + 1)) - } else { - Err(Error::Mismatch { - message: format!("expect: {}, found: {}", t, s), - position: start, - }) - } - } else { - Err(Error::Incomplete) + let Some(s) = input.get(start) else { return Err(Error::Incomplete) }; + if t != *s { + return Err(Error::Mismatch { + message: format!("expect: {}, found: {}", t, s), + position: start, + }); } + Ok((s.clone(), start + 1)) }) } @@ -269,15 +261,12 @@ where if index == tag.len() { return Ok((tag, pos)); } - if let Some(s) = input.get(pos) { - if tag[index] != *s { - return Err(Error::Mismatch { - message: format!("seq {:?} expect: {:?}, found: {:?}", tag, tag[index], s), - position: pos, - }); - } - } else { - return Err(Error::Incomplete); + let Some(s) = input.get(pos) else { return Err(Error::Incomplete) }; + if tag[index] != *s { + return Err(Error::Mismatch { + message: format!("seq {:?} expect: {:?}, found: {:?}", tag, tag[index], s), + position: pos, + }); } index += 1; } @@ -289,15 +278,12 @@ pub fn tag<'a, 'b: 'a>(tag: &'b str) -> Parser<'a, char, &'a str> { Parser::new(move |input: &'a [char], start: usize| { let mut pos = start; for c in tag.chars() { - if let Some(&s) = input.get(pos) { - if c != s { - return Err(Error::Mismatch { - message: format!("tag {:?} expect: {:?}, found: {}", tag, c, s), - position: pos, - }); - } - } else { - return Err(Error::Incomplete); + let Some(&s) = input.get(pos) else { return Err(Error::Incomplete) }; + if c != s { + return Err(Error::Mismatch { + message: format!("tag {:?} expect: {:?}, found: {}", tag, c, s), + position: pos, + }); } pos += 1; } @@ -341,18 +327,14 @@ where S: Set + ?Sized, { Parser::new(move |input: &'a [I], start: usize| { - if let Some(s) = input.get(start) { - if set.contains(s) { - Ok((s.clone(), start + 1)) - } else { - Err(Error::Mismatch { - message: format!("expect one of: {}, found: {}", set.to_str(), s), - position: start, - }) - } - } else { - Err(Error::Incomplete) - } + let Some(s) = input.get(start) else {return Err(Error::Incomplete) }; + if !set.contains(s) { + return Err(Error::Mismatch { + message: format!("expect one of: {}, found: {}", set.to_str(), s), + position: start, + }); + }; + Ok((s.clone(), start + 1)) }) } @@ -363,18 +345,14 @@ where S: Set + ?Sized, { Parser::new(move |input: &'a [I], start: usize| { - if let Some(s) = input.get(start) { - if set.contains(s) { - Err(Error::Mismatch { - message: format!("expect none of: {}, found: {}", set.to_str(), s), - position: start, - }) - } else { - Ok((s.clone(), start + 1)) - } - } else { - Err(Error::Incomplete) + let Some(s) = input.get(start) else {return Err(Error::Incomplete) }; + if set.contains(s) { + return Err(Error::Mismatch { + message: format!("expect none of: {}, found: {}", set.to_str(), s), + position: start, + }); } + Ok((s.clone(), start + 1)) }) } @@ -385,18 +363,15 @@ where F: Fn(I) -> bool + 'a, { Parser::new(move |input: &'a [I], start: usize| { - if let Some(s) = input.get(start) { - if predicate(s.clone()) { - Ok((s.clone(), start + 1)) - } else { - Err(Error::Mismatch { - message: format!("is_a predicate failed on: {}", s), - position: start, - }) - } - } else { - Err(Error::Incomplete) + let Some(s) = input.get(start) else { return Err(Error::Incomplete) }; + if !predicate(s.clone()) { + return Err(Error::Mismatch { + message: format!("is_a predicate failed on: {}", s), + position: start, + }); } + + Ok((s.clone(), start + 1)) }) } @@ -407,18 +382,14 @@ where F: Fn(I) -> bool + 'a, { Parser::new(move |input: &'a [I], start: usize| { - if let Some(s) = input.get(start) { - if predicate(s.clone()) { - Err(Error::Mismatch { - message: format!("not_a predicate failed on: {}", s), - position: start, - }) - } else { - Ok((s.clone(), start + 1)) - } - } else { - Err(Error::Incomplete) + let Some(s) = input.get(start) else { return Err(Error::Incomplete) }; + if predicate(s.clone()) { + return Err(Error::Mismatch { + message: format!("not_a predicate failed on: {}", s), + position: start, + }); } + Ok((s.clone(), start + 1)) }) } diff --git a/src/utf8.rs b/src/utf8.rs index c6fbb81..3a53172 100644 --- a/src/utf8.rs +++ b/src/utf8.rs @@ -1,560 +1,553 @@ -// Variants of parser functions specialized for matching UTF-8 strings and returning chars - -use super::{Error, Result}; -use super::parser; -use crate::range::RangeArgument; -use crate::set::Set; -use std::str; -use std::fmt::Debug; -use bstr::decode_utf8; -use std::ops::{Add, BitOr, Mul, Neg, Not, Shr, Sub}; - -// / Parser combinator. -//type Parse<'a, O> = dyn Fn(&'a [u8], usize) -> Result<(O, usize)> + 'a; - -/// Being wrapped in this struct guarantees that the parser within will only match valid UTF-8 strings. -pub struct Parser<'a, O> (parser::Parser<'a, u8, O>); - -impl<'a, O> Parser<'a, O> { - /// Create new parser. - pub fn new

(parse: P) -> Parser<'a, O> - where - P: Fn(&'a [u8], usize) -> Result<(O, usize)> + 'a, - { - Parser( parser::Parser::new(parse) ) - } - - /// Collect all matched input symbols. - // This method is the primary reason utf8::Parser exists at all. - pub fn collect(self) -> Parser<'a, &'a str> - where - O: 'a, - { - Parser( self.0.collect().map( - // UNSAFE: Because we only could have constructed this object from other utf8::Parser objects, the match space must be valid UTF-8 - |s| unsafe { str::from_utf8_unchecked(s) } - ) ) - } - - // Remaining methods in impl only delegate to base parser::Parser - - /// Apply the parser to parse input. - pub fn parse(&self, input: &'a [u8]) -> Result { - self.0.parse(input) - } - - /// Parse input at specified byte position. - pub fn parse_at(&self, input: &'a [u8], start: usize) -> Result<(O, usize)> { - self.0.parse_at(input, start) - } - - /// Apply the parser to parse input. - pub fn parse_str(&self, input: &'a str) -> Result { - self.0.parse(input.as_bytes()) - } - - /// Convert parser result to desired value. - pub fn map(self, f: F) -> Parser<'a, U> - where - F: Fn(O) -> U + 'a, - O: 'a, - U: 'a, - { - Parser( self.0.map(f) ) - } - - /// Convert parser result to desired value, fail in case of conversion error. - pub fn convert(self, f: F) -> Parser<'a, U> - where - F: Fn(O) -> ::std::result::Result + 'a, - E: Debug, - O: 'a, - U: 'a, - { - Parser( self.0.convert(f) ) - } - - /// Cache parser output result to speed up backtracking. - pub fn cache(self) -> Parser<'a, O> - where - O: Clone + 'a, - { - Parser( self.0.cache() ) - } - - /// Get input position after matching parser. - pub fn pos(self) -> Parser<'a, usize> - where - O: 'a, - { - Parser( self.0.pos() ) - } - - /// Discard parser output. - pub fn discard(self) -> Parser<'a, ()> - where - O: 'a, - { - Parser( self.0.discard() ) - } - - /// Make parser optional. - pub fn opt(self) -> Parser<'a, Option> - where - O: 'a, - { - Parser( self.0.opt() ) - } - - /// `p.repeat(5)` repeat p exactly 5 times - /// `p.repeat(0..)` repeat p zero or more times - /// `p.repeat(1..)` repeat p one or more times - /// `p.repeat(1..4)` match p at least 1 and at most 3 times - pub fn repeat(self, range: R) -> Parser<'a, Vec> - where - R: RangeArgument + Debug + 'a, - O: 'a, - { - Parser( self.0.repeat(range) ) - } - - /// Give parser a name to identify parsing errors. - pub fn name(self, name: &'a str) -> Parser<'a, O> - where - O: 'a, - { - Parser( self.0.name(name) ) - } - - /// Mark parser as expected, abort early when failed in ordered choice. - pub fn expect(self, name: &'a str) -> Parser<'a, O> - where - O: 'a, - { - Parser( self.0.expect(name) ) - } -} - -impl<'a, O> From> for parser::Parser<'a, u8, O> { - fn from(parser: Parser<'a, O>) -> Self { - parser.0 // Simply unwrap - } -} - -// Helper for functions that decode_utf8 and fail -fn no_utf8(start: usize, size: usize) -> Result { - if size == 0 { - Err(Error::Mismatch { - message: "end of input reached".to_owned(), - position: start, - }) - } else { - Err(Error::Mismatch { - message: "not UTF-8".to_owned(), - position: start, - }) - } -} - -/// Match any UTF-8 character. -pub fn any<'a>() -> Parser<'a, char> -{ - Parser::new(|input: &[u8], start: usize| { - let (ch, size) = decode_utf8(&input[start..]); - - if let Some(ch) = ch { - let pos = start+size; - - Ok((ch, pos)) - } else { - no_utf8(start, size) - } - }) -} - -/// Match specific UTF-8 character. -pub fn sym<'a>(tag: char) -> Parser<'a, char> -{ - Parser::new(move |input: &[u8], start: usize| { - let (ch, size) = decode_utf8(&input[start..]); - - if let Some(ch) = ch { - if ch == tag { - let pos = start+size; - - Ok((ch, pos)) - } else { - Err(Error::Mismatch { - message: format!("expect: {}, found: {}", tag, ch), - position: start, - }) - } - } else { - no_utf8(start, size) - } - }) -} - -/// Success when sequence of chars matches current input. -pub fn seq<'a, 'b: 'a>(tag_str: &'b str) -> Parser<'a, &'a str> -{ - let tag = tag_str.as_bytes(); - Parser::new(move |input: &'a [u8], start: usize| { - let mut index = 0; - loop { - let pos = start + index; - if index == tag.len() { - let result = &input[start..pos]; - // UNSAFE: Because slice is byte-identical to a str, it is known valid UTF-8 - let result_str = unsafe { str::from_utf8_unchecked(result) }; - return Ok((result_str, pos)); - } - if let Some(s) = input.get(pos) { - if tag[index] != *s { - return Err(Error::Mismatch { - message: format!("seq {:?} at byte index: {}", tag, pos), - position: pos, - }); - } - } else { - return Err(Error::Incomplete); - } - index += 1; - } - }) -} - -/// Success when current input symbol is one of the set. -pub fn one_of<'a, S>(set: &'a S) -> Parser<'a, char> -where - S: Set + ?Sized, -{ - Parser::new(move |input: &'a [u8], start: usize| { - let (ch, size) = decode_utf8(&input[start..]); - - if let Some(ch) = ch { - if set.contains(&ch) { - let pos = start+size; - - Ok((ch, pos)) - } else { - Err(Error::Mismatch { - message: format!("expect one of: {}, found: {}", set.to_str(), ch), - position: start, - }) - } - } else { - no_utf8(start, size) - } - }) -} - -/// Success when current input symbol is none of the set. -pub fn none_of<'a, S>(set: &'a S) -> Parser<'a, char> -where - S: Set + ?Sized, -{ - Parser::new(move |input: &'a [u8], start: usize| { - let (ch, size) = decode_utf8(&input[start..]); - - if let Some(ch) = ch { - if !set.contains(&ch) { - let pos = start+size; - - Ok((ch, pos)) - } else { - Err(Error::Mismatch { - message: format!("expect one of: {}, found: {}", set.to_str(), ch), - position: start, - }) - } - } else { - no_utf8(start, size) - } - }) -} - -/// Success when predicate returns true on current input symbol. -pub fn is_a<'a, F>(predicate: F) -> Parser<'a, char> -where - F: Fn(char) -> bool + 'a, -{ - Parser::new(move |input: &'a [u8], start: usize| { - let (ch, size) = decode_utf8(&input[start..]); - - if let Some(ch) = ch { - if predicate(ch) { - let pos = start+size; - - Ok((ch, pos)) - } else { - Err(Error::Mismatch { - message: format!("is_a predicate failed on: {}", ch), - position: start, - }) - } - } else { - no_utf8(start, size) - } - }) -} - -/// Success when predicate returns false on current input symbol. -pub fn not_a<'a, F>(predicate: F) -> Parser<'a, char> -where - F: Fn(char) -> bool + 'a, -{ - Parser::new(move |input: &'a [u8], start: usize| { - let (ch, size) = decode_utf8(&input[start..]); - - if let Some(ch) = ch { - if !predicate(ch) { - let pos = start+size; - - Ok((ch, pos)) - } else { - Err(Error::Mismatch { - message: format!("is_a predicate failed on: {}", ch), - position: start, - }) - } - } else { - no_utf8(start, size) - } - }) -} - -/// Read n chars. -pub fn take<'a>(n: usize) -> Parser<'a, &'a str> { - Parser::new(move |input: &'a [u8], start: usize| { - let mut byte_pos = start; - for _ in 0..n { - let (ch, size) = decode_utf8(&input[start..]); - if ch.is_none() { - return no_utf8(byte_pos, size) - } - byte_pos += size; - } - let result = &input[start..byte_pos]; - // UNSAFE: Because every char has been checked by decode_utf8, this string is known utf8 - let result_str = unsafe { str::from_utf8_unchecked(result) }; - Ok((result_str, byte_pos)) - }) -} - -/// Skip n symbols. -pub fn skip<'a, I>(n: usize) -> Parser<'a, ()> { - Parser::new(move |input: &'a [u8], start: usize| { - let mut byte_pos = start; - for _ in 0..n { - let (ch, size) = decode_utf8(&input[start..]); - if ch.is_none() { - return no_utf8(byte_pos, size) - } - byte_pos += size; - } - Ok(((), byte_pos)) - }) -} - -/// Read n bytes exactly. -pub fn take_bytes<'a>(n: usize) -> Parser<'a, &'a str> { - Parser::new(move |input: &'a [u8], start: usize| { - // FIXME: This runs in linear time because it checks each character. - // If we could remember which inputs were passed in from parse_str() instead of parse(), - // we could assume the characters are valid utf8 and run this in constant time by only checking - // the final character using bstr::decode_last_utf8. - let mut byte_pos = start; - loop { - let (ch, size) = decode_utf8(&input[start..]); - if ch.is_none() { - return no_utf8(byte_pos, size) - } - byte_pos += size; - if byte_pos > n { - return Err(Error::Mismatch { - message: "range splits a UTF-8 character".to_owned(), - position: start, - }) - } - if byte_pos == n { - let result = &input[start..byte_pos]; - // UNSAFE: Because every char has been checked by decode_utf8, this string is known utf8 - let result_str = unsafe { str::from_utf8_unchecked(result) }; - return Ok((result_str, byte_pos)) - } - } - }) -} - -/// Skip n bytes exactly. -pub fn skip_bytes<'a>(n: usize) -> Parser<'a, ()> { - Parser::new(move |input: &'a [u8], start: usize| { - // FIXME: See note on take_bytes. - let mut byte_pos = start; - loop { - let (ch, size) = decode_utf8(&input[start..]); - if ch.is_none() { - return no_utf8(byte_pos, size) - } - byte_pos += size; - if byte_pos > n { - return Err(Error::Mismatch { - message: "range splits a UTF-8 character".to_owned(), - position: start, - }) - } - if byte_pos == n { - return Ok(((), byte_pos)) - } - } - }) -} - -/// Chain two parsers where the second parser depends on the first's result. -impl<'a, O: 'a, U: 'a, F: Fn(O) -> Parser<'a, U> + 'a> Shr for Parser<'a, O> { - type Output = Parser<'a, U>; - - fn shr(self, other: F) -> Self::Output { - Parser::new(move |input: &'a [u8], start: usize| { - (self.0.method)(input, start).and_then(|(out, pos)| (other(out).0.method)(input, pos)) - }) - } -} - -// Note: There are no "degrade to parser::Parser" implementations for >> -// because Rust cannot tell the difference between an FN(O)->U and an FN(O)->V. - -// Remaining functions in file only delegate to base parser::Parser - -/// Always succeeds, consume no input. -pub fn empty<'a>() -> Parser<'a, ()> { - Parser( parser::empty() ) -} - -/// Parse separated list. -pub fn list<'a, I, O, U>( - item: Parser<'a, O>, - separator: Parser<'a, U>, -) -> Parser<'a, Vec> -where - O: 'a, - U: 'a, -{ - Parser( parser::list(item.0, separator.0) ) -} - -/// Call a parser factory, can be used to create recursive parsers. -pub fn call<'a, O, F>(parser_factory: F) -> Parser<'a, O> -where - O: 'a, - F: Fn() -> Parser<'a, O> + 'a, -{ - Parser( parser::call(move || parser_factory().0) ) -} - -/// Success when end of input is reached. -pub fn end<'a, I>() -> Parser<'a, ()> -{ - Parser( parser::end() ) -} - -// And, Sub and Mul are similar enough we can implement them with macros - -macro_rules! utf_op { - ( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => { - #[doc=$doc] - impl<'a, Left: 'a, Right: 'a> $impl_name> for Parser<'a, Left> { - type Output = Parser<'a, $return_type>; - - fn $fn_name (self, other: Parser<'a, Right>) -> Self::Output { - Parser(self.0 $op other.0) - } - } - }; -} - -macro_rules! utf_u8_op { - ( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => { - #[doc=concat!($doc, " (but degrade to non-utf8 parser)")] - impl<'a, Left: 'a, Right: 'a> $impl_name> for Parser<'a, Left> { - type Output = parser::Parser<'a, u8, $return_type>; - - fn $fn_name (self, other: parser::Parser<'a, u8, Right>) -> Self::Output { - self.0 $op other - } - } - }; -} - -macro_rules! u8_utf_op { - ( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => { - #[doc=concat!($doc, " (but degrade to non-utf8 parser)")] - impl<'a, Left: 'a, Right: 'a> $impl_name> for parser::Parser<'a, u8, Left> { - type Output = parser::Parser<'a, u8, $return_type>; - - fn $fn_name (self, other: Parser<'a, Right>) -> Self::Output { - self $op other.0 - } - } - }; -} - -macro_rules! all_op { - ( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => { - utf_op!($impl_name, $fn_name, $op, $return_type, $doc); - utf_u8_op!($impl_name, $fn_name, $op, $return_type, $doc); - u8_utf_op!($impl_name, $fn_name, $op, $return_type, $doc); - } -} - -all_op!(Add, add, +, (Left, Right), "Sequence reserve value"); - -all_op!(Sub, sub, -, Left, "Sequence discard second value"); - -all_op!(Mul, mul, *, Right, "Sequence discard first value"); - -/// Ordered choice -impl<'a, O: 'a> BitOr for Parser<'a, O> { - type Output = Parser<'a, O>; - - fn bitor(self, other: Parser<'a, O>) -> Self::Output { - Parser(self.0 | other.0) - } -} - -/// Ordered choice (but degrade to non-utf8 parser) -impl<'a, O: 'a> BitOr> for Parser<'a, O> { - type Output = parser::Parser<'a, u8, O>; - - fn bitor(self, other: parser::Parser<'a, u8, O>) -> Self::Output { - self.0 | other - } -} - -/// Ordered choice (but degrade to non-utf8 parser) -impl<'a, O: 'a> BitOr> for parser::Parser<'a, u8, O> { - type Output = parser::Parser<'a, u8, O>; - - fn bitor(self, other: Parser<'a, O>) -> Self::Output { - self | other.0 - } -} - -/// And predicate -impl<'a, O: 'a> Neg for Parser<'a, O> { - type Output = Parser<'a, bool>; - - fn neg(self) -> Self::Output { - Parser( -self.0 ) - } -} - -/// Not predicate -impl<'a, O: 'a> Not for Parser<'a, O> { - type Output = Parser<'a, bool>; - - fn not(self) -> Self::Output { - Parser( !self.0 ) - } -} +// Variants of parser functions specialized for matching UTF-8 strings and returning chars + +use super::parser; +use super::{Error, Result}; +use crate::range::RangeArgument; +use crate::set::Set; +use bstr::decode_utf8; +use std::fmt::Debug; +use std::ops::{Add, BitOr, Mul, Neg, Not, Shr, Sub}; +use std::str; + +// / Parser combinator. +//type Parse<'a, O> = dyn Fn(&'a [u8], usize) -> Result<(O, usize)> + 'a; + +/// Being wrapped in this struct guarantees that the parser within will only match valid UTF-8 strings. +pub struct Parser<'a, O>(parser::Parser<'a, u8, O>); + +impl<'a, O> Parser<'a, O> { + /// Create new parser. + pub fn new

(parse: P) -> Self + where + P: Fn(&'a [u8], usize) -> Result<(O, usize)> + 'a, + { + Self(parser::Parser::new(parse)) + } + + /// Collect all matched input symbols. + // This method is the primary reason utf8::Parser exists at all. + pub fn collect(self) -> Parser<'a, &'a str> + where + O: 'a, + { + Parser(self.0.collect().map( + // UNSAFE: Because we only could have constructed this object from other utf8::Parser objects, the match space must be valid UTF-8 + |s| unsafe { str::from_utf8_unchecked(s) }, + )) + } + + // Remaining methods in impl only delegate to base parser::Parser + + /// Apply the parser to parse input. + pub fn parse(&self, input: &'a [u8]) -> Result { + self.0.parse(input) + } + + /// Parse input at specified byte position. + pub fn parse_at(&self, input: &'a [u8], start: usize) -> Result<(O, usize)> { + self.0.parse_at(input, start) + } + + /// Apply the parser to parse input. + pub fn parse_str(&self, input: &'a str) -> Result { + self.0.parse(input.as_bytes()) + } + + /// Convert parser result to desired value. + pub fn map(self, f: F) -> Parser<'a, U> + where + F: Fn(O) -> U + 'a, + O: 'a, + U: 'a, + { + Parser(self.0.map(f)) + } + + /// Convert parser result to desired value, fail in case of conversion error. + pub fn convert(self, f: F) -> Parser<'a, U> + where + F: Fn(O) -> ::std::result::Result + 'a, + E: Debug, + O: 'a, + U: 'a, + { + Parser(self.0.convert(f)) + } + + /// Cache parser output result to speed up backtracking. + pub fn cache(self) -> Self + where + O: Clone + 'a, + { + Self(self.0.cache()) + } + + /// Get input position after matching parser. + pub fn pos(self) -> Parser<'a, usize> + where + O: 'a, + { + Parser(self.0.pos()) + } + + /// Discard parser output. + pub fn discard(self) -> Parser<'a, ()> + where + O: 'a, + { + Parser(self.0.discard()) + } + + /// Make parser optional. + pub fn opt(self) -> Parser<'a, Option> + where + O: 'a, + { + Parser(self.0.opt()) + } + + /// `p.repeat(5)` repeat p exactly 5 times + /// `p.repeat(0..)` repeat p zero or more times + /// `p.repeat(1..)` repeat p one or more times + /// `p.repeat(1..4)` match p at least 1 and at most 3 times + pub fn repeat(self, range: R) -> Parser<'a, Vec> + where + R: RangeArgument + Debug + 'a, + O: 'a, + { + Parser(self.0.repeat(range)) + } + + /// Give parser a name to identify parsing errors. + pub fn name(self, name: &'a str) -> Self + where + O: 'a, + { + Self(self.0.name(name)) + } + + /// Mark parser as expected, abort early when failed in ordered choice. + pub fn expect(self, name: &'a str) -> Self + where + O: 'a, + { + Self(self.0.expect(name)) + } +} + +impl<'a, O> From> for parser::Parser<'a, u8, O> { + fn from(parser: Parser<'a, O>) -> Self { + parser.0 // Simply unwrap + } +} + +// Helper for functions that decode_utf8 and fail +fn no_utf8(start: usize, size: usize) -> Result { + if size == 0 { + Err(Error::Mismatch { + message: "end of input reached".to_owned(), + position: start, + }) + } else { + Err(Error::Mismatch { + message: "not UTF-8".to_owned(), + position: start, + }) + } +} + +/// Match any UTF-8 character. +pub fn any<'a>() -> Parser<'a, char> { + Parser::new(|input: &[u8], start: usize| { + let (ch, size) = decode_utf8(&input[start..]); + + if let Some(ch) = ch { + let pos = start + size; + + Ok((ch, pos)) + } else { + no_utf8(start, size) + } + }) +} + +/// Match specific UTF-8 character. +pub fn sym<'a>(tag: char) -> Parser<'a, char> { + Parser::new(move |input: &[u8], start: usize| { + let (ch, size) = decode_utf8(&input[start..]); + + if let Some(ch) = ch { + if ch == tag { + let pos = start + size; + + Ok((ch, pos)) + } else { + Err(Error::Mismatch { + message: format!("expect: {}, found: {}", tag, ch), + position: start, + }) + } + } else { + no_utf8(start, size) + } + }) +} + +/// Success when sequence of chars matches current input. +pub fn seq<'a, 'b: 'a>(tag_str: &'b str) -> Parser<'a, &'a str> { + let tag = tag_str.as_bytes(); + Parser::new(move |input: &'a [u8], start: usize| { + let mut index = 0; + loop { + let pos = start + index; + if index == tag.len() { + let result = &input[start..pos]; + // UNSAFE: Because slice is byte-identical to a str, it is known valid UTF-8 + let result_str = unsafe { str::from_utf8_unchecked(result) }; + return Ok((result_str, pos)); + } + if let Some(s) = input.get(pos) { + if tag[index] != *s { + return Err(Error::Mismatch { + message: format!("seq {:?} at byte index: {}", tag, pos), + position: pos, + }); + } + } else { + return Err(Error::Incomplete); + } + index += 1; + } + }) +} + +/// Success when current input symbol is one of the set. +pub fn one_of<'a, S>(set: &'a S) -> Parser<'a, char> +where + S: Set + ?Sized, +{ + Parser::new(move |input: &'a [u8], start: usize| { + let (ch, size) = decode_utf8(&input[start..]); + + if let Some(ch) = ch { + if set.contains(&ch) { + let pos = start + size; + + Ok((ch, pos)) + } else { + Err(Error::Mismatch { + message: format!("expect one of: {}, found: {}", set.to_str(), ch), + position: start, + }) + } + } else { + no_utf8(start, size) + } + }) +} + +/// Success when current input symbol is none of the set. +pub fn none_of<'a, S>(set: &'a S) -> Parser<'a, char> +where + S: Set + ?Sized, +{ + Parser::new(move |input: &'a [u8], start: usize| { + let (ch, size) = decode_utf8(&input[start..]); + + if let Some(ch) = ch { + if !set.contains(&ch) { + let pos = start + size; + + Ok((ch, pos)) + } else { + Err(Error::Mismatch { + message: format!("expect one of: {}, found: {}", set.to_str(), ch), + position: start, + }) + } + } else { + no_utf8(start, size) + } + }) +} + +/// Success when predicate returns true on current input symbol. +pub fn is_a<'a, F>(predicate: F) -> Parser<'a, char> +where + F: Fn(char) -> bool + 'a, +{ + Parser::new(move |input: &'a [u8], start: usize| { + let (ch, size) = decode_utf8(&input[start..]); + + if let Some(ch) = ch { + if predicate(ch) { + let pos = start + size; + + Ok((ch, pos)) + } else { + Err(Error::Mismatch { + message: format!("is_a predicate failed on: {}", ch), + position: start, + }) + } + } else { + no_utf8(start, size) + } + }) +} + +/// Success when predicate returns false on current input symbol. +pub fn not_a<'a, F>(predicate: F) -> Parser<'a, char> +where + F: Fn(char) -> bool + 'a, +{ + Parser::new(move |input: &'a [u8], start: usize| { + let (ch, size) = decode_utf8(&input[start..]); + + if let Some(ch) = ch { + if !predicate(ch) { + let pos = start + size; + + Ok((ch, pos)) + } else { + Err(Error::Mismatch { + message: format!("is_a predicate failed on: {}", ch), + position: start, + }) + } + } else { + no_utf8(start, size) + } + }) +} + +/// Read n chars. +pub fn take<'a>(n: usize) -> Parser<'a, &'a str> { + Parser::new(move |input: &'a [u8], start: usize| { + let mut byte_pos = start; + for _ in 0..n { + let (ch, size) = decode_utf8(&input[start..]); + if ch.is_none() { + return no_utf8(byte_pos, size); + } + byte_pos += size; + } + let result = &input[start..byte_pos]; + // UNSAFE: Because every char has been checked by decode_utf8, this string is known utf8 + let result_str = unsafe { str::from_utf8_unchecked(result) }; + Ok((result_str, byte_pos)) + }) +} + +/// Skip n symbols. +pub fn skip<'a, I>(n: usize) -> Parser<'a, ()> { + Parser::new(move |input: &'a [u8], start: usize| { + let mut byte_pos = start; + for _ in 0..n { + let (ch, size) = decode_utf8(&input[start..]); + if ch.is_none() { + return no_utf8(byte_pos, size); + } + byte_pos += size; + } + Ok(((), byte_pos)) + }) +} + +/// Read n bytes exactly. +pub fn take_bytes<'a>(n: usize) -> Parser<'a, &'a str> { + Parser::new(move |input: &'a [u8], start: usize| { + // FIXME: This runs in linear time because it checks each character. + // If we could remember which inputs were passed in from parse_str() instead of parse(), + // we could assume the characters are valid utf8 and run this in constant time by only checking + // the final character using bstr::decode_last_utf8. + let mut byte_pos = start; + loop { + let (ch, size) = decode_utf8(&input[start..]); + if ch.is_none() { + return no_utf8(byte_pos, size); + } + byte_pos += size; + if byte_pos > n { + return Err(Error::Mismatch { + message: "range splits a UTF-8 character".to_owned(), + position: start, + }); + } + if byte_pos == n { + let result = &input[start..byte_pos]; + // UNSAFE: Because every char has been checked by decode_utf8, this string is known utf8 + let result_str = unsafe { str::from_utf8_unchecked(result) }; + return Ok((result_str, byte_pos)); + } + } + }) +} + +/// Skip n bytes exactly. +pub fn skip_bytes<'a>(n: usize) -> Parser<'a, ()> { + Parser::new(move |input: &'a [u8], start: usize| { + // FIXME: See note on take_bytes. + let mut byte_pos = start; + loop { + let (ch, size) = decode_utf8(&input[start..]); + if ch.is_none() { + return no_utf8(byte_pos, size); + } + byte_pos += size; + if byte_pos > n { + return Err(Error::Mismatch { + message: "range splits a UTF-8 character".to_owned(), + position: start, + }); + } + if byte_pos == n { + return Ok(((), byte_pos)); + } + } + }) +} + +/// Chain two parsers where the second parser depends on the first's result. +impl<'a, O: 'a, U: 'a, F: Fn(O) -> Parser<'a, U> + 'a> Shr for Parser<'a, O> { + type Output = Parser<'a, U>; + + fn shr(self, other: F) -> Self::Output { + Parser::new(move |input: &'a [u8], start: usize| { + (self.0.method)(input, start).and_then(|(out, pos)| (other(out).0.method)(input, pos)) + }) + } +} + +// Note: There are no "degrade to parser::Parser" implementations for >> +// because Rust cannot tell the difference between an FN(O)->U and an FN(O)->V. + +// Remaining functions in file only delegate to base parser::Parser + +/// Always succeeds, consume no input. +pub fn empty<'a>() -> Parser<'a, ()> { + Parser(parser::empty()) +} + +/// Parse separated list. +pub fn list<'a, I, O, U>(item: Parser<'a, O>, separator: Parser<'a, U>) -> Parser<'a, Vec> +where + O: 'a, + U: 'a, +{ + Parser(parser::list(item.0, separator.0)) +} + +/// Call a parser factory, can be used to create recursive parsers. +pub fn call<'a, O, F>(parser_factory: F) -> Parser<'a, O> +where + O: 'a, + F: Fn() -> Parser<'a, O> + 'a, +{ + Parser(parser::call(move || parser_factory().0)) +} + +/// Success when end of input is reached. +pub fn end<'a, I>() -> Parser<'a, ()> { + Parser(parser::end()) +} + +// And, Sub and Mul are similar enough we can implement them with macros + +macro_rules! utf_op { + ( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => { + #[doc=$doc] + impl<'a, Left: 'a, Right: 'a> $impl_name> for Parser<'a, Left> { + type Output = Parser<'a, $return_type>; + + fn $fn_name (self, other: Parser<'a, Right>) -> Self::Output { + Parser(self.0 $op other.0) + } + } + }; +} + +macro_rules! utf_u8_op { + ( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => { + #[doc=concat!($doc, " (but degrade to non-utf8 parser)")] + impl<'a, Left: 'a, Right: 'a> $impl_name> for Parser<'a, Left> { + type Output = parser::Parser<'a, u8, $return_type>; + + fn $fn_name (self, other: parser::Parser<'a, u8, Right>) -> Self::Output { + self.0 $op other + } + } + }; +} + +macro_rules! u8_utf_op { + ( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => { + #[doc=concat!($doc, " (but degrade to non-utf8 parser)")] + impl<'a, Left: 'a, Right: 'a> $impl_name> for parser::Parser<'a, u8, Left> { + type Output = parser::Parser<'a, u8, $return_type>; + + fn $fn_name (self, other: Parser<'a, Right>) -> Self::Output { + self $op other.0 + } + } + }; +} + +macro_rules! all_op { + ( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => { + utf_op!($impl_name, $fn_name, $op, $return_type, $doc); + utf_u8_op!($impl_name, $fn_name, $op, $return_type, $doc); + u8_utf_op!($impl_name, $fn_name, $op, $return_type, $doc); + }; +} + +all_op!(Add, add, +, (Left, Right), "Sequence reserve value"); + +all_op!(Sub, sub, -, Left, "Sequence discard second value"); + +all_op!(Mul, mul, *, Right, "Sequence discard first value"); + +/// Ordered choice +impl<'a, O: 'a> BitOr for Parser<'a, O> { + type Output = Self; + + fn bitor(self, other: Self) -> Self { + Self(self.0 | other.0) + } +} + +/// Ordered choice (but degrade to non-utf8 parser) +impl<'a, O: 'a> BitOr> for Parser<'a, O> { + type Output = parser::Parser<'a, u8, O>; + + fn bitor(self, other: parser::Parser<'a, u8, O>) -> Self::Output { + self.0 | other + } +} + +/// Ordered choice (but degrade to non-utf8 parser) +impl<'a, O: 'a> BitOr> for parser::Parser<'a, u8, O> { + type Output = parser::Parser<'a, u8, O>; + + fn bitor(self, other: Parser<'a, O>) -> Self::Output { + self | other.0 + } +} + +/// And predicate +impl<'a, O: 'a> Neg for Parser<'a, O> { + type Output = Parser<'a, bool>; + + fn neg(self) -> Self::Output { + Parser(-self.0) + } +} + +/// Not predicate +impl<'a, O: 'a> Not for Parser<'a, O> { + type Output = Parser<'a, bool>; + + fn not(self) -> Self::Output { + Parser(!self.0) + } +}