From 7efec98b0d422a9d511254972010a1b980e90cd3 Mon Sep 17 00:00:00 2001 From: mcclure Date: Tue, 3 Jan 2023 07:57:00 -0500 Subject: [PATCH] "utf8" module supporting matching UTF-8/returning &str (#59) * Create a utf8::Parser newtype so that collect() can safely return a str * Replace utf8 parser.as_bytes() with parser.into() * Example program mixing u8 parser and UTF-8 parser * utf8 sym tag / fix any tag return type * utf8 one_of/none_of * is_a/not_a * Remaining parser constructors for utf8 * More operator overloads: And, Sub, degrade versions for BitOr and Mul * Doc comments on utf8 macro expansions, utf8 Shr * Doc comments on utf8 macro expansions, rest of parser functions in utf8 * parse_str convenience calls .as_bytes for you * Feature for utf8, enabled by default * utf::take_bytes() and utf::skip_bytes() --- Cargo.lock | 46 ++++ Cargo.toml | 7 + examples/utf8.rs | 68 +++++ examples/utf8_mixed.rs | 35 +++ src/lib.rs | 5 + src/utf8.rs | 560 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 721 insertions(+) create mode 100644 Cargo.lock create mode 100644 examples/utf8.rs create mode 100644 examples/utf8_mixed.rs create mode 100644 src/utf8.rs diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..b722626 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,46 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "bstr" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b45ea9b00a7b3f2988e9a65ad3917e62123c38dba709b666506207be96d1790b" +dependencies = [ + "memchr", + "once_cell", + "regex-automata", + "serde", +] + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "once_cell" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86f0b0d4bf799edbc74508c1e8bf170ff5f41238e5f8225603ca7caaae2b7860" + +[[package]] +name = "pom" +version = "3.2.0" +dependencies = [ + "bstr", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" + +[[package]] +name = "serde" +version = "1.0.152" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb" diff --git a/Cargo.toml b/Cargo.toml index 1bf55ef..fbd61b6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,3 +14,10 @@ keywords = ["parser", "parser-combinators", "parsing", "PEG"] [badges] travis-ci = { repository = "J-F-Liu/pom" } + +[dependencies] +bstr = {version = "1.1.0", features = []} # Only uses one function, so no features needed. + +[features] +default = ["utf8"] +utf8 = [] diff --git a/examples/utf8.rs b/examples/utf8.rs new file mode 100644 index 0000000..f781c87 --- /dev/null +++ b/examples/utf8.rs @@ -0,0 +1,68 @@ +// Example shows basic UTF-8 combinators + +use pom::utf8::*; + +fn main() { + // Informal, Spanish-language movie database format + let input = "\ +Título: Abre los ojos +Año: 1997 +Director: Alejandro Amenábar + +Título: Amores Perros +Director: Alejandro González Iñárritu +Año: 2000 + +Título: La montaña sagrada +Año: 1973 +Director: Alejandro Jodorowsky +"; + + enum DataLine<'a> { + Title(&'a str), + Director(&'a str), + Year(i32), + } + + fn positive<'a>() -> Parser<'a, i32> { +// let integer = (one_of("123456789") - one_of("0123456789").repeat(0..)) | sym(b'0'); // TODO + let digit = one_of("0123456789"); + let integer = digit.discard().repeat(1..); + integer.collect().convert(|x|x.parse::()) + } + + fn rest_str<'a>() -> Parser<'a, &'a str> { + any().repeat(1..).collect() + } + + fn separator<'a>() ->Parser<'a, ()> { + seq(": ").discard() + } + + let parser = + (seq("Título") * separator() * rest_str().map(|s| DataLine::Title(s))) + | (seq("Director") * separator() * rest_str().map(|s| DataLine::Director(s))) + | (seq("Año") * separator() * positive().map(|i| DataLine::Year(i))); + + { + let mut title_opt:Option<&str> = None; + let mut year_opt:Option = None; + let mut director_opt:Option<&str> = None; + + for line in input.lines() { + if !line.is_empty() { // Skip blank lines without parsing + // Parse line + match parser.parse_str(line).unwrap() { + DataLine::Title(s) => title_opt = Some(s), + DataLine::Director(s) => director_opt = Some(s), + DataLine::Year(s) => year_opt = Some(s), + } + // When all three line types have been collected, print them + if let (Some(title), Some(year), Some(director)) = (title_opt,year_opt,director_opt) { + println!("Title: {}\nDirector: {}\nYear: {}\n", title, director, year); + (title_opt, year_opt, director_opt) = (None,None,None); + } + } + } + } +} diff --git a/examples/utf8_mixed.rs b/examples/utf8_mixed.rs new file mode 100644 index 0000000..b1c86fc --- /dev/null +++ b/examples/utf8_mixed.rs @@ -0,0 +1,35 @@ +// Example shows UTF-8 combinators intermixed with binary combinators + +use pom::parser::*; +use pom::utf8; + +fn main() { + // A parser for MsgPack (but only messages encoding a string) + let testcases: [Vec; 6] = [ + vec![0b10100100, 0b11110000, 0b10011111, 0b10100100, 0b10010100], // 🤔, max-size 31 format + vec![0xd9, 4, 0b11110000, 0b10011111, 0b10011000, 0b10101110], // 😮, max-size 255 format + vec![0xda, 0, 4, 0b11110000, 0b10011111, 0b10100100, 0b10101111], // 🤯, max-size 2^16-1 format + vec![0xdb, 0, 0, 0, 4, 0b11110000, 0b10011111, 0b10010010, 0b10100101], // 💥, max-size 2^32-1 format + vec![0xc4, 4, 0b11110000, 0b10011111, 0b10011000, 0b10101110], // Valid MsgPack, but not a string (binary) + vec![0b10100100, 0b10010100, 0b10100100, 0b10011111, 0b11110000], // A MsgPack string, but invalid UTF-8 + ]; + + const MASK:u8 = 0b11100000; // size 31 format is denoted by 3 high bits == 101 + const SIZE_31:u8 = 0b10100000; + + fn rest_as_str<'a>() -> utf8::Parser<'a, &'a str> { + utf8::any().repeat(0..).collect() + } + + // Demo parser does not verify that the claimed length matches the actual length (but checking so is simple with >>) + let parser = + (sym(0xdb) * any().repeat(4) * rest_as_str()) // 2^32-1 format + | (sym(0xda) * any().repeat(2) * rest_as_str()) // 2^16-1 format + | (sym(0xd9) * any() * rest_as_str()) // 255 format + | (is_a(|x| x&MASK == SIZE_31) * rest_as_str()) // 31 format + - end(); + + for testcase in testcases.iter() { + println!("{:?}", parser.parse(testcase)); + } +} diff --git a/src/lib.rs b/src/lib.rs index 3305898..395a619 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,6 +8,11 @@ pub mod parser; /// Utility functions to recognize char class of byte value. pub mod char_class; +/// Variants of parser functions specialized for matching UTF-8 strings and returning chars. +/// Method and constructor names/functionality are generally the same as in base parser module. +#[cfg(feature = "utf8")] +pub mod utf8; + pub use crate::result::{Error, Result}; /// Parser type, `Parser` is alias of `parser::Parser<'static, I, O>`. diff --git a/src/utf8.rs b/src/utf8.rs new file mode 100644 index 0000000..c6fbb81 --- /dev/null +++ b/src/utf8.rs @@ -0,0 +1,560 @@ +// Variants of parser functions specialized for matching UTF-8 strings and returning chars + +use super::{Error, Result}; +use super::parser; +use crate::range::RangeArgument; +use crate::set::Set; +use std::str; +use std::fmt::Debug; +use bstr::decode_utf8; +use std::ops::{Add, BitOr, Mul, Neg, Not, Shr, Sub}; + +// / Parser combinator. +//type Parse<'a, O> = dyn Fn(&'a [u8], usize) -> Result<(O, usize)> + 'a; + +/// Being wrapped in this struct guarantees that the parser within will only match valid UTF-8 strings. +pub struct Parser<'a, O> (parser::Parser<'a, u8, O>); + +impl<'a, O> Parser<'a, O> { + /// Create new parser. + pub fn new

(parse: P) -> Parser<'a, O> + where + P: Fn(&'a [u8], usize) -> Result<(O, usize)> + 'a, + { + Parser( parser::Parser::new(parse) ) + } + + /// Collect all matched input symbols. + // This method is the primary reason utf8::Parser exists at all. + pub fn collect(self) -> Parser<'a, &'a str> + where + O: 'a, + { + Parser( self.0.collect().map( + // UNSAFE: Because we only could have constructed this object from other utf8::Parser objects, the match space must be valid UTF-8 + |s| unsafe { str::from_utf8_unchecked(s) } + ) ) + } + + // Remaining methods in impl only delegate to base parser::Parser + + /// Apply the parser to parse input. + pub fn parse(&self, input: &'a [u8]) -> Result { + self.0.parse(input) + } + + /// Parse input at specified byte position. + pub fn parse_at(&self, input: &'a [u8], start: usize) -> Result<(O, usize)> { + self.0.parse_at(input, start) + } + + /// Apply the parser to parse input. + pub fn parse_str(&self, input: &'a str) -> Result { + self.0.parse(input.as_bytes()) + } + + /// Convert parser result to desired value. + pub fn map(self, f: F) -> Parser<'a, U> + where + F: Fn(O) -> U + 'a, + O: 'a, + U: 'a, + { + Parser( self.0.map(f) ) + } + + /// Convert parser result to desired value, fail in case of conversion error. + pub fn convert(self, f: F) -> Parser<'a, U> + where + F: Fn(O) -> ::std::result::Result + 'a, + E: Debug, + O: 'a, + U: 'a, + { + Parser( self.0.convert(f) ) + } + + /// Cache parser output result to speed up backtracking. + pub fn cache(self) -> Parser<'a, O> + where + O: Clone + 'a, + { + Parser( self.0.cache() ) + } + + /// Get input position after matching parser. + pub fn pos(self) -> Parser<'a, usize> + where + O: 'a, + { + Parser( self.0.pos() ) + } + + /// Discard parser output. + pub fn discard(self) -> Parser<'a, ()> + where + O: 'a, + { + Parser( self.0.discard() ) + } + + /// Make parser optional. + pub fn opt(self) -> Parser<'a, Option> + where + O: 'a, + { + Parser( self.0.opt() ) + } + + /// `p.repeat(5)` repeat p exactly 5 times + /// `p.repeat(0..)` repeat p zero or more times + /// `p.repeat(1..)` repeat p one or more times + /// `p.repeat(1..4)` match p at least 1 and at most 3 times + pub fn repeat(self, range: R) -> Parser<'a, Vec> + where + R: RangeArgument + Debug + 'a, + O: 'a, + { + Parser( self.0.repeat(range) ) + } + + /// Give parser a name to identify parsing errors. + pub fn name(self, name: &'a str) -> Parser<'a, O> + where + O: 'a, + { + Parser( self.0.name(name) ) + } + + /// Mark parser as expected, abort early when failed in ordered choice. + pub fn expect(self, name: &'a str) -> Parser<'a, O> + where + O: 'a, + { + Parser( self.0.expect(name) ) + } +} + +impl<'a, O> From> for parser::Parser<'a, u8, O> { + fn from(parser: Parser<'a, O>) -> Self { + parser.0 // Simply unwrap + } +} + +// Helper for functions that decode_utf8 and fail +fn no_utf8(start: usize, size: usize) -> Result { + if size == 0 { + Err(Error::Mismatch { + message: "end of input reached".to_owned(), + position: start, + }) + } else { + Err(Error::Mismatch { + message: "not UTF-8".to_owned(), + position: start, + }) + } +} + +/// Match any UTF-8 character. +pub fn any<'a>() -> Parser<'a, char> +{ + Parser::new(|input: &[u8], start: usize| { + let (ch, size) = decode_utf8(&input[start..]); + + if let Some(ch) = ch { + let pos = start+size; + + Ok((ch, pos)) + } else { + no_utf8(start, size) + } + }) +} + +/// Match specific UTF-8 character. +pub fn sym<'a>(tag: char) -> Parser<'a, char> +{ + Parser::new(move |input: &[u8], start: usize| { + let (ch, size) = decode_utf8(&input[start..]); + + if let Some(ch) = ch { + if ch == tag { + let pos = start+size; + + Ok((ch, pos)) + } else { + Err(Error::Mismatch { + message: format!("expect: {}, found: {}", tag, ch), + position: start, + }) + } + } else { + no_utf8(start, size) + } + }) +} + +/// Success when sequence of chars matches current input. +pub fn seq<'a, 'b: 'a>(tag_str: &'b str) -> Parser<'a, &'a str> +{ + let tag = tag_str.as_bytes(); + Parser::new(move |input: &'a [u8], start: usize| { + let mut index = 0; + loop { + let pos = start + index; + if index == tag.len() { + let result = &input[start..pos]; + // UNSAFE: Because slice is byte-identical to a str, it is known valid UTF-8 + let result_str = unsafe { str::from_utf8_unchecked(result) }; + return Ok((result_str, pos)); + } + if let Some(s) = input.get(pos) { + if tag[index] != *s { + return Err(Error::Mismatch { + message: format!("seq {:?} at byte index: {}", tag, pos), + position: pos, + }); + } + } else { + return Err(Error::Incomplete); + } + index += 1; + } + }) +} + +/// Success when current input symbol is one of the set. +pub fn one_of<'a, S>(set: &'a S) -> Parser<'a, char> +where + S: Set + ?Sized, +{ + Parser::new(move |input: &'a [u8], start: usize| { + let (ch, size) = decode_utf8(&input[start..]); + + if let Some(ch) = ch { + if set.contains(&ch) { + let pos = start+size; + + Ok((ch, pos)) + } else { + Err(Error::Mismatch { + message: format!("expect one of: {}, found: {}", set.to_str(), ch), + position: start, + }) + } + } else { + no_utf8(start, size) + } + }) +} + +/// Success when current input symbol is none of the set. +pub fn none_of<'a, S>(set: &'a S) -> Parser<'a, char> +where + S: Set + ?Sized, +{ + Parser::new(move |input: &'a [u8], start: usize| { + let (ch, size) = decode_utf8(&input[start..]); + + if let Some(ch) = ch { + if !set.contains(&ch) { + let pos = start+size; + + Ok((ch, pos)) + } else { + Err(Error::Mismatch { + message: format!("expect one of: {}, found: {}", set.to_str(), ch), + position: start, + }) + } + } else { + no_utf8(start, size) + } + }) +} + +/// Success when predicate returns true on current input symbol. +pub fn is_a<'a, F>(predicate: F) -> Parser<'a, char> +where + F: Fn(char) -> bool + 'a, +{ + Parser::new(move |input: &'a [u8], start: usize| { + let (ch, size) = decode_utf8(&input[start..]); + + if let Some(ch) = ch { + if predicate(ch) { + let pos = start+size; + + Ok((ch, pos)) + } else { + Err(Error::Mismatch { + message: format!("is_a predicate failed on: {}", ch), + position: start, + }) + } + } else { + no_utf8(start, size) + } + }) +} + +/// Success when predicate returns false on current input symbol. +pub fn not_a<'a, F>(predicate: F) -> Parser<'a, char> +where + F: Fn(char) -> bool + 'a, +{ + Parser::new(move |input: &'a [u8], start: usize| { + let (ch, size) = decode_utf8(&input[start..]); + + if let Some(ch) = ch { + if !predicate(ch) { + let pos = start+size; + + Ok((ch, pos)) + } else { + Err(Error::Mismatch { + message: format!("is_a predicate failed on: {}", ch), + position: start, + }) + } + } else { + no_utf8(start, size) + } + }) +} + +/// Read n chars. +pub fn take<'a>(n: usize) -> Parser<'a, &'a str> { + Parser::new(move |input: &'a [u8], start: usize| { + let mut byte_pos = start; + for _ in 0..n { + let (ch, size) = decode_utf8(&input[start..]); + if ch.is_none() { + return no_utf8(byte_pos, size) + } + byte_pos += size; + } + let result = &input[start..byte_pos]; + // UNSAFE: Because every char has been checked by decode_utf8, this string is known utf8 + let result_str = unsafe { str::from_utf8_unchecked(result) }; + Ok((result_str, byte_pos)) + }) +} + +/// Skip n symbols. +pub fn skip<'a, I>(n: usize) -> Parser<'a, ()> { + Parser::new(move |input: &'a [u8], start: usize| { + let mut byte_pos = start; + for _ in 0..n { + let (ch, size) = decode_utf8(&input[start..]); + if ch.is_none() { + return no_utf8(byte_pos, size) + } + byte_pos += size; + } + Ok(((), byte_pos)) + }) +} + +/// Read n bytes exactly. +pub fn take_bytes<'a>(n: usize) -> Parser<'a, &'a str> { + Parser::new(move |input: &'a [u8], start: usize| { + // FIXME: This runs in linear time because it checks each character. + // If we could remember which inputs were passed in from parse_str() instead of parse(), + // we could assume the characters are valid utf8 and run this in constant time by only checking + // the final character using bstr::decode_last_utf8. + let mut byte_pos = start; + loop { + let (ch, size) = decode_utf8(&input[start..]); + if ch.is_none() { + return no_utf8(byte_pos, size) + } + byte_pos += size; + if byte_pos > n { + return Err(Error::Mismatch { + message: "range splits a UTF-8 character".to_owned(), + position: start, + }) + } + if byte_pos == n { + let result = &input[start..byte_pos]; + // UNSAFE: Because every char has been checked by decode_utf8, this string is known utf8 + let result_str = unsafe { str::from_utf8_unchecked(result) }; + return Ok((result_str, byte_pos)) + } + } + }) +} + +/// Skip n bytes exactly. +pub fn skip_bytes<'a>(n: usize) -> Parser<'a, ()> { + Parser::new(move |input: &'a [u8], start: usize| { + // FIXME: See note on take_bytes. + let mut byte_pos = start; + loop { + let (ch, size) = decode_utf8(&input[start..]); + if ch.is_none() { + return no_utf8(byte_pos, size) + } + byte_pos += size; + if byte_pos > n { + return Err(Error::Mismatch { + message: "range splits a UTF-8 character".to_owned(), + position: start, + }) + } + if byte_pos == n { + return Ok(((), byte_pos)) + } + } + }) +} + +/// Chain two parsers where the second parser depends on the first's result. +impl<'a, O: 'a, U: 'a, F: Fn(O) -> Parser<'a, U> + 'a> Shr for Parser<'a, O> { + type Output = Parser<'a, U>; + + fn shr(self, other: F) -> Self::Output { + Parser::new(move |input: &'a [u8], start: usize| { + (self.0.method)(input, start).and_then(|(out, pos)| (other(out).0.method)(input, pos)) + }) + } +} + +// Note: There are no "degrade to parser::Parser" implementations for >> +// because Rust cannot tell the difference between an FN(O)->U and an FN(O)->V. + +// Remaining functions in file only delegate to base parser::Parser + +/// Always succeeds, consume no input. +pub fn empty<'a>() -> Parser<'a, ()> { + Parser( parser::empty() ) +} + +/// Parse separated list. +pub fn list<'a, I, O, U>( + item: Parser<'a, O>, + separator: Parser<'a, U>, +) -> Parser<'a, Vec> +where + O: 'a, + U: 'a, +{ + Parser( parser::list(item.0, separator.0) ) +} + +/// Call a parser factory, can be used to create recursive parsers. +pub fn call<'a, O, F>(parser_factory: F) -> Parser<'a, O> +where + O: 'a, + F: Fn() -> Parser<'a, O> + 'a, +{ + Parser( parser::call(move || parser_factory().0) ) +} + +/// Success when end of input is reached. +pub fn end<'a, I>() -> Parser<'a, ()> +{ + Parser( parser::end() ) +} + +// And, Sub and Mul are similar enough we can implement them with macros + +macro_rules! utf_op { + ( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => { + #[doc=$doc] + impl<'a, Left: 'a, Right: 'a> $impl_name> for Parser<'a, Left> { + type Output = Parser<'a, $return_type>; + + fn $fn_name (self, other: Parser<'a, Right>) -> Self::Output { + Parser(self.0 $op other.0) + } + } + }; +} + +macro_rules! utf_u8_op { + ( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => { + #[doc=concat!($doc, " (but degrade to non-utf8 parser)")] + impl<'a, Left: 'a, Right: 'a> $impl_name> for Parser<'a, Left> { + type Output = parser::Parser<'a, u8, $return_type>; + + fn $fn_name (self, other: parser::Parser<'a, u8, Right>) -> Self::Output { + self.0 $op other + } + } + }; +} + +macro_rules! u8_utf_op { + ( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => { + #[doc=concat!($doc, " (but degrade to non-utf8 parser)")] + impl<'a, Left: 'a, Right: 'a> $impl_name> for parser::Parser<'a, u8, Left> { + type Output = parser::Parser<'a, u8, $return_type>; + + fn $fn_name (self, other: Parser<'a, Right>) -> Self::Output { + self $op other.0 + } + } + }; +} + +macro_rules! all_op { + ( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => { + utf_op!($impl_name, $fn_name, $op, $return_type, $doc); + utf_u8_op!($impl_name, $fn_name, $op, $return_type, $doc); + u8_utf_op!($impl_name, $fn_name, $op, $return_type, $doc); + } +} + +all_op!(Add, add, +, (Left, Right), "Sequence reserve value"); + +all_op!(Sub, sub, -, Left, "Sequence discard second value"); + +all_op!(Mul, mul, *, Right, "Sequence discard first value"); + +/// Ordered choice +impl<'a, O: 'a> BitOr for Parser<'a, O> { + type Output = Parser<'a, O>; + + fn bitor(self, other: Parser<'a, O>) -> Self::Output { + Parser(self.0 | other.0) + } +} + +/// Ordered choice (but degrade to non-utf8 parser) +impl<'a, O: 'a> BitOr> for Parser<'a, O> { + type Output = parser::Parser<'a, u8, O>; + + fn bitor(self, other: parser::Parser<'a, u8, O>) -> Self::Output { + self.0 | other + } +} + +/// Ordered choice (but degrade to non-utf8 parser) +impl<'a, O: 'a> BitOr> for parser::Parser<'a, u8, O> { + type Output = parser::Parser<'a, u8, O>; + + fn bitor(self, other: Parser<'a, O>) -> Self::Output { + self | other.0 + } +} + +/// And predicate +impl<'a, O: 'a> Neg for Parser<'a, O> { + type Output = Parser<'a, bool>; + + fn neg(self) -> Self::Output { + Parser( -self.0 ) + } +} + +/// Not predicate +impl<'a, O: 'a> Not for Parser<'a, O> { + type Output = Parser<'a, bool>; + + fn not(self) -> Self::Output { + Parser( !self.0 ) + } +}