"utf8" module supporting matching UTF-8/returning &str (#59)

* Create a utf8::Parser newtype so that collect() can safely return a str * Replace utf8 parser.as_bytes() with parser.into<parser::Parser>() * Example program mixing u8 parser and UTF-8 parser * utf8 sym tag / fix any tag return type * utf8 one_of/none_of * is_a/not_a * Remaining parser constructors for utf8 * More operator overloads: And, Sub, degrade versions for BitOr and Mul * Doc comments on utf8 macro expansions, utf8 Shr * Doc comments on utf8 macro expansions, rest of parser functions in utf8 * parse_str convenience calls .as_bytes for you * Feature for utf8, enabled by default * utf::take_bytes() and utf::skip_bytes()
J-F-Liu · Jan 3, 2023 · 7efec98 · 7efec98
1 parent 9f4a265
commit 7efec98
Show file tree

Hide file tree

Showing 6 changed files with 721 additions and 0 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -14,3 +14,10 @@ keywords = ["parser", "parser-combinators", "parsing", "PEG"]
 
 [badges]
 travis-ci = { repository = "J-F-Liu/pom" }
+
+[dependencies] 
+bstr = {version = "1.1.0", features = []} # Only uses one function, so no features needed.
+
+[features]
+default = ["utf8"]
+utf8 = []
diff --git a/examples/utf8.rs b/examples/utf8.rs
@@ -0,0 +1,68 @@
+// Example shows basic UTF-8 combinators
+
+use pom::utf8::*;
+
+fn main() {
+	// Informal, Spanish-language movie database format
+	let input = "\
+Título: Abre los ojos
+Año: 1997
+Director: Alejandro Amenábar
+
+Título: Amores Perros
+Director: Alejandro González Iñárritu
+Año: 2000
+
+Título: La montaña sagrada
+Año: 1973
+Director: Alejandro Jodorowsky
+";
+
+	enum DataLine<'a> {
+		Title(&'a str),
+		Director(&'a str),
+		Year(i32),
+	}
+
+	fn positive<'a>() -> Parser<'a, i32> {
+//		let integer = (one_of("123456789") - one_of("0123456789").repeat(0..)) | sym(b'0'); // TODO
+		let digit = one_of("0123456789");
+		let integer = digit.discard().repeat(1..);
+		integer.collect().convert(|x|x.parse::<i32>())
+	}
+
+	fn rest_str<'a>() -> Parser<'a, &'a str> {
+		any().repeat(1..).collect()
+	}
+
+	fn separator<'a>() ->Parser<'a, ()> {
+		seq(": ").discard()
+	}
+
+	let parser =
+		  (seq("Título")   * separator() * rest_str().map(|s| DataLine::Title(s)))
+		| (seq("Director") * separator() * rest_str().map(|s| DataLine::Director(s)))
+		| (seq("Año")      * separator() * positive().map(|i| DataLine::Year(i)));
+
+	{
+		let mut title_opt:Option<&str> = None;
+		let mut year_opt:Option<i32> = None;
+		let mut director_opt:Option<&str> = None;
+
+		for line in input.lines()  {
+			if !line.is_empty() { // Skip blank lines without parsing
+				// Parse line
+				match parser.parse_str(line).unwrap() {
+					DataLine::Title(s) =>    title_opt = Some(s),
+					DataLine::Director(s) => director_opt = Some(s),
+					DataLine::Year(s) =>     year_opt = Some(s),
+				}
+				// When all three line types have been collected, print them
+				if let (Some(title), Some(year), Some(director)) = (title_opt,year_opt,director_opt) {
+					println!("Title: {}\nDirector: {}\nYear: {}\n", title, director, year);
+					(title_opt, year_opt, director_opt) = (None,None,None);
+				}
+			}
+		}
+	}
+}
diff --git a/examples/utf8_mixed.rs b/examples/utf8_mixed.rs
@@ -0,0 +1,35 @@
+// Example shows UTF-8 combinators intermixed with binary combinators
+
+use pom::parser::*;
+use pom::utf8;
+
+fn main() {
+	// A parser for MsgPack (but only messages encoding a string)
+	let testcases: [Vec<u8>; 6] = [
+		 vec![0b10100100,       0b11110000, 0b10011111, 0b10100100, 0b10010100], // 🤔, max-size 31 format
+		 vec![0xd9, 4,          0b11110000, 0b10011111, 0b10011000, 0b10101110], // 😮, max-size 255 format
+		 vec![0xda, 0, 4,       0b11110000, 0b10011111, 0b10100100, 0b10101111], // 🤯, max-size 2^16-1 format
+		 vec![0xdb, 0, 0, 0, 4, 0b11110000, 0b10011111, 0b10010010, 0b10100101], // 💥, max-size 2^32-1 format
+		 vec![0xc4, 4,          0b11110000, 0b10011111, 0b10011000, 0b10101110], // Valid MsgPack, but not a string (binary)
+		 vec![0b10100100,       0b10010100, 0b10100100, 0b10011111, 0b11110000], // A MsgPack string, but invalid UTF-8
+	];
+
+	const MASK:u8    = 0b11100000; // size 31 format is denoted by 3 high bits == 101
+	const SIZE_31:u8 = 0b10100000;
+
+	fn rest_as_str<'a>() -> utf8::Parser<'a, &'a str> {
+		utf8::any().repeat(0..).collect()
+	}
+
+	// Demo parser does not verify that the claimed length matches the actual length (but checking so is simple with >>)
+	let parser =
+		  (sym(0xdb) * any().repeat(4) * rest_as_str()) // 2^32-1 format
+		| (sym(0xda) * any().repeat(2) * rest_as_str()) // 2^16-1 format
+		| (sym(0xd9) * any()           * rest_as_str()) // 255 format
+		| (is_a(|x| x&MASK == SIZE_31) * rest_as_str()) // 31 format
+		- end();
+
+	for testcase in testcases.iter() {
+		println!("{:?}", parser.parse(testcase));
+	}
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -8,6 +8,11 @@ pub mod parser;
 /// Utility functions to recognize char class of byte value.
 pub mod char_class;
 
+/// Variants of parser functions specialized for matching UTF-8 strings and returning chars.
+/// Method and constructor names/functionality are generally the same as in base parser module. 
+#[cfg(feature = "utf8")]
+pub mod utf8;
+
 pub use crate::result::{Error, Result};
 
 /// Parser type, `Parser<I, O>` is alias of `parser::Parser<'static, I, O>`.