Skip to content

Commit

Permalink
"utf8" module supporting matching UTF-8/returning &str (#59)
Browse files Browse the repository at this point in the history
* Create a utf8::Parser newtype so that collect() can safely return a str

* Replace utf8 parser.as_bytes() with parser.into<parser::Parser>()

* Example program mixing u8 parser and UTF-8 parser

* utf8 sym tag / fix any tag return type

* utf8 one_of/none_of

* is_a/not_a

* Remaining parser constructors for utf8

* More operator overloads: And, Sub, degrade versions for BitOr and Mul

* Doc comments on utf8 macro expansions, utf8 Shr

* Doc comments on utf8 macro expansions, rest of parser functions in utf8

* parse_str convenience calls .as_bytes for you

* Feature for utf8, enabled by default

* utf::take_bytes() and utf::skip_bytes()
  • Loading branch information
mcclure authored Jan 3, 2023
1 parent 9f4a265 commit 7efec98
Show file tree
Hide file tree
Showing 6 changed files with 721 additions and 0 deletions.
46 changes: 46 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,10 @@ keywords = ["parser", "parser-combinators", "parsing", "PEG"]

[badges]
travis-ci = { repository = "J-F-Liu/pom" }

[dependencies]
bstr = {version = "1.1.0", features = []} # Only uses one function, so no features needed.

[features]
default = ["utf8"]
utf8 = []
68 changes: 68 additions & 0 deletions examples/utf8.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// Example shows basic UTF-8 combinators

use pom::utf8::*;

fn main() {
// Informal, Spanish-language movie database format
let input = "\
Título: Abre los ojos
Año: 1997
Director: Alejandro Amenábar
Título: Amores Perros
Director: Alejandro González Iñárritu
Año: 2000
Título: La montaña sagrada
Año: 1973
Director: Alejandro Jodorowsky
";

enum DataLine<'a> {
Title(&'a str),
Director(&'a str),
Year(i32),
}

fn positive<'a>() -> Parser<'a, i32> {
// let integer = (one_of("123456789") - one_of("0123456789").repeat(0..)) | sym(b'0'); // TODO
let digit = one_of("0123456789");
let integer = digit.discard().repeat(1..);
integer.collect().convert(|x|x.parse::<i32>())
}

fn rest_str<'a>() -> Parser<'a, &'a str> {
any().repeat(1..).collect()
}

fn separator<'a>() ->Parser<'a, ()> {
seq(": ").discard()
}

let parser =
(seq("Título") * separator() * rest_str().map(|s| DataLine::Title(s)))
| (seq("Director") * separator() * rest_str().map(|s| DataLine::Director(s)))
| (seq("Año") * separator() * positive().map(|i| DataLine::Year(i)));

{
let mut title_opt:Option<&str> = None;
let mut year_opt:Option<i32> = None;
let mut director_opt:Option<&str> = None;

for line in input.lines() {
if !line.is_empty() { // Skip blank lines without parsing
// Parse line
match parser.parse_str(line).unwrap() {
DataLine::Title(s) => title_opt = Some(s),
DataLine::Director(s) => director_opt = Some(s),
DataLine::Year(s) => year_opt = Some(s),
}
// When all three line types have been collected, print them
if let (Some(title), Some(year), Some(director)) = (title_opt,year_opt,director_opt) {
println!("Title: {}\nDirector: {}\nYear: {}\n", title, director, year);
(title_opt, year_opt, director_opt) = (None,None,None);
}
}
}
}
}
35 changes: 35 additions & 0 deletions examples/utf8_mixed.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Example shows UTF-8 combinators intermixed with binary combinators

use pom::parser::*;
use pom::utf8;

fn main() {
// A parser for MsgPack (but only messages encoding a string)
let testcases: [Vec<u8>; 6] = [
vec![0b10100100, 0b11110000, 0b10011111, 0b10100100, 0b10010100], // 🤔, max-size 31 format
vec![0xd9, 4, 0b11110000, 0b10011111, 0b10011000, 0b10101110], // 😮, max-size 255 format
vec![0xda, 0, 4, 0b11110000, 0b10011111, 0b10100100, 0b10101111], // 🤯, max-size 2^16-1 format
vec![0xdb, 0, 0, 0, 4, 0b11110000, 0b10011111, 0b10010010, 0b10100101], // 💥, max-size 2^32-1 format
vec![0xc4, 4, 0b11110000, 0b10011111, 0b10011000, 0b10101110], // Valid MsgPack, but not a string (binary)
vec![0b10100100, 0b10010100, 0b10100100, 0b10011111, 0b11110000], // A MsgPack string, but invalid UTF-8
];

const MASK:u8 = 0b11100000; // size 31 format is denoted by 3 high bits == 101
const SIZE_31:u8 = 0b10100000;

fn rest_as_str<'a>() -> utf8::Parser<'a, &'a str> {
utf8::any().repeat(0..).collect()
}

// Demo parser does not verify that the claimed length matches the actual length (but checking so is simple with >>)
let parser =
(sym(0xdb) * any().repeat(4) * rest_as_str()) // 2^32-1 format
| (sym(0xda) * any().repeat(2) * rest_as_str()) // 2^16-1 format
| (sym(0xd9) * any() * rest_as_str()) // 255 format
| (is_a(|x| x&MASK == SIZE_31) * rest_as_str()) // 31 format
- end();

for testcase in testcases.iter() {
println!("{:?}", parser.parse(testcase));
}
}
5 changes: 5 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ pub mod parser;
/// Utility functions to recognize char class of byte value.
pub mod char_class;

/// Variants of parser functions specialized for matching UTF-8 strings and returning chars.
/// Method and constructor names/functionality are generally the same as in base parser module.
#[cfg(feature = "utf8")]
pub mod utf8;

pub use crate::result::{Error, Result};

/// Parser type, `Parser<I, O>` is alias of `parser::Parser<'static, I, O>`.
Expand Down
Loading

0 comments on commit 7efec98

Please sign in to comment.