-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
"utf8" module supporting matching UTF-8/returning &str (#59)
* Create a utf8::Parser newtype so that collect() can safely return a str * Replace utf8 parser.as_bytes() with parser.into<parser::Parser>() * Example program mixing u8 parser and UTF-8 parser * utf8 sym tag / fix any tag return type * utf8 one_of/none_of * is_a/not_a * Remaining parser constructors for utf8 * More operator overloads: And, Sub, degrade versions for BitOr and Mul * Doc comments on utf8 macro expansions, utf8 Shr * Doc comments on utf8 macro expansions, rest of parser functions in utf8 * parse_str convenience calls .as_bytes for you * Feature for utf8, enabled by default * utf::take_bytes() and utf::skip_bytes()
- Loading branch information
Showing
6 changed files
with
721 additions
and
0 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
// Example shows basic UTF-8 combinators | ||
|
||
use pom::utf8::*; | ||
|
||
fn main() { | ||
// Informal, Spanish-language movie database format | ||
let input = "\ | ||
Título: Abre los ojos | ||
Año: 1997 | ||
Director: Alejandro Amenábar | ||
Título: Amores Perros | ||
Director: Alejandro González Iñárritu | ||
Año: 2000 | ||
Título: La montaña sagrada | ||
Año: 1973 | ||
Director: Alejandro Jodorowsky | ||
"; | ||
|
||
enum DataLine<'a> { | ||
Title(&'a str), | ||
Director(&'a str), | ||
Year(i32), | ||
} | ||
|
||
fn positive<'a>() -> Parser<'a, i32> { | ||
// let integer = (one_of("123456789") - one_of("0123456789").repeat(0..)) | sym(b'0'); // TODO | ||
let digit = one_of("0123456789"); | ||
let integer = digit.discard().repeat(1..); | ||
integer.collect().convert(|x|x.parse::<i32>()) | ||
} | ||
|
||
fn rest_str<'a>() -> Parser<'a, &'a str> { | ||
any().repeat(1..).collect() | ||
} | ||
|
||
fn separator<'a>() ->Parser<'a, ()> { | ||
seq(": ").discard() | ||
} | ||
|
||
let parser = | ||
(seq("Título") * separator() * rest_str().map(|s| DataLine::Title(s))) | ||
| (seq("Director") * separator() * rest_str().map(|s| DataLine::Director(s))) | ||
| (seq("Año") * separator() * positive().map(|i| DataLine::Year(i))); | ||
|
||
{ | ||
let mut title_opt:Option<&str> = None; | ||
let mut year_opt:Option<i32> = None; | ||
let mut director_opt:Option<&str> = None; | ||
|
||
for line in input.lines() { | ||
if !line.is_empty() { // Skip blank lines without parsing | ||
// Parse line | ||
match parser.parse_str(line).unwrap() { | ||
DataLine::Title(s) => title_opt = Some(s), | ||
DataLine::Director(s) => director_opt = Some(s), | ||
DataLine::Year(s) => year_opt = Some(s), | ||
} | ||
// When all three line types have been collected, print them | ||
if let (Some(title), Some(year), Some(director)) = (title_opt,year_opt,director_opt) { | ||
println!("Title: {}\nDirector: {}\nYear: {}\n", title, director, year); | ||
(title_opt, year_opt, director_opt) = (None,None,None); | ||
} | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
// Example shows UTF-8 combinators intermixed with binary combinators | ||
|
||
use pom::parser::*; | ||
use pom::utf8; | ||
|
||
fn main() { | ||
// A parser for MsgPack (but only messages encoding a string) | ||
let testcases: [Vec<u8>; 6] = [ | ||
vec![0b10100100, 0b11110000, 0b10011111, 0b10100100, 0b10010100], // 🤔, max-size 31 format | ||
vec![0xd9, 4, 0b11110000, 0b10011111, 0b10011000, 0b10101110], // 😮, max-size 255 format | ||
vec![0xda, 0, 4, 0b11110000, 0b10011111, 0b10100100, 0b10101111], // 🤯, max-size 2^16-1 format | ||
vec![0xdb, 0, 0, 0, 4, 0b11110000, 0b10011111, 0b10010010, 0b10100101], // 💥, max-size 2^32-1 format | ||
vec![0xc4, 4, 0b11110000, 0b10011111, 0b10011000, 0b10101110], // Valid MsgPack, but not a string (binary) | ||
vec![0b10100100, 0b10010100, 0b10100100, 0b10011111, 0b11110000], // A MsgPack string, but invalid UTF-8 | ||
]; | ||
|
||
const MASK:u8 = 0b11100000; // size 31 format is denoted by 3 high bits == 101 | ||
const SIZE_31:u8 = 0b10100000; | ||
|
||
fn rest_as_str<'a>() -> utf8::Parser<'a, &'a str> { | ||
utf8::any().repeat(0..).collect() | ||
} | ||
|
||
// Demo parser does not verify that the claimed length matches the actual length (but checking so is simple with >>) | ||
let parser = | ||
(sym(0xdb) * any().repeat(4) * rest_as_str()) // 2^32-1 format | ||
| (sym(0xda) * any().repeat(2) * rest_as_str()) // 2^16-1 format | ||
| (sym(0xd9) * any() * rest_as_str()) // 255 format | ||
| (is_a(|x| x&MASK == SIZE_31) * rest_as_str()) // 31 format | ||
- end(); | ||
|
||
for testcase in testcases.iter() { | ||
println!("{:?}", parser.parse(testcase)); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.