diff --git a/Cargo.toml b/Cargo.toml index d9c018d2..d88e70a4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,7 +41,7 @@ default-features = false [dev-dependencies] doc-comment = "0.3" proptest = "=1.0.0" - +nom-language = { path = "./nom-language" } [package.metadata.docs.rs] features = ["alloc", "std", "docsrs"] @@ -66,6 +66,10 @@ name = "css" [[test]] name = "custom_errors" +[[test]] +name = "expression_ast" +required-features = ["alloc"] + [[test]] name = "float" @@ -142,4 +146,4 @@ coveralls = { repository = "Geal/nom", branch = "main", service = "github" } maintenance = { status = "actively-developed" } [workspace] -members = [".", "benchmarks/"] +members = [".", "benchmarks/", "nom-language"] diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 689630cc..50caef2d 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -56,3 +56,4 @@ harness = false [dev-dependencies] codspeed-criterion-compat = "2.4.1" +nom-language = { path = "../nom-language" } diff --git a/benchmarks/benches/json.rs b/benchmarks/benches/json.rs index 0c6babbd..805f4d50 100644 --- a/benchmarks/benches/json.rs +++ b/benchmarks/benches/json.rs @@ -7,13 +7,14 @@ use nom::{ bytes::{tag, take}, character::{anychar, char, multispace0, none_of}, combinator::{map, map_opt, map_res, value, verify}, - error::{Error, ErrorKind, FromExternalError, ParseError, VerboseError}, + error::{Error, ErrorKind, FromExternalError, ParseError}, multi::{fold, separated_list0}, number::double, number::recognize_float, sequence::{delimited, preceded, separated_pair}, Check, Complete, Emit, IResult, Mode, OutputM, Parser, }; +use nom_language::error::VerboseError; use std::{collections::HashMap, marker::PhantomData, num::ParseIntError}; diff --git a/doc/choosing_a_combinator.md b/doc/choosing_a_combinator.md index 3363f63c..dfdee094 100644 --- a/doc/choosing_a_combinator.md +++ b/doc/choosing_a_combinator.md @@ -106,6 +106,7 @@ The following parsers could be found on [docs.rs number section](https://docs.rs - [`escaped`](https://docs.rs/nom/latest/nom/bytes/complete/fn.escaped.html): Matches a byte string with escaped characters - [`escaped_transform`](https://docs.rs/nom/latest/nom/bytes/complete/fn.escaped_transform.html): Matches a byte string with escaped characters, and returns a new string with the escaped characters replaced +- [`precedence`](https://docs.rs/nom/latest/nom/precedence/fn.precedence.html): Parses an expression with regards to operator precedence ## Binary format parsing diff --git a/examples/json.rs b/examples/json.rs index d47d99b3..85003b05 100644 --- a/examples/json.rs +++ b/examples/json.rs @@ -5,12 +5,13 @@ use nom::{ bytes::complete::{escaped, tag, take_while}, character::complete::{alphanumeric1 as alphanumeric, char, one_of}, combinator::{cut, map, opt, value}, - error::{context, convert_error, ContextError, ErrorKind, ParseError, VerboseError}, + error::{context, ContextError, ErrorKind, ParseError}, multi::separated_list0, number::complete::double, sequence::{delimited, preceded, separated_pair, terminated}, Err, IResult, Parser, }; +use nom_language::error::{convert_error, VerboseError}; use std::collections::HashMap; use std::str; diff --git a/examples/s_expression.rs b/examples/s_expression.rs index a85513be..e034b9c0 100644 --- a/examples/s_expression.rs +++ b/examples/s_expression.rs @@ -9,11 +9,12 @@ use nom::{ bytes::complete::tag, character::complete::{alpha1, char, digit1, multispace0, multispace1, one_of}, combinator::{cut, map, map_res, opt}, - error::{context, VerboseError}, + error::context, multi::many, sequence::{delimited, preceded, terminated}, IResult, Parser, }; +use nom_language::error::VerboseError; /// We start by defining the types that define the shape of data that we want. /// In this case, we want something tree-like diff --git a/nom-language/Cargo.toml b/nom-language/Cargo.toml new file mode 100644 index 00000000..0503cd7d --- /dev/null +++ b/nom-language/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "nom-language" +version = "0.0.1" +authors = ["contact@geoffroycouprie.com"] +description = "Language parsing focused combinators for the nom parser library" +edition = "2021" +license = "MIT" +repository = "https://github.com/rust-bakery/nom" + +[dependencies] +nom = { path = "..", version = "8.0.0-alpha2" } \ No newline at end of file diff --git a/nom-language/src/error.rs b/nom-language/src/error.rs new file mode 100644 index 00000000..4d38f429 --- /dev/null +++ b/nom-language/src/error.rs @@ -0,0 +1,262 @@ +use std::fmt; + +use nom::{ + error::{ContextError, ErrorKind, FromExternalError, ParseError}, + ErrorConvert, +}; + +/// This error type accumulates errors and their position when backtracking +/// through a parse tree. With some post processing, +/// it can be used to display user friendly error messages +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct VerboseError { + /// List of errors accumulated by `VerboseError`, containing the affected + /// part of input data, and some context + pub errors: Vec<(I, VerboseErrorKind)>, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +/// Error context for `VerboseError` +pub enum VerboseErrorKind { + /// Static string added by the `context` function + Context(&'static str), + /// Indicates which character was expected by the `char` function + Char(char), + /// Error kind given by various nom parsers + Nom(ErrorKind), +} + +impl ParseError for VerboseError { + fn from_error_kind(input: I, kind: ErrorKind) -> Self { + VerboseError { + errors: vec![(input, VerboseErrorKind::Nom(kind))], + } + } + + fn append(input: I, kind: ErrorKind, mut other: Self) -> Self { + other.errors.push((input, VerboseErrorKind::Nom(kind))); + other + } + + fn from_char(input: I, c: char) -> Self { + VerboseError { + errors: vec![(input, VerboseErrorKind::Char(c))], + } + } +} + +impl ContextError for VerboseError { + fn add_context(input: I, ctx: &'static str, mut other: Self) -> Self { + other.errors.push((input, VerboseErrorKind::Context(ctx))); + other + } +} + +impl FromExternalError for VerboseError { + /// Create a new error from an input position and an external error + fn from_external_error(input: I, kind: ErrorKind, _e: E) -> Self { + Self::from_error_kind(input, kind) + } +} + +impl fmt::Display for VerboseError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "Parse error:")?; + for (input, error) in &self.errors { + match error { + VerboseErrorKind::Nom(e) => writeln!(f, "{:?} at: {}", e, input)?, + VerboseErrorKind::Char(c) => writeln!(f, "expected '{}' at: {}", c, input)?, + VerboseErrorKind::Context(s) => writeln!(f, "in section '{}', at: {}", s, input)?, + } + } + + Ok(()) + } +} + +impl std::error::Error for VerboseError {} + +impl From> for VerboseError> { + fn from(value: VerboseError<&[u8]>) -> Self { + VerboseError { + errors: value + .errors + .into_iter() + .map(|(i, e)| (i.to_owned(), e)) + .collect(), + } + } +} + +impl From> for VerboseError { + fn from(value: VerboseError<&str>) -> Self { + VerboseError { + errors: value + .errors + .into_iter() + .map(|(i, e)| (i.to_owned(), e)) + .collect(), + } + } +} + +impl ErrorConvert> for VerboseError<(I, usize)> { + fn convert(self) -> VerboseError { + VerboseError { + errors: self.errors.into_iter().map(|(i, e)| (i.0, e)).collect(), + } + } +} + +impl ErrorConvert> for VerboseError { + fn convert(self) -> VerboseError<(I, usize)> { + VerboseError { + errors: self.errors.into_iter().map(|(i, e)| ((i, 0), e)).collect(), + } + } +} + +/// Transforms a `VerboseError` into a trace with input position information +/// +/// The errors contain references to input data that must come from `input`, +/// because nom calculates byte offsets between them +pub fn convert_error>(input: I, e: VerboseError) -> String { + use nom::Offset; + use std::fmt::Write; + + let mut result = String::new(); + + for (i, (substring, kind)) in e.errors.iter().enumerate() { + let offset = input.offset(substring); + + if input.is_empty() { + match kind { + VerboseErrorKind::Char(c) => { + write!(&mut result, "{}: expected '{}', got empty input\n\n", i, c) + } + VerboseErrorKind::Context(s) => write!(&mut result, "{}: in {}, got empty input\n\n", i, s), + VerboseErrorKind::Nom(e) => write!(&mut result, "{}: in {:?}, got empty input\n\n", i, e), + } + } else { + let prefix = &input.as_bytes()[..offset]; + + // Count the number of newlines in the first `offset` bytes of input + let line_number = prefix.iter().filter(|&&b| b == b'\n').count() + 1; + + // Find the line that includes the subslice: + // Find the *last* newline before the substring starts + let line_begin = prefix + .iter() + .rev() + .position(|&b| b == b'\n') + .map(|pos| offset - pos) + .unwrap_or(0); + + // Find the full line after that newline + let line = input[line_begin..] + .lines() + .next() + .unwrap_or(&input[line_begin..]) + .trim_end(); + + // The (1-indexed) column number is the offset of our substring into that line + let column_number = line.offset(substring) + 1; + + match kind { + VerboseErrorKind::Char(c) => { + if let Some(actual) = substring.chars().next() { + write!( + &mut result, + "{i}: at line {line_number}:\n\ + {line}\n\ + {caret:>column$}\n\ + expected '{expected}', found {actual}\n\n", + i = i, + line_number = line_number, + line = line, + caret = '^', + column = column_number, + expected = c, + actual = actual, + ) + } else { + write!( + &mut result, + "{i}: at line {line_number}:\n\ + {line}\n\ + {caret:>column$}\n\ + expected '{expected}', got end of input\n\n", + i = i, + line_number = line_number, + line = line, + caret = '^', + column = column_number, + expected = c, + ) + } + } + VerboseErrorKind::Context(s) => write!( + &mut result, + "{i}: at line {line_number}, in {context}:\n\ + {line}\n\ + {caret:>column$}\n\n", + i = i, + line_number = line_number, + context = s, + line = line, + caret = '^', + column = column_number, + ), + VerboseErrorKind::Nom(e) => write!( + &mut result, + "{i}: at line {line_number}, in {nom_err:?}:\n\ + {line}\n\ + {caret:>column$}\n\n", + i = i, + line_number = line_number, + nom_err = e, + line = line, + caret = '^', + column = column_number, + ), + } + } + // Because `write!` to a `String` is infallible, this `unwrap` is fine. + .unwrap(); + } + + result +} + +#[test] +fn convert_error_panic() { + use nom::character::complete::char; + use nom::IResult; + + let input = ""; + + let _result: IResult<_, _, VerboseError<&str>> = char('x')(input); +} + +#[test] +fn issue_1027_convert_error_panic_nonempty() { + use nom::character::complete::char; + use nom::sequence::pair; + use nom::Err; + use nom::IResult; + use nom::Parser; + + let input = "a"; + + let result: IResult<_, _, VerboseError<&str>> = pair(char('a'), char('b')).parse(input); + let err = match result.unwrap_err() { + Err::Error(e) => e, + _ => unreachable!(), + }; + + let msg = convert_error(input, err); + assert_eq!( + msg, + "0: at line 1:\na\n ^\nexpected \'b\', got end of input\n\n" + ); +} diff --git a/nom-language/src/lib.rs b/nom-language/src/lib.rs new file mode 100644 index 00000000..c018f319 --- /dev/null +++ b/nom-language/src/lib.rs @@ -0,0 +1,9 @@ +//! # Langage parsing combinators for the nom parser combinators library +//! +//! nom is a parser combinator library with a focus on safe parsing, +//! streaming patterns, and zero copy. +//! While nom provides general purpose combinators, this crate is targeted +//! at language parsing. + +pub mod error; +pub mod precedence; diff --git a/nom-language/src/precedence/mod.rs b/nom-language/src/precedence/mod.rs new file mode 100644 index 00000000..3cfc262a --- /dev/null +++ b/nom-language/src/precedence/mod.rs @@ -0,0 +1,504 @@ +//! Combinators to parse expressions with operator precedence. + +#[cfg(test)] +mod tests; + +use nom::error::{ErrorKind, FromExternalError, ParseError}; +use nom::{Check, Err, IResult, Input, Mode, OutputM, OutputMode, Parser}; + +/// An unary operator. +pub struct Unary { + value: V, + precedence: Q, +} + +/// A binary operator. +pub struct Binary { + value: V, + precedence: Q, + assoc: Assoc, +} + +/// A single evaluation step. +pub enum Operation { + /// A prefix operation. + Prefix(P1, O), + /// A postfix operation. + Postfix(O, P2), + /// A binary operation. + Binary(O, P3, O), +} + +/// Associativity for binary operators. +#[derive(Copy, Clone, PartialEq, Eq)] +pub enum Assoc { + /// Left associative. + Left, + /// Right associative. + Right, +} + +/// Element for operator stack. +enum Operator { + Prefix(P1, Q), + Postfix(P2, Q), + Binary(P3, Q, Assoc), +} + +impl Operator +where + Q: Ord + Copy, +{ + fn precedence(&self) -> Q { + match self { + Operator::Prefix(_, p) => *p, + Operator::Postfix(_, p) => *p, + Operator::Binary(_, p, _) => *p, + } + } + + fn is_postfix(&self) -> bool { + match self { + Operator::Postfix(_, _) => true, + _ => false, + } + } +} + +/// Runs the inner parser and transforms the result into an unary operator with the given precedence. +/// +/// Intended for use with [precedence]. +/// # Arguments +/// * `precedence` The precedence of the operator. +/// * `parser` The parser to apply. +pub fn unary_op( + precedence: Q, + mut parser: P, +) -> impl FnMut(I) -> IResult, E> +where + P: Parser, + Q: Ord + Copy, +{ + move |input| match parser.parse(input) { + Ok((i, value)) => Ok((i, Unary { value, precedence })), + Err(e) => Err(e), + } +} + +/// Runs the inner parser and transforms the result into a binary operator with the given precedence and associativity. +/// +/// Intended for use with [precedence]. +/// # Arguments +/// * `precedence` The precedence of the operator. +/// * `assoc` The associativity of the operator. +/// * `parser` The parser to apply. +pub fn binary_op( + precedence: Q, + assoc: Assoc, + mut parser: P, +) -> impl FnMut(I) -> IResult, E> +where + P: Parser, + Q: Ord + Copy, +{ + move |input| match parser.parse(input) { + Ok((i, value)) => Ok(( + i, + Binary { + value, + precedence, + assoc, + }, + )), + Err(e) => Err(e), + } +} + +/// Parses an expression with operator precedence. +/// +/// Supports prefix, postfix and binary operators. Operators are applied in ascending precedence. +/// +/// The parser will track its current position inside the expression and call the respective +/// operand/operator parsers. The prefix and postfix parsers are called repeatedly until they fail before +/// execution moves on to the operand or binary parser. +/// +/// Expressions are folded as soon as possible. The result will be reused as another operand. After the +/// expression has been read completely any remaining operations are folded and the resulting, single +/// operand is returned as the result. +/// +/// It will return `Err(Err:Error((_, ErrorKind::Precedence)))` if: +/// * the `fold` function returns an `Err`. +/// * more than one or no operands remain after the expression has been evaluated completely. +/// * the input does not match the pattern: `prefix* operand postfix* (binary prefix* operand postfix*)*` +/// +/// # Arguments +/// * `prefix` Parser for prefix unary operators. +/// * `postfix` Parser for postfix unary operators. +/// * `binary` Parser for binary operators. +/// * `operand` Parser for operands. +/// * `fold` Function that evaluates a single operation and returns the result. +/// +/// # Example +/// ```rust +/// # use nom::{Err, error::{Error, ErrorKind}, IResult}; +/// use nom_language::precedence::{precedence, unary_op, binary_op, Assoc, Operation}; +/// use nom::character::complete::digit1; +/// use nom::combinator::{map_res, fail}; +/// use nom::sequence::delimited; +/// use nom::bytes::complete::tag; +/// use nom::branch::alt; +/// +/// fn parser(i: &str) -> IResult<&str, i64> { +/// precedence( +/// unary_op(1, tag("-")), +/// fail(), +/// alt(( +/// binary_op(2, Assoc::Left, tag("*")), +/// binary_op(2, Assoc::Left, tag("/")), +/// binary_op(3, Assoc::Left, tag("+")), +/// binary_op(3, Assoc::Left, tag("-")), +/// )), +/// alt(( +/// map_res(digit1, |s: &str| s.parse::()), +/// delimited(tag("("), parser, tag(")")), +/// )), +/// |op: Operation<&str, &str, &str, i64>| { +/// use nom_language::precedence::Operation::*; +/// match op { +/// Prefix("-", o) => Ok(-o), +/// Binary(lhs, "*", rhs) => Ok(lhs * rhs), +/// Binary(lhs, "/", rhs) => Ok(lhs / rhs), +/// Binary(lhs, "+", rhs) => Ok(lhs + rhs), +/// Binary(lhs, "-", rhs) => Ok(lhs - rhs), +/// _ => Err("Invalid combination"), +/// } +/// } +/// )(i) +/// } +/// +/// assert_eq!(parser("8-2*2"), Ok(("", 4))); +/// assert_eq!(parser("4-(2+2)"), Ok(("", 0))); +/// assert_eq!(parser("3-(2*3)+7+2*2-(2*(2+4))"), Ok(("", -4))); +/// ``` +/// +/// # Evaluation order +/// This parser reads expressions from left to right and folds operations as soon as possible. This +/// behaviour is only important when using an operator grammar that allows for ambigious expressions. +/// +/// For example, the expression `-a++**b` is ambigious with the following precedence. +/// +/// | Operator | Position | Precedence | Associativity | +/// |----------|----------|------------|---------------| +/// | ** | Binary | 1 | Right | +/// | - | Prefix | 2 | N/A | +/// | ++ | Postfix | 3 | N/A | +/// +/// The expression can be parsed in two ways: `-((a++)**b)` or `((-a)++)**b`. This parser will always +/// parse it as the latter because of how it evaluates expressions: +/// * It reads, left-to-right, the first two operators `-a++`. +/// * Because the minus takes precedence over the increment it is evaluated immediately `(-a)++`. +/// * It then reads the remaining input and evaluates the increment next in order to preserve its +/// position in the expression \ +/// `((-a)++)**b`. +pub fn precedence( + mut prefix: H1, + mut postfix: H2, + mut binary: H3, + mut operand: F, + mut fold: G, +) -> impl FnMut(I) -> IResult +where + I: Clone + PartialEq, + E: ParseError + FromExternalError, + F: Parser, + G: FnMut(Operation) -> Result, + H1: Parser, Error = E>, + H2: Parser, Error = E>, + H3: Parser, Error = E>, + Q: Ord + Copy, +{ + move |mut i| { + let mut operands = Vec::new(); + let mut operators = Vec::new(); + let mut i1 = i.clone(); + + 'main: loop { + 'prefix: loop { + match prefix.parse(i1.clone()) { + Err(Err::Error(_)) => break 'prefix, + Err(e) => return Err(e), + Ok((i2, o)) => { + // infinite loop check: the parser must always consume + if i2 == i1 { + return Err(Err::Error(E::from_error_kind(i1, ErrorKind::Precedence))); + } + i1 = i2; + operators.push(Operator::Prefix(o.value, o.precedence)); + } + } + } + + let (i2, o) = match operand.parse(i1.clone()) { + Ok((i, o)) => (i, o), + Err(Err::Error(e)) => return Err(Err::Error(E::append(i, ErrorKind::Precedence, e))), + Err(e) => return Err(e), + }; + i1 = i2; + operands.push(o); + + 'postfix: loop { + match postfix.parse(i1.clone()) { + Err(Err::Error(_)) => break 'postfix, + Err(e) => return Err(e), + Ok((i2, o)) => { + // infinite loop check: the parser must always consume + if i2 == i1 { + return Err(Err::Error(E::from_error_kind(i1, ErrorKind::Precedence))); + } + + while operators + .last() + .map(|op| op.precedence() <= o.precedence) + .unwrap_or(false) + { + let value = operands.pop().unwrap(); + let operation = match operators.pop().unwrap() { + Operator::Prefix(op, _) => Operation::Prefix(op, value), + Operator::Postfix(op, _) => Operation::Postfix(value, op), + Operator::Binary(op, _, _) => match operands.pop() { + Some(lhs) => Operation::Binary(lhs, op, value), + None => return Err(Err::Error(E::from_error_kind(i1, ErrorKind::Precedence))), + }, + }; + let result = match fold(operation) { + Err(e) => { + return Err(Err::Error(E::from_external_error( + i, + ErrorKind::Precedence, + e, + ))) + } + Ok(r) => r, + }; + operands.push(result); + } + i1 = i2; + operators.push(Operator::Postfix(o.value, o.precedence)); + } + } + } + + match binary.parse(i1.clone()) { + Err(Err::Error(_)) => break 'main, + Err(e) => return Err(e), + Ok((i2, o)) => { + while operators + .last() + .map(|op| { + op.precedence() < o.precedence + || (o.assoc == Assoc::Left && op.precedence() == o.precedence) + || (op.is_postfix()) + }) + .unwrap_or(false) + { + let value = operands.pop().unwrap(); + let operation = match operators.pop().unwrap() { + Operator::Prefix(op, _) => Operation::Prefix(op, value), + Operator::Postfix(op, _) => Operation::Postfix(value, op), + Operator::Binary(op, _, _) => match operands.pop() { + Some(lhs) => Operation::Binary(lhs, op, value), + None => return Err(Err::Error(E::from_error_kind(i1, ErrorKind::Precedence))), + }, + }; + let result = match fold(operation) { + Err(e) => { + return Err(Err::Error(E::from_external_error( + i, + ErrorKind::Precedence, + e, + ))) + } + Ok(r) => r, + }; + operands.push(result); + } + operators.push(Operator::Binary(o.value, o.precedence, o.assoc)); + i1 = i2; + } + } + + // infinite loop check: either operand or operator must consume input + if i == i1 { + return Err(Err::Error(E::from_error_kind(i, ErrorKind::Precedence))); + } + i = i1.clone(); + } + + while operators.len() > 0 { + let value = match operands.pop() { + Some(o) => o, + None => return Err(Err::Error(E::from_error_kind(i, ErrorKind::Precedence))), + }; + let operation = match operators.pop().unwrap() { + Operator::Prefix(op, _) => Operation::Prefix(op, value), + Operator::Postfix(op, _) => Operation::Postfix(value, op), + Operator::Binary(op, _, _) => match operands.pop() { + Some(lhs) => Operation::Binary(lhs, op, value), + None => return Err(Err::Error(E::from_error_kind(i, ErrorKind::Precedence))), + }, + }; + let result = match fold(operation) { + Ok(r) => r, + Err(e) => { + return Err(Err::Error(E::from_external_error( + i, + ErrorKind::Precedence, + e, + ))) + } + }; + operands.push(result); + } + + if operands.len() == 1 { + return Ok((i1, operands.pop().unwrap())); + } else { + return Err(Err::Error(E::from_error_kind(i, ErrorKind::Precedence))); + } + } +} + +/// Applies a parser multiple times separated by another parser. +/// +/// It is similar to [`separated_list1`][crate::multi::separated_list1] but instead of collecting +/// into a vector, you have a callback to build the output. +/// +/// In a LALR grammar a left recursive operator is usually built with a rule syntax such as: +/// * A := A op B | B +/// +/// If you try to parse that wth [`alt`][crate::branch::alt] it will fail with a stack overflow +/// because the recusion is unlimited. This function solves this problem by converting the recusion +/// into an iteration. +/// +/// Compare with a right recursive operator, that in LALR would be: +/// * A := B op A | B +/// Or equivalently: +/// * A := B (op A)? +/// +/// That can be written in `nom` trivially. +/// +/// This stops when either parser returns [`err::error`] and returns the last built value. to instead chain an error up, see +/// [`cut`][crate::combinator::cut]. +/// +/// # Arguments +/// * `child` The parser to apply. +/// * `operator` Parses the operator between argument. +/// * `init` A function returning the initial value. +/// * `fold` The function that combines a result of `f` with +/// the current accumulator. +/// ```rust +/// # #[macro_use] extern crate nom; +/// # use nom::{Err, error::ErrorKind, Needed, IResult, Parser}; +/// use nom_language::precedence::left_assoc; +/// use nom::branch::alt; +/// use nom::sequence::delimited; +/// use nom::character::complete::{char, digit1}; +/// +/// fn add(i: &str) -> IResult<&str, String> { +/// left_assoc(mult, char('+'), |a, o, b| format!("{o}{a}{b}")).parse(i) +/// } +/// fn mult(i: &str) -> IResult<&str, String> { +/// left_assoc(single, char('*'), |a, o, b| format!("{o}{a}{b}")).parse(i) +/// } +/// fn single(i: &str) -> IResult<&str, String> { +/// alt(( +/// digit1.map(|x: &str| x.to_string()), +/// delimited(char('('), add, char(')')) +/// )).parse(i) +/// } +/// +/// assert_eq!(single("(1+2*3)"), Ok(("", String::from("+1*23")))); +/// assert_eq!(single("((1+2)*3)"), Ok(("", String::from("*+123")))); +/// assert_eq!(single("(1*2+3)"), Ok(("", String::from("+*123")))); +/// assert_eq!(single("((1+2*3)+4)"), Ok(("", String::from("++1*234")))); +/// assert_eq!(single("(1+(2*3+4))"), Ok(("", String::from("+1+*234")))); +/// ``` +pub fn left_assoc( + child: F, + operator: G, + builder: B, +) -> impl Parser +where + I: Clone + Input, + E: ParseError, + F: Parser, + G: Parser, + B: FnMut(O, OP, O) -> O, +{ + LeftAssoc { + child, + operator, + builder, + } +} + +/// Parser implementation for the [separated_list1] combinator +pub struct LeftAssoc { + child: F, + operator: G, + builder: B, +} + +impl Parser for LeftAssoc +where + I: Clone + Input, + E: ParseError, + F: Parser, + G: Parser, + B: FnMut(O, OP, O) -> O, +{ + type Output = O; + type Error = E; + + fn process( + &mut self, + mut i: I, + ) -> nom::PResult { + let (i1, mut res) = self.child.process::(i)?; + i = i1; + + loop { + let len = i.input_len(); + match self + .operator + .process::>(i.clone()) + { + Err(Err::Error(_)) => return Ok((i, res)), + Err(Err::Failure(e)) => return Err(Err::Failure(e)), + Err(Err::Incomplete(e)) => return Err(Err::Incomplete(e)), + Ok((i1, op)) => { + match self + .child + .process::>(i1.clone()) + { + Err(Err::Error(_)) => return Ok((i, res)), + Err(Err::Failure(e)) => return Err(Err::Failure(e)), + Err(Err::Incomplete(e)) => return Err(Err::Incomplete(e)), + Ok((i2, rhs)) => { + // infinite loop check: the parser must always consume + if i2.input_len() == len { + return Err(Err::Error(OM::Error::bind(|| { + >::Error::from_error_kind(i, ErrorKind::SeparatedList) + }))); + } + // there is no combine() with 3 arguments, fake it with a tuple and two calls + let op_rhs = OM::Output::combine(op, rhs, |op, rhs| (op, rhs)); + res = OM::Output::combine(res, op_rhs, |lhs, (op, rhs)| (self.builder)(lhs, op, rhs)); + i = i2; + } + } + } + } + } + } +} diff --git a/nom-language/src/precedence/tests.rs b/nom-language/src/precedence/tests.rs new file mode 100644 index 00000000..fc8bf63a --- /dev/null +++ b/nom-language/src/precedence/tests.rs @@ -0,0 +1,73 @@ +use crate::precedence::{binary_op, unary_op, Assoc, Operation}; +use nom::{ + branch::alt, + bytes::complete::tag, + character::complete::digit1, + combinator::{fail, map_res}, + error::ErrorKind, + error_node_position, error_position, + sequence::delimited, + Err, IResult, +}; + +use crate::precedence::precedence; + +fn parser(i: &str) -> IResult<&str, i64> { + precedence( + unary_op(1, tag("-")), + fail(), + alt(( + binary_op(2, Assoc::Left, tag("*")), + binary_op(2, Assoc::Left, tag("/")), + binary_op(3, Assoc::Left, tag("+")), + binary_op(3, Assoc::Left, tag("-")), + )), + alt(( + map_res(digit1, |s: &str| s.parse::()), + delimited(tag("("), parser, tag(")")), + )), + |op: Operation<&str, (), &str, i64>| { + use crate::precedence::Operation::*; + match op { + Prefix("-", o) => Ok(-o), + Binary(lhs, "*", rhs) => Ok(lhs * rhs), + Binary(lhs, "/", rhs) => Ok(lhs / rhs), + Binary(lhs, "+", rhs) => Ok(lhs + rhs), + Binary(lhs, "-", rhs) => Ok(lhs - rhs), + _ => Err("Invalid combination"), + } + }, + )(i) +} + +#[test] +fn precedence_test() { + assert_eq!(parser("3"), Ok(("", 3))); + assert_eq!(parser("-3"), Ok(("", -3))); + assert_eq!(parser("4-(2*2)"), Ok(("", 0))); + assert_eq!(parser("4-2*2"), Ok(("", 0))); + assert_eq!(parser("(4-2)*2"), Ok(("", 4))); + assert_eq!(parser("2*2/1"), Ok(("", 4))); + + let a = "a"; + + assert_eq!( + parser(a), + Err(Err::Error(error_node_position!( + &a[..], + ErrorKind::Precedence, + error_position!(&a[..], ErrorKind::Tag) + ))) + ); + + let b = "3+b"; + + assert_eq!( + parser(b), + Err(Err::Error(error_node_position!( + &b[2..], + ErrorKind::Precedence, + error_position!(&b[2..], ErrorKind::Tag) + ))) + ); +} diff --git a/src/error.rs b/src/error.rs index f4072c3a..dac6a400 100644 --- a/src/error.rs +++ b/src/error.rs @@ -15,7 +15,7 @@ use crate::internal::IResult; /// This trait must be implemented by the error type of a nom parser. /// /// There are already implementations of it for `(Input, ErrorKind)` -/// and `VerboseError`. +/// and `Error`. /// /// It provides methods to create an error from some combinators, /// and combine existing errors in combinators like `alt`. @@ -212,117 +212,6 @@ pub fn append_error>(input: I, kind: ErrorKind, other: E) -> E::append(input, kind, other) } -/// This error type accumulates errors and their position when backtracking -/// through a parse tree. With some post processing (cf `examples/json.rs`), -/// it can be used to display user friendly error messages -#[cfg(feature = "alloc")] -#[cfg_attr(feature = "docsrs", doc(cfg(feature = "alloc")))] -#[derive(Clone, Debug, Eq, PartialEq)] -pub struct VerboseError { - /// List of errors accumulated by `VerboseError`, containing the affected - /// part of input data, and some context - pub errors: crate::lib::std::vec::Vec<(I, VerboseErrorKind)>, -} - -#[cfg(feature = "alloc")] -#[cfg_attr(feature = "docsrs", doc(cfg(feature = "alloc")))] -#[derive(Clone, Debug, Eq, PartialEq)] -/// Error context for `VerboseError` -pub enum VerboseErrorKind { - /// Static string added by the `context` function - Context(&'static str), - /// Indicates which character was expected by the `char` function - Char(char), - /// Error kind given by various nom parsers - Nom(ErrorKind), -} - -#[cfg(feature = "alloc")] -#[cfg_attr(feature = "docsrs", doc(cfg(feature = "alloc")))] -impl ParseError for VerboseError { - fn from_error_kind(input: I, kind: ErrorKind) -> Self { - VerboseError { - errors: vec![(input, VerboseErrorKind::Nom(kind))], - } - } - - fn append(input: I, kind: ErrorKind, mut other: Self) -> Self { - other.errors.push((input, VerboseErrorKind::Nom(kind))); - other - } - - fn from_char(input: I, c: char) -> Self { - VerboseError { - errors: vec![(input, VerboseErrorKind::Char(c))], - } - } -} - -#[cfg(feature = "alloc")] -#[cfg_attr(feature = "docsrs", doc(cfg(feature = "alloc")))] -impl ContextError for VerboseError { - fn add_context(input: I, ctx: &'static str, mut other: Self) -> Self { - other.errors.push((input, VerboseErrorKind::Context(ctx))); - other - } -} - -#[cfg(feature = "alloc")] -#[cfg_attr(feature = "docsrs", doc(cfg(feature = "alloc")))] -impl FromExternalError for VerboseError { - /// Create a new error from an input position and an external error - fn from_external_error(input: I, kind: ErrorKind, _e: E) -> Self { - Self::from_error_kind(input, kind) - } -} - -#[cfg(feature = "alloc")] -impl fmt::Display for VerboseError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - writeln!(f, "Parse error:")?; - for (input, error) in &self.errors { - match error { - VerboseErrorKind::Nom(e) => writeln!(f, "{:?} at: {}", e, input)?, - VerboseErrorKind::Char(c) => writeln!(f, "expected '{}' at: {}", c, input)?, - VerboseErrorKind::Context(s) => writeln!(f, "in section '{}', at: {}", s, input)?, - } - } - - Ok(()) - } -} - -#[cfg(feature = "std")] -impl std::error::Error for VerboseError {} - -#[cfg(feature = "alloc")] -#[cfg_attr(feature = "docsrs", doc(cfg(feature = "alloc")))] -impl From> for VerboseError> { - fn from(value: VerboseError<&[u8]>) -> Self { - VerboseError { - errors: value - .errors - .into_iter() - .map(|(i, e)| (i.to_owned(), e)) - .collect(), - } - } -} - -#[cfg(feature = "alloc")] -#[cfg_attr(feature = "docsrs", doc(cfg(feature = "alloc")))] -impl From> for VerboseError { - fn from(value: VerboseError<&str>) -> Self { - VerboseError { - errors: value - .errors - .into_iter() - .map(|(i, e)| (i.to_owned(), e)) - .collect(), - } - } -} - /// Create a new error from an input position, a static string and an existing error. /// This is used mainly in the [context] combinator, to add user friendly information /// to errors when backtracking through a parse tree @@ -360,123 +249,6 @@ where } } -/// Transforms a `VerboseError` into a trace with input position information -/// -/// The errors contain references to input data that must come from `input`, -/// because nom calculates byte offsets between them -#[cfg(feature = "alloc")] -#[cfg_attr(feature = "docsrs", doc(cfg(feature = "alloc")))] -pub fn convert_error>( - input: I, - e: VerboseError, -) -> crate::lib::std::string::String { - use crate::lib::std::fmt::Write; - use crate::traits::Offset; - - let mut result = crate::lib::std::string::String::new(); - - for (i, (substring, kind)) in e.errors.iter().enumerate() { - let offset = input.offset(substring); - - if input.is_empty() { - match kind { - VerboseErrorKind::Char(c) => { - write!(&mut result, "{}: expected '{}', got empty input\n\n", i, c) - } - VerboseErrorKind::Context(s) => write!(&mut result, "{}: in {}, got empty input\n\n", i, s), - VerboseErrorKind::Nom(e) => write!(&mut result, "{}: in {:?}, got empty input\n\n", i, e), - } - } else { - let prefix = &input.as_bytes()[..offset]; - - // Count the number of newlines in the first `offset` bytes of input - let line_number = prefix.iter().filter(|&&b| b == b'\n').count() + 1; - - // Find the line that includes the subslice: - // Find the *last* newline before the substring starts - let line_begin = prefix - .iter() - .rev() - .position(|&b| b == b'\n') - .map(|pos| offset - pos) - .unwrap_or(0); - - // Find the full line after that newline - let line = input[line_begin..] - .lines() - .next() - .unwrap_or(&input[line_begin..]) - .trim_end(); - - // The (1-indexed) column number is the offset of our substring into that line - let column_number = line.offset(substring) + 1; - - match kind { - VerboseErrorKind::Char(c) => { - if let Some(actual) = substring.chars().next() { - write!( - &mut result, - "{i}: at line {line_number}:\n\ - {line}\n\ - {caret:>column$}\n\ - expected '{expected}', found {actual}\n\n", - i = i, - line_number = line_number, - line = line, - caret = '^', - column = column_number, - expected = c, - actual = actual, - ) - } else { - write!( - &mut result, - "{i}: at line {line_number}:\n\ - {line}\n\ - {caret:>column$}\n\ - expected '{expected}', got end of input\n\n", - i = i, - line_number = line_number, - line = line, - caret = '^', - column = column_number, - expected = c, - ) - } - } - VerboseErrorKind::Context(s) => write!( - &mut result, - "{i}: at line {line_number}, in {context}:\n\ - {line}\n\ - {caret:>column$}\n\n", - i = i, - line_number = line_number, - context = s, - line = line, - caret = '^', - column = column_number, - ), - VerboseErrorKind::Nom(e) => write!( - &mut result, - "{i}: at line {line_number}, in {nom_err:?}:\n\ - {line}\n\ - {caret:>column$}\n\n", - i = i, - line_number = line_number, - nom_err = e, - line = line, - caret = '^', - column = column_number, - ), - } - } - // Because `write!` to a `String` is infallible, this `unwrap` is fine. - .unwrap(); - } - - result -} - /// Indicates which parser returned an error #[rustfmt::skip] #[derive(Debug,PartialEq,Eq,Hash,Clone,Copy)] @@ -538,6 +310,7 @@ pub enum ErrorKind { Fail, Many, Fold, + Precedence, } #[rustfmt::skip] @@ -601,6 +374,7 @@ pub fn error_to_u32(e: &ErrorKind) -> u32 { ErrorKind::Many => 76, ErrorKind::Fold => 77, ErrorKind::BinDigit => 78, + ErrorKind::Precedence => 79, } } @@ -666,6 +440,7 @@ impl ErrorKind { ErrorKind::Fail => "Fail", ErrorKind::Many => "Many", ErrorKind::Fold => "Fold", + ErrorKind::Precedence => "Precedence", } } } @@ -791,17 +566,6 @@ mod tests { ); } - #[cfg(feature = "alloc")] - #[test] - fn convert_error_panic() { - use crate::character::complete::char; - use crate::internal::IResult; - - let input = ""; - - let _result: IResult<_, _, VerboseError<&str>> = char('x')(input); - } - #[cfg(feature = "alloc")] #[test] fn clone_error() { diff --git a/src/traits.rs b/src/traits.rs index 1313953e..f778c360 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -1309,26 +1309,6 @@ impl ErrorConvert> for error::Error { } } -#[cfg(feature = "alloc")] -#[cfg_attr(feature = "docsrs", doc(cfg(feature = "alloc")))] -impl ErrorConvert> for error::VerboseError<(I, usize)> { - fn convert(self) -> error::VerboseError { - error::VerboseError { - errors: self.errors.into_iter().map(|(i, e)| (i.0, e)).collect(), - } - } -} - -#[cfg(feature = "alloc")] -#[cfg_attr(feature = "docsrs", doc(cfg(feature = "alloc")))] -impl ErrorConvert> for error::VerboseError { - fn convert(self) -> error::VerboseError<(I, usize)> { - error::VerboseError { - errors: self.errors.into_iter().map(|(i, e)| ((i, 0), e)).collect(), - } - } -} - impl ErrorConvert<()> for () { fn convert(self) {} } diff --git a/tests/expression_ast.rs b/tests/expression_ast.rs new file mode 100644 index 00000000..0d04c9bb --- /dev/null +++ b/tests/expression_ast.rs @@ -0,0 +1,169 @@ +use nom::{ + branch::alt, + bytes::complete::tag, + character::complete::{alphanumeric1 as alphanumeric, digit1 as digit}, + combinator::{map, map_res}, + multi::separated_list0, + sequence::delimited, + IResult, Parser, +}; +use nom_language::precedence::{binary_op, precedence, unary_op, Assoc, Operation}; + +// Elements of the abstract syntax tree (ast) that represents an expression. +#[derive(Debug)] +pub enum Expr { + // A number literal. + Num(i64), + // An identifier. + Iden(String), + // Arithmetic operations. Each have a left hand side (lhs) and a right hand side (rhs). + Add(Box, Box), + Sub(Box, Box), + Mul(Box, Box), + Div(Box, Box), + // The function call operation. Left is the expression the function is called on, right is the list of parameters. + Call(Box, Vec), + // The ternary operator, the expressions from left to right are: The condition, the true case, the false case. + Tern(Box, Box, Box), +} + +// Prefix operators. +enum PrefixOp { + Identity, // + + Negate, // - +} + +// Postfix operators. +enum PostfixOp { + // The function call operator. In addition to its own representation "()" it carries additional information that we need to keep here. + // Specifically the vector of expressions that make up the parameters. + Call(Vec), // () +} + +// Binary operators. +enum BinaryOp { + Addition, // + + Subtraction, // - + Multiplication, // * + Division, // / + // The ternary operator can contain a single expression. + Ternary(Expr), // ?: +} + +// Parser for function calls. +fn function_call(i: &str) -> IResult<&str, PostfixOp> { + map( + delimited( + tag("("), + // Subexpressions are evaluated by recursing back into the expression parser. + separated_list0(tag(","), expression), + tag(")"), + ), + |v: Vec| PostfixOp::Call(v), + ) + .parse(i) +} + +// The ternary operator is actually just a binary operator that contains another expression. So it can be +// handled similarly to the function call operator except its in a binary position and can only contain +// a single expression. +// +// For example the expression "a IResult<&str, BinaryOp> { + map(delimited(tag("?"), expression, tag(":")), |e: Expr| { + BinaryOp::Ternary(e) + }) + .parse(i) +} + +// The actual expression parser . +fn expression(i: &str) -> IResult<&str, Expr> { + precedence( + alt(( + unary_op(2, map(tag("+"), |_| PrefixOp::Identity)), + unary_op(2, map(tag("-"), |_| PrefixOp::Negate)), + )), + // Function calls are implemented as postfix unary operators. + unary_op(1, function_call), + alt(( + binary_op( + 3, + Assoc::Left, + alt(( + map(tag("*"), |_| BinaryOp::Multiplication), + map(tag("/"), |_| BinaryOp::Division), + )), + ), + binary_op( + 4, + Assoc::Left, + alt(( + map(tag("+"), |_| BinaryOp::Addition), + map(tag("-"), |_| BinaryOp::Subtraction), + )), + ), + // Ternary operators are just binary operators with a subexpression. + binary_op(5, Assoc::Right, ternary_operator), + )), + alt(( + map_res(digit, |s: &str| match s.parse::() { + Ok(s) => Ok(Expr::Num(s)), + Err(e) => Err(e), + }), + map(alphanumeric, |s: &str| Expr::Iden(s.to_string())), + delimited(tag("("), expression, tag(")")), + )), + |op: Operation| -> Result { + use nom_language::precedence::Operation::*; + use BinaryOp::*; + use PostfixOp::*; + use PrefixOp::*; + match op { + // The identity operator (prefix +) is ignored. + Prefix(Identity, e) => Ok(e), + + // Unary minus gets evaluated to the same representation as a multiplication with -1. + Prefix(Negate, e) => Ok(Expr::Mul(Expr::Num(-1).into(), e.into())), + + // The list of parameters are taken from the operator and placed into the ast. + Postfix(e, Call(p)) => Ok(Expr::Call(e.into(), p)), + + // Meaning is assigned to the expressions of the ternary operator during evaluation. + // The lhs becomes the condition, the contained expression is the true case, rhs the false case. + Binary(lhs, Ternary(e), rhs) => Ok(Expr::Tern(lhs.into(), e.into(), rhs.into())), + + // Raw operators get turned into their respective ast nodes. + Binary(lhs, Multiplication, rhs) => Ok(Expr::Mul(lhs.into(), rhs.into())), + Binary(lhs, Division, rhs) => Ok(Expr::Div(lhs.into(), rhs.into())), + Binary(lhs, Addition, rhs) => Ok(Expr::Add(lhs.into(), rhs.into())), + Binary(lhs, Subtraction, rhs) => Ok(Expr::Sub(lhs.into(), rhs.into())), + } + }, + )(i) +} + +#[test] +fn expression_test() { + assert_eq!( + expression("-2*max(2,3)-2").map(|(i, x)| (i, format!("{:?}", x))), + Ok(( + "", + String::from("Sub(Mul(Mul(Num(-1), Num(2)), Call(Iden(\"max\"), [Num(2), Num(3)])), Num(2))") + )) + ); + + assert_eq!( + expression("a?2+c:-2*2").map(|(i, x)| (i, format!("{:?}", x))), + Ok(( + "", + String::from( + "Tern(Iden(\"a\"), Add(Num(2), Iden(\"c\")), Mul(Mul(Num(-1), Num(2)), Num(2)))" + ) + )) + ); +} diff --git a/tests/issues.rs b/tests/issues.rs index ecff7b3a..2b1923d6 100644 --- a/tests/issues.rs +++ b/tests/issues.rs @@ -178,27 +178,6 @@ fn issue_many_m_n_with_zeros() { assert_eq!(parser.parse("aaa"), Ok(("aaa", vec!()))); } -#[test] -fn issue_1027_convert_error_panic_nonempty() { - use nom::character::complete::char; - use nom::error::{convert_error, VerboseError}; - use nom::sequence::pair; - - let input = "a"; - - let result: IResult<_, _, VerboseError<&str>> = pair(char('a'), char('b')).parse(input); - let err = match result.unwrap_err() { - Err::Error(e) => e, - _ => unreachable!(), - }; - - let msg = convert_error(input, err); - assert_eq!( - msg, - "0: at line 1:\na\n ^\nexpected \'b\', got end of input\n\n" - ); -} - #[test] fn issue_1231_bits_expect_fn_closure() { use nom::bits::{bits, complete::take};