diff --git a/pomsky-syntax/src/exprs/alternation.rs b/pomsky-syntax/src/exprs/alternation.rs index 9f575dd..eac172d 100644 --- a/pomsky-syntax/src/exprs/alternation.rs +++ b/pomsky-syntax/src/exprs/alternation.rs @@ -3,7 +3,7 @@ use crate::Span; -use super::{Literal, Rule}; +use super::Rule; /// An [alternation](https://www.regular-expressions.info/alternation.html). /// This is a list of alternatives. Each alternative is a [`Rule`]. @@ -20,28 +20,6 @@ pub struct Alternation { } impl Alternation { - pub(crate) fn new_expr(rules: Vec) -> Rule { - rules - .into_iter() - .reduce(|a, b| match (a, b) { - (Rule::Alternation(mut a), Rule::Alternation(b)) => { - a.span = a.span.join(b.span); - a.rules.extend(b.rules); - Rule::Alternation(a) - } - (Rule::Alternation(mut a), b) => { - a.span = a.span.join(b.span()); - a.rules.push(b); - Rule::Alternation(a) - } - (a, b) => { - let span = a.span().join(b.span()); - Rule::Alternation(Alternation { rules: vec![a, b], span }) - } - }) - .unwrap_or_else(|| Rule::Literal(Literal::new("".to_string(), Span::default()))) - } - #[cfg(feature = "dbg")] pub(super) fn pretty_print(&self, buf: &mut crate::PrettyPrinter, needs_parens: bool) { if needs_parens { diff --git a/pomsky-syntax/src/exprs/intersection.rs b/pomsky-syntax/src/exprs/intersection.rs index 0ac0686..2cefe94 100644 --- a/pomsky-syntax/src/exprs/intersection.rs +++ b/pomsky-syntax/src/exprs/intersection.rs @@ -20,33 +20,6 @@ pub struct Intersection { } impl Intersection { - pub(crate) fn new_expr(rules: Vec, start_span: Span) -> Option { - rules - .into_iter() - .reduce(|a, b| match (a, b) { - (Rule::Intersection(mut a), Rule::Intersection(b)) => { - a.span = a.span.join(b.span); - a.rules.extend(b.rules); - Rule::Intersection(a) - } - (Rule::Intersection(mut a), b) => { - a.span = a.span.join(b.span()); - a.rules.push(b); - Rule::Intersection(a) - } - (a, b) => { - let span = a.span().join(b.span()); - Rule::Intersection(Intersection { rules: vec![a, b], span }) - } - }) - .map(|mut rule| { - if let Rule::Intersection(i) = &mut rule { - i.span = i.span.join(start_span) - } - rule - }) - } - #[cfg(feature = "dbg")] pub(super) fn pretty_print(&self, buf: &mut crate::PrettyPrinter, needs_parens: bool) { if needs_parens { diff --git a/pomsky-syntax/src/lexer/token.rs b/pomsky-syntax/src/lexer/token.rs index cb9e7f1..ce61b12 100644 --- a/pomsky-syntax/src/lexer/token.rs +++ b/pomsky-syntax/src/lexer/token.rs @@ -9,7 +9,11 @@ pub enum Token { /// `$` (end boundary) Dollar, /// `%` (`\b` boundary) - BWord, + Percent, + /// `<` (word start) + AngleLeft, + /// `>` (word end) + AngleRight, /// `*` (`*?` repetition) Star, @@ -42,28 +46,19 @@ pub enum Token { /// `[` (open character class) OpenBracket, - - /// `-` (unicode range) - Dash, - /// `]` (close character class) CloseBracket, + /// `-` (unicode range) + Dash, /// `.` (any code point except newline) Dot, /// `>>` (positive lookahead) LookAhead, - /// `<<` (positive lookbehind) LookBehind, - /// `<` (word start) - AngleLeft, - - /// `>` (word end) - AngleRight, - /// `::` (back reference) DoubleColon, @@ -100,7 +95,7 @@ impl core::fmt::Display for Token { f.write_str(match self { Token::Caret => "`^`", Token::Dollar => "`$`", - Token::BWord => "`%`", + Token::Percent => "`%`", Token::Star => "`*`", Token::Plus => "`+`", Token::QuestionMark => "`?`", diff --git a/pomsky-syntax/src/lexer/tokenize.rs b/pomsky-syntax/src/lexer/tokenize.rs index b25e128..7ea527c 100644 --- a/pomsky-syntax/src/lexer/tokenize.rs +++ b/pomsky-syntax/src/lexer/tokenize.rs @@ -42,6 +42,42 @@ macro_rules! reserved_word_pattern { ); } +static SINGLE_TOKEN_LOOKUP: [Option; 127] = const { + let mut table = [const { None }; 127]; + table[b'^' as usize] = Some(Token::Caret); + table[b'$' as usize] = Some(Token::Dollar); + table[b'%' as usize] = Some(Token::Percent); + table[b'<' as usize] = Some(Token::AngleLeft); + table[b'>' as usize] = Some(Token::AngleRight); + table[b'*' as usize] = Some(Token::Star); + table[b'+' as usize] = Some(Token::Plus); + table[b'?' as usize] = Some(Token::QuestionMark); + table[b'|' as usize] = Some(Token::Pipe); + table[b'&' as usize] = Some(Token::Ampersand); + table[b':' as usize] = Some(Token::Colon); + table[b')' as usize] = Some(Token::CloseParen); + table[b'{' as usize] = Some(Token::OpenBrace); + table[b'}' as usize] = Some(Token::CloseBrace); + table[b',' as usize] = Some(Token::Comma); + table[b'!' as usize] = Some(Token::Not); + table[b'[' as usize] = Some(Token::OpenBracket); + table[b']' as usize] = Some(Token::CloseBracket); + table[b'-' as usize] = Some(Token::Dash); + table[b'.' as usize] = Some(Token::Dot); + table[b';' as usize] = Some(Token::Semicolon); + table[b'=' as usize] = Some(Token::Equals); + table +}; + +fn lookup_single(c: char) -> Option { + let c = c as u32; + if c < 128 { + SINGLE_TOKEN_LOOKUP[c as usize] + } else { + None + } +} + pub(crate) fn tokenize(mut input: &str) -> Vec<(Token, Span)> { let mut result = vec![]; let mut offset = 0; @@ -64,28 +100,7 @@ pub(crate) fn tokenize(mut input: &str) -> Vec<(Token, Span)> { if input.starts_with("<<") => (2, Token::LookBehind); if input.starts_with("::") => (2, Token::DoubleColon); - if c == '^' => (1, Token::Caret); - if c == '$' => (1, Token::Dollar); - if c == '<' => (1, Token::AngleLeft); - if c == '>' => (1, Token::AngleRight); - if c == '%' => (1, Token::BWord); - if c == '*' => (1, Token::Star); - if c == '+' => (1, Token::Plus); - if c == '?' => (1, Token::QuestionMark); - if c == '|' => (1, Token::Pipe); - if c == '&' => (1, Token::Ampersand); - if c == ':' => (1, Token::Colon); - if c == ')' => (1, Token::CloseParen); - if c == '{' => (1, Token::OpenBrace); - if c == '}' => (1, Token::CloseBrace); - if c == ',' => (1, Token::Comma); - if c == '!' => (1, Token::Not); - if c == '[' => (1, Token::OpenBracket); - if c == '-' => (1, Token::Dash); - if c == ']' => (1, Token::CloseBracket); - if c == '.' => (1, Token::Dot); - if c == ';' => (1, Token::Semicolon); - if c == '=' => (1, Token::Equals); + if let Some(token) = lookup_single(c) => (1, token); if c == '\'' => match input[1..].find('\'') { Some(len_inner) => (len_inner + 2, Token::String), diff --git a/pomsky-syntax/src/parse/parser_impl.rs b/pomsky-syntax/src/parse/parser_impl.rs index 55a162a..799b866 100644 --- a/pomsky-syntax/src/parse/parser_impl.rs +++ b/pomsky-syntax/src/parse/parser_impl.rs @@ -17,7 +17,7 @@ use super::{helper, Parser}; type PResult = Result; -const MAX_REPETITION: u32 = 65_535; +const MAX_GROUP_NUMBER: u32 = 65_535; impl<'i> Parser<'i> { pub(super) fn parse_modified(&mut self) -> PResult { @@ -86,8 +86,7 @@ impl<'i> Parser<'i> { let span_start = self.last_span(); let setting = if self.consume_reserved("lazy") { BooleanSetting::Lazy - } else if let Some((Token::Identifier, "unicode")) = self.peek() { - self.advance(); + } else if self.consume_contextual_keyword("unicode") { BooleanSetting::Unicode } else { return Err(PEK::Expected("`lazy` or `unicode`").at(self.span())); @@ -154,8 +153,7 @@ impl<'i> Parser<'i> { let mut matches = Vec::new(); let mut literal = None; - if let Some((Token::Identifier, "in")) = self.peek() { - } else { + if self.peek() != Some((Token::Identifier, "in")) { matches.push(self.parse_test_match()?); while self.consume(Token::Comma) { matches.push(self.parse_test_match()?); @@ -246,27 +244,27 @@ impl<'i> Parser<'i> { let leading_pipe = self.consume(Token::Pipe); let mut alts = Vec::new(); - if let Some(first_alt) = self.parse_and()? { - alts.push(first_alt); - - while self.consume(Token::Pipe) { - if let Some(next_alt) = self.parse_and()? { - span = span.join(next_alt.span()); - alts.push(next_alt); - } else { - return Err(PEK::LonePipe.at(self.last_span())); - } - } - - if alts.len() == 1 { - Ok(alts.pop().unwrap()) + let Some(first_alt) = self.parse_and()? else { + if leading_pipe { + return Err(PEK::LonePipe.at(span)); } else { - Ok(Alternation::new_expr(alts)) + return Ok(Rule::Literal(Literal::new("".to_string(), Span::default()))); } - } else if leading_pipe { - Err(PEK::LonePipe.at(span)) + }; + alts.push(first_alt); + + while self.consume(Token::Pipe) { + let Some(next_alt) = self.parse_and()? else { + return Err(PEK::LonePipe.at(self.last_span())); + }; + span = span.join(next_alt.span()); + alts.push(next_alt); + } + + if alts.len() == 1 { + Ok(alts.pop().unwrap()) } else { - Ok(Alternation::new_expr(alts)) + Ok(Rule::Alternation(Alternation { rules: alts, span })) } } @@ -288,9 +286,8 @@ impl<'i> Parser<'i> { rules.push(first_sequence); loop { if !self.consume(Token::Ampersand) { - return Ok(Some( - Intersection::new_expr(rules, span_start).expect("intersection can't be empty"), - )); + let span = span_start.join(self.last_span()); + return Ok(Some(Rule::Intersection(Intersection { rules, span }))); } let Some(next_sequence) = self.parse_sequence()? else { @@ -516,13 +513,10 @@ impl<'i> Parser<'i> { } fn parse_literal(&mut self) -> PResult> { - if let Some(s) = self.consume_as(Token::String) { - let span = self.last_span(); - let content = helper::parse_quoted_text(s).map_err(|k| k.at(span))?; - Ok(Some(Literal::new(content.to_string(), span))) - } else { - Ok(None) - } + let Some(s) = self.consume_as(Token::String) else { return Ok(None) }; + let span = self.last_span(); + let content = helper::parse_quoted_text(s).map_err(|k| k.at(span))?; + Ok(Some(Literal::new(content.to_string(), span))) } /// Parses a char set, surrounded by `[` `]`. This was previously called a @@ -592,29 +586,29 @@ impl<'i> Parser<'i> { /// Parses an identifier or dot in a char set fn parse_char_group_ident(&mut self, negative: bool) -> PResult>> { - if self.consume(Token::Identifier) { - let span = self.last_span(); + if !self.consume(Token::Identifier) { + if let Some(name) = self.consume_as(Token::ReservedName) { + return Err(PEK::UnexpectedKeyword(name.to_owned()).at(self.last_span())); + } + return Ok(None); + } + let span = self.last_span(); - let before_colon = self.source_at(span); - let after_colon = if self.consume(Token::Colon) { - Some(self.expect_as(Token::Identifier)?) - } else { - None - }; - let (kind, name, span) = match after_colon { - Some(name) => (Some(before_colon), name, span.join(self.last_span())), - None => (None, before_colon, span), - }; + let before_colon = self.source_at(span); + let after_colon = if self.consume(Token::Colon) { + Some(self.expect_as(Token::Identifier)?) + } else { + None + }; + let (kind, name, span) = match after_colon { + Some(name) => (Some(before_colon), name, span.join(self.last_span())), + None => (None, before_colon, span), + }; - let item = CharGroup::try_from_group_name(kind, name, negative, span) - .map_err(|e| e.at(span))?; + let item = + CharGroup::try_from_group_name(kind, name, negative, span).map_err(|e| e.at(span))?; - Ok(Some(item)) - } else if let Some(name) = self.consume_as(Token::ReservedName) { - Err(PEK::UnexpectedKeyword(name.to_owned()).at(self.last_span())) - } else { - Ok(None) - } + Ok(Some(item)) } /// Parses a string literal or a character range in a char set, e.g. `"axd"` @@ -677,54 +671,42 @@ impl<'i> Parser<'i> { } fn parse_code_point(&mut self) -> PResult> { - if let Some(cp) = self.consume_as(Token::CodePoint) { - let span = self.last_span(); - let trimmed_u = cp[1..].trim_start(); - if !trimmed_u.starts_with('+') { - let warning = DeprecationWarning::Unicode(cp.into()); - self.add_warning(ParseWarningKind::Deprecation(warning).at(span)) - } + let Some(cp) = self.consume_as(Token::CodePoint) else { return Ok(None) }; + let span = self.last_span(); + let trimmed_u = cp[1..].trim_start(); + if !trimmed_u.starts_with('+') { + let warning = DeprecationWarning::Unicode(cp.into()); + self.add_warning(ParseWarningKind::Deprecation(warning).at(span)) + } - let hex = trimmed_u.trim_start_matches(|c: char| c == '+' || c.is_whitespace()); + let hex = trimmed_u.trim_start_matches(|c: char| c == '+' || c.is_whitespace()); - u32::from_str_radix(hex, 16) - .ok() - .and_then(|n| char::try_from(n).ok()) - .map(|c| Some((c, span))) - .ok_or_else(|| PEK::InvalidCodePoint.at(span)) - } else { - Ok(None) - } + u32::from_str_radix(hex, 16) + .ok() + .and_then(|n| char::try_from(n).ok()) + .map(|c| Some((c, span))) + .ok_or_else(|| PEK::InvalidCodePoint.at(span)) } fn parse_code_point_rule(&mut self) -> PResult> { - if let Some((c, span)) = self.parse_code_point()? { - Ok(Some(Rule::CharClass(CharClass::new( - vec![GroupItem::Char(c)], - span, - self.is_unicode_aware, - )))) - } else { - Ok(None) - } + let Some((c, span)) = self.parse_code_point()? else { return Ok(None) }; + let inner = vec![GroupItem::Char(c)]; + Ok(Some(Rule::CharClass(CharClass::new(inner, span, self.is_unicode_aware)))) } fn parse_special_char(&mut self) -> Option { - if let Some((Token::Identifier, string)) = self.peek() { - let c = match string { - "n" => '\n', - "r" => '\r', - "t" => '\t', - "a" => '\u{07}', - "e" => '\u{1B}', - "f" => '\u{0C}', - _ => return None, - }; - self.advance(); - Some(c) - } else { - None - } + let Some((Token::Identifier, string)) = self.peek() else { return None }; + let c = match string { + "n" => '\n', + "r" => '\r', + "t" => '\t', + "a" => '\u{07}', + "e" => '\u{1B}', + "f" => '\u{0C}', + _ => return None, + }; + self.advance(); + Some(c) } /// Parses a boundary. For start and end, there are two syntaxes: `^` and `$`. @@ -741,7 +723,7 @@ impl<'i> Parser<'i> { BoundaryKind::Start } else if self.consume(Token::Dollar) { BoundaryKind::End - } else if self.consume(Token::BWord) { + } else if self.consume(Token::Percent) { BoundaryKind::Word } else if self.consume(Token::AngleLeft) { BoundaryKind::WordStart @@ -766,7 +748,7 @@ impl<'i> Parser<'i> { let num = self.expect_number::()?; // negating from positive to negative can't overflow, luckily ReferenceTarget::Relative(-num) - } else if let Some(num) = self.consume_number(MAX_REPETITION)? { + } else if let Some(num) = self.consume_number(MAX_GROUP_NUMBER)? { ReferenceTarget::Number(num) } else { // TODO: Better diagnostic for `::let` @@ -857,16 +839,13 @@ impl<'i> Parser<'i> { /// Parses a variable (usage site). fn parse_variable(&mut self) -> PResult> { - if let Some(ident) = self.consume_as(Token::Identifier) { - let span1 = self.last_span(); - let rule = Rule::Variable(Variable::new(ident, span1)); - if let Some((Token::Equals, span2)) = self.peek_pair() { - return Err(PEK::MissingLetKeyword.at(span1.join(span2))); - } - Ok(Some(rule)) - } else { - Ok(None) + let Some(ident) = self.consume_as(Token::Identifier) else { return Ok(None) }; + let span1 = self.last_span(); + let rule = Rule::Variable(Variable::new(ident, span1)); + if let Some((Token::Equals, span2)) = self.peek_pair() { + return Err(PEK::MissingLetKeyword.at(span1.join(span2))); } + Ok(Some(rule)) } /// Parses the dot