Skip to content

Commit

Permalink
feat: character class prefixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Aloso committed Nov 28, 2024
1 parent 5c08286 commit 4073e1a
Show file tree
Hide file tree
Showing 27 changed files with 281 additions and 38 deletions.
4 changes: 4 additions & 0 deletions pomsky-lib/src/diagnose/diagnostic_code.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ diagnostic_code! {
MissingKeyword = 120,
MultipleStringsInTestCase = 121,
RangeLeadingZeroesVariableLength = 122,
UnexpectedCharClassPrefix = 123,
WrongCharClassPrefix = 124,

// Currently a parse error, but it should be a compile error
LetBindingExists = 300,
Expand Down Expand Up @@ -160,6 +162,8 @@ impl<'a> From<&'a CharClassError> for DiagnosticCode {
E::Unallowed => Self::CharClassUnallowedCombination,
E::UnknownNamedClass { .. } => Self::CharClassUnknownShorthand,
E::Negative => Self::CharClassIllegalNegation,
E::UnexpectedPrefix => Self::UnexpectedCharClassPrefix,
E::WrongPrefix { .. } => Self::WrongCharClassPrefix,
_ => panic!("Unhandled char class error message {value:?}"),
}
}
Expand Down
3 changes: 3 additions & 0 deletions pomsky-lib/src/diagnose/feature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ pub enum Feature {
UnicodeWordBoundaries,
/// Word start and word end is not supported in RE2
WordStartEnd,
/// Unicode script extensions, e.g. `[scx:Greek]`
ScriptExtensions,
}

impl Feature {
Expand All @@ -67,6 +69,7 @@ impl Feature {
Feature::Recursion => "recursion",
Feature::UnicodeWordBoundaries => "word boundaries in Unicode mode",
Feature::WordStartEnd => "word start and word end",
Feature::ScriptExtensions => "Unicode script extensions",
}
}
}
36 changes: 33 additions & 3 deletions pomsky-lib/src/exprs/char_class.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,10 @@ use crate::{
};

use pomsky_syntax::{
exprs::{Category, CharClass, CodeBlock, GroupItem, GroupName, OtherProperties, Script},
exprs::{
Category, CharClass, CodeBlock, GroupItem, GroupName, OtherProperties, Script,
ScriptExtension,
},
Span,
};

Expand Down Expand Up @@ -348,7 +351,7 @@ fn named_class_to_regex_unicode(
}
set.add_prop(RegexProperty::Category(c).negative_item(negative));
}
GroupName::Script(s) => {
GroupName::Script(s, e) => {
if flavor == RegexFlavor::DotNet {
return Err(CompileErrorKind::Unsupported(Feature::UnicodeScript, flavor).at(span));
}
Expand All @@ -357,7 +360,33 @@ fn named_class_to_regex_unicode(
{
return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));
}
set.add_prop(RegexProperty::Script(s).negative_item(negative));

let set_extensions = match e {
ScriptExtension::Yes => match flavor {
RegexFlavor::Rust | RegexFlavor::Pcre | RegexFlavor::JavaScript => {
ScriptExtension::Yes
}
RegexFlavor::Java
| RegexFlavor::DotNet
| RegexFlavor::Ruby
| RegexFlavor::Python
| RegexFlavor::RE2 => {
return Err(CompileErrorKind::Unsupported(
Feature::ScriptExtensions,
flavor,
)
.at(span))
}
},
ScriptExtension::No => match flavor {
// PCRE is currently the only flavor when `\p{Greek}` is the same as `\p{scx=Greek}`
RegexFlavor::Pcre => ScriptExtension::No,
_ => ScriptExtension::Unspecified,
},
_ => ScriptExtension::Unspecified,
};

set.add_prop(RegexProperty::Script(s, set_extensions).negative_item(negative));
}
GroupName::CodeBlock(b) => match flavor {
RegexFlavor::DotNet | RegexFlavor::Java | RegexFlavor::Ruby => {
Expand Down Expand Up @@ -514,6 +543,7 @@ impl fmt::Debug for RegexCharSetItem {
if negative {
f.write_str("!")?;
}
f.write_str(value.prefix_as_str())?;
f.write_str(value.as_str())
}
}
Expand Down
21 changes: 16 additions & 5 deletions pomsky-lib/src/regex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use std::borrow::Borrow;
use pomsky_syntax::{
exprs::{
BoundaryKind, Category, CodeBlock, LookaroundKind, OtherProperties, RepetitionKind, Script,
ScriptExtension,
},
Span,
};
Expand Down Expand Up @@ -206,7 +207,7 @@ impl RegexShorthand {
#[cfg_attr(feature = "dbg", derive(Debug))]
pub(crate) enum RegexProperty {
Category(Category),
Script(Script),
Script(Script, ScriptExtension),
Block(CodeBlock),
Other(OtherProperties),
}
Expand All @@ -215,12 +216,20 @@ impl RegexProperty {
pub fn as_str(&self) -> &'static str {
match self {
RegexProperty::Category(c) => c.as_str(),
RegexProperty::Script(s) => s.as_str(),
RegexProperty::Script(s, _) => s.as_str(),
RegexProperty::Block(b) => b.as_str(),
RegexProperty::Other(o) => o.as_str(),
}
}

pub fn prefix_as_str(&self) -> &'static str {
match self {
RegexProperty::Script(_, ScriptExtension::No) => "sc:",
RegexProperty::Script(_, ScriptExtension::Yes) => "scx:",
_ => "",
}
}

pub(crate) fn negative_item(self, negative: bool) -> RegexCharSetItem {
RegexCharSetItem::Property { negative, value: self }
}
Expand Down Expand Up @@ -427,9 +436,11 @@ impl RegexProperty {
RegexProperty::Category(c) => {
buf.push_str(c.as_str());
}
RegexProperty::Script(s) => {
if let RegexFlavor::JavaScript | RegexFlavor::Java = flavor {
buf.push_str("sc=");
RegexProperty::Script(s, e) => {
if matches!(flavor, RegexFlavor::JavaScript | RegexFlavor::Java)
|| e != ScriptExtension::Unspecified
{
buf.push_str(if let ScriptExtension::Yes = e { "scx=" } else { "sc=" });
}
buf.push_str(s.as_str());
}
Expand Down
21 changes: 16 additions & 5 deletions pomsky-lib/tests/it/diff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,24 @@ pub fn simple_diff<'a>(left: &'a str, right: &'a str) -> (&'a str, &'a str, &'a
if left == right {
return (left, "", "", "");
}
let ((prefix_len, _), _) =
left.char_indices().zip(right.chars()).find(|&((_, a), b)| a != b).unwrap();

let ((left_last_idx, _), _) =
left.char_indices().rev().zip(right.chars().rev()).find(|&((_, a), b)| a != b).unwrap();
let min = left.len().min(right.len());

let prefix_len = left
.char_indices()
.zip(right.chars())
.find(|&((_, a), b)| a != b)
.map(|((n, _), _)| n)
.unwrap_or(min);

let suffix_len = left
.char_indices()
.rev()
.zip(right.chars().rev())
.find(|&((_, a), b)| a != b)
.map(|((n, _), _)| left.len() - n - 1)
.unwrap_or(min);

let suffix_len = left.len() - left_last_idx - 1;
let suffix_len = suffix_len.min(left.len() - prefix_len).min(right.len() - prefix_len);

let prefix = &left[..prefix_len];
Expand Down
5 changes: 5 additions & 0 deletions pomsky-lib/tests/testcases/prefixes/blk_dotnet.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#! flavor=DotNet
[blk:Greek]

-----
\p{IsGreekandCoptic}
5 changes: 5 additions & 0 deletions pomsky-lib/tests/testcases/prefixes/blk_ruby.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#! flavor=Ruby
[blk:Greek]

-----
\p{InGreek_And_Coptic}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#! expect=error
[gcd:Alphabetic]

-----
ERROR: This character class cannot have a prefix
SPAN: 1..15
7 changes: 7 additions & 0 deletions pomsky-lib/tests/testcases/prefixes/error_wrong_prefix.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#! expect=error
[gc:InBasic_Latin]

-----
ERROR: This character class has the wrong prefix; it should be block (blk),
and the `In` at the start should be removed
SPAN: 1..17
5 changes: 5 additions & 0 deletions pomsky-lib/tests/testcases/prefixes/gc_dotnet.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#! flavor=.NET
[gc:L]

-----
\p{L}
5 changes: 5 additions & 0 deletions pomsky-lib/tests/testcases/prefixes/gc_js.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#! flavor=JavaScript
[gc:L]

-----
\p{L}
5 changes: 5 additions & 0 deletions pomsky-lib/tests/testcases/prefixes/gc_ruby.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#! flavor=Ruby
[gc:L]

-----
\pL
5 changes: 5 additions & 0 deletions pomsky-lib/tests/testcases/prefixes/script_java.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#! flavor=Java
[sc:Greek]

-----
\p{sc=Greek}
5 changes: 5 additions & 0 deletions pomsky-lib/tests/testcases/prefixes/script_js.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#! flavor=JavaScript
[sc:Greek]

-----
\p{sc=Greek}
5 changes: 5 additions & 0 deletions pomsky-lib/tests/testcases/prefixes/script_pcre.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#! flavor=Pcre
[sc:Greek]

-----
\p{sc=Greek}
5 changes: 5 additions & 0 deletions pomsky-lib/tests/testcases/prefixes/script_rs.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#! flavor=Rust
[sc:Greek]

-----
\p{Greek}
6 changes: 6 additions & 0 deletions pomsky-lib/tests/testcases/prefixes/scx_java.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#! expect=error, flavor=Java
[scx:Greek]

-----
ERROR: Unsupported feature `Unicode script extensions` in the `Java` regex flavor
SPAN: 1..10
5 changes: 5 additions & 0 deletions pomsky-lib/tests/testcases/prefixes/scx_pcre.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#! flavor=Pcre
[scx:Greek]

-----
\p{scx=Greek}
5 changes: 5 additions & 0 deletions pomsky-lib/tests/testcases/prefixes/scx_rs.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#! flavor=Rust
[scx:Greek]

-----
\p{scx=Greek}
15 changes: 8 additions & 7 deletions pomsky-syntax/DotNetSupportedBlocks.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@ BopomofoExtended
BoxDrawing
BraillePatterns
Buhid
Cherokee
CJKCompatibility
CJKCompatibilityForms
CJKCompatibilityIdeographs
CJKRadicalsSupplement
CJKSymbolsandPunctuation
CJKUnifiedIdeographs
CJKUnifiedIdeographsExtensionA
Cherokee
CombiningDiacriticalMarks
CombiningDiacriticalMarksforSymbols
CombiningHalfMarks
Expand All @@ -37,8 +37,8 @@ GeneralPunctuation
GeometricShapes
Georgian
Greek
GreekandCoptic
GreekExtended
GreekandCoptic
Gujarati
Gurmukhi
HalfwidthandFullwidthForms
Expand All @@ -50,8 +50,9 @@ Hebrew
HighPrivateUseSurrogates
HighSurrogates
Hiragana
IdeographicDescriptionCharacters
IPAExtensions
IdeographicDescriptionCharacters
IsPrivateUseArea
Kanbun
KangxiRadicals
Kannada
Expand All @@ -62,8 +63,8 @@ KhmerSymbols
Lao
Latin-1Supplement
LatinExtended-A
LatinExtendedAdditional
LatinExtended-B
LatinExtendedAdditional
LetterlikeSymbols
Limbu
LowSurrogates
Expand All @@ -81,7 +82,7 @@ Ogham
OpticalCharacterRecognition
Oriya
PhoneticExtensions
PrivateUse or IsPrivateUseArea
PrivateUse
Runic
Sinhala
SmallFormVariants
Expand All @@ -102,6 +103,6 @@ Thai
Tibetan
UnifiedCanadianAboriginalSyllabics
VariationSelectors
YijingHexagramSymbols
YiRadicals
YiSyllables
YiSyllables
YijingHexagramSymbols
2 changes: 1 addition & 1 deletion pomsky-syntax/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ fn generate_unicode_data() {
let canonical = script[1];
for &name in script {
if !distinct_cache.contains(&(name, false)) {
lut.push(format!("(\"{name}\", GroupName::Script(Script::{canonical}))"));
lut.push(format!("(\"{name}\", GroupName::Script(Script::{canonical}, ScriptExtension::Unspecified))"));
distinct_cache.insert((name, false));
}
}
Expand Down
18 changes: 18 additions & 0 deletions pomsky-syntax/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,10 @@ pub enum CharClassError {
},
/// A character class that can't be negated, e.g. `[!ascii]`
Negative,
/// The character class has a prefix where none is expected, e.g. `[scx:w]`
UnexpectedPrefix,
/// The character class has the wrong prefix, e.g. `[sc:Basic_Latin]` (the correct prefix would be `block:`)
WrongPrefix { expected: &'static str, has_in_prefix: bool },
}

impl std::error::Error for CharClassError {}
Expand All @@ -243,6 +247,20 @@ impl core::fmt::Display for CharClassError {
write!(f, "Unknown character class `{found}`")
}
CharClassError::Negative => write!(f, "This character class can't be negated"),
CharClassError::UnexpectedPrefix => {
write!(f, "This character class cannot have a prefix")
}
&CharClassError::WrongPrefix { expected, has_in_prefix } => {
if has_in_prefix {
write!(
f,
"This character class has the wrong prefix; it should be {expected},\n\
and the `In` at the start should be removed"
)
} else {
write!(f, "This character class has the wrong prefix; it should be {expected}")
}
}
}
}
}
Expand Down
Loading

0 comments on commit 4073e1a

Please sign in to comment.