Skip to content

Commit

Permalink
refactor: simplify range code, fix bug in regex-test
Browse files Browse the repository at this point in the history
  • Loading branch information
Aloso committed Nov 25, 2024
1 parent e5ecc50 commit 25f0a6a
Show file tree
Hide file tree
Showing 17 changed files with 87 additions and 87 deletions.
22 changes: 11 additions & 11 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions pomsky-bin/src/test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ fn check_test_match(regex: &Regex, test_case: TestCaseMatch, errors: &mut Vec<Di
let result = regex.captures(test_case.literal.content.as_bytes());
match result {
Ok(Some(captures)) => {
if captures[0].len() != test_case.literal.content.as_bytes().len() {
if captures[0].len() != test_case.literal.content.len() {
errors.push(Diagnostic::test_failure(
test_case.literal.span,
DiagnosticCode::TestNoExactMatch,
Expand Down Expand Up @@ -177,7 +177,7 @@ fn check_test_reject(regex: &Regex, test_case: TestCaseReject, errors: &mut Vec<
let result = regex.captures(test_case.literal.content.as_bytes());
match result {
Ok(Some(captures)) => {
let is_exact = captures[0].len() == test_case.literal.content.as_bytes().len();
let is_exact = captures[0].len() == test_case.literal.content.len();
if test_case.as_substring || is_exact {
let actual_value;
let (code, actual_value) = if is_exact || !test_case.as_substring {
Expand Down
22 changes: 11 additions & 11 deletions pomsky-lib/afl-fuzz/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions pomsky-lib/afl-fuzz/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,21 @@ When you found a crash, you might find it in `errors.txt`. If it's not in `error
## Report the bug

Please report the bug [here](https://github.com/pomsky-lang/pomsky/issues). If you think it could be a security vulnerability, please disclose it directly per email: [email protected].

## Latest findings

### PCRE

- Lookbehind cannot contain include unbounded repetitions.
- Bounded repetitions need an upper bound of _at most_ 255. I.e. `(?<=a{4,255})` is ok.
- Nested repetitions reach the limit quicker (TBD)
- Lookbehind cannot contain `\X`

### Ruby

- Lookaround cannot contain capturing groups

### Python

- Lookbehind requires fixed-width pattern
- Cannot refer to open capturing group
26 changes: 3 additions & 23 deletions pomsky-lib/src/exprs/range.rs
Original file line number Diff line number Diff line change
Expand Up @@ -469,34 +469,14 @@ impl Class {
let (a, b) = (self.start, self.end);
let mut set = UnicodeSet::new();

match (a, b, a == b) {
(0..=9, _, true) => return Regex::Char((a + b'0') as char),
(0..=9, 0..=9, _) => {
match (a, b) {
(0..=9, 0..=9) => {
set.add_range_unchecked((a + b'0') as char..=(b + b'0') as char);
}
(10.., _, true) => {
set.add_char_unchecked((a + b'a' - 10) as char);
set.add_char_unchecked((a + b'A' - 10) as char);
}
(10.., 10.., _) => {
(10.., 10..) => {
set.add_range_unchecked((a + b'a' - 10) as char..=(b + b'a' - 10) as char);
set.add_range_unchecked((a + b'A' - 10) as char..=(b + b'A' - 10) as char);
}
(9, 10, _) => {
set.add_char_unchecked('9');
set.add_char_unchecked('a');
set.add_char_unchecked('A');
}
(_, 10, _) => {
set.add_range_unchecked((a + b'0') as char..='9');
set.add_char_unchecked('a');
set.add_char_unchecked('A');
}
(9, _, _) => {
set.add_char_unchecked('9');
set.add_range_unchecked('a'..=(b + b'a' - 10) as char);
set.add_range_unchecked('A'..=(b + b'A' - 10) as char);
}
_ => {
set.add_range_unchecked((a + b'0') as char..='9');
set.add_range_unchecked('a'..=(b + b'a' - 10) as char);
Expand Down
10 changes: 0 additions & 10 deletions pomsky-lib/src/regex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ pub(crate) enum Regex {
Literal(String),
/// A regex string that is inserted verbatim into the output
Unescaped(String),
/// A literal char
Char(char),
/// A character class, delimited with square brackets
CharSet(RegexCharSet),
/// A Unicode grapheme
Expand Down Expand Up @@ -66,7 +64,6 @@ impl Regex {
match self {
Regex::Literal(str) => Ok(Some(str.chars().count() as u32)),
Regex::Unescaped(_) => Ok(None),
Regex::Char(_) => Ok(Some(1)),
Regex::CharSet(_) => Ok(Some(1)),
Regex::Grapheme => Err(CompileErrorKind::UnsupportedInLookbehind {
flavor: RegexFlavor::Python,
Expand Down Expand Up @@ -115,7 +112,6 @@ impl Regex {
match self {
Regex::Literal(_) => Ok(()),
Regex::Unescaped(_) => Ok(()),
Regex::Char(_) => Ok(()),
Regex::CharSet(_) => Ok(()),
Regex::Grapheme => Err(CompileErrorKind::UnsupportedInLookbehind {
flavor: RegexFlavor::Pcre,
Expand Down Expand Up @@ -241,7 +237,6 @@ impl Regex {
}
Ok(Regex::CharSet(RegexCharSet::new(c.into()).negate()))
}
Regex::Char(c) => Ok(Regex::CharSet(RegexCharSet::new(c.into()).negate())),
Regex::CharSet(s) => Ok(Regex::CharSet(s.negate())),
Regex::Boundary(b) => match b {
BoundaryKind::Word => Ok(Regex::Boundary(BoundaryKind::NotWord)),
Expand Down Expand Up @@ -309,9 +304,6 @@ impl Regex {
Regex::Unescaped(u) => {
buf.push_str(u);
}
&Regex::Char(c) => {
literal::codegen_char_esc(c, buf, flavor);
}
Regex::CharSet(c) => c.codegen(buf, flavor),
Regex::Grapheme => buf.push_str("\\X"),
Regex::Dot => buf.push('.'),
Expand All @@ -330,7 +322,6 @@ impl Regex {
Regex::Alternation(_) => true,
Regex::Literal(_)
| Regex::Unescaped(_)
| Regex::Char(_)
| Regex::Group(_)
| Regex::CharSet(_)
| Regex::Grapheme
Expand All @@ -353,7 +344,6 @@ impl Regex {
| Regex::Unescaped(_) => true,
Regex::Lookaround(_) => matches!(flavor, RegexFlavor::JavaScript),
Regex::CharSet(_)
| Regex::Char(_)
| Regex::Grapheme
| Regex::Reference(_)
| Regex::Dot
Expand Down
3 changes: 1 addition & 2 deletions pomsky-lib/src/regex/optimize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,7 @@ impl Regex {
Count::One
}
Regex::Unescaped(_) => Count::Many,
Regex::Char(_)
| Regex::CharSet(_)
Regex::CharSet(_)
| Regex::Grapheme
| Regex::Dot
| Regex::Boundary(_)
Expand Down
4 changes: 0 additions & 4 deletions pomsky-lib/src/unicode_set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,6 @@ impl UnicodeSet {
self.ranges.insert(SetRange { first: *range.start() as u32, last: *range.end() as u32 });
}

pub fn add_char_unchecked(&mut self, char: char) {
self.ranges.insert(SetRange::single(char as u32));
}

fn add(&mut self, mut range_new: SetRange) {
let lower = SetRange::single(range_new.first.saturating_sub(1));
let upper = SetRange::single(range_new.last.saturating_add(1));
Expand Down
4 changes: 3 additions & 1 deletion pomsky-syntax/src/exprs/reference.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ impl Reference {
ReferenceTarget::Named(n) => buf.write(n),
ReferenceTarget::Number(i) => buf.write_fmt(i),
&ReferenceTarget::Relative(o) => {
buf.push(if o < 0 { '-' } else { '+' });
if o >= 0 {
buf.push('+');
}
buf.write_fmt(o);
}
}
Expand Down
4 changes: 2 additions & 2 deletions regex-test/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ publish = false
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
regex = "1.10.2"
pcre2 = "0.2.5"
regex = "1.11.1"
pcre2 = "0.2.9"
onig = "6.4.0"
6 changes: 5 additions & 1 deletion regex-test/dotnet/TesterAsync.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,12 @@ public static void Main(string[] args) {
string testLine;

while ((line = Console.ReadLine()) != null) {
if (!line.StartsWith("REGEX:")) {
continue;
}

try {
var r = new Regex(line, RegexOptions.Compiled);
var r = new Regex(line.Substring(6), RegexOptions.Compiled);
Console.WriteLine("success");

while ((testLine = Console.ReadLine()) != null && testLine.StartsWith("TEST:")) {
Expand Down
6 changes: 5 additions & 1 deletion regex-test/java/TesterAsync.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,12 @@ public static void main(String[] args) {
try (Scanner input = new Scanner(System.in)) {
while (input.hasNext()) {
String regex = input.nextLine();
if (!regex.startsWith("REGEX:")) {
continue;
}

try {
Pattern p = Pattern.compile(regex);
Pattern p = Pattern.compile("(?U)" + regex.substring(6));
System.out.printf("success\n");

while (input.hasNext()) {
Expand Down
10 changes: 6 additions & 4 deletions regex-test/js/tester-deno-async.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@ let regex

for await (const line of readLines(Deno.stdin)) {
if (regex === undefined) {
if (!line.startsWith('REGEX:')) {
continue
}

try {
regex = new RegExp(line, 'u')
regex = new RegExp(line.slice(6), 'u')
console.log('success')
} catch (e) {
console.log(substituteLf(e.message))
Expand All @@ -17,9 +21,7 @@ for await (const line of readLines(Deno.stdin)) {
if (regex.test(test)) {
console.log('test good')
} else {
console.log(
substituteLf(`Regex '${regex.source}' does not match '${test}'`)
)
console.log(substituteLf(`Regex '${regex.source}' does not match '${test}'`))
regex = undefined
}
} else {
Expand Down
10 changes: 6 additions & 4 deletions regex-test/js/tester-node-async.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,12 @@ let regex

rl.on('line', (line) => {
if (regex === undefined) {
if (!line.startsWith('REGEX:')) {
return
}

try {
regex = new RegExp(line, 'u')
regex = new RegExp(line.slice(6), 'u')
console.log('success')
} catch (e) {
console.log(substituteLf(e.message))
Expand All @@ -23,9 +27,7 @@ rl.on('line', (line) => {
if (regex.test(test)) {
console.log('test good')
} else {
console.log(
substituteLf(`Regex '${regex.source}' does not match '${test}'`)
)
console.log(substituteLf(`Regex '${regex.source}' does not match '${test}'`))
regex = undefined
}
} else {
Expand Down
Loading

0 comments on commit 25f0a6a

Please sign in to comment.