From 25f0a6aa1378f3dfd44626b37ce6a3058a53ce02 Mon Sep 17 00:00:00 2001 From: Ludwig Stecher Date: Tue, 26 Nov 2024 00:44:35 +0100 Subject: [PATCH] refactor: simplify range code, fix bug in regex-test --- Cargo.lock | 22 +++++++++++----------- pomsky-bin/src/test.rs | 4 ++-- pomsky-lib/afl-fuzz/Cargo.lock | 22 +++++++++++----------- pomsky-lib/afl-fuzz/README.md | 18 ++++++++++++++++++ pomsky-lib/src/exprs/range.rs | 26 +++----------------------- pomsky-lib/src/regex/mod.rs | 10 ---------- pomsky-lib/src/regex/optimize.rs | 3 +-- pomsky-lib/src/unicode_set.rs | 4 ---- pomsky-syntax/src/exprs/reference.rs | 4 +++- regex-test/Cargo.toml | 4 ++-- regex-test/dotnet/TesterAsync.cs | 6 +++++- regex-test/java/TesterAsync.java | 6 +++++- regex-test/js/tester-deno-async.js | 10 ++++++---- regex-test/js/tester-node-async.js | 10 ++++++---- regex-test/python/tester_async.py | 5 ++++- regex-test/src/sync/mod.rs | 16 ++++++++-------- regex-test/src/sync/process.rs | 4 ++-- 17 files changed, 87 insertions(+), 87 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 95eaebd..c4605d0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "aho-corasick" @@ -594,9 +594,9 @@ checksum = "c1b04fb49957986fdce4d6ee7a65027d55d4b6d2265e5848bbb507b58ccfdb6f" [[package]] name = "pcre2" -version = "0.2.5" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9deb1d02d6a373ee392128ba86087352a986359f32a106e2e3b08cc90cc659c9" +checksum = "3be55c43ac18044541d58d897e8f4c55157218428953ebd39d86df3ba0286b2b" dependencies = [ "libc", "log", @@ -605,9 +605,9 @@ dependencies = [ [[package]] name = "pcre2-sys" -version = "0.2.6" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae234f441970dbd52d4e29bee70f3b56ca83040081cb2b55b7df772b16e0b06e" +checksum = "550f5d18fb1b90c20b87e161852c10cde77858c3900c5059b5ad2a1449f11d8a" dependencies = [ "cc", "libc", @@ -786,9 +786,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.2" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", @@ -798,9 +798,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.3" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", @@ -815,9 +815,9 @@ checksum = "30b661b2f27137bdbc16f00eda72866a92bb28af1753ffbd56744fb6e2e9cd8e" [[package]] name = "regex-syntax" -version = "0.8.2" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "regex-test" diff --git a/pomsky-bin/src/test.rs b/pomsky-bin/src/test.rs index 6b16e9d..7857246 100644 --- a/pomsky-bin/src/test.rs +++ b/pomsky-bin/src/test.rs @@ -57,7 +57,7 @@ fn check_test_match(regex: &Regex, test_case: TestCaseMatch, errors: &mut Vec { - if captures[0].len() != test_case.literal.content.as_bytes().len() { + if captures[0].len() != test_case.literal.content.len() { errors.push(Diagnostic::test_failure( test_case.literal.span, DiagnosticCode::TestNoExactMatch, @@ -177,7 +177,7 @@ fn check_test_reject(regex: &Regex, test_case: TestCaseReject, errors: &mut Vec< let result = regex.captures(test_case.literal.content.as_bytes()); match result { Ok(Some(captures)) => { - let is_exact = captures[0].len() == test_case.literal.content.as_bytes().len(); + let is_exact = captures[0].len() == test_case.literal.content.len(); if test_case.as_substring || is_exact { let actual_value; let (code, actual_value) = if is_exact || !test_case.as_substring { diff --git a/pomsky-lib/afl-fuzz/Cargo.lock b/pomsky-lib/afl-fuzz/Cargo.lock index 15e0ea6..40f19b7 100644 --- a/pomsky-lib/afl-fuzz/Cargo.lock +++ b/pomsky-lib/afl-fuzz/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "afl" @@ -233,9 +233,9 @@ dependencies = [ [[package]] name = "pcre2" -version = "0.2.5" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9deb1d02d6a373ee392128ba86087352a986359f32a106e2e3b08cc90cc659c9" +checksum = "3be55c43ac18044541d58d897e8f4c55157218428953ebd39d86df3ba0286b2b" dependencies = [ "libc", "log", @@ -244,9 +244,9 @@ dependencies = [ [[package]] name = "pcre2-sys" -version = "0.2.6" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae234f441970dbd52d4e29bee70f3b56ca83040081cb2b55b7df772b16e0b06e" +checksum = "550f5d18fb1b90c20b87e161852c10cde77858c3900c5059b5ad2a1449f11d8a" dependencies = [ "cc", "libc", @@ -300,9 +300,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.2" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", @@ -312,9 +312,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.3" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", @@ -323,9 +323,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.2" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "regex-test" diff --git a/pomsky-lib/afl-fuzz/README.md b/pomsky-lib/afl-fuzz/README.md index 46eb335..4f240dc 100644 --- a/pomsky-lib/afl-fuzz/README.md +++ b/pomsky-lib/afl-fuzz/README.md @@ -31,3 +31,21 @@ When you found a crash, you might find it in `errors.txt`. If it's not in `error ## Report the bug Please report the bug [here](https://github.com/pomsky-lang/pomsky/issues). If you think it could be a security vulnerability, please disclose it directly per email: ludwig.stecher@gmx.de. + +## Latest findings + +### PCRE + +- Lookbehind cannot contain include unbounded repetitions. + - Bounded repetitions need an upper bound of _at most_ 255. I.e. `(?<=a{4,255})` is ok. + - Nested repetitions reach the limit quicker (TBD) +- Lookbehind cannot contain `\X` + +### Ruby + +- Lookaround cannot contain capturing groups + +### Python + +- Lookbehind requires fixed-width pattern +- Cannot refer to open capturing group diff --git a/pomsky-lib/src/exprs/range.rs b/pomsky-lib/src/exprs/range.rs index 62f7414..f7dee79 100644 --- a/pomsky-lib/src/exprs/range.rs +++ b/pomsky-lib/src/exprs/range.rs @@ -469,34 +469,14 @@ impl Class { let (a, b) = (self.start, self.end); let mut set = UnicodeSet::new(); - match (a, b, a == b) { - (0..=9, _, true) => return Regex::Char((a + b'0') as char), - (0..=9, 0..=9, _) => { + match (a, b) { + (0..=9, 0..=9) => { set.add_range_unchecked((a + b'0') as char..=(b + b'0') as char); } - (10.., _, true) => { - set.add_char_unchecked((a + b'a' - 10) as char); - set.add_char_unchecked((a + b'A' - 10) as char); - } - (10.., 10.., _) => { + (10.., 10..) => { set.add_range_unchecked((a + b'a' - 10) as char..=(b + b'a' - 10) as char); set.add_range_unchecked((a + b'A' - 10) as char..=(b + b'A' - 10) as char); } - (9, 10, _) => { - set.add_char_unchecked('9'); - set.add_char_unchecked('a'); - set.add_char_unchecked('A'); - } - (_, 10, _) => { - set.add_range_unchecked((a + b'0') as char..='9'); - set.add_char_unchecked('a'); - set.add_char_unchecked('A'); - } - (9, _, _) => { - set.add_char_unchecked('9'); - set.add_range_unchecked('a'..=(b + b'a' - 10) as char); - set.add_range_unchecked('A'..=(b + b'A' - 10) as char); - } _ => { set.add_range_unchecked((a + b'0') as char..='9'); set.add_range_unchecked('a'..=(b + b'a' - 10) as char); diff --git a/pomsky-lib/src/regex/mod.rs b/pomsky-lib/src/regex/mod.rs index 91919bb..6b67799 100644 --- a/pomsky-lib/src/regex/mod.rs +++ b/pomsky-lib/src/regex/mod.rs @@ -34,8 +34,6 @@ pub(crate) enum Regex { Literal(String), /// A regex string that is inserted verbatim into the output Unescaped(String), - /// A literal char - Char(char), /// A character class, delimited with square brackets CharSet(RegexCharSet), /// A Unicode grapheme @@ -66,7 +64,6 @@ impl Regex { match self { Regex::Literal(str) => Ok(Some(str.chars().count() as u32)), Regex::Unescaped(_) => Ok(None), - Regex::Char(_) => Ok(Some(1)), Regex::CharSet(_) => Ok(Some(1)), Regex::Grapheme => Err(CompileErrorKind::UnsupportedInLookbehind { flavor: RegexFlavor::Python, @@ -115,7 +112,6 @@ impl Regex { match self { Regex::Literal(_) => Ok(()), Regex::Unescaped(_) => Ok(()), - Regex::Char(_) => Ok(()), Regex::CharSet(_) => Ok(()), Regex::Grapheme => Err(CompileErrorKind::UnsupportedInLookbehind { flavor: RegexFlavor::Pcre, @@ -241,7 +237,6 @@ impl Regex { } Ok(Regex::CharSet(RegexCharSet::new(c.into()).negate())) } - Regex::Char(c) => Ok(Regex::CharSet(RegexCharSet::new(c.into()).negate())), Regex::CharSet(s) => Ok(Regex::CharSet(s.negate())), Regex::Boundary(b) => match b { BoundaryKind::Word => Ok(Regex::Boundary(BoundaryKind::NotWord)), @@ -309,9 +304,6 @@ impl Regex { Regex::Unescaped(u) => { buf.push_str(u); } - &Regex::Char(c) => { - literal::codegen_char_esc(c, buf, flavor); - } Regex::CharSet(c) => c.codegen(buf, flavor), Regex::Grapheme => buf.push_str("\\X"), Regex::Dot => buf.push('.'), @@ -330,7 +322,6 @@ impl Regex { Regex::Alternation(_) => true, Regex::Literal(_) | Regex::Unescaped(_) - | Regex::Char(_) | Regex::Group(_) | Regex::CharSet(_) | Regex::Grapheme @@ -353,7 +344,6 @@ impl Regex { | Regex::Unescaped(_) => true, Regex::Lookaround(_) => matches!(flavor, RegexFlavor::JavaScript), Regex::CharSet(_) - | Regex::Char(_) | Regex::Grapheme | Regex::Reference(_) | Regex::Dot diff --git a/pomsky-lib/src/regex/optimize.rs b/pomsky-lib/src/regex/optimize.rs index 2bf2649..46e33ce 100644 --- a/pomsky-lib/src/regex/optimize.rs +++ b/pomsky-lib/src/regex/optimize.rs @@ -107,8 +107,7 @@ impl Regex { Count::One } Regex::Unescaped(_) => Count::Many, - Regex::Char(_) - | Regex::CharSet(_) + Regex::CharSet(_) | Regex::Grapheme | Regex::Dot | Regex::Boundary(_) diff --git a/pomsky-lib/src/unicode_set.rs b/pomsky-lib/src/unicode_set.rs index 604dfb6..b5886f8 100644 --- a/pomsky-lib/src/unicode_set.rs +++ b/pomsky-lib/src/unicode_set.rs @@ -114,10 +114,6 @@ impl UnicodeSet { self.ranges.insert(SetRange { first: *range.start() as u32, last: *range.end() as u32 }); } - pub fn add_char_unchecked(&mut self, char: char) { - self.ranges.insert(SetRange::single(char as u32)); - } - fn add(&mut self, mut range_new: SetRange) { let lower = SetRange::single(range_new.first.saturating_sub(1)); let upper = SetRange::single(range_new.last.saturating_add(1)); diff --git a/pomsky-syntax/src/exprs/reference.rs b/pomsky-syntax/src/exprs/reference.rs index 9711626..7dea01b 100644 --- a/pomsky-syntax/src/exprs/reference.rs +++ b/pomsky-syntax/src/exprs/reference.rs @@ -31,7 +31,9 @@ impl Reference { ReferenceTarget::Named(n) => buf.write(n), ReferenceTarget::Number(i) => buf.write_fmt(i), &ReferenceTarget::Relative(o) => { - buf.push(if o < 0 { '-' } else { '+' }); + if o >= 0 { + buf.push('+'); + } buf.write_fmt(o); } } diff --git a/regex-test/Cargo.toml b/regex-test/Cargo.toml index d78ab45..b250b39 100644 --- a/regex-test/Cargo.toml +++ b/regex-test/Cargo.toml @@ -7,6 +7,6 @@ publish = false # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -regex = "1.10.2" -pcre2 = "0.2.5" +regex = "1.11.1" +pcre2 = "0.2.9" onig = "6.4.0" diff --git a/regex-test/dotnet/TesterAsync.cs b/regex-test/dotnet/TesterAsync.cs index a7cb85c..26b0f17 100644 --- a/regex-test/dotnet/TesterAsync.cs +++ b/regex-test/dotnet/TesterAsync.cs @@ -7,8 +7,12 @@ public static void Main(string[] args) { string testLine; while ((line = Console.ReadLine()) != null) { + if (!line.StartsWith("REGEX:")) { + continue; + } + try { - var r = new Regex(line, RegexOptions.Compiled); + var r = new Regex(line.Substring(6), RegexOptions.Compiled); Console.WriteLine("success"); while ((testLine = Console.ReadLine()) != null && testLine.StartsWith("TEST:")) { diff --git a/regex-test/java/TesterAsync.java b/regex-test/java/TesterAsync.java index f47b605..ff0cd8a 100644 --- a/regex-test/java/TesterAsync.java +++ b/regex-test/java/TesterAsync.java @@ -7,8 +7,12 @@ public static void main(String[] args) { try (Scanner input = new Scanner(System.in)) { while (input.hasNext()) { String regex = input.nextLine(); + if (!regex.startsWith("REGEX:")) { + continue; + } + try { - Pattern p = Pattern.compile(regex); + Pattern p = Pattern.compile("(?U)" + regex.substring(6)); System.out.printf("success\n"); while (input.hasNext()) { diff --git a/regex-test/js/tester-deno-async.js b/regex-test/js/tester-deno-async.js index 625602a..c1742c7 100644 --- a/regex-test/js/tester-deno-async.js +++ b/regex-test/js/tester-deno-async.js @@ -5,8 +5,12 @@ let regex for await (const line of readLines(Deno.stdin)) { if (regex === undefined) { + if (!line.startsWith('REGEX:')) { + continue + } + try { - regex = new RegExp(line, 'u') + regex = new RegExp(line.slice(6), 'u') console.log('success') } catch (e) { console.log(substituteLf(e.message)) @@ -17,9 +21,7 @@ for await (const line of readLines(Deno.stdin)) { if (regex.test(test)) { console.log('test good') } else { - console.log( - substituteLf(`Regex '${regex.source}' does not match '${test}'`) - ) + console.log(substituteLf(`Regex '${regex.source}' does not match '${test}'`)) regex = undefined } } else { diff --git a/regex-test/js/tester-node-async.js b/regex-test/js/tester-node-async.js index fda5064..5db9070 100644 --- a/regex-test/js/tester-node-async.js +++ b/regex-test/js/tester-node-async.js @@ -11,8 +11,12 @@ let regex rl.on('line', (line) => { if (regex === undefined) { + if (!line.startsWith('REGEX:')) { + return + } + try { - regex = new RegExp(line, 'u') + regex = new RegExp(line.slice(6), 'u') console.log('success') } catch (e) { console.log(substituteLf(e.message)) @@ -23,9 +27,7 @@ rl.on('line', (line) => { if (regex.test(test)) { console.log('test good') } else { - console.log( - substituteLf(`Regex '${regex.source}' does not match '${test}'`) - ) + console.log(substituteLf(`Regex '${regex.source}' does not match '${test}'`)) regex = undefined } } else { diff --git a/regex-test/python/tester_async.py b/regex-test/python/tester_async.py index 5f4eee4..8212cf7 100644 --- a/regex-test/python/tester_async.py +++ b/regex-test/python/tester_async.py @@ -15,8 +15,11 @@ def substituteLf(s): line = line[:-1] if regex == None: + if not line.startswith("REGEX:"): + continue + try: - regex = re.compile(line) + regex = re.compile(line[6:]) print("success") except Exception as e: print(substituteLf(str(e))) diff --git a/regex-test/src/sync/mod.rs b/regex-test/src/sync/mod.rs index ba4ae9d..9066e8e 100644 --- a/regex-test/src/sync/mod.rs +++ b/regex-test/src/sync/mod.rs @@ -80,29 +80,29 @@ impl RegexTest { crate::native::ruby(regex, tests) } - pub fn test_js(&self, regex: impl Into) -> Outcome { + pub fn test_js(&self, regex: impl AsRef) -> Outcome { self.test_js_with(regex, &[] as &[&str]) } - pub fn test_js_with(&self, regex: impl Into, tests: &[impl AsRef]) -> Outcome { + pub fn test_js_with(&self, regex: impl AsRef, tests: &[impl AsRef]) -> Outcome { self.js.start("js", "deno", &["run", "tester-deno-async.js"]); self.js.test(regex, tests) } - pub fn test_python(&self, regex: impl Into) -> Outcome { + pub fn test_python(&self, regex: impl AsRef) -> Outcome { self.test_python_with(regex, &[] as &[&str]) } - pub fn test_python_with(&self, regex: impl Into, tests: &[impl AsRef]) -> Outcome { + pub fn test_python_with(&self, regex: impl AsRef, tests: &[impl AsRef]) -> Outcome { self.py.start("python", "python", &["tester_async.py"]); self.py.test(regex, tests) } - pub fn test_java(&self, regex: impl Into) -> Outcome { + pub fn test_java(&self, regex: impl AsRef) -> Outcome { self.test_java_with(regex, &[] as &[&str]) } - pub fn test_java_with(&self, regex: impl Into, tests: &[impl AsRef]) -> Outcome { + pub fn test_java_with(&self, regex: impl AsRef, tests: &[impl AsRef]) -> Outcome { self.java.start_with("java", "java", &["TesterAsync"], || { let compiled = concat!(env!("CARGO_MANIFEST_DIR"), "/java/TesterAsync.class"); if !Path::new(compiled).exists() { @@ -119,12 +119,12 @@ impl RegexTest { } #[cfg(target_os = "linux")] - pub fn test_dotnet(&self, regex: impl Into) -> Outcome { + pub fn test_dotnet(&self, regex: impl AsRef) -> Outcome { self.test_dotnet_with(regex, &[] as &[&str]) } #[cfg(target_os = "linux")] - pub fn test_dotnet_with(&self, regex: impl Into, tests: &[impl AsRef]) -> Outcome { + pub fn test_dotnet_with(&self, regex: impl AsRef, tests: &[impl AsRef]) -> Outcome { self.dotnet.start_with("dotnet", "mono", &["TesterAsync.exe"], || { let compiled = concat!(env!("CARGO_MANIFEST_DIR"), "/dotnet/TesterAsync.exe"); if !Path::new(compiled).exists() { diff --git a/regex-test/src/sync/process.rs b/regex-test/src/sync/process.rs index 573a911..f920cb5 100644 --- a/regex-test/src/sync/process.rs +++ b/regex-test/src/sync/process.rs @@ -59,12 +59,12 @@ impl Process { } } - pub(crate) fn test(&self, regex: impl Into, tests: &[impl AsRef]) -> Outcome { + pub(crate) fn test(&self, regex: impl AsRef, tests: &[impl AsRef]) -> Outcome { let mut lock = self.data.lock().unwrap(); let (_, stdin, stdout, count) = (*lock).as_mut().expect("process isn't running"); *count += 1; - stdin.write_all((regex.into() + "\n").as_bytes()).unwrap(); + stdin.write_all(("REGEX:".to_string() + regex.as_ref() + "\n").as_bytes()).unwrap(); let line = stdout.next().expect("child process did not respond").unwrap(); if line != "success" {