Skip to content

Commit

Permalink
Regex and Glob builtins (microsoft#56)
Browse files Browse the repository at this point in the history
Signed-off-by: Anand Krishnamoorthi <[email protected]>
  • Loading branch information
anakrish authored Dec 3, 2023
1 parent ed3492f commit ab968c2
Show file tree
Hide file tree
Showing 7 changed files with 362 additions and 70 deletions.
5 changes: 4 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@ rand = "0.8.5"
data-encoding = "2.4.0"
regex = "1.10.2"
num = "0.4.1"
rust_decimal = { version = "1.33.1", features = ["serde-with-arbitrary-precision", "serde_json", "maths"] }
rust_decimal = { version = "1.33.1", features = ["serde-with-arbitrary-precision"] }
glob = "0.3.1"
wax = { version = "0.6.0", features = [], default-features = false }


[dev-dependencies]
clap = { version = "4.4.7", features = ["derive"] }
Expand Down
112 changes: 112 additions & 0 deletions src/builtins/glob.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

use crate::ast::{Expr, Ref};
use crate::builtins;
use crate::builtins::utils::{ensure_args_count, ensure_string, ensure_string_collection};
use crate::lexer::Span;
use crate::value::Value;

use std::collections::HashMap;

use anyhow::{bail, Result};
//use glob::{Pattern, MatchOptions};
use wax::{Glob, Pattern};

pub fn register(m: &mut HashMap<&'static str, builtins::BuiltinFcn>) {
m.insert("glob.match", (glob_match, 3));
m.insert("glob.quote_meta", (quote_meta, 1));
}

const PLACE_HOLDER: &str = "\0";

fn suppress_unix_style_delimiter(s: &str) -> Result<String> {
// Replace unix-style delimiter with placeholder so that no delimiter is
// encountered during glob matching.
Ok(s.replace('/', PLACE_HOLDER))
}

fn make_delimiters_unix_style(s: &str, delimiters: &[char]) -> Result<String> {
if s.contains(PLACE_HOLDER) {
bail!("string contains internal glob placeholder");
}

let has_unix_style = delimiters.contains(&'/');

let mut s = if !has_unix_style {
suppress_unix_style_delimiter(s)?
} else {
s.to_string()
};

for d in delimiters {
if *d == ':' {
s = s.replace(*d, PLACE_HOLDER);
} else if *d != '/' {
// Insert / before occurances of delimiter.
s = s.replace(*d, format!("/{}/", *d).as_str());
}
}

Ok(s)
}

fn make_glob<'a>(pattern: &'a str, span: &'a Span) -> Result<Glob<'a>> {
Glob::new(pattern).or_else(|_| bail!(span.error("invalid glob")))
}

fn glob_match(span: &Span, params: &[Ref<Expr>], args: &[Value]) -> Result<Value> {
let name = "glob.match";
ensure_args_count(span, name, params, args, 3)?;

let pattern = ensure_string(name, &params[0], &args[0])?;
let value = ensure_string(name, &params[2], &args[2])?;

let pattern = pattern.as_ref();
let value = value.as_ref();

if let Value::Null = &args[1] {
// Ensure that / is not treated as a delimiter.
let value = suppress_unix_style_delimiter(value)?;
let pattern = suppress_unix_style_delimiter(pattern)?;

let glob = make_glob(&pattern, params[0].span())?;
return Ok(Value::Bool(glob.is_match(&value[..])));
}

let delimiters = if let Value::Array(_) = &args[1] {
ensure_string_collection(name, &params[1], &args[1])?
} else {
bail!(params[1]
.span()
.error(format!("{name} requires string array").as_str()));
};

if delimiters.iter().any(|d| d.len() > 1) {
bail!(params[1]
.span()
.error("delimiters must be single character"));
}

let mut delimiters: Vec<char> = delimiters.iter().filter_map(|d| d.chars().next()).collect();

if delimiters.is_empty() {
delimiters.push('.');
}

let pattern = make_delimiters_unix_style(pattern, &delimiters)?;
let value = make_delimiters_unix_style(value, &delimiters)?;

let glob = make_glob(&pattern, params[0].span())?;
Ok(Value::Bool(glob.is_match(&value[..])))
}

fn quote_meta(span: &Span, params: &[Ref<Expr>], args: &[Value]) -> Result<Value> {
let name = "glob.quote_meta";
ensure_args_count(span, name, params, args, 1)?;

let pattern = ensure_string(name, &params[0], &args[0])?;
// Ensure that the glob is valid.
let _ = make_glob(&pattern, params[0].span())?;
Ok(Value::String(pattern.as_ref().replace('*', "\\*").into()))
}
3 changes: 2 additions & 1 deletion src/builtins/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ mod debugging;
pub mod deprecated;

mod encoding;
mod glob;
pub mod numbers;
mod objects;
mod regex;
Expand Down Expand Up @@ -48,7 +49,7 @@ lazy_static! {
objects::register(&mut m);
strings::register(&mut m);
regex::register(&mut m);
//glob::register(&mut m);
glob::register(&mut m);
bitwise::register(&mut m);
conversions::register(&mut m);
//units::register(&mut m);
Expand Down
155 changes: 151 additions & 4 deletions src/builtins/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

use crate::ast::{Expr, Ref};
use crate::builtins;
use crate::builtins::utils::{ensure_args_count, ensure_string};
use crate::builtins::utils::{ensure_args_count, ensure_numeric, ensure_string};
use crate::lexer::Span;
use crate::value::Value;

Expand All @@ -13,9 +13,89 @@ use anyhow::{bail, Result};
use regex::Regex;

pub fn register(m: &mut HashMap<&'static str, builtins::BuiltinFcn>) {
m.insert(
"regex.find_all_string_submatch_n",
(find_all_string_submatch_n, 3),
);
m.insert("regex.find_n", (find_n, 3));
// TODO: m.insert("regex.globs_match", (globs_match, 2));
m.insert("regex.is_valid", (is_valid, 1));
m.insert("regex.match", (regex_match, 2));
m.insert("regex.replace", (regex_replace, 3));
m.insert("regex.split", (regex_split, 2));
m.insert("regex.template_match", (regex_template_match, 4));
}

fn find_all_string_submatch_n(span: &Span, params: &[Ref<Expr>], args: &[Value]) -> Result<Value> {
let name = "regex.find_all_string_submatch_n";
ensure_args_count(span, name, params, args, 3)?;

let pattern = ensure_string(name, &params[0], &args[0])?;
let value = ensure_string(name, &params[1], &args[1])?;
let n = ensure_numeric(name, &params[2], &args[2])?;

let pattern =
Regex::new(&pattern).or_else(|_| bail!(params[0].span().error("invalid regex")))?;

if !n.is_integer() {
bail!(params[2].span().error("n must be an integer"));
}

let n = match n.as_i64() {
Some(n) if n < 0 => usize::MAX,
Some(n) => n as usize,
None => usize::MAX,
};

Ok(Value::from_array(
pattern
.captures_iter(&value)
.map(|capture| {
Value::from_array(
capture
.iter()
.map(|group| {
Value::String(match group {
Some(s) => s.as_str().into(),
_ => "".into(),
})
})
.collect(),
)
})
.take(n)
.collect(),
))
}

fn find_n(span: &Span, params: &[Ref<Expr>], args: &[Value]) -> Result<Value> {
let name = "regex.find_n";
ensure_args_count(span, name, params, args, 3)?;

let pattern = ensure_string(name, &params[0], &args[0])?;
let value = ensure_string(name, &params[1], &args[1])?;
let n = ensure_numeric(name, &params[2], &args[2])?;

let pattern =
Regex::new(&pattern).or_else(|_| bail!(params[0].span().error("invalid regex")))?;

if !n.is_integer() {
bail!(params[2].span().error("n must be an integer"));
}

let n = match n.as_i64() {
Some(n) if n < 0 => usize::MAX,
Some(n) => n as usize,
None => usize::MAX,
};

Ok(Value::from_array(
pattern
.find_iter(&value)
.map(|m| Value::String(m.as_str().into()))
.take(n)
.collect(),
))
}

fn is_valid(span: &Span, params: &[Ref<Expr>], args: &[Value]) -> Result<Value> {
Expand All @@ -31,21 +111,88 @@ pub fn regex_match(span: &Span, params: &[Ref<Expr>], args: &[Value]) -> Result<
let pattern = ensure_string(name, &params[0], &args[0])?;
let value = ensure_string(name, &params[1], &args[1])?;

let pattern = Regex::new(&pattern).or_else(|_| bail!(span.error("invalid regex")))?;
let pattern =
Regex::new(&pattern).or_else(|_| bail!(params[0].span().error("invalid regex")))?;
Ok(Value::Bool(pattern.is_match(&value)))
}

pub fn regex_split(span: &Span, params: &[Ref<Expr>], args: &[Value]) -> Result<Value> {
fn regex_replace(span: &Span, params: &[Ref<Expr>], args: &[Value]) -> Result<Value> {
let name = "regex.replace";
ensure_args_count(span, name, params, args, 3)?;

let s = ensure_string(name, &params[0], &args[0])?;
let pattern = ensure_string(name, &params[1], &args[1])?;
let value = ensure_string(name, &params[2], &args[2])?;

let pattern = match Regex::new(&pattern) {
Ok(p) => p,
// TODO: This behavior is due to OPA test not raising error. Should we raise error?
_ => return Ok(Value::Undefined),
};

Ok(Value::String(
pattern.replace_all(&s, value.as_ref()).into(),
))
}

fn regex_split(span: &Span, params: &[Ref<Expr>], args: &[Value]) -> Result<Value> {
let name = "regex.split";
ensure_args_count(span, name, params, args, 2)?;
let pattern = ensure_string(name, &params[0], &args[0])?;
let value = ensure_string(name, &params[1], &args[1])?;

let pattern = Regex::new(&pattern).or_else(|_| bail!(span.error("invalid regex")))?;
let pattern =
Regex::new(&pattern).or_else(|_| bail!(params[0].span().error("invalid regex")))?;
Ok(Value::from_array(
pattern
.split(&value)
.map(|s| Value::String(s.into()))
.collect::<Vec<Value>>(),
))
}

fn regex_template_match(span: &Span, params: &[Ref<Expr>], args: &[Value]) -> Result<Value> {
let name = "regex.template_match";
ensure_args_count(span, name, params, args, 4)?;
let template = ensure_string(name, &params[0], &args[0])?;
let value = ensure_string(name, &params[1], &args[1])?;
let delimiter_start = ensure_string(name, &params[2], &args[2])?;
let delimiter_end = ensure_string(name, &params[3], &args[3])?;

let delimiter_start = delimiter_start.as_ref();
let delimiter_end = delimiter_end.as_ref();
let mut template = template.as_ref();
let mut value = value.as_ref();

while let (Some(start), Some(end)) =
(template.find(delimiter_start), template.find(delimiter_end))
{
if start >= end {
return Ok(Value::Undefined);
}
// Match precesing literal (if any)
if template[0..start] != value[0..start] {
return Ok(Value::Bool(false));
}

// Fetch pattern, excluding delimiters.
let pattern = Regex::new(&template[start + delimiter_start.len()..end])
.or_else(|_| bail!(params[0].span().error("invalid regex")))?;

// Skip preceding literal in value.
value = &value[start..];

let m = match pattern.find(value) {
Some(m) if m.start() == 0 => m,
_ => return Ok(Value::Bool(false)),
};
// Skip match in string.
value = &value[m.len()..];

// Skip regex and delimiter in template.
template = &template[end + delimiter_end.len()..];
}

// Ensure that ending literal matches.
Ok(Value::Bool(template == value))
}
Loading

0 comments on commit ab968c2

Please sign in to comment.