Skip to content

Commit

Permalink
Improve documentation on regular expressions
Browse files Browse the repository at this point in the history
  • Loading branch information
justin-tay committed Jun 15, 2024
1 parent 45c0b8f commit 77105a6
Show file tree
Hide file tree
Showing 7 changed files with 144 additions and 40 deletions.
38 changes: 18 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ This implementation is tested against the [JSON Schema Test Suite](https://githu
|-----------------|-------------------------------------------------------------------------|-------------------------------------------------------------------|---------------------------------------------------------------------|--------------------------------------------------------------------|------------------------------------------------------------------------|----------------------------------------------------------------------|------------------------------------------------------------------------|
| NetworkNt | pass: r:4703 (100.0%) o:2369 (100.0%)<br>fail: r:0 (0.0%) o:1 (0.0%) | | pass: r:600 (100.0%) o:251 (100.0%)<br>fail: r:0 (0.0%) o:0 (0.0%) | pass: r:796 (100.0%) o:318 (100.0%)<br>fail: r:0 (0.0%) o:0 (0.0%) | pass: r:880 (100.0%) o:541 (100.0%)<br>fail: r:0 (0.0%) o:0 (0.0%) | pass: r:1201 (100.0%) o:625 (100.0%)<br>fail: r:0 (0.0%) o:0 (0.0%) | pass: r:1226 (100.0%) o:634 (99.8%)<br>fail: r:0 (0.0%) o:1 (0.2%) |

* Note that this uses the ECMA 262 Validator option turned on for the `pattern` tests.
* Note that this uses the `JoniRegularExpressionFactory` for the `pattern` and `format` `regex` tests.

#### Jackson Parser

Expand Down Expand Up @@ -157,25 +157,22 @@ The following are the optional dependencies that may be required for certain opt
These are not automatically included and setting the relevant option without adding the library will result in a `ClassNotFoundException`.

```xml
<!-- Either library is required when setting setEcma262Validator(true) or explicitly via setRegularExpressionFactory() -->
<dependency>
<!-- Used to validate ECMA 262 regular expressions -->
<!-- Approximately 2 MB in dependencies -->
<!-- JoniRegularExpressionFactory -->
<groupId>org.jruby.joni</groupId>
<artifactId>joni</artifactId>
<version>${version.joni}</version>
<optional>true</optional>
</dependency>

<dependency>
<!-- Used to validate ECMA 262 regular expressions -->
<!-- Approximately 50 MB in dependencies -->
<!-- GraalJSRegularExpressionFactory -->
<groupId>org.graalvm.js</groupId>
<artifactId>js</artifactId>
<version>${version.graaljs}</version>
<optional>true</optional>
</dependency>

<dependency>
<!-- Used to validate ECMA 262 regular expressions -->
<!-- Approximately 2 MB in dependencies -->
<!-- JoniRegularExpressionFactory -->
<groupId>org.jruby.joni</groupId>
<artifactId>joni</artifactId>
<version>${version.joni}</version>
</dependency>
```

Expand Down Expand Up @@ -270,8 +267,9 @@ SchemaValidatorsConfig config = new SchemaValidatorsConfig();
// By default JSON Path is used for reporting the instance location and evaluation path
config.setPathType(PathType.JSON_POINTER);
// By default the JDK regular expression implementation which is not ECMA 262 compliant is used
// Note that setting this to true requires including the optional joni or graaljs dependency
// config.setEcma262Validator(true);
// Note that setting this requires including optional depedencies
// config.setRegularExpressionFactory(GraalJSRegularExpressionFactory.getInstance());
// config.setRegularExpressionFactory(JoniRegularExpressionFactory.getInstance());

// Due to the mapping the schema will be retrieved from the classpath at classpath:schema/example-main.json.
// If the schema data does not specify an $id the absolute IRI of the schema location will be used as the $id.
Expand Down Expand Up @@ -305,8 +303,9 @@ SchemaValidatorsConfig config = new SchemaValidatorsConfig();
// By default JSON Path is used for reporting the instance location and evaluation path
config.setPathType(PathType.JSON_POINTER);
// By default the JDK regular expression implementation which is not ECMA 262 compliant is used
// Note that setting this to true requires including the optional joni or graaljs dependency
// config.setEcma262Validator(true);
// Note that setting this requires including optional depedencies
// config.setRegularExpressionFactory(GraalJSRegularExpressionFactory.getInstance());
// config.setRegularExpressionFactory(JoniRegularExpressionFactory.getInstance());

// Due to the mapping the meta-schema will be retrieved from the classpath at classpath:draft/2020-12/schema.
JsonSchema schema = jsonSchemaFactory.getSchema(SchemaLocation.of(SchemaId.V202012), config);
Expand Down Expand Up @@ -529,7 +528,6 @@ The following is sample output from the Hierarchical format.
| Name | Description | Default Value
|---------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------
| `pathType` | The path type to use for reporting the instance location and evaluation path. Set to `PathType.JSON_POINTER` to use JSON Pointer. | `PathType.DEFAULT`
| `ecma262Validator` | Whether to use the ECMA 262 `joni` or `graaljs` library to validate the `pattern` keyword. This requires the dependency to be manually added to the project or a `ClassNotFoundException` will be thrown. | `false`
| `executionContextCustomizer` | This can be used to customize the `ExecutionContext` generated by the `JsonSchema` for each validation run. | `null`
| `schemaIdValidator` | This is used to customize how the `$id` values are validated. Note that the default implementation allows non-empty fragments where no base IRI is specified and also allows non-absolute IRI `$id` values in the root schema. | `JsonSchemaIdValidator.DEFAULT`
| `messageSource` | This is used to retrieve the locale specific messages. | `DefaultMessageSource.getInstance()`
Expand All @@ -539,7 +537,7 @@ The following is sample output from the Hierarchical format.
| `locale` | The locale to use for generating messages in the `ValidationMessage`. | `Locale.getDefault()`
| `failFast` | Whether to return failure immediately when an assertion is generated. | `false`
| `formatAssertionsEnabled` | The default is to generate format assertions from Draft 4 to Draft 7 and to only generate annotations from Draft 2019-09. Setting to `true` or `false` will override the default behavior. | `null`
| `regularExpressionFactory` | The factory to use to create regular expressions for instance `JoniRegularExpressionFactory` or `GraalJSRegularExpressionFactory`. | `JDKRegularExpressionFactory.getInstance()`
| `regularExpressionFactory` | The factory to use to create regular expressions for instance `JoniRegularExpressionFactory` or `GraalJSRegularExpressionFactory`. This requires the dependency to be manually added to the project or a `ClassNotFoundException` will be thrown. | `JDKRegularExpressionFactory.getInstance()`

## Performance Considerations

Expand Down Expand Up @@ -576,7 +574,7 @@ This does not mean that using a schema with a later draft specification will aut

## [JSON Schema Walkers and WalkListeners](doc/walkers.md)

## [ECMA-262 Regex](doc/ecma-262.md)
## [Regular Expressions](doc/ecma-262.md)

## [Custom Message](doc/cust-msg.md)

Expand Down
17 changes: 14 additions & 3 deletions doc/compatibility.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
[![Draft 6](https://img.shields.io/endpoint?url=https%3A%2F%2Fbowtie.report%2Fbadges%2Fjava-com.networknt-json-schema-validator%2Fcompliance%2Fdraft6.json)](https://bowtie.report/#/dialects/draft6)
[![Draft 4](https://img.shields.io/endpoint?url=https%3A%2F%2Fbowtie.report%2Fbadges%2Fjava-com.networknt-json-schema-validator%2Fcompliance%2Fdraft4.json)](https://bowtie.report/#/dialects/draft4)

The `pattern` validator by default uses the JDK regular expression implementation which is not ECMA-262 compliant and is thus not compliant with the JSON Schema specification. The library can however be configured to use a ECMA-262 compliant regular expression implementation.
The `pattern` and `format` `regex` validator by default uses the JDK regular expression implementation which is not ECMA-262 compliant and is thus not compliant with the JSON Schema specification. The library can however be configured to use a ECMA-262 compliant regular expression implementation such as `GraalJS` or `Joni`.

Annotation processing and reporting are implemented. Note that the collection of annotations will have an adverse performance impact.

Expand Down Expand Up @@ -113,13 +113,24 @@ By default the `pattern` keyword uses the JDK regular expression implementation

This is not ECMA-262 compliant and is thus not compliant with the JSON Schema specification. This is however the more likely desired behavior as other logic will most likely be using the default JDK regular expression implementation to perform downstream processing.

The library can be configured to use a ECMA-262 compliant regular expression validator which is implemented using [joni](https://github.com/jruby/joni). This can be configured by setting `setEcma262Validator` to `true`.
The library can be configured to use a ECMA-262 compliant regular expression validator which is implemented using [GraalJS](https://github.com/oracle/graaljs) or [Joni](https://github.com/jruby/joni). This can be configured by setting `setRegularExpressionFactory` to the respective `GraalJSRegularExpressionFactory` or `JoniRegularExpressionFactory` instances.

This also requires adding the `joni` dependency.
This also requires adding the `org.graalvm.js:js` or `org.jruby.joni:joni` dependency.

```xml
<dependency>
<!-- Used to validate ECMA 262 regular expressions -->
<!-- Approximately 50 MB in dependencies -->
<!-- GraalJSRegularExpressionFactory -->
<groupId>org.graalvm.js</groupId>
<artifactId>js</artifactId>
<version>${version.graaljs}</version>
</dependency>

<dependency>
<!-- Used to validate ECMA 262 regular expressions -->
<!-- Approximately 2 MB in dependencies -->
<!-- JoniRegularExpressionFactory -->
<groupId>org.jruby.joni</groupId>
<artifactId>joni</artifactId>
<version>${version.joni}</version>
Expand Down
71 changes: 54 additions & 17 deletions doc/ecma-262.md
Original file line number Diff line number Diff line change
@@ -1,28 +1,65 @@
For the pattern validator, we now have two options for regex in the library. The default one is `java.util.regex`; however, you can use the ECMA-262 standard library `org.jruby.joni` by configuration.
# Regular Expressions

As we know, the JSON schema is designed based on the Javascript language and its regex. The Java internal implementation has some differences which don't comply with the standard. For most users, these edge cases are not the issue as they are not using them anyway. Even when they are using it, they are expecting the Java regex result as the application is built on the Java platform. For users who want to ensure that they are using 100% standard patter validator, we have provided an option to override the default regex library with `org.jruby.joni` that is complying with the ECMA-262 standard.
For the `pattern` and `format` `regex` validators there are 3 built in options in the library.

### Which one to choose?
A custom implementation can be made by implementing `com.networknt.schema.regex.RegularExpressionFactory` to return a custom implementation of `com.networknt.schema.regex.RegularExpression`.

If you want a faster regex lib and don't care about the slight difference between Java and Javascript regex, then you don't need to do anything. The default regex lib is the `java.util.regex`.
| Regular Expression Factory | Description |
|--------------------------------------------------|----------------------------------------------------|
| `JDKRegularExpressionFactory` | Uses Java's standard `java.util.regex` and calls the `find()` method. Note that `matches()` is not called as that attempts to match the entire string, implicitly adding anchors. This is the default implementation and does not require any additional libraries. |
| `JoniRegularExpressionFactory` | Uses `org.joni.Regex` with `Syntax.ECMAScript`. This requires adding the `org.jruby.joni:joni` dependency which will require about 2MB. |
| `GraalJSRegularExpressionFactory` | Uses GraalJS with `new RegExp(pattern, 'u')`. This requires adding the `org.graalvm.js:js` dependency which will require about 50MB. |

If you want to ensure full compliance, use the `org.jruby.joni`. It is 1.5 times slower then `java.util.regex`. Depending on your use case, it might not be an issue.
## Specification

### How to switch?
The use of Regular Expressions is specified in JSON Schema at https://json-schema.org/draft/2020-12/json-schema-core#name-regular-expressions.

Here is the test case that shows how to pass a config object to use the ECMA-262 library.
```
Keywords MAY use regular expressions to express constraints, or constrain the instance value to be a regular expression. These regular expressions SHOULD be valid according to the regular expression dialect described in ECMA-262, section 21.2.1 [ecma262].
Regular expressions SHOULD be built with the "u" flag (or equivalent) to provide Unicode support, or processed in such a way which provides Unicode support as defined by ECMA-262.
Furthermore, given the high disparity in regular expression constructs support, schema authors SHOULD limit themselves to the following regular expression tokens:
individual Unicode characters, as defined by the JSON specification [RFC8259];
simple character classes ([abc]), range character classes ([a-z]);
complemented character classes ([^abc], [^a-z]);
simple quantifiers: "+" (one or more), "*" (zero or more), "?" (zero or one), and their lazy versions ("+?", "*?", "??");
range quantifiers: "{x}" (exactly x occurrences), "{x,y}" (at least x, at most y, occurrences), {x,} (x occurrences or more), and their lazy versions;
the beginning-of-input ("^") and end-of-input ("$") anchors;
simple grouping ("(...)") and alternation ("|").
Finally, implementations MUST NOT take regular expressions to be anchored, neither at the beginning nor at the end. This means, for instance, the pattern "es" matches "expression".
```

## Considerations when selecting implementation

If strict compliance with the regular expression dialect described in ECMA-262 is required. Then only the `GraalJS` implementation meets that criteria.

The `Joni` implementation is configured to attempt to match the ECMA-262 regular expression dialect. However this dialect isn't directly maintained by its maintainers as it doesn't come from its upstream `Oniguruma`. The current implementation has known issues matching inputs with newlines and not respecting `^` and `$` anchors.

The `JDK` implementation is the default and uses `java.util.regex` with the `find()` method.

As the implementations are used when validating regular expressions, using `format` `regex`, one consideration is how the regular expression is used. For instance if the system that consumes the input is implemented in Javascript then the `GraalJS` implementation will ensure that this regular expression will work. If the system that consumes the input is implemented in Java then the `JDK` implementation may be better.

## Configuration of implementation

The following test case shows how to pass a config object to use the `GraalJS` factory.

```java
@Test(expected = JsonSchemaException.class)
public void testInvalidPatternPropertiesValidatorECMA262() throws Exception {
SchemaValidatorsConfig config = new SchemaValidatorsConfig();
config.setEcma262Validator(true);
JsonSchemaFactory factory = JsonSchemaFactory.getInstance(SpecVersion.VersionFlag.V4);
JsonSchema schema = factory.getSchema("{\"patternProperties\":6}", config);

JsonNode node = getJsonNodeFromStringContent("");
Set<ValidationMessage> errors = schema.validate(node);
Assert.assertEquals(errors.size(), 0);
public class RegularExpressionTest {
@Test
public void testInvalidRegexValidatorECMA262() throws Exception {
SchemaValidatorsConfig config = new SchemaValidatorsConfig();
config.setRegularExpressionFactory(GraalJSRegularExpressionFactory.getInstance());
JsonSchemaFactory factory = JsonSchemaFactory.getInstance(VersionFlag.V202012);
JsonSchema schema = factory.getSchema("{\r\n"
+ " \"format\": \"regex\"\r\n"
+ "}", config);
Set<ValidationMessage> errors = schema.validate("\"\\\\a\"", InputFormat.JSON, executionContext -> {
executionContext.getExecutionConfig().setFormatAssertionsEnabled(true);
});
assertFalse(errors.isEmpty());
}
}
```

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ class JDKRegularExpression implements RegularExpression {

@Override
public boolean matches(String value) {
/*
* Note that the matches function is not used here as it implicitly adds anchors
*/
return this.pattern.matcher(value).find();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,23 @@ void namedBackreference() {
assertTrue(regex.matches("title=\"Named capturing groups\\' advantages\""));
}

@Test
void anchorShouldNotMatchMultilineInput() {
RegularExpression regex = new GraalJSRegularExpression("^[a-z]{1,10}$", CONTEXT);
assertFalse(regex.matches("abc\n"));
}

/**
* This test is because the JDK regex matches function implicitly adds anchors
* which isn't expected.
*/
@Test
void noImplicitAnchors() {
RegularExpression regex = new GraalJSRegularExpression("[a-z]{1,10}", CONTEXT);
assertTrue(regex.matches("1abc1"));
}


@Test
void concurrency() throws Exception {
RegularExpression regex = new GraalJSRegularExpression("\\d", CONTEXT);
Expand Down
Loading

0 comments on commit 77105a6

Please sign in to comment.