Skip to content

Commit

Permalink
Merge pull request #1 from gnieh/feature/tdfa-regex
Browse files Browse the repository at this point in the history
Add new regex backend based on TDFA.
  • Loading branch information
satabin committed Apr 30, 2016
2 parents 854f8cf + 5df6266 commit 843d05e
Show file tree
Hide file tree
Showing 24 changed files with 1,064 additions and 78 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,39 @@ import org.scalameter.picklers.noPickler._

class JavaComparisonBenchmark extends Bench.LocalTime {

val gniehRe = "abc|abd|abe|abf".re
val gniehReBC = {
import bytecode._
"abc|abd|abe|abf".re
}

val gniehReDFA = {
import automaton._
"abc|abd|abe|abf".re
}

val stdRe = "abc|abd|abe|abf"
val stdRe = "abc|abd|abe|abf".r

val text = Gen.single("text")("fda f ofdio difm i mfofgroa gaabcdjsaabedsakabflklklabdabcabakldskfsdfbacpabc")

performance of "gnieh regular expressions" in {
performance of "gnieh regular expressions based on bytecode" in {

measure method "findAllIn" in {

using(text) in { t =>
gniehRe.isMatchedBy(t)
gniehReBC.isMatchedBy(t)

}

}

}

performance of "gnieh regular expressions based on TDFA" in {

measure method "findAllIn" in {

using(text) in { t =>
gniehReDFA.isMatchedBy(t)

}

Expand All @@ -44,7 +65,7 @@ class JavaComparisonBenchmark extends Bench.LocalTime {
measure method "findAllIn" in {

using(text) in { t =>
t.matches(stdRe)
stdRe.unapplySeq(t)

}

Expand All @@ -53,4 +74,3 @@ class JavaComparisonBenchmark extends Bench.LocalTime {
}

}

11 changes: 7 additions & 4 deletions benchmarks/src/test/scala/gnieh/regex/SimpleBenchmark.scala
Original file line number Diff line number Diff line change
Expand Up @@ -47,20 +47,23 @@ class SimpleBenchmark extends Bench.ForkedTime {

val textGen = Gen.single("text")(text)

val reGen =
val reGenBC = {
import bytecode._
for(re <- Gen.single("re")("([-A-Za-z0-9_.!~*'();/?:@&=+$,# ]|%[A-Fa-f0-9]{2})+".re))
yield {
// force evaluation to make it compile
re.isMatchedBy("")
re
}
}

val inputs = Gen.crossProduct(textGen, reGen)
val inputs = Gen.crossProduct(textGen, reGenBC)

performance of "New regular expression" in {
performance of "New regular expression based on bytecode" in {
measure method "findFirstIn" in {

using(textGen) in { t =>
import bytecode._
val localRe = "([-A-Za-z0-9_.!~*'();/?:@&=+$,# ]|%[A-Fa-f0-9]{2})+".re

localRe.isMatchedBy(t)
Expand All @@ -69,7 +72,7 @@ class SimpleBenchmark extends Bench.ForkedTime {
}
}

performance of "Reused regular expression" in {
performance of "Reused regular expression based on bytecode" in {
measure method "findFirstIn" in {

using(inputs) in { case (t, re) =>
Expand Down
8 changes: 4 additions & 4 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ name := "tekstlib"

version := "0.1.0-SNAPSHOT"

scalaVersion := "2.11.7"
scalaVersion := "2.11.8"

crossScalaVersions := Seq("2.11.7", "2.10.4")
crossScalaVersions := Seq("2.11.8")

libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.1" % "test"

Expand Down Expand Up @@ -34,7 +34,7 @@ OsgiKeys.bundleSymbolicName := "org.gnieh.tekstlib"

OsgiKeys.privatePackage := Seq()

defaultScalariformSettings
scalariformSettings

ScalariformKeys.preferences := {
import scalariform.formatter.preferences._
Expand Down Expand Up @@ -86,7 +86,7 @@ pomExtra := (

lazy val benchmarks = project in file("benchmarks") dependsOn(root)

scalaVersion in benchmarks := "2.11.7"
scalaVersion in benchmarks := "2.11.8"

libraryDependencies in benchmarks += "com.storm-enroute" %% "scalameter" % "0.7"

Expand Down
1 change: 1 addition & 0 deletions src/main/scala/gnieh/mustache/MustacheParser.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package mustache
import string.StringScanner

import regex._
import bytecode._

import scala.util.{
Try,
Expand Down
53 changes: 21 additions & 32 deletions src/main/scala/gnieh/regex/Regex.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,46 +15,32 @@ package gnieh.regex

import compiler._
import util._
import vm._

import scala.util.Failure

/** This class provides a way to create and use regular expressions. It is a non backtracking implementation
* based on the descrition from [Russ Cox](http://swtch.com/~rsc/regexp/).
* Following regular expressions are supported:
* - `.` any character, possibly including newline (s=true)
* - `[xyz]` character class
* - `[^xyz]` negated character class
* - `\d` a digit character (equivalent to `[0-9]`)
* - `\D` a non digit character (equivalent to `[^0-9]`)
* - `\w` an alphanumeric character (equivalent to `[A-Za-z0-9_]`)
* - `\W` a non alphanumeric character (equivalent to `[^A-Za-z0-9_]`)
* - `\s` a space character (equivalent to `[ \t\r\n\f]`)
* - `\S` a non space character (equivalent to `[^ \t\r\n\f]`)
* - `xy` `x` followed by `y`
* - `x|y` `x` or `y` (prefer `x`)
* - `x*` zero or more `x` (prefer more)
* - `x+` one or more `x` (prefer more)
* - `x?` zero or one `x` (prefer one)
* - `x*?` zero or more `x` (prefer zero)
* - `x+?` one or more `x` (prefer one)
* - `x??` zero or one `x` (prefer zero)
* - `(re)` numbered capturing group (starting at 1)
/** This class provides a way to create and use regular expressions. The actual implementation depends on the imported backend.
* By default we provide a non backtracking implementation [[gnieh.regex.vm.BytecodeImpl]] and a TDFA implementation [[gnieh.regex.tdfa.TDfaImpl]]. See the documentation of each class for details about the supported features.
*
* @author Lucas Satabin
*/
class Regex(re: ReNode, source: Option[String]) extends Serializable {
class Regex(re: Either[ReNode, String], impl: RegexImpl) extends Serializable {

def this(source: String) =
this(Parser.parse(source).get, Some(source))
def this(source: String, impl: RegexImpl) =
this(Right(source), impl)

private val (saved, compiled) = Compiler.compile(re)
def this(re: ReNode, impl: RegexImpl) =
this(Left(re), impl)

private val (saved, compiled) = re match {
case Left(re) => impl.compile(re)
case Right(source) => impl.compile(Parser.parse(source).get)
}

//println(util.Debug.print(compiled))

/** Tells whether this regular expression is matched by the given input */
def isMatchedBy(input: String): Boolean =
VM.exec(compiled, saved, 0, input) match {
impl.exec(compiled, saved, 0, input) match {
case (-1, -1, _) =>
false
case (start, end, _) =>
Expand All @@ -76,7 +62,7 @@ class Regex(re: ReNode, source: Option[String]) extends Serializable {
*/
def findFirstMatchIn(input: String): Option[Match] = {
def find(startIdx: Int): Option[Match] =
VM.exec(compiled, saved, startIdx, input) match {
impl.exec(compiled, saved, startIdx, input) match {
case (-1, -1, _) if startIdx < input.size =>
find(startIdx + 1)
case (-1, -1, _) =>
Expand All @@ -97,7 +83,7 @@ class Regex(re: ReNode, source: Option[String]) extends Serializable {
/** Finds all matches of this regular expression in the input. */
def findAllMatchIn(input: String): Iterator[Match] = {
def loop(startIdx: Int): Stream[Match] =
VM.exec(compiled, saved, startIdx, input) match {
impl.exec(compiled, saved, startIdx, input) match {
case (-1, -1, _) if startIdx < input.size =>
loop(startIdx + 1)
case (-1, -1, _) =>
Expand All @@ -121,14 +107,17 @@ class Regex(re: ReNode, source: Option[String]) extends Serializable {
} yield m.subgroups

override def toString =
source.getOrElse(re.toString)
re match {
case Left(re) => re.toString
case Right(re) => re
}

}

object Regex {

def apply(str: String): Regex =
new Regex(str)
def apply(str: String)(implicit impl: RegexImpl): Regex =
new Regex(str, impl)

/** Escaped version of this character if it is needed. */
def escape(c: Char): String =
Expand Down
28 changes: 28 additions & 0 deletions src/main/scala/gnieh/regex/RegexImpl.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/*
* Copyright (c) 2016 Lucas Satabin
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gnieh.regex

import util.ReNode

trait RegexImpl {

type Compiled

def compile(re: ReNode): (Int, Compiled)

def exec(compiled: Compiled, nbSaved: Int, startIdx: Int, string: String): (Int, Int, Vector[Int])

}
2 changes: 1 addition & 1 deletion src/main/scala/gnieh/regex/compiler/Compiler.scala
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ object Compiler {
// save n + 1
val (currentSave1, idx1, v1) = loop(currentSave + 2, startIdx + 1, e)
(currentSave1, idx1 + 1, Vector(Save(currentSave)) ++ v1 ++ Vector(Save(currentSave + 1)))
case _: Temporary =>
case _ =>
throw new RuntimeException("Should never happen")
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/gnieh/regex/dsl/DslRegex.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import util._
*
* @author Lucas Satabin
*/
class DslRegex(val re: ReNode) extends Regex(re, None) {
class DslRegex(val re: ReNode)(implicit impl: RegexImpl) extends Regex(re, impl) {

/** Matches `this` regular expression followed by `that` regular expression */
def +(that: DslRegex): DslRegex =
Expand Down
6 changes: 3 additions & 3 deletions src/main/scala/gnieh/regex/dsl/greedy.scala
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,21 @@ trait DslGreedyRegex extends DslRegex {

}

private class DslGreedyStar(re: ReNode) extends DslRegex(Star(re, true)) with DslGreedyRegex {
private class DslGreedyStar(re: ReNode)(implicit impl: RegexImpl) extends DslRegex(Star(re, true)) with DslGreedyRegex {

lazy val nonGreedy: DslRegex =
new DslRegex(Star(re, false))

}

private class DslGreedyPlus(re: ReNode) extends DslRegex(Plus(re, true)) with DslGreedyRegex {
private class DslGreedyPlus(re: ReNode)(implicit impl: RegexImpl) extends DslRegex(Plus(re, true)) with DslGreedyRegex {

lazy val nonGreedy: DslRegex =
new DslRegex(Plus(re, false))

}

private class DslGreedyOpt(re: ReNode) extends DslRegex(Opt(re, true)) with DslGreedyRegex {
private class DslGreedyOpt(re: ReNode)(implicit impl: RegexImpl) extends DslRegex(Opt(re, true)) with DslGreedyRegex {

lazy val nonGreedy: DslRegex =
new DslRegex(Opt(re, false))
Expand Down
31 changes: 15 additions & 16 deletions src/main/scala/gnieh/regex/dsl/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -37,68 +37,67 @@ package object dsl {
CharRange(c)

/** Matches any character */
lazy val any: DslRegex =
def any(implicit impl: RegexImpl): DslRegex =
new DslRegex(AnyChar)

/** Matches any character in the classes. A character class
* is either a single character `c`, a range `a-z`
*/
def any(classes: CharRange*): DslRegex =
def any(classes: CharRange*)(implicit impl: RegexImpl): DslRegex =
new DslRegex(CharSet(CharRangeSet(classes: _*)))

/** Matches any digit (equivalent to `[0-9]`) */
lazy val digit: DslRegex =
def digit(implicit impl: RegexImpl): DslRegex =
any('0' -- '9')

/** Matches digits (equivalent to `[0-9]+`) */
lazy val digits: DslRegex =
def digits(implicit impl: RegexImpl): DslRegex =
digit.oneOrMore

/** Matches the empty string */
lazy val empty: DslRegex =
def empty(implicit impl: RegexImpl): DslRegex =
new DslRegex(Empty)

/** Matches any hexadecimal digit (equivalent to `[A-Fa-f0-9]`) */
lazy val hexDigit: DslRegex =
def hexDigit(implicit impl: RegexImpl): DslRegex =
any('A' -- 'F', 'a' -- 'f', '0' -- '9')

/** Matches hexadecimal digits (equivalent to `[A-Fa-f0-9]+`) */
lazy val hexDigits: DslRegex =
def hexDigits(implicit impl: RegexImpl): DslRegex =
hexDigits.oneOrMore

/** Matches any character that is not in any of the classes */
def none(classes: CharRange*): DslRegex =
def none(classes: CharRange*)(implicit impl: RegexImpl): DslRegex =
new DslRegex(CharSet(CharRangeSet(classes: _*).negate))

/** Matches any non space character (equivalent to `\S`) */
lazy val nonspace: DslRegex =
def nonspace(implicit impl: RegexImpl): DslRegex =
none(' ', '\t', '\r', '\n', '\f')

/** Matches non space characters (equivalent to `\S+`) */
lazy val nonspaces: DslRegex =
def nonspaces(implicit impl: RegexImpl): DslRegex =
nonspace.oneOrMore

/** Matches the literal characters of the string (special regular expression characters
* are considered as raw characters
*/
def raw(str: String): DslRegex =
def raw(str: String)(implicit impl: RegexImpl): DslRegex =
new DslRegex(str.map(SomeChar(_)).foldLeft(Empty: ReNode)(Concat(_, _)))

/** Matches any space character (equivalent to `\s`) */
lazy val space: DslRegex =
def space(implicit impl: RegexImpl): DslRegex =
any(' ', '\t', '\r', '\n', '\f')

/** Matches space characters (equivalent to `\s+`) */
lazy val spaces: DslRegex =
def spaces(implicit impl: RegexImpl): DslRegex =
space.oneOrMore

/** Matches any word (equivalent to `\w+`) */
lazy val word: DslRegex =
def word(implicit impl: RegexImpl): DslRegex =
wordChar.oneOrMore

/** Matches any word character (equivalent to `\w`) */
lazy val wordChar: DslRegex =
def wordChar(implicit impl: RegexImpl): DslRegex =
any('A' -- 'Z', 'a' -- 'z', '0' -- '9', '_')

}

Loading

0 comments on commit 843d05e

Please sign in to comment.