Skip to content

Commit

Permalink
bump re2 version
Browse files Browse the repository at this point in the history
  • Loading branch information
qinwf committed Mar 13, 2020
1 parent d511fbf commit 54b6a7c
Show file tree
Hide file tree
Showing 46 changed files with 6,892 additions and 10,575 deletions.
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: re2r
Type: Package
Title: RE2 Regular Expression
Version: 0.2.0
Version: 0.3.0
Authors@R: c(
person("Qin Wenfeng", email = "[email protected]", role = c("aut", "cre")),
person("Toby Dylan Hocking", role = "ctb", comment = "benchmarks"),
Expand All @@ -17,7 +17,7 @@ Authors@R: c(
)
Maintainer: Qin Wenfeng <[email protected]>
Description: RE2 <https://github.com/google/re2> is a primarily deterministic finite automaton based regular expression engine from Google that is very fast
at matching large amounts of text.
at matching large amounts of text.
License: BSD_3_clause + file LICENSE
LazyData: TRUE
Depends: R (>= 3.3)
Expand All @@ -33,5 +33,6 @@ URL: https://github.com/qinwf/re2r/
BugReports: https://github.com/qinwf/re2r/issues
VignetteBuilder: knitr
NeedsCompilation: yes
RoxygenNote: 6.0.1
RoxygenNote: 6.1.1
SystemRequirements: GNU make
Encoding: UTF-8
5 changes: 5 additions & 0 deletions inst/include/re2/bitmap256.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ namespace re2 {
class Bitmap256 {
public:
Bitmap256() {
Clear();
}

// Clears all of the bits.
void Clear() {
memset(words_, 0, sizeof words_);
}

Expand Down
55 changes: 55 additions & 0 deletions inst/include/re2/pod_array.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// Copyright 2018 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#ifndef RE2_POD_ARRAY_H_
#define RE2_POD_ARRAY_H_

#include <memory>
#include <type_traits>

namespace re2 {

template <typename T>
class PODArray {
public:
static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
"T must be POD");

PODArray()
: ptr_() {}
explicit PODArray(int len)
: ptr_(std::allocator<T>().allocate(len), Deleter(len)) {}

T* data() const {
return ptr_.get();
}

int size() const {
return ptr_.get_deleter().len_;
}

T& operator[](int pos) const {
return ptr_[pos];
}

private:
struct Deleter {
Deleter()
: len_(0) {}
explicit Deleter(int len)
: len_(len) {}

void operator()(T* ptr) const {
std::allocator<T>().deallocate(ptr, len_);
}

int len_;
};

std::unique_ptr<T[], Deleter> ptr_;
};

} // namespace re2

#endif // RE2_POD_ARRAY_H_
105 changes: 60 additions & 45 deletions inst/include/re2/prog.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@
#include <mutex>
#include <string>
#include <vector>
#include <type_traits>

#include "util/util.h"
#include "util/logging.h"
#include "util/sparse_array.h"
#include "util/sparse_set.h"
#include "re2/pod_array.h"
#include "re2/re2.h"
#include "re2/sparse_array.h"
#include "re2/sparse_set.h"

namespace re2 {

Expand Down Expand Up @@ -59,7 +61,8 @@ class Prog {
// Single instruction in regexp program.
class Inst {
public:
Inst() : out_opcode_(0), out1_(0) {}
// See the assertion below for why this is so.
Inst() = default;

// Copyable.
Inst(const Inst&) = default;
Expand All @@ -75,15 +78,16 @@ class Prog {
void InitFail();

// Getters
int id(Prog* p) { return static_cast<int>(this - p->inst_); }
int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); }
InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
int last() { return (out_opcode_>>3)&1; }
int out() { return out_opcode_>>4; }
int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_&1; }
int hint() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_>>1; }
int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }

Expand All @@ -97,13 +101,13 @@ class Prog {
// Does this inst (an kInstByteRange) match c?
inline bool Matches(int c) {
DCHECK_EQ(opcode(), kInstByteRange);
if (foldcase_ && 'A' <= c && c <= 'Z')
if (foldcase() && 'A' <= c && c <= 'Z')
c += 'a' - 'A';
return lo_ <= c && c <= hi_;
}

// Returns string representation for debugging.
string Dump();
std::string Dump();

// Maximum instruction id.
// (Must fit in out_opcode_. PatchList/last steal another bit.)
Expand All @@ -126,32 +130,32 @@ class Prog {
out_opcode_ = (out<<4) | (last()<<3) | opcode;
}

uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
union { // additional instruction arguments:
uint32_t out1_; // opcode == kInstAlt
// alternate next instruction

int32_t cap_; // opcode == kInstCapture
// Index of capture register (holds text
// position recorded by capturing parentheses).
// For \n (the submatch for the nth parentheses),
// the left parenthesis captures into register 2*n
// and the right one captures into register 2*n+1.

int32_t match_id_; // opcode == kInstMatch
// Match ID to identify this match (for re2::Set).
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpedantic"
#endif
struct { // opcode == kInstByteRange
uint8_t lo_; // byte range is lo_-hi_ inclusive
uint8_t hi_; //
uint8_t foldcase_; // convert A-Z to a-z before checking range.
uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
union { // additional instruction arguments:
uint32_t out1_; // opcode == kInstAlt
// alternate next instruction

int32_t cap_; // opcode == kInstCapture
// Index of capture register (holds text
// position recorded by capturing parentheses).
// For \n (the submatch for the nth parentheses),
// the left parenthesis captures into register 2*n
// and the right one captures into register 2*n+1.

int32_t match_id_; // opcode == kInstMatch
// Match ID to identify this match (for re2::Set).

struct { // opcode == kInstByteRange
uint8_t lo_; // byte range is lo_-hi_ inclusive
uint8_t hi_; //
uint16_t hint_foldcase_; // 15 bits: hint, 1 (low) bit: foldcase
// hint to execution engines: the delta to the
// next instruction (in the current list) worth
// exploring iff this instruction matched; 0
// means there are no remaining possibilities,
// which is most likely for character classes.
// foldcase: A-Z -> a-z before checking range.
};
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif

EmptyOp empty_; // opcode == kInstEmptyWidth
// empty_ is bitwise OR of kEmpty* flags above.
Expand All @@ -162,6 +166,11 @@ class Prog {
friend class Prog;
};

// Inst must be trivial so that we can freely clear it with memset(3).
// Arrays of Inst are initialised by copying the initial elements with
// memmove(3) and then clearing any remaining elements with memset(3).
static_assert(std::is_trivial<Inst>::value, "Inst must be trivial");

// Whether to anchor the search.
enum Anchor {
kUnanchored, // match anywhere
Expand Down Expand Up @@ -197,6 +206,7 @@ class Prog {
void set_reversed(bool reversed) { reversed_ = reversed; }
int list_count() { return list_count_; }
int inst_count(InstOp op) { return inst_count_[op]; }
uint16_t* list_heads() { return list_heads_.data(); }
void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
int64_t dfa_mem() { return dfa_mem_; }
int flags() { return flags_; }
Expand All @@ -212,9 +222,9 @@ class Prog {
int first_byte();

// Returns string representation of program for debugging.
string Dump();
string DumpUnanchored();
string DumpByteMap();
std::string Dump();
std::string DumpUnanchored();
std::string DumpByteMap();

// Returns the set of kEmpty flags that are in effect at
// position p within context.
Expand Down Expand Up @@ -261,7 +271,7 @@ class Prog {
// SearchDFA fills matches with the match IDs of the final matching state.
bool SearchDFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind, StringPiece* match0,
bool* failed, std::vector<int>* matches);
bool* failed, SparseSet* matches);

// The callback issued after building each DFA state with BuildEntireDFA().
// If next is null, then the memory budget has been exhausted and building
Expand Down Expand Up @@ -303,7 +313,8 @@ class Prog {
StringPiece* match, int nmatch);

// Bit-state backtracking. Fast on small cases but uses memory
// proportional to the product of the program size and the text size.
// proportional to the product of the list count and the text size.
bool CanBitState() { return list_heads_.data() != NULL; }
bool SearchBitState(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
Expand Down Expand Up @@ -335,16 +346,15 @@ class Prog {
// do not compile down to infinite repetitions.
//
// Returns true on success, false on error.
bool PossibleMatchRange(string* min, string* max, int maxlen);
bool PossibleMatchRange(std::string* min, std::string* max, int maxlen);

// EXPERIMENTAL! SUBJECT TO CHANGE!
// Outputs the program fanout into the given sparse array.
void Fanout(SparseArray<int>* fanout);

// Compiles a collection of regexps to Prog. Each regexp will have
// its own Match instruction recording the index in the vector.
static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
Regexp* re);
// its own Match instruction recording the index in the output vector.
static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);

// Flattens the Prog from "tree" form to "list" form. This is an in-place
// operation in the sense that the old instructions are lost.
Expand Down Expand Up @@ -373,6 +383,9 @@ class Prog {
std::vector<Inst>* flat,
SparseSet* reachable, std::vector<int>* stk);

// Computes hints for ByteRange instructions in [begin, end).
void ComputeHints(std::vector<Inst>* flat, int begin, int end);

private:
friend class Compiler;

Expand All @@ -392,11 +405,13 @@ class Prog {
int first_byte_; // required first byte for match, or -1 if none
int flags_; // regexp parse flags

int list_count_; // count of lists (see above)
int inst_count_[kNumInst]; // count of instructions by opcode
int list_count_; // count of lists (see above)
int inst_count_[kNumInst]; // count of instructions by opcode
PODArray<uint16_t> list_heads_; // sparse array enumerating list heads
// not populated if size_ is overly large

Inst* inst_; // pointer to instruction array
uint8_t* onepass_nodes_; // data for OnePass nodes
PODArray<Inst> inst_; // pointer to instruction array
PODArray<uint8_t> onepass_nodes_; // data for OnePass nodes

int64_t dfa_mem_; // Maximum memory for DFAs.
DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch
Expand Down
Loading

0 comments on commit 54b6a7c

Please sign in to comment.