From 3d4beb1c476fc9111a51f929d410440497e7d0a3 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 22 Apr 2024 16:55:47 -0400 Subject: [PATCH 1/6] re_strings: add support for endids. This adds an extra parameter to `re_strings_add_str` and `re_strings_add_raw` that (if non-NULL) will associate a single endid with the string being added. When `re_strings_build` constructs the DFA it will produce a separate end state for each end. This needs further testing with multiple overlapping patterns. When multiple literal strings appear in the input only the latest match will be reported. --- include/fsm/fsm.h | 6 ++++++ include/re/strings.h | 6 +++--- src/libfsm/endids.c | 10 ++++++++++ src/libfsm/libfsm.syms | 1 + src/libre/ac.c | 12 ++++++++++-- src/libre/ac.h | 5 ++++- src/libre/re_strings.c | 10 +++++----- 7 files changed, 39 insertions(+), 11 deletions(-) diff --git a/include/fsm/fsm.h b/include/fsm/fsm.h index 7c3883749..2b4c438f1 100644 --- a/include/fsm/fsm.h +++ b/include/fsm/fsm.h @@ -207,6 +207,12 @@ fsm_setend(struct fsm *fsm, fsm_state_t state, int end); int fsm_setendid(struct fsm *fsm, fsm_end_id_t id); +/* Associate a numeric ID with a specific end state in an fsm. + * Returns 1 on success, 0 on error. + * */ +int +fsm_setendidstate(struct fsm *fsm, fsm_state_t end_state, fsm_end_id_t id); + /* Get the end IDs associated with an end state, if any. * If id_buf has enough cells to store all the end IDs (according * to id_buf_count) then they are written into id_buf[] and diff --git a/include/re/strings.h b/include/re/strings.h index 06387a54c..fe4f7adc2 100644 --- a/include/re/strings.h +++ b/include/re/strings.h @@ -7,7 +7,7 @@ #ifndef RE_STRINGS_H #define RE_STRINGS_H -struct fsm; +#include struct fsm_options; struct re_strings; @@ -42,10 +42,10 @@ void re_strings_free(struct re_strings *g); int -re_strings_add_raw(struct re_strings *g, const void *p, size_t n); +re_strings_add_raw(struct re_strings *g, const void *p, size_t n, const fsm_end_id_t *endid); int -re_strings_add_str(struct re_strings *g, const char *s); +re_strings_add_str(struct re_strings *g, const char *s, const fsm_end_id_t *endid); struct fsm * re_strings_build(struct re_strings *g, diff --git a/src/libfsm/endids.c b/src/libfsm/endids.c index 444ccbc2e..1fc98ae68 100644 --- a/src/libfsm/endids.c +++ b/src/libfsm/endids.c @@ -84,6 +84,16 @@ fsm_setendid(struct fsm *fsm, fsm_end_id_t id) return 1; } +int +fsm_setendidstate(struct fsm *fsm, fsm_state_t end_state, fsm_end_id_t id) +{ + enum fsm_endid_set_res sres = fsm_endid_set(fsm, end_state, id); + if (sres == FSM_ENDID_SET_ERROR_ALLOC_FAIL) { + return 0; + } + return 1; +} + enum fsm_getendids_res fsm_getendids(const struct fsm *fsm, fsm_state_t end_state, size_t id_buf_count, fsm_end_id_t *id_buf, diff --git a/src/libfsm/libfsm.syms b/src/libfsm/libfsm.syms index 415bffbea..a2570b8c9 100644 --- a/src/libfsm/libfsm.syms +++ b/src/libfsm/libfsm.syms @@ -89,6 +89,7 @@ fsm_getendids fsm_setendid fsm_mapendids fsm_increndids +fsm_setendidstate fsm_countedges fsm_countstates diff --git a/src/libre/ac.c b/src/libre/ac.c index 2ebdb4f7e..d5f8e6f4b 100644 --- a/src/libre/ac.c +++ b/src/libre/ac.c @@ -16,6 +16,7 @@ #include "ac.h" +#define ENDID_NONE ((fsm_end_id_t)-1) enum { POOL_BLOCK_SIZE = 256 }; struct trie_state { @@ -25,6 +26,7 @@ struct trie_state { unsigned int index; unsigned int output:1; unsigned int have_st:1; + fsm_end_id_t endid; /* or ENDID_NONE */ }; struct trie_pool { @@ -126,7 +128,7 @@ trie_create(void) } struct trie_graph * -trie_add_word(struct trie_graph *g, const char *w, size_t n) +trie_add_word(struct trie_graph *g, const char *w, size_t n, const fsm_end_id_t *endid) { struct trie_state *st; size_t i; @@ -159,6 +161,7 @@ trie_add_word(struct trie_graph *g, const char *w, size_t n) g->depth = n; } + st->endid = (endid == NULL ? ENDID_NONE : *endid); return g; } @@ -278,7 +281,7 @@ trie_to_fsm_state(struct trie_state *ts, struct fsm *fsm, assert(fsm != NULL); assert(q != NULL); - if (ts->output && have_end) { + if (ts->output && have_end && ts->endid == ENDID_NONE) { *q = single_end; return 1; } @@ -315,6 +318,11 @@ trie_to_fsm_state(struct trie_state *ts, struct fsm *fsm, if (ts->output) { fsm_setend(fsm, st, 1); + if (ts->endid != ENDID_NONE) { + if (!fsm_setendidstate(fsm, st, ts->endid)) { + return 0; + } + } } *q = st; diff --git a/src/libre/ac.h b/src/libre/ac.h index 480c76bfb..edabca27c 100644 --- a/src/libre/ac.h +++ b/src/libre/ac.h @@ -7,6 +7,8 @@ #ifndef AC_H #define AC_H +#include "fsm/fsm.h" + struct fsm; struct fsm_state; struct fsm_options; @@ -20,7 +22,8 @@ void trie_free(struct trie_graph *g); struct trie_graph * -trie_add_word(struct trie_graph *g, const char *w, size_t n); +trie_add_word(struct trie_graph *g, const char *w, size_t n, + const fsm_end_id_t *endid); int trie_add_failure_edges(struct trie_graph *g); diff --git a/src/libre/re_strings.c b/src/libre/re_strings.c index 06b7dc772..b2f04fec1 100644 --- a/src/libre/re_strings.c +++ b/src/libre/re_strings.c @@ -32,7 +32,7 @@ re_strings(const struct fsm_options *opt, const char *a[], size_t n, } for (i = 0; i < n; i++) { - if (!re_strings_add_str(g, a[i])) { + if (!re_strings_add_str(g, a[i], NULL)) { goto error; } } @@ -64,20 +64,20 @@ re_strings_free(struct re_strings *g) } int -re_strings_add_raw(struct re_strings *g, const void *p, size_t n) +re_strings_add_raw(struct re_strings *g, const void *p, size_t n, const fsm_end_id_t *endid) { assert(p != NULL); assert(n > 0); - return trie_add_word((struct trie_graph *) g, p, n) != NULL; + return trie_add_word((struct trie_graph *) g, p, n, endid) != NULL; } int -re_strings_add_str(struct re_strings *g, const char *s) +re_strings_add_str(struct re_strings *g, const char *s, const fsm_end_id_t *endid) { assert(s != NULL); - return re_strings_add_raw(g, s, strlen(s)); + return re_strings_add_raw(g, s, strlen(s), endid); } struct fsm * From 986144b5a10429f74fe1e99b346bca55569a8e16 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 29 Apr 2024 10:57:06 -0400 Subject: [PATCH 2/6] Add tests/re_strings. Switch to a set of endids. This uses a `struct state_set` since sizeof(fsm_state) == sizeof(fsm_end_id_t), and it's probably not worth making a separate ADT just for these. The second test checks that duplicated strings get all their endids set. The previous implementation (a single endid, or ENDID_NONE) dropped all but the last endid defined. --- Makefile | 1 + src/libre/ac.c | 22 ++++++++--- tests/re_strings/Makefile | 26 +++++++++++++ tests/re_strings/re_strings1.c | 21 +++++++++++ tests/re_strings/re_strings2.c | 17 +++++++++ tests/re_strings/testutil.c | 69 ++++++++++++++++++++++++++++++++++ tests/re_strings/testutil.h | 11 ++++++ 7 files changed, 161 insertions(+), 6 deletions(-) create mode 100644 tests/re_strings/Makefile create mode 100644 tests/re_strings/re_strings1.c create mode 100644 tests/re_strings/re_strings2.c create mode 100644 tests/re_strings/testutil.c create mode 100644 tests/re_strings/testutil.h diff --git a/Makefile b/Makefile index f1f4f1396..499239fd9 100644 --- a/Makefile +++ b/Makefile @@ -131,6 +131,7 @@ SUBDIR += tests/pcre-flags SUBDIR += tests/pcre-repeat SUBDIR += tests/pred SUBDIR += tests/re_literal +SUBDIR += tests/re_strings SUBDIR += tests/reverse SUBDIR += tests/trim SUBDIR += tests/union diff --git a/src/libre/ac.c b/src/libre/ac.c index d5f8e6f4b..121eed487 100644 --- a/src/libre/ac.c +++ b/src/libre/ac.c @@ -13,10 +13,10 @@ #include #include +#include #include "ac.h" -#define ENDID_NONE ((fsm_end_id_t)-1) enum { POOL_BLOCK_SIZE = 256 }; struct trie_state { @@ -26,7 +26,9 @@ struct trie_state { unsigned int index; unsigned int output:1; unsigned int have_st:1; - fsm_end_id_t endid; /* or ENDID_NONE */ + + /* use a state set as an endid set */ + struct state_set *endids; }; struct trie_pool { @@ -75,6 +77,7 @@ newstate(struct trie_graph *g) st->index = ++g->nstates; st->output = 0; + st->endids = NULL; return st; } @@ -161,7 +164,9 @@ trie_add_word(struct trie_graph *g, const char *w, size_t n, const fsm_end_id_t g->depth = n; } - st->endid = (endid == NULL ? ENDID_NONE : *endid); + if (endid != NULL) { + state_set_add(&st->endids, NULL, (fsm_state_t)*endid); + } return g; } @@ -281,7 +286,7 @@ trie_to_fsm_state(struct trie_state *ts, struct fsm *fsm, assert(fsm != NULL); assert(q != NULL); - if (ts->output && have_end && ts->endid == ENDID_NONE) { + if (ts->output && have_end && state_set_empty(ts->endids)) { *q = single_end; return 1; } @@ -318,8 +323,13 @@ trie_to_fsm_state(struct trie_state *ts, struct fsm *fsm, if (ts->output) { fsm_setend(fsm, st, 1); - if (ts->endid != ENDID_NONE) { - if (!fsm_setendidstate(fsm, st, ts->endid)) { + + struct state_iter si; + fsm_state_t state; + state_set_reset(ts->endids, &si); + while (state_set_next(&si, &state)) { + fsm_end_id_t endid = (fsm_end_id_t)state; + if (!fsm_setendidstate(fsm, st, endid)) { return 0; } } diff --git a/tests/re_strings/Makefile b/tests/re_strings/Makefile new file mode 100644 index 000000000..7fc7f2548 --- /dev/null +++ b/tests/re_strings/Makefile @@ -0,0 +1,26 @@ +.include "../../share/mk/top.mk" + +TEST.tests/re_strings != ls -1 tests/re_strings/re_strings*.c +TEST_SRCDIR.tests/re_strings = tests/re_strings +TEST_OUTDIR.tests/re_strings = ${BUILD}/tests/re_strings + +.for n in ${TEST.tests/re_strings:T:R:C/^re_strings//} +test:: ${TEST_OUTDIR.tests/re_strings}/res${n} +SRC += ${TEST_SRCDIR.tests/re_strings}/re_strings${n}.c +CFLAGS.${TEST_SRCDIR.tests/re_strings}/re_strings${n}.c = -UNDEBUG + +${TEST_OUTDIR.tests/re_strings}/run${n}: ${TEST_OUTDIR.tests/re_strings}/re_strings${n}.o ${TEST_OUTDIR.tests/re_strings}/testutil.o + ${CC} ${CFLAGS} -o ${TEST_OUTDIR.tests/re_strings}/run${n} ${TEST_OUTDIR.tests/re_strings}/re_strings${n}.o ${TEST_OUTDIR.tests/re_strings}/testutil.o ${BUILD}/lib/libfsm.a ${BUILD}/lib/libre.a + +${TEST_OUTDIR.tests/re_strings}/re_strings${n}.o: tests/re_strings/testutil.h + +${TEST_OUTDIR.tests/re_strings}/res${n}: ${TEST_OUTDIR.tests/re_strings}/run${n} + ( ${TEST_OUTDIR.tests/re_strings}/run${n} 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/re_strings}/res${n} + +.for lib in ${LIB:Mlibfsm} ${LIB:Mlibre} +${TEST_OUTDIR.tests/re_strings}/run${n}: ${BUILD}/lib/${lib:R}.a +.endfor +.endfor + +${TEST_OUTDIR.tests/re_strings}/testutil.o: tests/re_strings/testutil.c + ${CC} ${CFLAGS} -c -o ${TEST_OUTDIR.tests/re_strings}/testutil.o tests/re_strings/testutil.c diff --git a/tests/re_strings/re_strings1.c b/tests/re_strings/re_strings1.c new file mode 100644 index 000000000..44f10d41d --- /dev/null +++ b/tests/re_strings/re_strings1.c @@ -0,0 +1,21 @@ +#include "testutil.h" + +const char *strings[] = { + "aa", + "ab", + "ac", + "ba", + "bb", + "bc", + "ca", + "cb", + "cc", + NULL, +}; + +int main(int argc, char **argv) +{ + (void)argc; + (void)argv; + return run_test(strings); +} diff --git a/tests/re_strings/re_strings2.c b/tests/re_strings/re_strings2.c new file mode 100644 index 000000000..6e4c80459 --- /dev/null +++ b/tests/re_strings/re_strings2.c @@ -0,0 +1,17 @@ +#include "testutil.h" + +const char *strings[] = { + "first", + "duplicate", + "duplicate", + "duplicate", + "last", + NULL, +}; + +int main(int argc, char **argv) +{ + (void)argc; + (void)argv; + return run_test(strings); +} diff --git a/tests/re_strings/testutil.c b/tests/re_strings/testutil.c new file mode 100644 index 000000000..078ca4f4c --- /dev/null +++ b/tests/re_strings/testutil.c @@ -0,0 +1,69 @@ +#include "testutil.h" + +#include +#include + +#include "fsm/fsm.h" +#include "fsm/options.h" + +#include "re/re.h" +#include "re/strings.h" + +static struct fsm_options opt; + +#define MAX_INPUTS 100 +static fsm_end_id_t id_buf[MAX_INPUTS]; + +int +run_test(const char **strings) +{ + struct re_strings *s = re_strings_new(); + assert(s != NULL); + + fsm_end_id_t id = 0; + const char **input = strings; + while (*input != NULL) { + if (!re_strings_add_str(s, *input, &id)) { + assert(!"re_strings_add_str"); + } + + input++; + id++; + assert(id < MAX_INPUTS); + } + + const int flags = 0; /* not anchored */ + + struct fsm *fsm = re_strings_build(s, &opt, flags); + assert(fsm != NULL); + + /* Each literal string input should match, and the set of + * matching endids should include the expected one. */ + id = 0; + input = strings; + while (*input != NULL) { + fsm_state_t end; + const char **string = input; + const int res = fsm_exec(fsm, fsm_sgetc, string, &end, NULL); + assert(res > 0); /* match */ + + size_t written; + enum fsm_getendids_res eres = fsm_getendids(fsm, end, + MAX_INPUTS, id_buf, &written); + assert(eres == FSM_GETENDIDS_FOUND); + bool found = false; + for (size_t i = 0; i < written; i++) { + if (id_buf[i] == id) { + found = true; + break; + } + } + assert(found); + + input++; + id++; + } + + re_strings_free(s); + return EXIT_SUCCESS; +} diff --git a/tests/re_strings/testutil.h b/tests/re_strings/testutil.h new file mode 100644 index 000000000..6898200b7 --- /dev/null +++ b/tests/re_strings/testutil.h @@ -0,0 +1,11 @@ +#ifndef TESTUTIL_H +#define TESTUTIL_H + +#include +#include +#include + +int +run_test(const char **strings); + +#endif From b208d59e743e976ca73d8f973b7ccaf361986819 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 29 Apr 2024 11:03:33 -0400 Subject: [PATCH 3/6] ac: free the endid set. --- src/libre/ac.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/libre/ac.c b/src/libre/ac.c index 121eed487..bca129c2a 100644 --- a/src/libre/ac.c +++ b/src/libre/ac.c @@ -91,6 +91,10 @@ cleanup_pool(struct trie_graph *g) p = g->pool; g->pool = p->next; + for (size_t i = 0; i < p->n; i++) { + state_set_free(p->states[i].endids); + } + free(p->states); free(p); } From 43439c01b7bc7b54ac398dc56eb6c605cf94a6a8 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 29 Apr 2024 11:06:13 -0400 Subject: [PATCH 4/6] tests/re_strings/testutil.c: Free the fsm after the test. --- tests/re_strings/testutil.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/re_strings/testutil.c b/tests/re_strings/testutil.c index 078ca4f4c..4f4678b0d 100644 --- a/tests/re_strings/testutil.c +++ b/tests/re_strings/testutil.c @@ -65,5 +65,7 @@ run_test(const char **strings) } re_strings_free(s); + fsm_free(fsm); + return EXIT_SUCCESS; } From 8b69338585e18bf43784678bc98ab550d631e549 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 30 Apr 2024 11:38:32 -0400 Subject: [PATCH 5/6] ac: Move field for better struct packing. --- src/libre/ac.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/libre/ac.c b/src/libre/ac.c index bca129c2a..efe13df36 100644 --- a/src/libre/ac.c +++ b/src/libre/ac.c @@ -22,13 +22,13 @@ enum { POOL_BLOCK_SIZE = 256 }; struct trie_state { struct trie_state *children[256]; struct trie_state *fail; + /* use a state set as an endid set */ + struct state_set *endids; + fsm_state_t st; unsigned int index; unsigned int output:1; unsigned int have_st:1; - - /* use a state set as an endid set */ - struct state_set *endids; }; struct trie_pool { From 7d9b8621048fb71e92a2863b5dc738fcd066b8fc Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 8 May 2024 09:15:56 -0400 Subject: [PATCH 6/6] re_strings: Add tests for ALL duplicates and for the empty set. --- tests/re_strings/re_strings3.c | 15 +++++++++++++++ tests/re_strings/re_strings4.c | 13 +++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 tests/re_strings/re_strings3.c create mode 100644 tests/re_strings/re_strings4.c diff --git a/tests/re_strings/re_strings3.c b/tests/re_strings/re_strings3.c new file mode 100644 index 000000000..3f4f1d052 --- /dev/null +++ b/tests/re_strings/re_strings3.c @@ -0,0 +1,15 @@ +#include "testutil.h" + +const char *strings[] = { + "duplicate", + "duplicate", + "duplicate", + NULL, +}; + +int main(int argc, char **argv) +{ + (void)argc; + (void)argv; + return run_test(strings); +} diff --git a/tests/re_strings/re_strings4.c b/tests/re_strings/re_strings4.c new file mode 100644 index 000000000..b1e00e70f --- /dev/null +++ b/tests/re_strings/re_strings4.c @@ -0,0 +1,13 @@ +#include "testutil.h" + +const char *strings[] = { + /* empty */ + NULL, +}; + +int main(int argc, char **argv) +{ + (void)argc; + (void)argv; + return run_test(strings); +}