diff --git a/src/libfsm/minimise.c b/src/libfsm/minimise.c index 60cec48c5..f1ff9fffc 100644 --- a/src/libfsm/minimise.c +++ b/src/libfsm/minimise.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "internal.h" #include "capture.h" @@ -38,6 +39,21 @@ #include "minimise_internal.h" #include "minimise_test_oracle.h" +static int +split_ecs_by_end_metadata(struct min_env *env, const struct fsm *fsm); + +#define DEF_CAPTURE_ID_CEIL 4 +struct end_metadata { + struct end_metadata_end { + unsigned count; + fsm_end_id_t *ids; + } end; +}; + +static int +collect_end_ids(const struct fsm *fsm, fsm_state_t s, + struct end_metadata_end *e); + int fsm_minimise(struct fsm *fsm) { @@ -253,6 +269,12 @@ build_minimised_mapping(const struct fsm *fsm, goto cleanup; } + /* This only needs to be run once, but must run before the main + * fixpoint loop below, because it potentially refines ECs. */ + if (!split_ecs_by_end_metadata(&env, fsm)) { + goto cleanup; + } + #if LOG_INIT for (i = 0; i < env.ec_count; i++) { fprintf(stderr, "# --ec[%lu]: %d\n", i, env.ecs[i]); @@ -646,6 +668,307 @@ populate_initial_ecs(struct min_env *env, const struct fsm *fsm, #endif } +SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() +static void +incremental_hash_of_ids(uint64_t *accum, fsm_end_id_t id) +{ + (*accum) += hash_id(id); +} + +static int +same_end_metadata(const struct end_metadata *a, const struct end_metadata *b) +{ + if (a->end.count != b->end.count) { + return 0; + } + + /* compare -- these must be sorted */ + + for (size_t i = 0; i < a->end.count; i++) { + if (a->end.ids[i] != b->end.ids[i]) { + return 0; + } + } + + return 1; +} + +static int +split_ecs_by_end_metadata(struct min_env *env, const struct fsm *fsm) +{ + int res = 0; + + struct end_metadata *end_md; + fsm_state_t *htab = NULL; + + const size_t state_count = fsm_countstates(fsm); + +#if EXPENSIVE_INTEGRITY_CHECKS + /* Invariant: For each EC, either all or none of the states + * are end states. We only partition the set(s) of end states + * here. */ + all_end_states_are_currently_together(env); +#endif + + /* Use the hash table to assign to new groups. */ + + end_md = f_calloc(fsm->opt->alloc, + state_count, sizeof(end_md[0])); + if (end_md == NULL) { + goto cleanup; + } + + size_t bucket_count = 1; + while (bucket_count < state_count) { + bucket_count *= 2; /* power of 2 ceiling */ + } + const size_t mask = bucket_count - 1; + + htab = f_malloc(fsm->opt->alloc, + bucket_count * sizeof(htab[0])); + if (htab == NULL) { + goto cleanup; + } + + /* First pass: collect end state metadata */ + for (size_t ec_i = 0; ec_i < env->ec_count; ec_i++) { + fsm_state_t s = MASK_EC_HEAD(env->ecs[ec_i]); +#if LOG_ECS + fprintf(stderr, "## EC %zu\n", ec_i); +#endif + while (s != NO_ID) { + struct end_metadata *e = &end_md[s]; + if (!fsm_isend(fsm, s)) { + break; /* this EC has non-end states, skip */ + } + + if (!collect_end_ids(fsm, s, &e->end)) { + goto cleanup; + } + + s = env->jump[s]; + } + } + +#if LOG_ECS + fprintf(stderr, "==== BEFORE PARTITIONING BY END METADATA\n"); + dump_ecs(stderr, env); + fprintf(stderr, "====\n"); +#endif + + /* Second pass: partition ECs into groups with identical end IDs. + * for each group with different end IDs, unlink them. */ + const size_t max_ec = env->ec_count; + for (size_t ec_i = 0; ec_i < max_ec; ec_i++) { + fsm_state_t s = MASK_EC_HEAD(env->ecs[ec_i]); + fsm_state_t prev = NO_ID; + + for (size_t i = 0; i < bucket_count; i++) { + htab[i] = NO_ID; /* reset hash table */ + } + + while (s != NO_ID) { + const struct end_metadata *s_md = &end_md[s]; + + uint64_t hash = 0; + const fsm_state_t next = env->jump[s]; + + for (size_t eid_i = 0; eid_i < s_md->end.count; eid_i++) { + incremental_hash_of_ids(&hash, s_md->end.ids[eid_i]); + } + + for (size_t b_i = 0; b_i < bucket_count; b_i++) { + fsm_state_t *b = &htab[(b_i + hash) & mask]; + const fsm_state_t other = *b; + const struct end_metadata *other_md = &end_md[other]; + + if (other == NO_ID) { /* empty hash bucket */ + *b = s; + if (prev == NO_ID) { + /* keep the first state, along with other states + * with matching end IDs, in this EC. no-op. */ +#if LOG_ECS + fprintf(stderr, " -- keeping state s %d in EC %u\n", + s, env->state_ecs[s]); +#endif + prev = s; + } else { /* not first (prev is set), so it landed somewhere else */ + /* unlink and assign new EC */ +#if LOG_ECS + fprintf(stderr, " -- moving state s %d from EC %u to EC %u\n", + s, env->state_ecs[s], env->ec_count); +#endif + env->jump[prev] = env->jump[s]; /* unlink */ + env->ecs[env->ec_count] = s; /* head of new EC */ + env->state_ecs[s] = env->ec_count; + env->jump[s] = NO_ID; + env->ec_count++; + } + break; + } else if (same_end_metadata(s_md, other_md)) { + if (env->state_ecs[other] == ec_i) { + /* keep in the current EC -- no-op */ +#if LOG_ECS + fprintf(stderr, " -- keeping state s %d in EC %u\n", + s, env->state_ecs[s]); +#endif + prev = s; + } else { + /* unlink and link to other state's EC */ +#if LOG_ECS + fprintf(stderr, " -- appending s %d to EC %u, after state %d, before %d\n", + s, env->state_ecs[other], other, env->jump[other]); +#endif + assert(prev != NO_ID); + env->jump[prev] = env->jump[s]; /* unlink */ + env->state_ecs[s] = env->state_ecs[other]; + env->jump[s] = env->jump[other]; + env->jump[other] = s; /* link after other */ + } + break; + } else { + continue; /* collision */ + } + } + + s = next; + } + + /* If this EC only has one entry and it's before the + * done_ec_offset, then set that here so that invariants + * will be restored while sweeping forward after this loop. */ + + if (env->jump[MASK_EC_HEAD(env->ecs[ec_i])] == NO_ID && ec_i < env->done_ec_offset) { + env->done_ec_offset = ec_i; /* will be readjusted later */ + } + +#if LOG_ECS + fprintf(stderr, "==== AFTER PARTITIONING BY END METADATA -- EC %zu\n", ec_i); + dump_ecs(stderr, env); + fprintf(stderr, "==== (done_ec_offset: %d)\n", env->done_ec_offset); +#endif + } + +#if LOG_ECS + fprintf(stderr, "==== AFTER PARTITIONING BY END IDs\n"); + dump_ecs(stderr, env); + fprintf(stderr, "==== (done_ec_offset: %d)\n", env->done_ec_offset); +#endif + + /* Sweep forward and swap ECs as necessary so all single-entry + * ECs are at the end -- they're done. */ + size_t ec_i = env->done_ec_offset; + + while (ec_i < env->ec_count) { + const fsm_state_t head = MASK_EC_HEAD(env->ecs[ec_i]); + if (env->jump[head] == NO_ID) { + /* offset stays where it is */ +#if LOG_ECS + fprintf(stderr, "ec_i: %zu / %u -- branch a\n", ec_i, env->ec_count); +#endif + env->ecs[ec_i] = SET_SMALL_EC_FLAG(head); + } else { + /* this EC has more than one state, but is after + * the done_ec_offset, so swap it with an EC at + * the boundary. */ + const fsm_state_t n_ec_i = env->done_ec_offset; +#if LOG_ECS + fprintf(stderr, "ec_i: %zu / %u -- branch b -- swap %ld and %d\n", + ec_i, env->ec_count, ec_i, n_ec_i); +#endif + + /* swap ec[n_ec_i] and ec[ec_i] */ + const fsm_state_t tmp = env->ecs[ec_i]; + env->ecs[ec_i] = env->ecs[n_ec_i]; + env->ecs[n_ec_i] = tmp; + /* note: this may set the SMALL_EC_FLAG. */ + update_ec_links(env, ec_i); + update_ec_links(env, n_ec_i); + env->done_ec_offset++; + } + ec_i++; + } + +#if LOG_ECS + fprintf(stderr, "==== (done_ec_offset is now: %d, ec_count %u)\n", env->done_ec_offset, env->ec_count); + dump_ecs(stderr, env); +#endif + + /* check that all ECs are before/after done_ec_offset */ + for (size_t ec_i = 0; ec_i < env->ec_count; ec_i++) { + const fsm_state_t s = MASK_EC_HEAD(env->ecs[ec_i]); +#if LOG_ECS + fprintf(stderr, " -- ec_i %zu: s %d\n", ec_i, s); +#endif + if (ec_i < env->done_ec_offset) { + assert(env->jump[s] != NO_ID); + } else { + assert(env->jump[s] == NO_ID); + } + } + + res = 1; + +cleanup: + if (htab != NULL) { + f_free(fsm->opt->alloc, htab); + } + if (end_md != NULL) { + size_t i; + for (i = 0; i < state_count; i++) { + struct end_metadata *e = &end_md[i]; + if (e->end.ids != NULL) { + f_free(fsm->opt->alloc, e->end.ids); + } + } + f_free(fsm->opt->alloc, end_md); + } + + return res; +} + +static int +cmp_end_ids(const void *pa, const void *pb) +{ + const fsm_end_id_t a = *(fsm_end_id_t *)pa; + const fsm_end_id_t b = *(fsm_end_id_t *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + +static int +collect_end_ids(const struct fsm *fsm, fsm_state_t s, + struct end_metadata_end *e) +{ + e->count = fsm_getendidcount(fsm, s); + + if (e->count > 0) { + e->ids = f_malloc(fsm->opt->alloc, + e->count * sizeof(e->ids[0])); + if (e->ids == NULL) { + return 0; + } + + size_t written; + enum fsm_getendids_res res = fsm_getendids(fsm, s, + e->count, e->ids, &written); + assert(res == FSM_GETENDIDS_FOUND); + assert(written == e->count); + + /* sort, to make comparison easier later */ + qsort(e->ids, e->count, + sizeof(e->ids[0]), cmp_end_ids); + +#if LOG_ECS + fprintf(stderr, "%d:", s); + for (size_t i = 0; i < written; i++) { + fprintf(stderr, " %u", e->ids[i]); + } + fprintf(stderr, "\n"); +#endif + } + return 1; +} + #if EXPENSIVE_INTEGRITY_CHECKS static void check_done_ec_offset(const struct min_env *env) diff --git a/tests/endids/endids10_minimise_partial_overlap.c b/tests/endids/endids10_minimise_partial_overlap.c new file mode 100644 index 000000000..69b70249d --- /dev/null +++ b/tests/endids/endids10_minimise_partial_overlap.c @@ -0,0 +1,82 @@ +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#include +#include +#include +#include +#include + +#include "endids_utils.h" + +#define ENDID_AB_STAR_C 1 +#define ENDID_ABC 2 + +int main(void) +{ + /* Union /^ab*c$/ and /^abc$/ with distinct endids set on each + * and verify that inputs of "ac", "abc", "abbc" get the endids + * associated with only the regexes that match. + * + * In other words, check that minimisation does not merge them + * and cause false positives. */ + + const char *regex_ab_star_c = "^ab*c$"; + const char *regex_abc = "^abc$"; + + struct fsm *fsm_ab_star_c = re_comp(RE_NATIVE, fsm_sgetc, (void *)®ex_ab_star_c, NULL, 0, NULL); + assert(fsm_ab_star_c != NULL); + if (!fsm_setendid(fsm_ab_star_c, ENDID_AB_STAR_C)) { assert(!"setendid"); } + + if (!fsm_determinise(fsm_ab_star_c)) { assert(!"determinise"); } + if (!fsm_minimise(fsm_ab_star_c)) { assert(!"minimise"); } + + struct fsm *fsm_abc = re_comp(RE_NATIVE, fsm_sgetc, (void *)®ex_abc, NULL, 0, NULL); + assert(fsm_abc != NULL); + if (!fsm_setendid(fsm_abc, ENDID_ABC)) { assert(!"setendid"); } + + if (!fsm_determinise(fsm_abc)) { assert(!"determinise"); } + if (!fsm_minimise(fsm_abc)) { assert(!"minimise"); } + + struct fsm *combined = fsm_union(fsm_ab_star_c, fsm_abc, NULL); + assert(combined != NULL); + + int ret = fsm_determinise(combined); + assert(ret != 0); + + ret = fsm_minimise(combined); + assert(ret != 0); + + size_t written; + fsm_end_id_t *endids = NULL; + if (match_string(combined, "ac", NULL, &endids, &written) != 1) { + assert(!"'ac' should match"); + } + assert(written == 1); + assert(endids[0] == ENDID_AB_STAR_C); + + if (match_string(combined, "abc", NULL, &endids, &written) != 1) { + assert(!"'abc' should match"); + } + assert(written == 2); + /* result is not sorted */ + assert((endids[0] == ENDID_AB_STAR_C && endids[1] == ENDID_ABC) || + (endids[1] == ENDID_AB_STAR_C && endids[0] == ENDID_ABC)); + + if (match_string(combined, "abbc", NULL, &endids, &written) != 1) { + assert(!"'abbc' should match"); + } + assert(written == 1); + assert(endids[0] == ENDID_AB_STAR_C); + + fsm_free(combined); +} diff --git a/tests/endids/endids2_union_many_endids.c b/tests/endids/endids2_union_many_endids.c index 8e39ca93d..46f2a189e 100644 --- a/tests/endids/endids2_union_many_endids.c +++ b/tests/endids/endids2_union_many_endids.c @@ -22,6 +22,7 @@ // to exercise reallocs in the ID list // // Each pattern gets 5 end ids +// Note: These are unanchored and therefore partially overlap static const char *patterns[] = { "abc", // 1-5 "def", // 6-10 @@ -175,7 +176,7 @@ int main(void) ret = fsm_determinise(fsm); assert(ret != 0); - // find end states, make sure we have two end states and they each have endids + // find end states, make sure we have multiple end states and they each have endids nstates = fsm_countstates(fsm); for (state_ind = 0; state_ind < nstates; state_ind++) { @@ -238,19 +239,18 @@ int main(void) } } - /* fsm_minimise currently collapses all end states to the same state. This should - * create a single end state with NUM_ENDIDS_TOTAL end ids - */ ret = fsm_minimise(fsm); assert(ret != 0); - assert( fsm_count(fsm, fsm_isend) == 1 ); + /* fsm_minimise should not collapse all the end states to a + * single end state, because they have distinct endids. */ + assert( fsm_count(fsm, fsm_isend) > 1); nstates = fsm_countstates(fsm); for (state_ind = 0; state_ind < nstates; state_ind++) { if (fsm_isend(fsm, state_ind)) { fsm_end_id_t endids[NUM_ENDIDS_TOTAL]; - size_t nwritten, num_endids, j; + size_t nwritten, num_endids; enum fsm_getendids_res ret; memset(&endids[0], 0, sizeof endids); @@ -258,7 +258,7 @@ int main(void) nwritten = 0; num_endids = fsm_getendidcount(fsm, state_ind); - assert(num_endids == NUM_ENDIDS_TOTAL); + assert(num_endids <= NUM_ENDIDS_TOTAL); ret = fsm_getendids( fsm, @@ -269,10 +269,6 @@ int main(void) assert(ret == FSM_GETENDIDS_FOUND); assert(nwritten == num_endids); - - for (j=0; j < num_endids; j++) { - assert( endids[j] == j+1 ); - } } }