Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Prefix needle optimization #148

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 178 additions & 4 deletions include/ctre/evaluation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,24 @@ template <typename CharT, typename Iterator, typename EndIterator> constexpr CTR
return false;
}

struct zero_terminated_string_end_iterator;
template <auto... String, size_t... Idx, typename Iterator, typename EndIterator> constexpr CTRE_FORCE_INLINE string_match_result<Iterator> evaluate_match_string(Iterator current, [[maybe_unused]] const EndIterator end, std::index_sequence<Idx...>) noexcept {

bool same = (compare_character(String, current, end) && ... && true);

return {current, same};
#if __cpp_char8_t >= 201811
if constexpr (sizeof...(String) && !std::is_same_v<Iterator, utf8_iterator> && is_random_accessible(typename std::iterator_traits<Iterator>::iterator_category{}) && !std::is_same_v<EndIterator, ctre::zero_terminated_string_end_iterator>) {
#else
if constexpr (sizeof...(String) && is_random_accessible(typename std::iterator_traits<Iterator>::iterator_category{}) && !std::is_same_v<EndIterator, ctre::zero_terminated_string_end_iterator>) {
#endif
using char_type = decltype(*current);
bool same = ((size_t)std::distance(current, end) >= sizeof...(String)) && ((static_cast<char_type>(String) == *(current + Idx)) && ...);
if (same) {
return { current += sizeof...(String), same };
} else {
return { current, same };
}
} else {
bool same = (compare_character(String, current, end) && ... && true);
return { current, same };
}
}

template <typename R, typename Iterator, typename EndIterator, auto... String, typename... Tail>
Expand Down Expand Up @@ -522,6 +535,167 @@ constexpr CTRE_FORCE_INLINE R evaluate(const Iterator begin, Iterator current, c
}
}

template <typename T>
constexpr bool is_string(T) noexcept {
return false;
}
template <auto... String>
constexpr bool is_string(string<String...>)noexcept {
return true;
}

template <typename T>
constexpr bool is_string_like(T) noexcept {
return false;
}
template <auto... String>
constexpr bool is_string_like(string<String...>) noexcept {
return true;
}
template <typename CharacterLike, typename = std::enable_if_t<MatchesCharacter<CharacterLike>::template value<decltype(*std::declval<std::string_view::iterator>())>>>
constexpr bool is_string_like(CharacterLike) noexcept {
return true;
}

template <typename... Content>
constexpr auto extract_leading_string(ctll::list<Content...>) noexcept -> ctll::list<Content...> {
return {};
}
template <typename... Content>
constexpr auto extract_leading_string(sequence<Content...>) noexcept -> sequence<Content...> {
return {};
}

//concatenation
template <auto C, auto... String, typename... Content>
constexpr auto extract_leading_string(ctll::list<string<String...>, character<C>, Content...>) noexcept {
return extract_leading_string(ctll::list<string<String..., C>, Content...>());
}

template <auto... StringA, auto... StringB, typename... Content>
constexpr auto extract_leading_string(ctll::list<string<StringA...>, string<StringB...>, Content...>) noexcept {
return extract_leading_string(ctll::list<string<StringA..., StringB...>, Content...>());
}
//move things up out of sequences
template <typename... Content, typename... Tail>
constexpr auto extract_leading_string(ctll::list<sequence<Content...>, Tail...>) noexcept {
return extract_leading_string(ctll::list<Content..., Tail...>());
}

template <typename T, typename... Content, typename... Tail>
constexpr auto extract_leading_string(ctll::list<T, sequence<Content...>, Tail...>) noexcept {
return extract_leading_string(ctll::list<T, Content..., Tail...>());
}

template <typename... Content>
constexpr auto make_into_sequence(ctll::list<Content...>) noexcept -> sequence<Content...> {
return{};
}
template <typename... Content>
constexpr auto make_into_sequence(sequence<Content...>) noexcept -> sequence<Content...> {
return{};
}

//boyer moore utils
template<typename Ty>
constexpr bool is_prefix(Ty* word, size_t wordlen, ptrdiff_t pos) {
ptrdiff_t suffixlen = wordlen - pos;
for (int i = 0; i < suffixlen; i++) {
if (word[i] != word[pos + i]) {
return false;
}
}
return true;
}

template<typename Ty>
constexpr size_t suffix_length(Ty* word, size_t wordlen, ptrdiff_t pos) {
size_t i = 0;
// increment suffix length i to the first mismatch or beginning of the word
for (; (word[pos - i] == word[wordlen - 1 - i]) && (i < pos); i++);
return i;
}
//MSVC workaround, array operator[] blows up in face if constexpr, use pointers instead
template<typename Ty, auto... String>
constexpr auto make_delta_2(string<String...>) {
std::array<Ty, sizeof...(String)> chars{ String... };
std::array<ptrdiff_t, sizeof...(String)> table;
constexpr size_t patlen = sizeof...(String);
size_t p = 0;
size_t last_prefix_index = patlen - 1;

for (p = patlen - 1; p < patlen; p--) {
if (is_prefix(chars.data(), patlen, p + 1)) {
last_prefix_index = p + 1;
}
table.data()[p] = last_prefix_index + (patlen - 1 - p);
}

for (p = 0; p < patlen - 1; p++) {
size_t slen = suffix_length(chars.data(), patlen, p);
if (chars.data()[p - slen] != chars.data()[patlen - 1 - slen]) {
table.data()[patlen - 1 - slen] = patlen - 1 - p + slen;
}
}

return table;
}

template <typename Iterator> struct string_search_result {
Iterator position;
Iterator end_position;
bool match;
};

template <typename Iterator, typename EndIterator, auto... String>
constexpr CTRE_FORCE_INLINE string_search_result<Iterator> search_for_string(Iterator current, const EndIterator end, string<String...>) noexcept {
#if __cpp_char8_t >= 201811
if constexpr (sizeof...(String) > 2 && !std::is_same_v<Iterator, utf8_iterator> && is_random_accessible(typename std::iterator_traits<Iterator>::iterator_category{})) {
#else
if constexpr (sizeof...(String) > 2 && is_random_accessible(typename std::iterator_traits<Iterator>::iterator_category{})) {
#endif
constexpr std::array<typename ::std::iterator_traits<Iterator>::value_type, sizeof...(String)> chars{ String... };
constexpr std::array<ptrdiff_t, sizeof...(String)> delta_2 = make_delta_2<typename ::std::iterator_traits<Iterator>::value_type>(string<String...>());

size_t str_size = std::distance(current, end);
if (str_size < sizeof...(String)) { //quick exit no way to match
return { current + str_size, current + str_size, false };
}

size_t i = sizeof...(String) - 1; //index over to the starting location
for (; i < str_size;) {
size_t j = sizeof...(String) - 1;
size_t m = i + 1;
for (; *(current + i) == *(chars.data() + j); --i, --j) { //match string in reverse
if (j == 0) {
return { current + i, current + m, true };
}
}
size_t shift = enumeration<String...>::match_char(*(current + i)) ? static_cast<size_t>(*(delta_2.data() + j)) : sizeof...(String);
i += shift;
}

return { current + str_size, current + str_size, false };
} else if constexpr (sizeof...(String)) {
//fallback to plain string matching
constexpr std::array<typename ::std::iterator_traits<Iterator>::value_type, sizeof...(String)> chars{ String... };
constexpr typename ::std::iterator_traits<Iterator>::value_type first_char = chars.data()[0];
while (current != end) {
while (current != end && *current != first_char) {
current++;
}
auto result = evaluate_match_string<String...>(current, end, std::make_index_sequence<sizeof...(String)>());
if (result.match) {
return { current, result.position, result.match };
} else {
++current;
}
}
return { current, current, false };
} else {
return { current, current, true };
}
}

}

Expand Down
39 changes: 31 additions & 8 deletions include/ctre/wrapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,19 +62,42 @@ struct match_method {
struct search_method {
template <typename Modifier = singleline, typename ResultIterator = void, typename RE, typename IteratorBegin, typename IteratorEnd> constexpr CTRE_FORCE_INLINE static auto exec(IteratorBegin orig_begin, IteratorBegin begin, IteratorEnd end, RE) noexcept {
using result_iterator = std::conditional_t<std::is_same_v<ResultIterator, void>, IteratorBegin, ResultIterator>;

using front_re = decltype(pop_and_get_front(extract_leading_string(ctll::list<RE>{})));
constexpr bool fixed = starts_with_anchor(Modifier{}, ctll::list<RE>{});

auto it = begin;

for (; end != it && !fixed; ++it) {
if (auto out = evaluate(orig_begin, it, end, Modifier{}, return_type<result_iterator, RE>{}, ctll::list<start_mark, RE, end_mark, accept>())) {
return out;
if constexpr (is_string(front_re{}.front) && size(front_re{}.list)) {
auto it2 = search_for_string(it, end, front_re{}.front);
return_type<result_iterator, RE> result{};
for (; end != it2.position;) {
result.set_start_mark(it2.position);
result = evaluate(orig_begin, it2.end_position, end, Modifier{}, result, ctll::list<start_mark, decltype(make_into_sequence(front_re{}.list)), end_mark, accept>());
if (result) {
return result;
}
result.unmatch();
std::advance(it2.position, 1);
it2 = search_for_string(it2.position, end, front_re{}.front);
}
result.set_start_mark(it2.position);
return result = evaluate(orig_begin, it2.end_position, end, Modifier{}, result, ctll::list<start_mark, decltype(make_into_sequence(front_re{}.list)), end_mark, accept>());
} else if constexpr (is_string(front_re{}.front)) {
auto it2 = search_for_string(it, end, front_re{}.front);
return_type<result_iterator, RE> result{};
result.set_start_mark(it2.position);
result.set_end_mark(it2.end_position);
if (it2.match)
result.matched();
return result;
} else {
for (; end != it && !fixed; ++it) {
if (auto out = evaluate(orig_begin, it, end, Modifier{}, return_type<result_iterator, RE>{}, ctll::list<start_mark, RE, end_mark, accept>())) {
return out;
}
}
// in case the RE is empty or fixed
return evaluate(orig_begin, it, end, Modifier{}, return_type<result_iterator, RE>{}, ctll::list<start_mark, RE, end_mark, accept>());
}

// in case the RE is empty or fixed
return evaluate(orig_begin, it, end, Modifier{}, return_type<result_iterator, RE>{}, ctll::list<start_mark, RE, end_mark, accept>());
}

template <typename Modifier = singleline, typename ResultIterator = void, typename RE, typename IteratorBegin, typename IteratorEnd> constexpr CTRE_FORCE_INLINE static auto exec(IteratorBegin begin, IteratorEnd end, RE) noexcept {
Expand Down