From 9a0ee2e401f67696785bf941519a370dc8396642 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Fri, 1 Jul 2022 17:53:25 +1000 Subject: [PATCH 01/16] Add C bits --- cbits/codepoint_offset.c | 288 +++++++++++++++++++++++++++++++++++++++ text.cabal | 1 + 2 files changed, 289 insertions(+) create mode 100644 cbits/codepoint_offset.c diff --git a/cbits/codepoint_offset.c b/cbits/codepoint_offset.c new file mode 100644 index 00000000..19d76581 --- /dev/null +++ b/cbits/codepoint_offset.c @@ -0,0 +1,288 @@ + +#include +#include +#include +#ifdef __x86_64__ +#include +#include +#endif +#include +#include + +// The following is from FreeBSD's memmem.c +// https://github.com/freebsd/freebsd-src/blob/9921563f43a924d21c7bf43db4a34e724577db95/lib/libc/string/memmem.c + +/*- + * SPDX-License-Identifier: MIT + * + * Copyright (c) 2005-2014 Rich Felker, et al. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +static char * +twobyte_memmem(const unsigned char *h, size_t k, const unsigned char *n) +{ + uint16_t nw = n[0] << 8 | n[1], hw = h[0] << 8 | h[1]; + for (h += 2, k -= 2; k; k--, hw = hw << 8 | *h++) + if (hw == nw) + return (char *)h - 2; + return hw == nw ? (char *)h - 2 : 0; +} + +static char * +threebyte_memmem(const unsigned char *h, size_t k, const unsigned char *n) +{ + uint32_t nw = (uint32_t)n[0] << 24 | n[1] << 16 | n[2] << 8; + uint32_t hw = (uint32_t)h[0] << 24 | h[1] << 16 | h[2] << 8; + for (h += 3, k -= 3; k; k--, hw = (hw | *h++) << 8) + if (hw == nw) + return (char *)h - 3; + return hw == nw ? (char *)h - 3 : 0; +} + +static char * +fourbyte_memmem(const unsigned char *h, size_t k, const unsigned char *n) +{ + uint32_t nw = (uint32_t)n[0] << 24 | n[1] << 16 | n[2] << 8 | n[3]; + uint32_t hw = (uint32_t)h[0] << 24 | h[1] << 16 | h[2] << 8 | h[3]; + for (h += 4, k -= 4; k; k--, hw = hw << 8 | *h++) + if (hw == nw) + return (char *)h - 4; + return hw == nw ? (char *)h - 4 : 0; +} + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +#define BITOP(a, b, op) \ + ((a)[(size_t)(b) / (8 * sizeof *(a))] op(size_t) 1 << ((size_t)(b) % (8 * sizeof *(a)))) + +/* + * Two Way string search algorithm, with a bad shift table applied to the last + * byte of the window. A bit array marks which entries in the shift table are + * initialized to avoid fully initializing a 1kb/2kb table. + * + * Reference: CROCHEMORE M., PERRIN D., 1991, Two-way string-matching, + * Journal of the ACM 38(3):651-675 + */ +static char * +twoway_memmem(const unsigned char *h, const unsigned char *z, + const unsigned char *n, size_t l) +{ + size_t i, ip, jp, k, p, ms, p0, mem, mem0; + size_t byteset[32 / sizeof(size_t)] = {0}; + size_t shift[256]; + + /* Computing length of needle and fill shift table */ + for (i = 0; i < l; i++) + BITOP(byteset, n[i], |=), shift[n[i]] = i + 1; + + /* Compute maximal suffix */ + ip = -1; + jp = 0; + k = p = 1; + while (jp + k < l) + { + if (n[ip + k] == n[jp + k]) + { + if (k == p) + { + jp += p; + k = 1; + } + else + k++; + } + else if (n[ip + k] > n[jp + k]) + { + jp += k; + k = 1; + p = jp - ip; + } + else + { + ip = jp++; + k = p = 1; + } + } + ms = ip; + p0 = p; + + /* And with the opposite comparison */ + ip = -1; + jp = 0; + k = p = 1; + while (jp + k < l) + { + if (n[ip + k] == n[jp + k]) + { + if (k == p) + { + jp += p; + k = 1; + } + else + k++; + } + else if (n[ip + k] < n[jp + k]) + { + jp += k; + k = 1; + p = jp - ip; + } + else + { + ip = jp++; + k = p = 1; + } + } + if (ip + 1 > ms + 1) + ms = ip; + else + p = p0; + + /* Periodic needle? */ + if (memcmp(n, n + p, ms + 1)) + { + mem0 = 0; + p = MAX(ms, l - ms - 1) + 1; + } + else + mem0 = l - p; + mem = 0; + + /* Search loop */ + for (;;) + { + /* If remainder of haystack is shorter than needle, done */ + if (z - h < l) + return 0; + + /* Check last byte first; advance by shift on mismatch */ + if (BITOP(byteset, h[l - 1], &)) + { + k = l - shift[h[l - 1]]; + if (k) + { + if (k < mem) + k = mem; + h += k; + mem = 0; + continue; + } + } + else + { + h += l; + mem = 0; + continue; + } + + /* Compare right half */ + for (k = MAX(ms + 1, mem); k < l && n[k] == h[k]; k++) + ; + if (k < l) + { + h += k - ms; + mem = 0; + continue; + } + /* Compare left half */ + for (k = ms + 1; k > mem && n[k - 1] == h[k - 1]; k--) + ; + if (k <= mem) + return (char *)h; + h += p; + mem = mem0; + } +} + +void * +_hs_memmem_standard(const void *h0, size_t k, const void *n0, size_t l) +{ + const unsigned char *h = h0, *n = n0; + + /* Return immediately on empty needle */ + if (!l) + return (void *)h; + + /* Return immediately when needle is longer than haystack */ + if (k < l) + return 0; + + /* Use faster algorithms for short needles */ + h = memchr(h0, *n, k); + if (!h || l == 1) + return (void *)h; + k -= h - (const unsigned char *)h0; + if (k < l) + return 0; + if (l == 2) + return twobyte_memmem(h, k, n); + if (l == 3) + return threebyte_memmem(h, k, n); + if (l == 4) + return fourbyte_memmem(h, k, n); + + return twoway_memmem(h, h + k, n, l); +} + +size_t +_hs_text_memmem(const void *h0, size_t hoff, size_t hlen, const void *n0, size_t noff, size_t nlen) +{ + void *res = _hs_memmem_standard(h0 + hoff, hlen, n0 + noff, nlen); + return res == NULL ? -1 : (size_t)(res - h0); +} + +const size_t _hs_offset_of_codepoint(const uint8_t *haystack0, const size_t hoffset, const size_t hlen, size_t needle) +{ + const uint8_t *haystack = haystack0 + hoffset; + uint8_t asUtf8[4]; + size_t codepointLen; + if (needle < 0x80) + { + codepointLen = 1; + asUtf8[0] = needle; + } + else if (needle < 0x0800) + { + codepointLen = 2; + asUtf8[0] = (uint8_t)(((needle >> 6) & 0x1F) | 0xC0); + asUtf8[1] = (uint8_t)(((needle >> 0) & 0x3F) | 0x80); + } + else if (needle < 0x10000) + { + codepointLen = 3; + asUtf8[0] = (uint8_t)(((needle >> 12) & 0x0F) | 0xE0); + asUtf8[1] = (uint8_t)(((needle >> 6) & 0x3F) | 0x80); + asUtf8[2] = (uint8_t)(((needle >> 0) & 0x3F) | 0x80); + } + else + { + codepointLen = 4; + asUtf8[0] = (uint8_t)(((needle >> 18) & 0x07) | 0xF0); + asUtf8[1] = (uint8_t)(((needle >> 12) & 0x3F) | 0x80); + asUtf8[2] = (uint8_t)(((needle >> 6) & 0x3F) | 0x80); + asUtf8[3] = (uint8_t)(((needle >> 0) & 0x3F) | 0x80); + } + const void *res = _hs_memmem_standard(haystack, hlen, asUtf8, codepointLen); + return res == NULL ? -1 : (size_t)((uint8_t *)res - haystack); +} \ No newline at end of file diff --git a/text.cabal b/text.cabal index f6343609..2f9ba03b 100644 --- a/text.cabal +++ b/text.cabal @@ -82,6 +82,7 @@ library cbits/measure_off.c cbits/reverse.c cbits/utils.c + cbits/codepoint_offset.c hs-source-dirs: src if flag(simdutf) From 8aa7155fed05c687e74655473ed605a87dfb4d60 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Fri, 1 Jul 2022 17:54:24 +1000 Subject: [PATCH 02/16] Initial Text functions and tests --- src/Data/Text.hs | 85 ++++++++++++++++++++++++++++++---- tests/Tests/Properties/Text.hs | 20 +++++++- 2 files changed, 96 insertions(+), 9 deletions(-) diff --git a/src/Data/Text.hs b/src/Data/Text.hs index 2ec5980a..c1338dbe 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -143,6 +143,7 @@ module Data.Text , stripEnd , splitAt , breakOn + , breakOnChar , breakOnEnd , break , span @@ -156,6 +157,7 @@ module Data.Text -- ** Breaking into many substrings -- $split , splitOn + , splitOn' , split , chunksOf @@ -204,6 +206,7 @@ module Data.Text , unpackCStringAscii# , measureOff + , codepointOffset ) where import Prelude (Char, Bool(..), Int, Maybe(..), String, @@ -258,6 +261,7 @@ import qualified Language.Haskell.TH.Syntax as TH import Text.Printf (PrintfArg, formatArg, formatString) import System.Posix.Types (CSsize(..)) import System.IO.Unsafe (unsafePerformIO) +import Debug.Trace (traceShow) -- $setup -- >>> :set -package transformers @@ -411,7 +415,7 @@ instance Data Text where instance TH.Lift Text where #if MIN_VERSION_template_haskell(2,16,0) lift txt = do - let (ptr, len) = unsafePerformIO $ asForeignPtr txt + let (ptr, len) = unsafePerformIO $ asForeignPtr txt let lenInt = P.fromIntegral len TH.appE (TH.appE (TH.varE 'unpackCStringLen#) (TH.litE . TH.bytesPrimL $ TH.mkBytes ptr 0 lenInt)) (TH.lift lenInt) #else @@ -1300,6 +1304,17 @@ measureOff !n (Text (A.ByteArray arr) off len) = if len == 0 then 0 else foreign import ccall unsafe "_hs_text_measure_off" c_measure_off :: ByteArray# -> CSize -> CSize -> CSize -> IO CSsize +-- | O(n) Finds the offset of the first occurrence of @c@ in the @Text@, or +-- '-1' if if can't be found. +codepointOffset :: Text -> Char -> Int +codepointOffset !(Text (A.ByteArray arr) off len) c = if len == 0 then -1 else + cSsizeToInt $ unsafeDupablePerformIO $ + c_hs_offset_of_codepoint arr (intToCSize off) (intToCSize len) (intToCSize $ ord c) + + +foreign import ccall unsafe "_hs_offset_of_codepoint" c_hs_offset_of_codepoint + ::ByteArray# -> CSize -> CSize -> CSize -> IO CSsize + -- | /O(n)/ 'takeEnd' @n@ @t@ returns the suffix remaining after -- taking @n@ characters from the end of @t@. -- @@ -1528,6 +1543,8 @@ findAIndexOrEnd q t@(Text _arr _off len) = go 0 | otherwise = go (i+d) where Iter c d = iter t i + + -- | /O(n)/ Group characters in a string by equality. group :: Text -> [Text] group = groupBy (==) @@ -1584,7 +1601,7 @@ splitOn :: HasCallStack -> [Text] splitOn pat@(Text _ _ l) src@(Text arr off len) | l <= 0 = emptyError "splitOn" - | isSingleton pat = split (== unsafeHead pat) src + | isSingleton pat = splitOnChar (unsafeHead pat) src | otherwise = go 0 (indices pat src) where go !s (x:xs) = text arr (s+off) (x-s) : go (x+l) xs @@ -1592,19 +1609,42 @@ splitOn pat@(Text _ _ l) src@(Text arr off len) {-# INLINE [1] splitOn #-} {-# RULES -"TEXT splitOn/singleton -> split/==" [~1] forall c t. - splitOn (singleton c) t = split (==c) t +"TEXT splitOn/singleton -> splitOnChar" [~1] forall c t. + splitOn (singleton c) t = splitOnChar c t #-} +splitOn' :: Text -> Text -> [Text] +splitOn' needle@(Text _ _ nlen) (Text harr hoff0 hlen0) = loop hoff0 hlen0 where + -- loop hoff hlen | traceShow ("loop", hoff, hlen, text harr hoff hlen) False = P.undefined + loop !hoff !hlen | hlen < nlen = [text harr hoff hlen] + loop _ hlen | hlen <= 0 = [] + loop hoff hlen = case memmem needle (text harr hoff hlen) of + -- n | traceShow ("memmem", n) False -> P.undefined + (-1) -> [] + n -> text harr hoff (n-hoff) : loop (n + nlen) (hlen - (n-hoff) - nlen) + +memmem :: Text -> Text -> Int +memmem (Text (A.ByteArray narr) noff nlen) (Text (A.ByteArray harr) hoff hlen) = unsafeDupablePerformIO $ + cSsizeToInt + P.<$> text_memmem harr (intToCSize hoff) (intToCSize hlen) + narr (intToCSize noff) (intToCSize nlen) +{-# INLINE memmem #-} + +foreign import ccall unsafe "_hs_text_memmem" text_memmem + :: ByteArray# -> CSize -> CSize -> ByteArray# -> CSize -> CSize -> IO CSsize + + -- | /O(n)/ Splits a 'Text' into components delimited by separators, -- where the predicate returns True for a separator element. The -- resulting components do not contain the separators. Two adjacent --- separators result in an empty component in the output. eg. +-- separators result in an empty component in the output. To split +-- on a specific character, use @splitOnChar@. +-- eg. -- --- >>> split (=='a') "aabbaca" --- ["","","bb","c",""] +-- >>> split isUpper "theQuickBrownFox" +-- ["the","uick","rown","ox"] -- --- >>> split (=='a') "" +-- >>> split isUpper "" -- [""] split :: (Char -> Bool) -> Text -> [Text] split _ t@(Text _off _arr 0) = [t] @@ -1614,6 +1654,27 @@ split p t = loop t where (# l, s' #) = span_ (not . p) s {-# INLINE split #-} + +{- TODO Fix: +Rule "TEXT split/eq1 -> splitOnChar/==" may never fire + because rule "Class op ==" for ‘==’ might fire first +Probable fix: add phase [n] or [~n] to the competing rulecompile(-Winline-rule-shadowing) +-} +{-# RULES +"TEXT split/eq1 -> splitOnChar/==" [~2] forall c t. + split (== c) t = splitOnChar c t +"TEXT split/eq1 -> splitOnChar/==" [~2] forall c t. + split (c ==) t = splitOnChar c t + #-} + + +splitOnChar :: Char -> Text -> [Text] +splitOnChar _ t@(Text _off _arr 0) = [t] +splitOnChar c t = loop t + where loop s | null s' = [l] + | otherwise = l : loop (unsafeTail s') + where ( l, s' ) = breakOnChar c s + -- | /O(n)/ Splits a 'Text' into components of length @k@. The last -- element may be shorter than the other chunks, depending on the -- length of the input. Examples: @@ -1737,6 +1798,8 @@ filter p = go -- is the prefix of @haystack@ before @needle@ is matched. The second -- is the remainder of @haystack@, starting with the match. -- +-- To break on a specific character, use @breakOnChar@ +-- -- Examples: -- -- >>> breakOn "::" "a::b::c" @@ -1764,6 +1827,12 @@ breakOn pat src@(Text arr off len) (x:_) -> (text arr off x, text arr (off+x) (len-x)) {-# INLINE breakOn #-} +breakOnChar :: Char -> Text -> (Text, Text) +breakOnChar c src@(Text arr off len) = case codepointOffset src c of + -1 -> (src, empty) + n -> (text arr off n, text arr (off+n) (len-n) ) + + -- | /O(n+m)/ Similar to 'breakOn', but searches from the end of the -- string. -- diff --git a/tests/Tests/Properties/Text.hs b/tests/Tests/Properties/Text.hs index 3faa7bf3..cdd8e290 100644 --- a/tests/Tests/Properties/Text.hs +++ b/tests/Tests/Properties/Text.hs @@ -23,6 +23,7 @@ import qualified Data.Text.Internal.Fusion.Common as S import qualified Data.Text.Internal.Lazy.Fusion as SL import qualified Data.Text.Internal.Lazy.Search as S (indices) import qualified Data.Text.Internal.Search as T (indices) +import qualified Data.Text.Internal as TI (Text(..)) import qualified Data.Text.Lazy as TL import qualified Tests.SlowFunctions as Slow @@ -267,6 +268,19 @@ tl_indices_char_drop n c pref suff = map fromIntegral (S.indices s t) === Slow.i s = TL.singleton c t = TL.drop n $ pref `TL.append` s `TL.append` suff +t_codepointOffset_exists tPrefix target tSuffix = + let cleanPrefix@(TI.Text _ _ len) = T.filter (/= target) tPrefix + in T.codepointOffset (cleanPrefix <> T.singleton target <> tSuffix) target === len + +t_codepointOffset_missing t target = T.codepointOffset (T.filter (/= target) t) target == -1 + +t_breakOnChar_exists tPrefix target tSuffix = + let cleanPrefix = T.filter (/= target) tPrefix + (before, after) = T.breakOnChar target (cleanPrefix <> T.singleton target <> tSuffix) + in before == cleanPrefix && after == T.singleton target <> tSuffix + +t_breakOnChar_missing t target = T.breakOnChar target (T.filter (/= target) t) == (t,T.empty) + -- Make a stream appear shorter than it really is, to ensure that -- functions that consume inaccurately sized streams behave -- themselves. @@ -374,7 +388,11 @@ testText = testProperty "t_find" t_find, testProperty "tl_find" tl_find, testProperty "t_partition" t_partition, - testProperty "tl_partition" tl_partition + testProperty "tl_partition" tl_partition, + testProperty "t_codepointOffset_exists" t_codepointOffset_exists, + testProperty "t_codepointOffset_missing" t_codepointOffset_missing, + testProperty "t_breakOnChar_exists" t_breakOnChar_exists, + testProperty "t_breakOnChar_missing" t_breakOnChar_missing ], testGroup "indexing" [ From 672d2d79b5f6d1fd12344669718f466c48a5d6c5 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Fri, 1 Jul 2022 18:15:41 +1000 Subject: [PATCH 03/16] Fix and add tests --- tests/Tests/Properties/Text.hs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/Tests/Properties/Text.hs b/tests/Tests/Properties/Text.hs index cdd8e290..c578a492 100644 --- a/tests/Tests/Properties/Text.hs +++ b/tests/Tests/Properties/Text.hs @@ -279,7 +279,11 @@ t_breakOnChar_exists tPrefix target tSuffix = (before, after) = T.breakOnChar target (cleanPrefix <> T.singleton target <> tSuffix) in before == cleanPrefix && after == T.singleton target <> tSuffix -t_breakOnChar_missing t target = T.breakOnChar target (T.filter (/= target) t) == (t,T.empty) +t_breakOnChar_missing t target = + let filtered = T.filter (/= target) t + in T.breakOnChar target filtered == (filtered,T.empty) + +t_breakOnChar_is_break_eq_char t c = T.breakOnChar c t == T.break (== c) t -- Make a stream appear shorter than it really is, to ensure that -- functions that consume inaccurately sized streams behave @@ -392,7 +396,8 @@ testText = testProperty "t_codepointOffset_exists" t_codepointOffset_exists, testProperty "t_codepointOffset_missing" t_codepointOffset_missing, testProperty "t_breakOnChar_exists" t_breakOnChar_exists, - testProperty "t_breakOnChar_missing" t_breakOnChar_missing + testProperty "t_breakOnChar_missing" t_breakOnChar_missing, + testProperty "t_breakOnChar_is_break_eq_char" t_breakOnChar_is_break_eq_char ], testGroup "indexing" [ From 13bba0d499e7ab0c73e75ba32d23f1418d985de1 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Fri, 1 Jul 2022 18:54:23 +1000 Subject: [PATCH 04/16] Add haddocs for splitOnChar --- src/Data/Text.hs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/Data/Text.hs b/src/Data/Text.hs index c1338dbe..245c33ef 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -266,6 +266,7 @@ import Debug.Trace (traceShow) -- $setup -- >>> :set -package transformers -- >>> import Control.Monad.Trans.State +-- >>> import Data.Char (isUpper) -- >>> import Data.Text -- >>> import qualified Data.Text as T -- >>> :seti -XOverloadedStrings @@ -1668,6 +1669,11 @@ Probable fix: add phase [n] or [~n] to the competing rulecompile(-Winline-rule-s #-} +-- | /O(n)/ Splits a 'Text' into components delimited by the given @Char@. +-- The behaviour is the same as @split@ but should be faster than @split (== c)@ +-- +-- >>> split (=='a') "aabbaca" +-- ["","","bb","c",""] splitOnChar :: Char -> Text -> [Text] splitOnChar _ t@(Text _off _arr 0) = [t] splitOnChar c t = loop t From 868d97ae5803b4c7048719e12b474c2903f9b734 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Fri, 1 Jul 2022 18:57:31 +1000 Subject: [PATCH 05/16] Add haddocs for breakOnChar --- src/Data/Text.hs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/Data/Text.hs b/src/Data/Text.hs index 245c33ef..e9999d95 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -1833,6 +1833,13 @@ breakOn pat src@(Text arr off len) (x:_) -> (text arr off x, text arr (off+x) (len-x)) {-# INLINE breakOn #-} +-- | /O(n)/ Equivalent to @breakOn (== c)@ but should be faster. +-- +-- >>> breakOnChar '/' "foo/bar/" +-- ("foo","/bar/") +-- +-- >>> breakOnChar '/' "foobar" +-- ("foobar","") breakOnChar :: Char -> Text -> (Text, Text) breakOnChar c src@(Text arr off len) = case codepointOffset src c of -1 -> (src, empty) From 277e1bd56da1fec6fcb658d659c55f66e7b3de3b Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Fri, 1 Jul 2022 18:59:10 +1000 Subject: [PATCH 06/16] Add temporary docs for splitOn' --- src/Data/Text.hs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Data/Text.hs b/src/Data/Text.hs index e9999d95..d130b3cc 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -1614,6 +1614,9 @@ splitOn pat@(Text _ _ l) src@(Text arr off len) splitOn (singleton c) t = splitOnChar c t #-} + +-- | TODO: Remove - a test to ensure the behaviour using memmem is equivalent +-- to the version using @indices@ splitOn' :: Text -> Text -> [Text] splitOn' needle@(Text _ _ nlen) (Text harr hoff0 hlen0) = loop hoff0 hlen0 where -- loop hoff hlen | traceShow ("loop", hoff, hlen, text harr hoff hlen) False = P.undefined From bef46dbaa7170fe9fb096ad5512c3276a880620f Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Fri, 1 Jul 2022 19:02:50 +1000 Subject: [PATCH 07/16] Add haddocs for codepointOffset --- src/Data/Text.hs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Data/Text.hs b/src/Data/Text.hs index d130b3cc..dce4c212 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -1305,14 +1305,16 @@ measureOff !n (Text (A.ByteArray arr) off len) = if len == 0 then 0 else foreign import ccall unsafe "_hs_text_measure_off" c_measure_off :: ByteArray# -> CSize -> CSize -> CSize -> IO CSsize --- | O(n) Finds the offset of the first occurrence of @c@ in the @Text@, or +-- | O(n) Finds the byte offset of the first occurrence of @c@ in the @Text@, or -- '-1' if if can't be found. codepointOffset :: Text -> Char -> Int codepointOffset !(Text (A.ByteArray arr) off len) c = if len == 0 then -1 else cSsizeToInt $ unsafeDupablePerformIO $ c_hs_offset_of_codepoint arr (intToCSize off) (intToCSize len) (intToCSize $ ord c) - +-- | The input buffer (arr :: ByteArray#, off :: CSize, len :: CSize) +-- must specify a valid UTF-8 sequence, and the character must be less +-- than 0x10FFFF, these conditions are not checked. foreign import ccall unsafe "_hs_offset_of_codepoint" c_hs_offset_of_codepoint ::ByteArray# -> CSize -> CSize -> CSize -> IO CSsize From f885b480e54854ca7a796ebd7622481492f59e38 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Fri, 1 Jul 2022 19:11:08 +1000 Subject: [PATCH 08/16] Remove Debug.Trace --- src/Data/Text.hs | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/Data/Text.hs b/src/Data/Text.hs index dce4c212..5ec34bdc 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -261,7 +261,6 @@ import qualified Language.Haskell.TH.Syntax as TH import Text.Printf (PrintfArg, formatArg, formatString) import System.Posix.Types (CSsize(..)) import System.IO.Unsafe (unsafePerformIO) -import Debug.Trace (traceShow) -- $setup -- >>> :set -package transformers @@ -1621,11 +1620,9 @@ splitOn pat@(Text _ _ l) src@(Text arr off len) -- to the version using @indices@ splitOn' :: Text -> Text -> [Text] splitOn' needle@(Text _ _ nlen) (Text harr hoff0 hlen0) = loop hoff0 hlen0 where - -- loop hoff hlen | traceShow ("loop", hoff, hlen, text harr hoff hlen) False = P.undefined loop !hoff !hlen | hlen < nlen = [text harr hoff hlen] loop _ hlen | hlen <= 0 = [] loop hoff hlen = case memmem needle (text harr hoff hlen) of - -- n | traceShow ("memmem", n) False -> P.undefined (-1) -> [] n -> text harr hoff (n-hoff) : loop (n + nlen) (hlen - (n-hoff) - nlen) From 15a111ae19d5bd569c87a946a76ec6dbf940925e Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Fri, 1 Jul 2022 21:50:26 +1000 Subject: [PATCH 09/16] Add type signatures to new tests --- tests/Tests/Properties/Text.hs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/Tests/Properties/Text.hs b/tests/Tests/Properties/Text.hs index c578a492..c0af9232 100644 --- a/tests/Tests/Properties/Text.hs +++ b/tests/Tests/Properties/Text.hs @@ -268,21 +268,26 @@ tl_indices_char_drop n c pref suff = map fromIntegral (S.indices s t) === Slow.i s = TL.singleton c t = TL.drop n $ pref `TL.append` s `TL.append` suff +t_codepointOffset_exists :: T.Text -> Char -> T.Text -> Property t_codepointOffset_exists tPrefix target tSuffix = let cleanPrefix@(TI.Text _ _ len) = T.filter (/= target) tPrefix in T.codepointOffset (cleanPrefix <> T.singleton target <> tSuffix) target === len +t_codepointOffset_missing :: T.Text -> Char -> Bool t_codepointOffset_missing t target = T.codepointOffset (T.filter (/= target) t) target == -1 +t_breakOnChar_exists :: T.Text -> Char -> T.Text -> Bool t_breakOnChar_exists tPrefix target tSuffix = let cleanPrefix = T.filter (/= target) tPrefix (before, after) = T.breakOnChar target (cleanPrefix <> T.singleton target <> tSuffix) in before == cleanPrefix && after == T.singleton target <> tSuffix +t_breakOnChar_missing :: T.Text -> Char -> Bool t_breakOnChar_missing t target = let filtered = T.filter (/= target) t in T.breakOnChar target filtered == (filtered,T.empty) +t_breakOnChar_is_break_eq_char :: T.Text -> Char -> Bool t_breakOnChar_is_break_eq_char t c = T.breakOnChar c t == T.break (== c) t -- Make a stream appear shorter than it really is, to ensure that From 27adcd9a75b444faa76b73c0def40243f93dee25 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Fri, 1 Jul 2022 22:11:06 +1000 Subject: [PATCH 10/16] less cryptic naming in C functions --- cbits/codepoint_offset.c | 47 ++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/cbits/codepoint_offset.c b/cbits/codepoint_offset.c index 19d76581..aa4a778f 100644 --- a/cbits/codepoint_offset.c +++ b/cbits/codepoint_offset.c @@ -38,32 +38,32 @@ */ static char * -twobyte_memmem(const unsigned char *h, size_t k, const unsigned char *n) +twobyte_memmem(const uint8_t *h, size_t hlen, const uint8_t *n) { uint16_t nw = n[0] << 8 | n[1], hw = h[0] << 8 | h[1]; - for (h += 2, k -= 2; k; k--, hw = hw << 8 | *h++) + for (h += 2, hlen -= 2; hlen; hlen--, hw = hw << 8 | *h++) if (hw == nw) return (char *)h - 2; return hw == nw ? (char *)h - 2 : 0; } static char * -threebyte_memmem(const unsigned char *h, size_t k, const unsigned char *n) +threebyte_memmem(const uint8_t *h, size_t hlen, const uint8_t *n) { uint32_t nw = (uint32_t)n[0] << 24 | n[1] << 16 | n[2] << 8; uint32_t hw = (uint32_t)h[0] << 24 | h[1] << 16 | h[2] << 8; - for (h += 3, k -= 3; k; k--, hw = (hw | *h++) << 8) + for (h += 3, hlen -= 3; hlen; hlen--, hw = (hw | *h++) << 8) if (hw == nw) return (char *)h - 3; return hw == nw ? (char *)h - 3 : 0; } static char * -fourbyte_memmem(const unsigned char *h, size_t k, const unsigned char *n) +fourbyte_memmem(const uint8_t *h, size_t hlen, const uint8_t *n) { uint32_t nw = (uint32_t)n[0] << 24 | n[1] << 16 | n[2] << 8 | n[3]; uint32_t hw = (uint32_t)h[0] << 24 | h[1] << 16 | h[2] << 8 | h[3]; - for (h += 4, k -= 4; k; k--, hw = hw << 8 | *h++) + for (h += 4, hlen -= 4; hlen; hlen--, hw = hw << 8 | *h++) if (hw == nw) return (char *)h - 4; return hw == nw ? (char *)h - 4 : 0; @@ -84,8 +84,8 @@ fourbyte_memmem(const unsigned char *h, size_t k, const unsigned char *n) * Journal of the ACM 38(3):651-675 */ static char * -twoway_memmem(const unsigned char *h, const unsigned char *z, - const unsigned char *n, size_t l) +twoway_memmem(const uint8_t *h, const uint8_t *z, + const uint8_t *n, size_t l) { size_t i, ip, jp, k, p, ms, p0, mem, mem0; size_t byteset[32 / sizeof(size_t)] = {0}; @@ -215,34 +215,35 @@ twoway_memmem(const unsigned char *h, const unsigned char *z, } } +// TODO: If on a system which provides memmem (macOS, FreeBSD, Linux), use it instead void * -_hs_memmem_standard(const void *h0, size_t k, const void *n0, size_t l) +_hs_memmem_standard(const void *h0, size_t hlen, const void *n0, size_t nlen) { - const unsigned char *h = h0, *n = n0; + const uint8_t *h = h0, *n = n0; /* Return immediately on empty needle */ - if (!l) + if (!nlen) return (void *)h; /* Return immediately when needle is longer than haystack */ - if (k < l) + if (hlen < nlen) return 0; /* Use faster algorithms for short needles */ - h = memchr(h0, *n, k); - if (!h || l == 1) + h = memchr(h0, *n, hlen); + if (!h || nlen == 1) return (void *)h; - k -= h - (const unsigned char *)h0; - if (k < l) + hlen -= h - (const uint8_t *)h0; + if (hlen < nlen) return 0; - if (l == 2) - return twobyte_memmem(h, k, n); - if (l == 3) - return threebyte_memmem(h, k, n); - if (l == 4) - return fourbyte_memmem(h, k, n); + if (nlen == 2) + return twobyte_memmem(h, hlen, n); + if (nlen == 3) + return threebyte_memmem(h, hlen, n); + if (nlen == 4) + return fourbyte_memmem(h, hlen, n); - return twoway_memmem(h, h + k, n, l); + return twoway_memmem(h, h + hlen, n, nlen); } size_t From b25b93c3f956f6cf84c6442a03ac7f9f4872085e Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Fri, 1 Jul 2022 22:11:25 +1000 Subject: [PATCH 11/16] Remove use of <> for older GHCs --- tests/Tests/Properties/Text.hs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/Tests/Properties/Text.hs b/tests/Tests/Properties/Text.hs index c0af9232..b5d6e665 100644 --- a/tests/Tests/Properties/Text.hs +++ b/tests/Tests/Properties/Text.hs @@ -271,7 +271,7 @@ tl_indices_char_drop n c pref suff = map fromIntegral (S.indices s t) === Slow.i t_codepointOffset_exists :: T.Text -> Char -> T.Text -> Property t_codepointOffset_exists tPrefix target tSuffix = let cleanPrefix@(TI.Text _ _ len) = T.filter (/= target) tPrefix - in T.codepointOffset (cleanPrefix <> T.singleton target <> tSuffix) target === len + in T.codepointOffset (T.append cleanPrefix $ T.cons target tSuffix) target === len t_codepointOffset_missing :: T.Text -> Char -> Bool t_codepointOffset_missing t target = T.codepointOffset (T.filter (/= target) t) target == -1 @@ -279,8 +279,8 @@ t_codepointOffset_missing t target = T.codepointOffset (T.filter (/= target) t) t_breakOnChar_exists :: T.Text -> Char -> T.Text -> Bool t_breakOnChar_exists tPrefix target tSuffix = let cleanPrefix = T.filter (/= target) tPrefix - (before, after) = T.breakOnChar target (cleanPrefix <> T.singleton target <> tSuffix) - in before == cleanPrefix && after == T.singleton target <> tSuffix + (before, after) = T.breakOnChar target (T.append cleanPrefix $ T.cons target tSuffix) + in before == cleanPrefix && after == T.cons target tSuffix t_breakOnChar_missing :: T.Text -> Char -> Bool t_breakOnChar_missing t target = From f3ce9dd12a82d60c3600f87692188d3925146a6c Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Sat, 2 Jul 2022 12:28:56 +1000 Subject: [PATCH 12/16] Remove all the memmem code, inline calls to Nbyte_memmem. --- cbits/codepoint_offset.c | 249 ++++++++------------------------------- src/Data/Text.hs | 18 --- 2 files changed, 49 insertions(+), 218 deletions(-) diff --git a/cbits/codepoint_offset.c b/cbits/codepoint_offset.c index aa4a778f..ef1385fd 100644 --- a/cbits/codepoint_offset.c +++ b/cbits/codepoint_offset.c @@ -15,7 +15,7 @@ /*- * SPDX-License-Identifier: MIT * - * Copyright (c) 2005-2014 Rich Felker, et al. + * Copyright (c) 2005-2014 Rich Felker, et al, 2022 Alex Mason. * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the @@ -69,221 +69,70 @@ fourbyte_memmem(const uint8_t *h, size_t hlen, const uint8_t *n) return hw == nw ? (char *)h - 4 : 0; } -#define MAX(a, b) ((a) > (b) ? (a) : (b)) -#define MIN(a, b) ((a) < (b) ? (a) : (b)) - -#define BITOP(a, b, op) \ - ((a)[(size_t)(b) / (8 * sizeof *(a))] op(size_t) 1 << ((size_t)(b) % (8 * sizeof *(a)))) - -/* - * Two Way string search algorithm, with a bad shift table applied to the last - * byte of the window. A bit array marks which entries in the shift table are - * initialized to avoid fully initializing a 1kb/2kb table. - * - * Reference: CROCHEMORE M., PERRIN D., 1991, Two-way string-matching, - * Journal of the ACM 38(3):651-675 - */ -static char * -twoway_memmem(const uint8_t *h, const uint8_t *z, - const uint8_t *n, size_t l) +static int _hs_codepoint_to_utf8(uint8_t asUtf8[4], uint32_t codepoint) { - size_t i, ip, jp, k, p, ms, p0, mem, mem0; - size_t byteset[32 / sizeof(size_t)] = {0}; - size_t shift[256]; - - /* Computing length of needle and fill shift table */ - for (i = 0; i < l; i++) - BITOP(byteset, n[i], |=), shift[n[i]] = i + 1; - /* Compute maximal suffix */ - ip = -1; - jp = 0; - k = p = 1; - while (jp + k < l) + if (codepoint < 0x80) { - if (n[ip + k] == n[jp + k]) - { - if (k == p) - { - jp += p; - k = 1; - } - else - k++; - } - else if (n[ip + k] > n[jp + k]) - { - jp += k; - k = 1; - p = jp - ip; - } - else - { - ip = jp++; - k = p = 1; - } + asUtf8[0] = codepoint; + return 1; } - ms = ip; - p0 = p; - - /* And with the opposite comparison */ - ip = -1; - jp = 0; - k = p = 1; - while (jp + k < l) + else if (codepoint < 0x0800) { - if (n[ip + k] == n[jp + k]) - { - if (k == p) - { - jp += p; - k = 1; - } - else - k++; - } - else if (n[ip + k] < n[jp + k]) - { - jp += k; - k = 1; - p = jp - ip; - } - else - { - ip = jp++; - k = p = 1; - } + asUtf8[0] = (uint8_t)(((codepoint >> 6) & 0x1F) | 0xC0); + asUtf8[1] = (uint8_t)(((codepoint >> 0) & 0x3F) | 0x80); + return 2; } - if (ip + 1 > ms + 1) - ms = ip; - else - p = p0; - - /* Periodic needle? */ - if (memcmp(n, n + p, ms + 1)) + else if (codepoint < 0x10000) { - mem0 = 0; - p = MAX(ms, l - ms - 1) + 1; + asUtf8[0] = (uint8_t)(((codepoint >> 12) & 0x0F) | 0xE0); + asUtf8[1] = (uint8_t)(((codepoint >> 6) & 0x3F) | 0x80); + asUtf8[2] = (uint8_t)(((codepoint >> 0) & 0x3F) | 0x80); + return 3; } else - mem0 = l - p; - mem = 0; - - /* Search loop */ - for (;;) { - /* If remainder of haystack is shorter than needle, done */ - if (z - h < l) - return 0; - - /* Check last byte first; advance by shift on mismatch */ - if (BITOP(byteset, h[l - 1], &)) - { - k = l - shift[h[l - 1]]; - if (k) - { - if (k < mem) - k = mem; - h += k; - mem = 0; - continue; - } - } - else - { - h += l; - mem = 0; - continue; - } - - /* Compare right half */ - for (k = MAX(ms + 1, mem); k < l && n[k] == h[k]; k++) - ; - if (k < l) - { - h += k - ms; - mem = 0; - continue; - } - /* Compare left half */ - for (k = ms + 1; k > mem && n[k - 1] == h[k - 1]; k--) - ; - if (k <= mem) - return (char *)h; - h += p; - mem = mem0; + asUtf8[0] = (uint8_t)(((codepoint >> 18) & 0x07) | 0xF0); + asUtf8[1] = (uint8_t)(((codepoint >> 12) & 0x3F) | 0x80); + asUtf8[2] = (uint8_t)(((codepoint >> 6) & 0x3F) | 0x80); + asUtf8[3] = (uint8_t)(((codepoint >> 0) & 0x3F) | 0x80); + return 4; } } -// TODO: If on a system which provides memmem (macOS, FreeBSD, Linux), use it instead -void * -_hs_memmem_standard(const void *h0, size_t hlen, const void *n0, size_t nlen) +size_t _hs_offset_of_codepoint(const uint8_t *haystack0, const size_t hoffset, const size_t hlen0, const size_t needle) { - const uint8_t *h = h0, *n = n0; - - /* Return immediately on empty needle */ - if (!nlen) - return (void *)h; - - /* Return immediately when needle is longer than haystack */ - if (hlen < nlen) - return 0; - - /* Use faster algorithms for short needles */ - h = memchr(h0, *n, hlen); - if (!h || nlen == 1) - return (void *)h; - hlen -= h - (const uint8_t *)h0; - if (hlen < nlen) - return 0; - if (nlen == 2) - return twobyte_memmem(h, hlen, n); - if (nlen == 3) - return threebyte_memmem(h, hlen, n); - if (nlen == 4) - return fourbyte_memmem(h, hlen, n); - - return twoway_memmem(h, h + hlen, n, nlen); -} + const uint8_t *haystack = haystack0 + hoffset; + uint8_t *res = NULL; + uint8_t asUtf8[4] = {0}; + const int codepointLen = _hs_codepoint_to_utf8(asUtf8, needle); -size_t -_hs_text_memmem(const void *h0, size_t hoff, size_t hlen, const void *n0, size_t noff, size_t nlen) -{ - void *res = _hs_memmem_standard(h0 + hoff, hlen, n0 + noff, nlen); - return res == NULL ? -1 : (size_t)(res - h0); -} + // Skip to first location that could contain the character. + const uint8_t *haystackFirst = memchr(haystack, asUtf8[0], hlen0); -const size_t _hs_offset_of_codepoint(const uint8_t *haystack0, const size_t hoffset, const size_t hlen, size_t needle) -{ - const uint8_t *haystack = haystack0 + hoffset; - uint8_t asUtf8[4]; - size_t codepointLen; - if (needle < 0x80) - { - codepointLen = 1; - asUtf8[0] = needle; - } - else if (needle < 0x0800) + if (haystackFirst) { - codepointLen = 2; - asUtf8[0] = (uint8_t)(((needle >> 6) & 0x1F) | 0xC0); - asUtf8[1] = (uint8_t)(((needle >> 0) & 0x3F) | 0x80); - } - else if (needle < 0x10000) - { - codepointLen = 3; - asUtf8[0] = (uint8_t)(((needle >> 12) & 0x0F) | 0xE0); - asUtf8[1] = (uint8_t)(((needle >> 6) & 0x3F) | 0x80); - asUtf8[2] = (uint8_t)(((needle >> 0) & 0x3F) | 0x80); - } - else - { - codepointLen = 4; - asUtf8[0] = (uint8_t)(((needle >> 18) & 0x07) | 0xF0); - asUtf8[1] = (uint8_t)(((needle >> 12) & 0x3F) | 0x80); - asUtf8[2] = (uint8_t)(((needle >> 6) & 0x3F) | 0x80); - asUtf8[3] = (uint8_t)(((needle >> 0) & 0x3F) | 0x80); + const size_t hlen = hlen0 - (haystackFirst - haystack); + + switch (codepointLen) + { + case 1: + res = haystackFirst; + break; + case 2: + res = twobyte_memmem(haystackFirst, hlen, asUtf8); + break; + case 3: + res = threebyte_memmem(haystackFirst, hlen, asUtf8); + break; + case 4: + res = fourbyte_memmem(haystackFirst, hlen, asUtf8); + break; + default: + res = NULL; + break; + } } - const void *res = _hs_memmem_standard(haystack, hlen, asUtf8, codepointLen); + return res == NULL ? -1 : (size_t)((uint8_t *)res - haystack); -} \ No newline at end of file +} diff --git a/src/Data/Text.hs b/src/Data/Text.hs index 5ec34bdc..c5527019 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -157,7 +157,6 @@ module Data.Text -- ** Breaking into many substrings -- $split , splitOn - , splitOn' , split , chunksOf @@ -1616,23 +1615,6 @@ splitOn pat@(Text _ _ l) src@(Text arr off len) #-} --- | TODO: Remove - a test to ensure the behaviour using memmem is equivalent --- to the version using @indices@ -splitOn' :: Text -> Text -> [Text] -splitOn' needle@(Text _ _ nlen) (Text harr hoff0 hlen0) = loop hoff0 hlen0 where - loop !hoff !hlen | hlen < nlen = [text harr hoff hlen] - loop _ hlen | hlen <= 0 = [] - loop hoff hlen = case memmem needle (text harr hoff hlen) of - (-1) -> [] - n -> text harr hoff (n-hoff) : loop (n + nlen) (hlen - (n-hoff) - nlen) - -memmem :: Text -> Text -> Int -memmem (Text (A.ByteArray narr) noff nlen) (Text (A.ByteArray harr) hoff hlen) = unsafeDupablePerformIO $ - cSsizeToInt - P.<$> text_memmem harr (intToCSize hoff) (intToCSize hlen) - narr (intToCSize noff) (intToCSize nlen) -{-# INLINE memmem #-} - foreign import ccall unsafe "_hs_text_memmem" text_memmem :: ByteArray# -> CSize -> CSize -> ByteArray# -> CSize -> CSize -> IO CSsize From 842f9510abd407c27476ccf1c2394ad7769c414c Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Sat, 2 Jul 2022 13:01:11 +1000 Subject: [PATCH 13/16] Make types consistent (thanks to godbolt.org) --- cbits/codepoint_offset.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cbits/codepoint_offset.c b/cbits/codepoint_offset.c index ef1385fd..bcfed52c 100644 --- a/cbits/codepoint_offset.c +++ b/cbits/codepoint_offset.c @@ -37,36 +37,36 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -static char * +static uint8_t * twobyte_memmem(const uint8_t *h, size_t hlen, const uint8_t *n) { uint16_t nw = n[0] << 8 | n[1], hw = h[0] << 8 | h[1]; for (h += 2, hlen -= 2; hlen; hlen--, hw = hw << 8 | *h++) if (hw == nw) - return (char *)h - 2; - return hw == nw ? (char *)h - 2 : 0; + return (uint8_t *)h - 2; + return hw == nw ? (uint8_t *)h - 2 : 0; } -static char * +static uint8_t * threebyte_memmem(const uint8_t *h, size_t hlen, const uint8_t *n) { uint32_t nw = (uint32_t)n[0] << 24 | n[1] << 16 | n[2] << 8; uint32_t hw = (uint32_t)h[0] << 24 | h[1] << 16 | h[2] << 8; for (h += 3, hlen -= 3; hlen; hlen--, hw = (hw | *h++) << 8) if (hw == nw) - return (char *)h - 3; - return hw == nw ? (char *)h - 3 : 0; + return (uint8_t *)h - 3; + return hw == nw ? (uint8_t *)h - 3 : 0; } -static char * +static uint8_t * fourbyte_memmem(const uint8_t *h, size_t hlen, const uint8_t *n) { uint32_t nw = (uint32_t)n[0] << 24 | n[1] << 16 | n[2] << 8 | n[3]; uint32_t hw = (uint32_t)h[0] << 24 | h[1] << 16 | h[2] << 8 | h[3]; for (h += 4, hlen -= 4; hlen; hlen--, hw = hw << 8 | *h++) if (hw == nw) - return (char *)h - 4; - return hw == nw ? (char *)h - 4 : 0; + return (uint8_t *)h - 4; + return hw == nw ? (uint8_t *)h - 4 : 0; } static int _hs_codepoint_to_utf8(uint8_t asUtf8[4], uint32_t codepoint) @@ -108,7 +108,7 @@ size_t _hs_offset_of_codepoint(const uint8_t *haystack0, const size_t hoffset, c const int codepointLen = _hs_codepoint_to_utf8(asUtf8, needle); // Skip to first location that could contain the character. - const uint8_t *haystackFirst = memchr(haystack, asUtf8[0], hlen0); + uint8_t *haystackFirst = (uint8_t *)memchr(haystack, asUtf8[0], hlen0); if (haystackFirst) { From 94eb8e870c7b27ca4fa9d079954d1383140ba40d Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Sat, 2 Jul 2022 13:04:34 +1000 Subject: [PATCH 14/16] Export splitOnChar --- src/Data/Text.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Data/Text.hs b/src/Data/Text.hs index c5527019..7d89d709 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -158,6 +158,7 @@ module Data.Text -- $split , splitOn , split + , splitOnChar , chunksOf -- ** Breaking into lines and words From 880bdafd964879617dd433a2f9cf87ca9d2cbf39 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Sat, 2 Jul 2022 13:10:23 +1000 Subject: [PATCH 15/16] Clean up --- src/Data/Text.hs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Data/Text.hs b/src/Data/Text.hs index 7d89d709..eb7e8786 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -1304,7 +1304,7 @@ measureOff !n (Text (A.ByteArray arr) off len) = if len == 0 then 0 else foreign import ccall unsafe "_hs_text_measure_off" c_measure_off :: ByteArray# -> CSize -> CSize -> CSize -> IO CSsize --- | O(n) Finds the byte offset of the first occurrence of @c@ in the @Text@, or +-- | /O(n)/ Finds the byte offset of the first occurrence of @c@ in the @Text@, or -- '-1' if if can't be found. codepointOffset :: Text -> Char -> Int codepointOffset !(Text (A.ByteArray arr) off len) c = if len == 0 then -1 else @@ -1545,8 +1545,6 @@ findAIndexOrEnd q t@(Text _arr _off len) = go 0 | otherwise = go (i+d) where Iter c d = iter t i - - -- | /O(n)/ Group characters in a string by equality. group :: Text -> [Text] group = groupBy (==) From 5f07e3a5450aa3651c3f52b05d09d37105ddf130 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Sat, 2 Jul 2022 17:40:48 +1000 Subject: [PATCH 16/16] Remove memmem import --- src/Data/Text.hs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/Data/Text.hs b/src/Data/Text.hs index eb7e8786..16364aff 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -1614,10 +1614,6 @@ splitOn pat@(Text _ _ l) src@(Text arr off len) #-} -foreign import ccall unsafe "_hs_text_memmem" text_memmem - :: ByteArray# -> CSize -> CSize -> ByteArray# -> CSize -> CSize -> IO CSsize - - -- | /O(n)/ Splits a 'Text' into components delimited by separators, -- where the predicate returns True for a separator element. The -- resulting components do not contain the separators. Two adjacent