From 9a0ee2e401f67696785bf941519a370dc8396642 Mon Sep 17 00:00:00 2001
From: Alex Mason <Axman6@gmail.com>
Date: Fri, 1 Jul 2022 17:53:25 +1000
Subject: [PATCH 01/16] Add C bits

---
 cbits/codepoint_offset.c | 288 +++++++++++++++++++++++++++++++++++++++
 text.cabal               |   1 +
 2 files changed, 289 insertions(+)
 create mode 100644 cbits/codepoint_offset.c

diff --git a/cbits/codepoint_offset.c b/cbits/codepoint_offset.c
new file mode 100644
index 00000000..19d76581
--- /dev/null
+++ b/cbits/codepoint_offset.c
@@ -0,0 +1,288 @@
+
+#include <string.h>
+#include <stdint.h>
+#include <sys/types.h>
+#ifdef __x86_64__
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#endif
+#include <stdbool.h>
+#include <sys/cdefs.h>
+
+// The following is from FreeBSD's memmem.c
+// https://github.com/freebsd/freebsd-src/blob/9921563f43a924d21c7bf43db4a34e724577db95/lib/libc/string/memmem.c
+
+/*-
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright (c) 2005-2014 Rich Felker, et al.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+static char *
+twobyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
+{
+	uint16_t nw = n[0] << 8 | n[1], hw = h[0] << 8 | h[1];
+	for (h += 2, k -= 2; k; k--, hw = hw << 8 | *h++)
+		if (hw == nw)
+			return (char *)h - 2;
+	return hw == nw ? (char *)h - 2 : 0;
+}
+
+static char *
+threebyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
+{
+	uint32_t nw = (uint32_t)n[0] << 24 | n[1] << 16 | n[2] << 8;
+	uint32_t hw = (uint32_t)h[0] << 24 | h[1] << 16 | h[2] << 8;
+	for (h += 3, k -= 3; k; k--, hw = (hw | *h++) << 8)
+		if (hw == nw)
+			return (char *)h - 3;
+	return hw == nw ? (char *)h - 3 : 0;
+}
+
+static char *
+fourbyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
+{
+	uint32_t nw = (uint32_t)n[0] << 24 | n[1] << 16 | n[2] << 8 | n[3];
+	uint32_t hw = (uint32_t)h[0] << 24 | h[1] << 16 | h[2] << 8 | h[3];
+	for (h += 4, k -= 4; k; k--, hw = hw << 8 | *h++)
+		if (hw == nw)
+			return (char *)h - 4;
+	return hw == nw ? (char *)h - 4 : 0;
+}
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+#define BITOP(a, b, op) \
+	((a)[(size_t)(b) / (8 * sizeof *(a))] op(size_t) 1 << ((size_t)(b) % (8 * sizeof *(a))))
+
+/*
+ * Two Way string search algorithm, with a bad shift table applied to the last
+ * byte of the window. A bit array marks which entries in the shift table are
+ * initialized to avoid fully initializing a 1kb/2kb table.
+ *
+ * Reference: CROCHEMORE M., PERRIN D., 1991, Two-way string-matching,
+ * Journal of the ACM 38(3):651-675
+ */
+static char *
+twoway_memmem(const unsigned char *h, const unsigned char *z,
+							const unsigned char *n, size_t l)
+{
+	size_t i, ip, jp, k, p, ms, p0, mem, mem0;
+	size_t byteset[32 / sizeof(size_t)] = {0};
+	size_t shift[256];
+
+	/* Computing length of needle and fill shift table */
+	for (i = 0; i < l; i++)
+		BITOP(byteset, n[i], |=), shift[n[i]] = i + 1;
+
+	/* Compute maximal suffix */
+	ip = -1;
+	jp = 0;
+	k = p = 1;
+	while (jp + k < l)
+	{
+		if (n[ip + k] == n[jp + k])
+		{
+			if (k == p)
+			{
+				jp += p;
+				k = 1;
+			}
+			else
+				k++;
+		}
+		else if (n[ip + k] > n[jp + k])
+		{
+			jp += k;
+			k = 1;
+			p = jp - ip;
+		}
+		else
+		{
+			ip = jp++;
+			k = p = 1;
+		}
+	}
+	ms = ip;
+	p0 = p;
+
+	/* And with the opposite comparison */
+	ip = -1;
+	jp = 0;
+	k = p = 1;
+	while (jp + k < l)
+	{
+		if (n[ip + k] == n[jp + k])
+		{
+			if (k == p)
+			{
+				jp += p;
+				k = 1;
+			}
+			else
+				k++;
+		}
+		else if (n[ip + k] < n[jp + k])
+		{
+			jp += k;
+			k = 1;
+			p = jp - ip;
+		}
+		else
+		{
+			ip = jp++;
+			k = p = 1;
+		}
+	}
+	if (ip + 1 > ms + 1)
+		ms = ip;
+	else
+		p = p0;
+
+	/* Periodic needle? */
+	if (memcmp(n, n + p, ms + 1))
+	{
+		mem0 = 0;
+		p = MAX(ms, l - ms - 1) + 1;
+	}
+	else
+		mem0 = l - p;
+	mem = 0;
+
+	/* Search loop */
+	for (;;)
+	{
+		/* If remainder of haystack is shorter than needle, done */
+		if (z - h < l)
+			return 0;
+
+		/* Check last byte first; advance by shift on mismatch */
+		if (BITOP(byteset, h[l - 1], &))
+		{
+			k = l - shift[h[l - 1]];
+			if (k)
+			{
+				if (k < mem)
+					k = mem;
+				h += k;
+				mem = 0;
+				continue;
+			}
+		}
+		else
+		{
+			h += l;
+			mem = 0;
+			continue;
+		}
+
+		/* Compare right half */
+		for (k = MAX(ms + 1, mem); k < l && n[k] == h[k]; k++)
+			;
+		if (k < l)
+		{
+			h += k - ms;
+			mem = 0;
+			continue;
+		}
+		/* Compare left half */
+		for (k = ms + 1; k > mem && n[k - 1] == h[k - 1]; k--)
+			;
+		if (k <= mem)
+			return (char *)h;
+		h += p;
+		mem = mem0;
+	}
+}
+
+void *
+_hs_memmem_standard(const void *h0, size_t k, const void *n0, size_t l)
+{
+	const unsigned char *h = h0, *n = n0;
+
+	/* Return immediately on empty needle */
+	if (!l)
+		return (void *)h;
+
+	/* Return immediately when needle is longer than haystack */
+	if (k < l)
+		return 0;
+
+	/* Use faster algorithms for short needles */
+	h = memchr(h0, *n, k);
+	if (!h || l == 1)
+		return (void *)h;
+	k -= h - (const unsigned char *)h0;
+	if (k < l)
+		return 0;
+	if (l == 2)
+		return twobyte_memmem(h, k, n);
+	if (l == 3)
+		return threebyte_memmem(h, k, n);
+	if (l == 4)
+		return fourbyte_memmem(h, k, n);
+
+	return twoway_memmem(h, h + k, n, l);
+}
+
+size_t
+_hs_text_memmem(const void *h0, size_t hoff, size_t hlen, const void *n0, size_t noff, size_t nlen)
+{
+	void *res = _hs_memmem_standard(h0 + hoff, hlen, n0 + noff, nlen);
+	return res == NULL ? -1 : (size_t)(res - h0);
+}
+
+const size_t _hs_offset_of_codepoint(const uint8_t *haystack0, const size_t hoffset, const size_t hlen, size_t needle)
+{
+	const uint8_t *haystack = haystack0 + hoffset;
+	uint8_t asUtf8[4];
+	size_t codepointLen;
+	if (needle < 0x80)
+	{
+		codepointLen = 1;
+		asUtf8[0] = needle;
+	}
+	else if (needle < 0x0800)
+	{
+		codepointLen = 2;
+		asUtf8[0] = (uint8_t)(((needle >> 6) & 0x1F) | 0xC0);
+		asUtf8[1] = (uint8_t)(((needle >> 0) & 0x3F) | 0x80);
+	}
+	else if (needle < 0x10000)
+	{
+		codepointLen = 3;
+		asUtf8[0] = (uint8_t)(((needle >> 12) & 0x0F) | 0xE0);
+		asUtf8[1] = (uint8_t)(((needle >> 6) & 0x3F) | 0x80);
+		asUtf8[2] = (uint8_t)(((needle >> 0) & 0x3F) | 0x80);
+	}
+	else
+	{
+		codepointLen = 4;
+		asUtf8[0] = (uint8_t)(((needle >> 18) & 0x07) | 0xF0);
+		asUtf8[1] = (uint8_t)(((needle >> 12) & 0x3F) | 0x80);
+		asUtf8[2] = (uint8_t)(((needle >> 6) & 0x3F) | 0x80);
+		asUtf8[3] = (uint8_t)(((needle >> 0) & 0x3F) | 0x80);
+	}
+	const void *res = _hs_memmem_standard(haystack, hlen, asUtf8, codepointLen);
+	return res == NULL ? -1 : (size_t)((uint8_t *)res - haystack);
+}
\ No newline at end of file
diff --git a/text.cabal b/text.cabal
index f6343609..2f9ba03b 100644
--- a/text.cabal
+++ b/text.cabal
@@ -82,6 +82,7 @@ library
                 cbits/measure_off.c
                 cbits/reverse.c
                 cbits/utils.c
+                cbits/codepoint_offset.c
   hs-source-dirs: src
 
   if flag(simdutf)

From 8aa7155fed05c687e74655473ed605a87dfb4d60 Mon Sep 17 00:00:00 2001
From: Alex Mason <Axman6@gmail.com>
Date: Fri, 1 Jul 2022 17:54:24 +1000
Subject: [PATCH 02/16] Initial Text functions and tests

---
 src/Data/Text.hs               | 85 ++++++++++++++++++++++++++++++----
 tests/Tests/Properties/Text.hs | 20 +++++++-
 2 files changed, 96 insertions(+), 9 deletions(-)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index 2ec5980a..c1338dbe 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -143,6 +143,7 @@ module Data.Text
     , stripEnd
     , splitAt
     , breakOn
+    , breakOnChar
     , breakOnEnd
     , break
     , span
@@ -156,6 +157,7 @@ module Data.Text
     -- ** Breaking into many substrings
     -- $split
     , splitOn
+    , splitOn'
     , split
     , chunksOf
 
@@ -204,6 +206,7 @@ module Data.Text
     , unpackCStringAscii#
 
     , measureOff
+    , codepointOffset
     ) where
 
 import Prelude (Char, Bool(..), Int, Maybe(..), String,
@@ -258,6 +261,7 @@ import qualified Language.Haskell.TH.Syntax as TH
 import Text.Printf (PrintfArg, formatArg, formatString)
 import System.Posix.Types (CSsize(..))
 import System.IO.Unsafe (unsafePerformIO)
+import Debug.Trace (traceShow)
 
 -- $setup
 -- >>> :set -package transformers
@@ -411,7 +415,7 @@ instance Data Text where
 instance TH.Lift Text where
 #if MIN_VERSION_template_haskell(2,16,0)
   lift txt = do
-    let (ptr, len) = unsafePerformIO $ asForeignPtr txt 
+    let (ptr, len) = unsafePerformIO $ asForeignPtr txt
     let lenInt = P.fromIntegral len
     TH.appE (TH.appE (TH.varE 'unpackCStringLen#) (TH.litE . TH.bytesPrimL $ TH.mkBytes ptr 0 lenInt)) (TH.lift lenInt)
 #else
@@ -1300,6 +1304,17 @@ measureOff !n (Text (A.ByteArray arr) off len) = if len == 0 then 0 else
 foreign import ccall unsafe "_hs_text_measure_off" c_measure_off
     :: ByteArray# -> CSize -> CSize -> CSize -> IO CSsize
 
+-- | O(n) Finds the offset of the first occurrence of @c@ in the @Text@, or
+-- '-1' if if can't be found.
+codepointOffset :: Text -> Char -> Int
+codepointOffset !(Text (A.ByteArray arr) off len) c = if len == 0 then -1 else
+   cSsizeToInt $ unsafeDupablePerformIO $
+    c_hs_offset_of_codepoint arr (intToCSize off) (intToCSize len) (intToCSize $ ord c)
+
+
+foreign import ccall unsafe "_hs_offset_of_codepoint" c_hs_offset_of_codepoint
+  ::ByteArray# -> CSize -> CSize -> CSize -> IO CSsize
+
 -- | /O(n)/ 'takeEnd' @n@ @t@ returns the suffix remaining after
 -- taking @n@ characters from the end of @t@.
 --
@@ -1528,6 +1543,8 @@ findAIndexOrEnd q t@(Text _arr _off len) = go 0
                 | otherwise             = go (i+d)
                 where Iter c d          = iter t i
 
+
+
 -- | /O(n)/ Group characters in a string by equality.
 group :: Text -> [Text]
 group = groupBy (==)
@@ -1584,7 +1601,7 @@ splitOn :: HasCallStack
         -> [Text]
 splitOn pat@(Text _ _ l) src@(Text arr off len)
     | l <= 0          = emptyError "splitOn"
-    | isSingleton pat = split (== unsafeHead pat) src
+    | isSingleton pat = splitOnChar (unsafeHead pat) src
     | otherwise       = go 0 (indices pat src)
   where
     go !s (x:xs) =  text arr (s+off) (x-s) : go (x+l) xs
@@ -1592,19 +1609,42 @@ splitOn pat@(Text _ _ l) src@(Text arr off len)
 {-# INLINE [1] splitOn #-}
 
 {-# RULES
-"TEXT splitOn/singleton -> split/==" [~1] forall c t.
-    splitOn (singleton c) t = split (==c) t
+"TEXT splitOn/singleton -> splitOnChar" [~1] forall c t.
+    splitOn (singleton c) t = splitOnChar c t
   #-}
 
+splitOn' :: Text -> Text -> [Text]
+splitOn' needle@(Text _ _ nlen) (Text harr hoff0 hlen0) = loop hoff0 hlen0 where
+    -- loop hoff hlen | traceShow ("loop", hoff, hlen, text harr hoff hlen) False = P.undefined
+    loop !hoff !hlen | hlen < nlen = [text harr hoff hlen]
+    loop _    hlen | hlen <= 0 = []
+    loop hoff hlen = case memmem needle (text harr hoff hlen) of
+      -- n | traceShow ("memmem", n) False -> P.undefined
+      (-1) -> []
+      n    -> text harr hoff (n-hoff) : loop (n + nlen) (hlen - (n-hoff) - nlen)
+
+memmem :: Text -> Text -> Int
+memmem (Text (A.ByteArray narr) noff nlen) (Text (A.ByteArray harr) hoff hlen) = unsafeDupablePerformIO $
+  cSsizeToInt
+    P.<$> text_memmem harr (intToCSize hoff) (intToCSize hlen)
+                      narr (intToCSize noff) (intToCSize nlen)
+{-# INLINE memmem #-}
+
+foreign import ccall unsafe "_hs_text_memmem" text_memmem
+    :: ByteArray# -> CSize -> CSize -> ByteArray# -> CSize -> CSize -> IO CSsize
+
+
 -- | /O(n)/ Splits a 'Text' into components delimited by separators,
 -- where the predicate returns True for a separator element.  The
 -- resulting components do not contain the separators.  Two adjacent
--- separators result in an empty component in the output.  eg.
+-- separators result in an empty component in the output.  To split
+-- on a specific character, use @splitOnChar@.
+-- eg.
 --
--- >>> split (=='a') "aabbaca"
--- ["","","bb","c",""]
+-- >>> split isUpper "theQuickBrownFox"
+-- ["the","uick","rown","ox"]
 --
--- >>> split (=='a') ""
+-- >>> split isUpper ""
 -- [""]
 split :: (Char -> Bool) -> Text -> [Text]
 split _ t@(Text _off _arr 0) = [t]
@@ -1614,6 +1654,27 @@ split p t = loop t
               where (# l, s' #) = span_ (not . p) s
 {-# INLINE split #-}
 
+
+{- TODO Fix:
+Rule "TEXT split/eq1 -> splitOnChar/==" may never fire
+  because rule "Class op ==" for ‘==’ might fire first
+Probable fix: add phase [n] or [~n] to the competing rulecompile(-Winline-rule-shadowing)
+-}
+{-# RULES
+"TEXT split/eq1 -> splitOnChar/==" [~2] forall c t.
+    split (== c) t = splitOnChar c t
+"TEXT split/eq1 -> splitOnChar/==" [~2] forall c t.
+    split (c ==) t = splitOnChar c t
+  #-}
+
+
+splitOnChar :: Char -> Text -> [Text]
+splitOnChar _ t@(Text _off _arr 0) = [t]
+splitOnChar c t = loop t
+      where loop s | null s'   = [l]
+                 | otherwise = l : loop (unsafeTail s')
+              where ( l, s' ) = breakOnChar c s
+
 -- | /O(n)/ Splits a 'Text' into components of length @k@.  The last
 -- element may be shorter than the other chunks, depending on the
 -- length of the input. Examples:
@@ -1737,6 +1798,8 @@ filter p = go
 -- is the prefix of @haystack@ before @needle@ is matched.  The second
 -- is the remainder of @haystack@, starting with the match.
 --
+-- To break on a specific character, use @breakOnChar@
+--
 -- Examples:
 --
 -- >>> breakOn "::" "a::b::c"
@@ -1764,6 +1827,12 @@ breakOn pat src@(Text arr off len)
                     (x:_) -> (text arr off x, text arr (off+x) (len-x))
 {-# INLINE breakOn #-}
 
+breakOnChar :: Char -> Text -> (Text, Text)
+breakOnChar c src@(Text arr off len) = case codepointOffset src c of
+  -1 -> (src, empty)
+  n  -> (text arr off n, text arr (off+n) (len-n) )
+
+
 -- | /O(n+m)/ Similar to 'breakOn', but searches from the end of the
 -- string.
 --
diff --git a/tests/Tests/Properties/Text.hs b/tests/Tests/Properties/Text.hs
index 3faa7bf3..cdd8e290 100644
--- a/tests/Tests/Properties/Text.hs
+++ b/tests/Tests/Properties/Text.hs
@@ -23,6 +23,7 @@ import qualified Data.Text.Internal.Fusion.Common as S
 import qualified Data.Text.Internal.Lazy.Fusion as SL
 import qualified Data.Text.Internal.Lazy.Search as S (indices)
 import qualified Data.Text.Internal.Search as T (indices)
+import qualified Data.Text.Internal as TI (Text(..))
 import qualified Data.Text.Lazy as TL
 import qualified Tests.SlowFunctions as Slow
 
@@ -267,6 +268,19 @@ tl_indices_char_drop n c pref suff = map fromIntegral (S.indices s t) === Slow.i
     s = TL.singleton c
     t = TL.drop n $ pref `TL.append` s `TL.append` suff
 
+t_codepointOffset_exists tPrefix target tSuffix =
+  let cleanPrefix@(TI.Text _ _ len) = T.filter (/= target) tPrefix
+  in T.codepointOffset (cleanPrefix <> T.singleton target <> tSuffix) target === len
+
+t_codepointOffset_missing t target = T.codepointOffset (T.filter (/= target) t) target == -1
+
+t_breakOnChar_exists tPrefix target tSuffix =
+  let cleanPrefix = T.filter (/= target) tPrefix
+      (before, after) = T.breakOnChar target (cleanPrefix <> T.singleton target <> tSuffix)
+  in before == cleanPrefix && after == T.singleton target <> tSuffix
+
+t_breakOnChar_missing t target = T.breakOnChar target (T.filter (/= target) t) == (t,T.empty)
+
 -- Make a stream appear shorter than it really is, to ensure that
 -- functions that consume inaccurately sized streams behave
 -- themselves.
@@ -374,7 +388,11 @@ testText =
       testProperty "t_find" t_find,
       testProperty "tl_find" tl_find,
       testProperty "t_partition" t_partition,
-      testProperty "tl_partition" tl_partition
+      testProperty "tl_partition" tl_partition,
+      testProperty "t_codepointOffset_exists" t_codepointOffset_exists,
+      testProperty "t_codepointOffset_missing" t_codepointOffset_missing,
+      testProperty "t_breakOnChar_exists" t_breakOnChar_exists,
+      testProperty "t_breakOnChar_missing" t_breakOnChar_missing
     ],
 
     testGroup "indexing" [

From 672d2d79b5f6d1fd12344669718f466c48a5d6c5 Mon Sep 17 00:00:00 2001
From: Alex Mason <Axman6@gmail.com>
Date: Fri, 1 Jul 2022 18:15:41 +1000
Subject: [PATCH 03/16] Fix and add tests

---
 tests/Tests/Properties/Text.hs | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/Tests/Properties/Text.hs b/tests/Tests/Properties/Text.hs
index cdd8e290..c578a492 100644
--- a/tests/Tests/Properties/Text.hs
+++ b/tests/Tests/Properties/Text.hs
@@ -279,7 +279,11 @@ t_breakOnChar_exists tPrefix target tSuffix =
       (before, after) = T.breakOnChar target (cleanPrefix <> T.singleton target <> tSuffix)
   in before == cleanPrefix && after == T.singleton target <> tSuffix
 
-t_breakOnChar_missing t target = T.breakOnChar target (T.filter (/= target) t) == (t,T.empty)
+t_breakOnChar_missing t target =
+  let filtered = T.filter (/= target) t
+  in T.breakOnChar target filtered == (filtered,T.empty)
+
+t_breakOnChar_is_break_eq_char t c = T.breakOnChar c t == T.break (== c) t
 
 -- Make a stream appear shorter than it really is, to ensure that
 -- functions that consume inaccurately sized streams behave
@@ -392,7 +396,8 @@ testText =
       testProperty "t_codepointOffset_exists" t_codepointOffset_exists,
       testProperty "t_codepointOffset_missing" t_codepointOffset_missing,
       testProperty "t_breakOnChar_exists" t_breakOnChar_exists,
-      testProperty "t_breakOnChar_missing" t_breakOnChar_missing
+      testProperty "t_breakOnChar_missing" t_breakOnChar_missing,
+      testProperty "t_breakOnChar_is_break_eq_char" t_breakOnChar_is_break_eq_char
     ],
 
     testGroup "indexing" [

From 13bba0d499e7ab0c73e75ba32d23f1418d985de1 Mon Sep 17 00:00:00 2001
From: Alex Mason <Axman6@gmail.com>
Date: Fri, 1 Jul 2022 18:54:23 +1000
Subject: [PATCH 04/16] Add haddocs for splitOnChar

---
 src/Data/Text.hs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index c1338dbe..245c33ef 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -266,6 +266,7 @@ import Debug.Trace (traceShow)
 -- $setup
 -- >>> :set -package transformers
 -- >>> import Control.Monad.Trans.State
+-- >>> import Data.Char (isUpper)
 -- >>> import Data.Text
 -- >>> import qualified Data.Text as T
 -- >>> :seti -XOverloadedStrings
@@ -1668,6 +1669,11 @@ Probable fix: add phase [n] or [~n] to the competing rulecompile(-Winline-rule-s
   #-}
 
 
+-- | /O(n)/ Splits a 'Text' into components delimited by the given @Char@.
+-- The behaviour is the same as @split@ but should be faster than @split (== c)@
+--
+-- >>> split (=='a') "aabbaca"
+-- ["","","bb","c",""]
 splitOnChar :: Char -> Text -> [Text]
 splitOnChar _ t@(Text _off _arr 0) = [t]
 splitOnChar c t = loop t

From 868d97ae5803b4c7048719e12b474c2903f9b734 Mon Sep 17 00:00:00 2001
From: Alex Mason <Axman6@gmail.com>
Date: Fri, 1 Jul 2022 18:57:31 +1000
Subject: [PATCH 05/16] Add haddocs for breakOnChar

---
 src/Data/Text.hs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index 245c33ef..e9999d95 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -1833,6 +1833,13 @@ breakOn pat src@(Text arr off len)
                     (x:_) -> (text arr off x, text arr (off+x) (len-x))
 {-# INLINE breakOn #-}
 
+-- | /O(n)/ Equivalent to @breakOn (== c)@ but should be faster.
+--
+-- >>> breakOnChar '/' "foo/bar/"
+-- ("foo","/bar/")
+--
+-- >>> breakOnChar '/' "foobar"
+-- ("foobar","")
 breakOnChar :: Char -> Text -> (Text, Text)
 breakOnChar c src@(Text arr off len) = case codepointOffset src c of
   -1 -> (src, empty)

From 277e1bd56da1fec6fcb658d659c55f66e7b3de3b Mon Sep 17 00:00:00 2001
From: Alex Mason <Axman6@gmail.com>
Date: Fri, 1 Jul 2022 18:59:10 +1000
Subject: [PATCH 06/16] Add temporary docs for splitOn'

---
 src/Data/Text.hs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index e9999d95..d130b3cc 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -1614,6 +1614,9 @@ splitOn pat@(Text _ _ l) src@(Text arr off len)
     splitOn (singleton c) t = splitOnChar c t
   #-}
 
+
+-- | TODO: Remove - a test to ensure the behaviour using memmem is equivalent
+-- to the version using @indices@
 splitOn' :: Text -> Text -> [Text]
 splitOn' needle@(Text _ _ nlen) (Text harr hoff0 hlen0) = loop hoff0 hlen0 where
     -- loop hoff hlen | traceShow ("loop", hoff, hlen, text harr hoff hlen) False = P.undefined

From bef46dbaa7170fe9fb096ad5512c3276a880620f Mon Sep 17 00:00:00 2001
From: Alex Mason <Axman6@gmail.com>
Date: Fri, 1 Jul 2022 19:02:50 +1000
Subject: [PATCH 07/16] Add haddocs for codepointOffset

---
 src/Data/Text.hs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index d130b3cc..dce4c212 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -1305,14 +1305,16 @@ measureOff !n (Text (A.ByteArray arr) off len) = if len == 0 then 0 else
 foreign import ccall unsafe "_hs_text_measure_off" c_measure_off
     :: ByteArray# -> CSize -> CSize -> CSize -> IO CSsize
 
--- | O(n) Finds the offset of the first occurrence of @c@ in the @Text@, or
+-- | O(n) Finds the byte offset of the first occurrence of @c@ in the @Text@, or
 -- '-1' if if can't be found.
 codepointOffset :: Text -> Char -> Int
 codepointOffset !(Text (A.ByteArray arr) off len) c = if len == 0 then -1 else
    cSsizeToInt $ unsafeDupablePerformIO $
     c_hs_offset_of_codepoint arr (intToCSize off) (intToCSize len) (intToCSize $ ord c)
 
-
+-- | The input buffer (arr :: ByteArray#, off :: CSize, len :: CSize)
+-- must specify a valid UTF-8 sequence, and the character must be less
+-- than 0x10FFFF, these conditions are not checked.
 foreign import ccall unsafe "_hs_offset_of_codepoint" c_hs_offset_of_codepoint
   ::ByteArray# -> CSize -> CSize -> CSize -> IO CSsize
 

From f885b480e54854ca7a796ebd7622481492f59e38 Mon Sep 17 00:00:00 2001
From: Alex Mason <Axman6@gmail.com>
Date: Fri, 1 Jul 2022 19:11:08 +1000
Subject: [PATCH 08/16] Remove Debug.Trace

---
 src/Data/Text.hs | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index dce4c212..5ec34bdc 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -261,7 +261,6 @@ import qualified Language.Haskell.TH.Syntax as TH
 import Text.Printf (PrintfArg, formatArg, formatString)
 import System.Posix.Types (CSsize(..))
 import System.IO.Unsafe (unsafePerformIO)
-import Debug.Trace (traceShow)
 
 -- $setup
 -- >>> :set -package transformers
@@ -1621,11 +1620,9 @@ splitOn pat@(Text _ _ l) src@(Text arr off len)
 -- to the version using @indices@
 splitOn' :: Text -> Text -> [Text]
 splitOn' needle@(Text _ _ nlen) (Text harr hoff0 hlen0) = loop hoff0 hlen0 where
-    -- loop hoff hlen | traceShow ("loop", hoff, hlen, text harr hoff hlen) False = P.undefined
     loop !hoff !hlen | hlen < nlen = [text harr hoff hlen]
     loop _    hlen | hlen <= 0 = []
     loop hoff hlen = case memmem needle (text harr hoff hlen) of
-      -- n | traceShow ("memmem", n) False -> P.undefined
       (-1) -> []
       n    -> text harr hoff (n-hoff) : loop (n + nlen) (hlen - (n-hoff) - nlen)
 

From 15a111ae19d5bd569c87a946a76ec6dbf940925e Mon Sep 17 00:00:00 2001
From: Alex Mason <Axman6@gmail.com>
Date: Fri, 1 Jul 2022 21:50:26 +1000
Subject: [PATCH 09/16] Add type signatures to new tests

---
 tests/Tests/Properties/Text.hs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/Tests/Properties/Text.hs b/tests/Tests/Properties/Text.hs
index c578a492..c0af9232 100644
--- a/tests/Tests/Properties/Text.hs
+++ b/tests/Tests/Properties/Text.hs
@@ -268,21 +268,26 @@ tl_indices_char_drop n c pref suff = map fromIntegral (S.indices s t) === Slow.i
     s = TL.singleton c
     t = TL.drop n $ pref `TL.append` s `TL.append` suff
 
+t_codepointOffset_exists :: T.Text -> Char -> T.Text -> Property
 t_codepointOffset_exists tPrefix target tSuffix =
   let cleanPrefix@(TI.Text _ _ len) = T.filter (/= target) tPrefix
   in T.codepointOffset (cleanPrefix <> T.singleton target <> tSuffix) target === len
 
+t_codepointOffset_missing :: T.Text -> Char -> Bool
 t_codepointOffset_missing t target = T.codepointOffset (T.filter (/= target) t) target == -1
 
+t_breakOnChar_exists :: T.Text -> Char -> T.Text -> Bool
 t_breakOnChar_exists tPrefix target tSuffix =
   let cleanPrefix = T.filter (/= target) tPrefix
       (before, after) = T.breakOnChar target (cleanPrefix <> T.singleton target <> tSuffix)
   in before == cleanPrefix && after == T.singleton target <> tSuffix
 
+t_breakOnChar_missing :: T.Text -> Char -> Bool
 t_breakOnChar_missing t target =
   let filtered = T.filter (/= target) t
   in T.breakOnChar target filtered == (filtered,T.empty)
 
+t_breakOnChar_is_break_eq_char :: T.Text -> Char -> Bool
 t_breakOnChar_is_break_eq_char t c = T.breakOnChar c t == T.break (== c) t
 
 -- Make a stream appear shorter than it really is, to ensure that

From 27adcd9a75b444faa76b73c0def40243f93dee25 Mon Sep 17 00:00:00 2001
From: Alex Mason <Axman6@gmail.com>
Date: Fri, 1 Jul 2022 22:11:06 +1000
Subject: [PATCH 10/16] less cryptic naming in C functions

---
 cbits/codepoint_offset.c | 47 ++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/cbits/codepoint_offset.c b/cbits/codepoint_offset.c
index 19d76581..aa4a778f 100644
--- a/cbits/codepoint_offset.c
+++ b/cbits/codepoint_offset.c
@@ -38,32 +38,32 @@
  */
 
 static char *
-twobyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
+twobyte_memmem(const uint8_t *h, size_t hlen, const uint8_t *n)
 {
 	uint16_t nw = n[0] << 8 | n[1], hw = h[0] << 8 | h[1];
-	for (h += 2, k -= 2; k; k--, hw = hw << 8 | *h++)
+	for (h += 2, hlen -= 2; hlen; hlen--, hw = hw << 8 | *h++)
 		if (hw == nw)
 			return (char *)h - 2;
 	return hw == nw ? (char *)h - 2 : 0;
 }
 
 static char *
-threebyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
+threebyte_memmem(const uint8_t *h, size_t hlen, const uint8_t *n)
 {
 	uint32_t nw = (uint32_t)n[0] << 24 | n[1] << 16 | n[2] << 8;
 	uint32_t hw = (uint32_t)h[0] << 24 | h[1] << 16 | h[2] << 8;
-	for (h += 3, k -= 3; k; k--, hw = (hw | *h++) << 8)
+	for (h += 3, hlen -= 3; hlen; hlen--, hw = (hw | *h++) << 8)
 		if (hw == nw)
 			return (char *)h - 3;
 	return hw == nw ? (char *)h - 3 : 0;
 }
 
 static char *
-fourbyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
+fourbyte_memmem(const uint8_t *h, size_t hlen, const uint8_t *n)
 {
 	uint32_t nw = (uint32_t)n[0] << 24 | n[1] << 16 | n[2] << 8 | n[3];
 	uint32_t hw = (uint32_t)h[0] << 24 | h[1] << 16 | h[2] << 8 | h[3];
-	for (h += 4, k -= 4; k; k--, hw = hw << 8 | *h++)
+	for (h += 4, hlen -= 4; hlen; hlen--, hw = hw << 8 | *h++)
 		if (hw == nw)
 			return (char *)h - 4;
 	return hw == nw ? (char *)h - 4 : 0;
@@ -84,8 +84,8 @@ fourbyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
  * Journal of the ACM 38(3):651-675
  */
 static char *
-twoway_memmem(const unsigned char *h, const unsigned char *z,
-							const unsigned char *n, size_t l)
+twoway_memmem(const uint8_t *h, const uint8_t *z,
+							const uint8_t *n, size_t l)
 {
 	size_t i, ip, jp, k, p, ms, p0, mem, mem0;
 	size_t byteset[32 / sizeof(size_t)] = {0};
@@ -215,34 +215,35 @@ twoway_memmem(const unsigned char *h, const unsigned char *z,
 	}
 }
 
+// TODO: If on a system which provides memmem (macOS, FreeBSD, Linux), use it instead
 void *
-_hs_memmem_standard(const void *h0, size_t k, const void *n0, size_t l)
+_hs_memmem_standard(const void *h0, size_t hlen, const void *n0, size_t nlen)
 {
-	const unsigned char *h = h0, *n = n0;
+	const uint8_t *h = h0, *n = n0;
 
 	/* Return immediately on empty needle */
-	if (!l)
+	if (!nlen)
 		return (void *)h;
 
 	/* Return immediately when needle is longer than haystack */
-	if (k < l)
+	if (hlen < nlen)
 		return 0;
 
 	/* Use faster algorithms for short needles */
-	h = memchr(h0, *n, k);
-	if (!h || l == 1)
+	h = memchr(h0, *n, hlen);
+	if (!h || nlen == 1)
 		return (void *)h;
-	k -= h - (const unsigned char *)h0;
-	if (k < l)
+	hlen -= h - (const uint8_t *)h0;
+	if (hlen < nlen)
 		return 0;
-	if (l == 2)
-		return twobyte_memmem(h, k, n);
-	if (l == 3)
-		return threebyte_memmem(h, k, n);
-	if (l == 4)
-		return fourbyte_memmem(h, k, n);
+	if (nlen == 2)
+		return twobyte_memmem(h, hlen, n);
+	if (nlen == 3)
+		return threebyte_memmem(h, hlen, n);
+	if (nlen == 4)
+		return fourbyte_memmem(h, hlen, n);
 
-	return twoway_memmem(h, h + k, n, l);
+	return twoway_memmem(h, h + hlen, n, nlen);
 }
 
 size_t

From b25b93c3f956f6cf84c6442a03ac7f9f4872085e Mon Sep 17 00:00:00 2001
From: Alex Mason <Axman6@gmail.com>
Date: Fri, 1 Jul 2022 22:11:25 +1000
Subject: [PATCH 11/16] Remove use of <> for older GHCs

---
 tests/Tests/Properties/Text.hs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/Tests/Properties/Text.hs b/tests/Tests/Properties/Text.hs
index c0af9232..b5d6e665 100644
--- a/tests/Tests/Properties/Text.hs
+++ b/tests/Tests/Properties/Text.hs
@@ -271,7 +271,7 @@ tl_indices_char_drop n c pref suff = map fromIntegral (S.indices s t) === Slow.i
 t_codepointOffset_exists :: T.Text -> Char -> T.Text -> Property
 t_codepointOffset_exists tPrefix target tSuffix =
   let cleanPrefix@(TI.Text _ _ len) = T.filter (/= target) tPrefix
-  in T.codepointOffset (cleanPrefix <> T.singleton target <> tSuffix) target === len
+  in T.codepointOffset (T.append cleanPrefix $ T.cons target tSuffix) target === len
 
 t_codepointOffset_missing :: T.Text -> Char -> Bool
 t_codepointOffset_missing t target = T.codepointOffset (T.filter (/= target) t) target == -1
@@ -279,8 +279,8 @@ t_codepointOffset_missing t target = T.codepointOffset (T.filter (/= target) t)
 t_breakOnChar_exists :: T.Text -> Char -> T.Text -> Bool
 t_breakOnChar_exists tPrefix target tSuffix =
   let cleanPrefix = T.filter (/= target) tPrefix
-      (before, after) = T.breakOnChar target (cleanPrefix <> T.singleton target <> tSuffix)
-  in before == cleanPrefix && after == T.singleton target <> tSuffix
+      (before, after) = T.breakOnChar target (T.append cleanPrefix $ T.cons target tSuffix)
+  in before == cleanPrefix && after == T.cons target tSuffix
 
 t_breakOnChar_missing :: T.Text -> Char -> Bool
 t_breakOnChar_missing t target =

From f3ce9dd12a82d60c3600f87692188d3925146a6c Mon Sep 17 00:00:00 2001
From: Alex Mason <Axman6@gmail.com>
Date: Sat, 2 Jul 2022 12:28:56 +1000
Subject: [PATCH 12/16] Remove all the memmem code, inline calls to
 Nbyte_memmem.

---
 cbits/codepoint_offset.c | 249 ++++++++-------------------------------
 src/Data/Text.hs         |  18 ---
 2 files changed, 49 insertions(+), 218 deletions(-)

diff --git a/cbits/codepoint_offset.c b/cbits/codepoint_offset.c
index aa4a778f..ef1385fd 100644
--- a/cbits/codepoint_offset.c
+++ b/cbits/codepoint_offset.c
@@ -15,7 +15,7 @@
 /*-
  * SPDX-License-Identifier: MIT
  *
- * Copyright (c) 2005-2014 Rich Felker, et al.
+ * Copyright (c) 2005-2014 Rich Felker, et al, 2022 Alex Mason.
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
@@ -69,221 +69,70 @@ fourbyte_memmem(const uint8_t *h, size_t hlen, const uint8_t *n)
 	return hw == nw ? (char *)h - 4 : 0;
 }
 
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
-#define BITOP(a, b, op) \
-	((a)[(size_t)(b) / (8 * sizeof *(a))] op(size_t) 1 << ((size_t)(b) % (8 * sizeof *(a))))
-
-/*
- * Two Way string search algorithm, with a bad shift table applied to the last
- * byte of the window. A bit array marks which entries in the shift table are
- * initialized to avoid fully initializing a 1kb/2kb table.
- *
- * Reference: CROCHEMORE M., PERRIN D., 1991, Two-way string-matching,
- * Journal of the ACM 38(3):651-675
- */
-static char *
-twoway_memmem(const uint8_t *h, const uint8_t *z,
-							const uint8_t *n, size_t l)
+static int _hs_codepoint_to_utf8(uint8_t asUtf8[4], uint32_t codepoint)
 {
-	size_t i, ip, jp, k, p, ms, p0, mem, mem0;
-	size_t byteset[32 / sizeof(size_t)] = {0};
-	size_t shift[256];
-
-	/* Computing length of needle and fill shift table */
-	for (i = 0; i < l; i++)
-		BITOP(byteset, n[i], |=), shift[n[i]] = i + 1;
 
-	/* Compute maximal suffix */
-	ip = -1;
-	jp = 0;
-	k = p = 1;
-	while (jp + k < l)
+	if (codepoint < 0x80)
 	{
-		if (n[ip + k] == n[jp + k])
-		{
-			if (k == p)
-			{
-				jp += p;
-				k = 1;
-			}
-			else
-				k++;
-		}
-		else if (n[ip + k] > n[jp + k])
-		{
-			jp += k;
-			k = 1;
-			p = jp - ip;
-		}
-		else
-		{
-			ip = jp++;
-			k = p = 1;
-		}
+		asUtf8[0] = codepoint;
+		return 1;
 	}
-	ms = ip;
-	p0 = p;
-
-	/* And with the opposite comparison */
-	ip = -1;
-	jp = 0;
-	k = p = 1;
-	while (jp + k < l)
+	else if (codepoint < 0x0800)
 	{
-		if (n[ip + k] == n[jp + k])
-		{
-			if (k == p)
-			{
-				jp += p;
-				k = 1;
-			}
-			else
-				k++;
-		}
-		else if (n[ip + k] < n[jp + k])
-		{
-			jp += k;
-			k = 1;
-			p = jp - ip;
-		}
-		else
-		{
-			ip = jp++;
-			k = p = 1;
-		}
+		asUtf8[0] = (uint8_t)(((codepoint >> 6) & 0x1F) | 0xC0);
+		asUtf8[1] = (uint8_t)(((codepoint >> 0) & 0x3F) | 0x80);
+		return 2;
 	}
-	if (ip + 1 > ms + 1)
-		ms = ip;
-	else
-		p = p0;
-
-	/* Periodic needle? */
-	if (memcmp(n, n + p, ms + 1))
+	else if (codepoint < 0x10000)
 	{
-		mem0 = 0;
-		p = MAX(ms, l - ms - 1) + 1;
+		asUtf8[0] = (uint8_t)(((codepoint >> 12) & 0x0F) | 0xE0);
+		asUtf8[1] = (uint8_t)(((codepoint >> 6) & 0x3F) | 0x80);
+		asUtf8[2] = (uint8_t)(((codepoint >> 0) & 0x3F) | 0x80);
+		return 3;
 	}
 	else
-		mem0 = l - p;
-	mem = 0;
-
-	/* Search loop */
-	for (;;)
 	{
-		/* If remainder of haystack is shorter than needle, done */
-		if (z - h < l)
-			return 0;
-
-		/* Check last byte first; advance by shift on mismatch */
-		if (BITOP(byteset, h[l - 1], &))
-		{
-			k = l - shift[h[l - 1]];
-			if (k)
-			{
-				if (k < mem)
-					k = mem;
-				h += k;
-				mem = 0;
-				continue;
-			}
-		}
-		else
-		{
-			h += l;
-			mem = 0;
-			continue;
-		}
-
-		/* Compare right half */
-		for (k = MAX(ms + 1, mem); k < l && n[k] == h[k]; k++)
-			;
-		if (k < l)
-		{
-			h += k - ms;
-			mem = 0;
-			continue;
-		}
-		/* Compare left half */
-		for (k = ms + 1; k > mem && n[k - 1] == h[k - 1]; k--)
-			;
-		if (k <= mem)
-			return (char *)h;
-		h += p;
-		mem = mem0;
+		asUtf8[0] = (uint8_t)(((codepoint >> 18) & 0x07) | 0xF0);
+		asUtf8[1] = (uint8_t)(((codepoint >> 12) & 0x3F) | 0x80);
+		asUtf8[2] = (uint8_t)(((codepoint >> 6) & 0x3F) | 0x80);
+		asUtf8[3] = (uint8_t)(((codepoint >> 0) & 0x3F) | 0x80);
+		return 4;
 	}
 }
 
-// TODO: If on a system which provides memmem (macOS, FreeBSD, Linux), use it instead
-void *
-_hs_memmem_standard(const void *h0, size_t hlen, const void *n0, size_t nlen)
+size_t _hs_offset_of_codepoint(const uint8_t *haystack0, const size_t hoffset, const size_t hlen0, const size_t needle)
 {
-	const uint8_t *h = h0, *n = n0;
-
-	/* Return immediately on empty needle */
-	if (!nlen)
-		return (void *)h;
-
-	/* Return immediately when needle is longer than haystack */
-	if (hlen < nlen)
-		return 0;
-
-	/* Use faster algorithms for short needles */
-	h = memchr(h0, *n, hlen);
-	if (!h || nlen == 1)
-		return (void *)h;
-	hlen -= h - (const uint8_t *)h0;
-	if (hlen < nlen)
-		return 0;
-	if (nlen == 2)
-		return twobyte_memmem(h, hlen, n);
-	if (nlen == 3)
-		return threebyte_memmem(h, hlen, n);
-	if (nlen == 4)
-		return fourbyte_memmem(h, hlen, n);
-
-	return twoway_memmem(h, h + hlen, n, nlen);
-}
+	const uint8_t *haystack = haystack0 + hoffset;
+	uint8_t *res = NULL;
+	uint8_t asUtf8[4] = {0};
+	const int codepointLen = _hs_codepoint_to_utf8(asUtf8, needle);
 
-size_t
-_hs_text_memmem(const void *h0, size_t hoff, size_t hlen, const void *n0, size_t noff, size_t nlen)
-{
-	void *res = _hs_memmem_standard(h0 + hoff, hlen, n0 + noff, nlen);
-	return res == NULL ? -1 : (size_t)(res - h0);
-}
+	// Skip to first location that could contain the character.
+	const uint8_t *haystackFirst = memchr(haystack, asUtf8[0], hlen0);
 
-const size_t _hs_offset_of_codepoint(const uint8_t *haystack0, const size_t hoffset, const size_t hlen, size_t needle)
-{
-	const uint8_t *haystack = haystack0 + hoffset;
-	uint8_t asUtf8[4];
-	size_t codepointLen;
-	if (needle < 0x80)
-	{
-		codepointLen = 1;
-		asUtf8[0] = needle;
-	}
-	else if (needle < 0x0800)
+	if (haystackFirst)
 	{
-		codepointLen = 2;
-		asUtf8[0] = (uint8_t)(((needle >> 6) & 0x1F) | 0xC0);
-		asUtf8[1] = (uint8_t)(((needle >> 0) & 0x3F) | 0x80);
-	}
-	else if (needle < 0x10000)
-	{
-		codepointLen = 3;
-		asUtf8[0] = (uint8_t)(((needle >> 12) & 0x0F) | 0xE0);
-		asUtf8[1] = (uint8_t)(((needle >> 6) & 0x3F) | 0x80);
-		asUtf8[2] = (uint8_t)(((needle >> 0) & 0x3F) | 0x80);
-	}
-	else
-	{
-		codepointLen = 4;
-		asUtf8[0] = (uint8_t)(((needle >> 18) & 0x07) | 0xF0);
-		asUtf8[1] = (uint8_t)(((needle >> 12) & 0x3F) | 0x80);
-		asUtf8[2] = (uint8_t)(((needle >> 6) & 0x3F) | 0x80);
-		asUtf8[3] = (uint8_t)(((needle >> 0) & 0x3F) | 0x80);
+		const size_t hlen = hlen0 - (haystackFirst - haystack);
+
+		switch (codepointLen)
+		{
+		case 1:
+			res = haystackFirst;
+			break;
+		case 2:
+			res = twobyte_memmem(haystackFirst, hlen, asUtf8);
+			break;
+		case 3:
+			res = threebyte_memmem(haystackFirst, hlen, asUtf8);
+			break;
+		case 4:
+			res = fourbyte_memmem(haystackFirst, hlen, asUtf8);
+			break;
+		default:
+			res = NULL;
+			break;
+		}
 	}
-	const void *res = _hs_memmem_standard(haystack, hlen, asUtf8, codepointLen);
+
 	return res == NULL ? -1 : (size_t)((uint8_t *)res - haystack);
-}
\ No newline at end of file
+}
diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index 5ec34bdc..c5527019 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -157,7 +157,6 @@ module Data.Text
     -- ** Breaking into many substrings
     -- $split
     , splitOn
-    , splitOn'
     , split
     , chunksOf
 
@@ -1616,23 +1615,6 @@ splitOn pat@(Text _ _ l) src@(Text arr off len)
   #-}
 
 
--- | TODO: Remove - a test to ensure the behaviour using memmem is equivalent
--- to the version using @indices@
-splitOn' :: Text -> Text -> [Text]
-splitOn' needle@(Text _ _ nlen) (Text harr hoff0 hlen0) = loop hoff0 hlen0 where
-    loop !hoff !hlen | hlen < nlen = [text harr hoff hlen]
-    loop _    hlen | hlen <= 0 = []
-    loop hoff hlen = case memmem needle (text harr hoff hlen) of
-      (-1) -> []
-      n    -> text harr hoff (n-hoff) : loop (n + nlen) (hlen - (n-hoff) - nlen)
-
-memmem :: Text -> Text -> Int
-memmem (Text (A.ByteArray narr) noff nlen) (Text (A.ByteArray harr) hoff hlen) = unsafeDupablePerformIO $
-  cSsizeToInt
-    P.<$> text_memmem harr (intToCSize hoff) (intToCSize hlen)
-                      narr (intToCSize noff) (intToCSize nlen)
-{-# INLINE memmem #-}
-
 foreign import ccall unsafe "_hs_text_memmem" text_memmem
     :: ByteArray# -> CSize -> CSize -> ByteArray# -> CSize -> CSize -> IO CSsize
 

From 842f9510abd407c27476ccf1c2394ad7769c414c Mon Sep 17 00:00:00 2001
From: Alex Mason <Axman6@gmail.com>
Date: Sat, 2 Jul 2022 13:01:11 +1000
Subject: [PATCH 13/16] Make types consistent (thanks to godbolt.org)

---
 cbits/codepoint_offset.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/cbits/codepoint_offset.c b/cbits/codepoint_offset.c
index ef1385fd..bcfed52c 100644
--- a/cbits/codepoint_offset.c
+++ b/cbits/codepoint_offset.c
@@ -37,36 +37,36 @@
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-static char *
+static uint8_t *
 twobyte_memmem(const uint8_t *h, size_t hlen, const uint8_t *n)
 {
 	uint16_t nw = n[0] << 8 | n[1], hw = h[0] << 8 | h[1];
 	for (h += 2, hlen -= 2; hlen; hlen--, hw = hw << 8 | *h++)
 		if (hw == nw)
-			return (char *)h - 2;
-	return hw == nw ? (char *)h - 2 : 0;
+			return (uint8_t *)h - 2;
+	return hw == nw ? (uint8_t *)h - 2 : 0;
 }
 
-static char *
+static uint8_t *
 threebyte_memmem(const uint8_t *h, size_t hlen, const uint8_t *n)
 {
 	uint32_t nw = (uint32_t)n[0] << 24 | n[1] << 16 | n[2] << 8;
 	uint32_t hw = (uint32_t)h[0] << 24 | h[1] << 16 | h[2] << 8;
 	for (h += 3, hlen -= 3; hlen; hlen--, hw = (hw | *h++) << 8)
 		if (hw == nw)
-			return (char *)h - 3;
-	return hw == nw ? (char *)h - 3 : 0;
+			return (uint8_t *)h - 3;
+	return hw == nw ? (uint8_t *)h - 3 : 0;
 }
 
-static char *
+static uint8_t *
 fourbyte_memmem(const uint8_t *h, size_t hlen, const uint8_t *n)
 {
 	uint32_t nw = (uint32_t)n[0] << 24 | n[1] << 16 | n[2] << 8 | n[3];
 	uint32_t hw = (uint32_t)h[0] << 24 | h[1] << 16 | h[2] << 8 | h[3];
 	for (h += 4, hlen -= 4; hlen; hlen--, hw = hw << 8 | *h++)
 		if (hw == nw)
-			return (char *)h - 4;
-	return hw == nw ? (char *)h - 4 : 0;
+			return (uint8_t *)h - 4;
+	return hw == nw ? (uint8_t *)h - 4 : 0;
 }
 
 static int _hs_codepoint_to_utf8(uint8_t asUtf8[4], uint32_t codepoint)
@@ -108,7 +108,7 @@ size_t _hs_offset_of_codepoint(const uint8_t *haystack0, const size_t hoffset, c
 	const int codepointLen = _hs_codepoint_to_utf8(asUtf8, needle);
 
 	// Skip to first location that could contain the character.
-	const uint8_t *haystackFirst = memchr(haystack, asUtf8[0], hlen0);
+	uint8_t *haystackFirst = (uint8_t *)memchr(haystack, asUtf8[0], hlen0);
 
 	if (haystackFirst)
 	{

From 94eb8e870c7b27ca4fa9d079954d1383140ba40d Mon Sep 17 00:00:00 2001
From: Alex Mason <Axman6@gmail.com>
Date: Sat, 2 Jul 2022 13:04:34 +1000
Subject: [PATCH 14/16] Export splitOnChar

---
 src/Data/Text.hs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index c5527019..7d89d709 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -158,6 +158,7 @@ module Data.Text
     -- $split
     , splitOn
     , split
+    , splitOnChar
     , chunksOf
 
     -- ** Breaking into lines and words

From 880bdafd964879617dd433a2f9cf87ca9d2cbf39 Mon Sep 17 00:00:00 2001
From: Alex Mason <Axman6@gmail.com>
Date: Sat, 2 Jul 2022 13:10:23 +1000
Subject: [PATCH 15/16] Clean up

---
 src/Data/Text.hs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index 7d89d709..eb7e8786 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -1304,7 +1304,7 @@ measureOff !n (Text (A.ByteArray arr) off len) = if len == 0 then 0 else
 foreign import ccall unsafe "_hs_text_measure_off" c_measure_off
     :: ByteArray# -> CSize -> CSize -> CSize -> IO CSsize
 
--- | O(n) Finds the byte offset of the first occurrence of @c@ in the @Text@, or
+-- | /O(n)/ Finds the byte offset of the first occurrence of @c@ in the @Text@, or
 -- '-1' if if can't be found.
 codepointOffset :: Text -> Char -> Int
 codepointOffset !(Text (A.ByteArray arr) off len) c = if len == 0 then -1 else
@@ -1545,8 +1545,6 @@ findAIndexOrEnd q t@(Text _arr _off len) = go 0
                 | otherwise             = go (i+d)
                 where Iter c d          = iter t i
 
-
-
 -- | /O(n)/ Group characters in a string by equality.
 group :: Text -> [Text]
 group = groupBy (==)

From 5f07e3a5450aa3651c3f52b05d09d37105ddf130 Mon Sep 17 00:00:00 2001
From: Alex Mason <Axman6@gmail.com>
Date: Sat, 2 Jul 2022 17:40:48 +1000
Subject: [PATCH 16/16] Remove memmem import

---
 src/Data/Text.hs | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/Data/Text.hs b/src/Data/Text.hs
index eb7e8786..16364aff 100644
--- a/src/Data/Text.hs
+++ b/src/Data/Text.hs
@@ -1614,10 +1614,6 @@ splitOn pat@(Text _ _ l) src@(Text arr off len)
   #-}
 
 
-foreign import ccall unsafe "_hs_text_memmem" text_memmem
-    :: ByteArray# -> CSize -> CSize -> ByteArray# -> CSize -> CSize -> IO CSsize
-
-
 -- | /O(n)/ Splits a 'Text' into components delimited by separators,
 -- where the predicate returns True for a separator element.  The
 -- resulting components do not contain the separators.  Two adjacent