Fixes for review:

Add benchmark result in README. Fix stack config Move ambiguous functions to compatibility module: Unicode.Char.General.Compat. Fix benchmark for old GHC Improve doc about isSpace and isWhiteSpace Improve benchmark by using bcompare. Add .editorconfig to .packcheck.ignore Fixes for review. Add .editorconfig Fixes for review appveyor: lts-18.17 → lts-18.18 Revert "Add support for big-endian architectures" This reverts commit 137f201. Update Changelog.md Simplify tests Add support for big-endian architectures
composewell · Dec 20, 2021 · 50a9ed5 · 50a9ed5
1 parent 2ace84e
commit 50a9ed5
Show file tree

Hide file tree

Showing 16 changed files with 521 additions and 356 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,12 @@
+# EditorConfig is awesome: https://EditorConfig.org
+
+# top-most EditorConfig file
+root = true
+
+[*]
+indent_style = space
+indent_size = 4
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = false
diff --git a/.packcheck.ignore b/.packcheck.ignore
@@ -1,4 +1,5 @@
 .packcheck.ignore
+.editorconfig
 .github/workflows/haskell.yml
 appveyor.yml
 stack.yaml

diff --git a/Changelog.md b/Changelog.md
@@ -1,5 +1,22 @@
 # Changelog
 
+## Next
+
+- Support for big-endian architectures.
+- Added `GeneralCategory` data type and corresponding `generalCategoryAbbr`,
+  `generalCategory` functions.
+- Added the following functions to `Unicode.Char.General`:
+  `isAlphabetic`, `isAlphaNum`,
+  `isControl`, `isMark`, `isPrint`, `isPunctuation`, `isSeparator`,
+  `isSymbol` and `isWhiteSpace`.
+- Added the module `Unicode.Char.Numeric`.
+- **Breaking change:** Changed the behavior of `isLetter` and `isSpace` to match
+  `base`’s `Data.Char` behavior. Move these functions to the compatibility module
+  `Unicode.Char.General.Compat`. The previous behavior is obtained using
+  `isAlphabetic` and `isWhiteSpace` respectively.
+- Re-export some functions from `Data.Char` in order to make `Unicode.Char`
+  a drop-in replacement.
+
 ## 0.2.0 (November 2021)
 
 * Update to [Unicode 14.0.0](https://www.unicode.org/versions/Unicode14.0.0/).

diff --git a/README.md b/README.md
@@ -15,6 +15,92 @@ any other packages or use cases.
 
 Please see the haddock documentation for reference documentation.
 
+## Performance
+
+`unicode-data` is up to _5 times faster_ than `base`.
+
+The following benchmark compares the time taken in milliseconds to process all
+the Unicode code points for `base-4.16` and this package (v0.3).
+Machine: 8 × AMD Ryzen 5 2500U on Linux.
+
+```
+All
+  Unicode.Char.Case
+    isLower
+      base:           OK (6.59s)
+         26 ms ± 238 μs
+      unicode-data:   OK (1.16s)
+        4.5 ms ±  83 μs, 0.17x
+    isUpper
+      base:           OK (1.69s)
+         27 ms ± 459 μs
+      unicode-data:   OK (1.21s)
+        4.8 ms ±  77 μs, 0.18x
+  Unicode.Char.General
+    generalCategory
+      base:           OK (0.92s)
+        131 ms ± 1.5 ms
+      unicode-data:   OK (1.62s)
+        108 ms ± 1.2 ms, 0.82x
+    isAlphaNum
+      base:           OK (3.28s)
+         26 ms ± 300 μs
+      unicode-data:   OK (20.60s)
+        5.0 ms ±  59 μs, 0.19x
+    isControl
+      base:           OK (1.61s)
+         26 ms ± 463 μs
+      unicode-data:   OK (1.22s)
+        4.8 ms ±  53 μs, 0.19x
+    isMark
+      base:           OK (0.80s)
+         26 ms ± 339 μs
+      unicode-data:   OK (1.33s)
+        5.2 ms ±  77 μs, 0.20x
+    isPrint
+      base:           OK (3.32s)
+         26 ms ± 498 μs
+      unicode-data:   OK (1.33s)
+        5.2 ms ±  55 μs, 0.20x
+    isPunctuation
+      base:           OK (3.41s)
+         27 ms ± 497 μs
+      unicode-data:   OK (2.67s)
+        5.3 ms ±  28 μs, 0.20x
+    isSeparator
+      base:           OK (0.84s)
+         27 ms ± 422 μs
+      unicode-data:   OK (1.41s)
+        5.5 ms ±  52 μs, 0.21x
+    isSymbol
+      base:           OK (1.72s)
+         27 ms ± 443 μs
+      unicode-data:   OK (1.45s)
+        5.7 ms ± 112 μs, 0.21x
+  Unicode.Char.General.Compat
+    isAlpha
+      base:           OK (3.26s)
+         26 ms ± 254 μs
+      unicode-data:   OK (2.66s)
+        5.2 ms ±  48 μs, 0.20x
+    isLetter
+      base:           OK (1.70s)
+         27 ms ± 453 μs
+      unicode-data:   OK (1.33s)
+        5.2 ms ±  69 μs, 0.19x
+    isSpace
+      base:           OK (0.85s)
+         13 ms ± 237 μs
+      unicode-data:   OK (1.69s)
+        6.7 ms ±  61 μs, 0.49x
+  Unicode.Char.Numeric
+    isNumber
+      base:           OK (1.67s)
+         26 ms ± 316 μs
+      unicode-data:   OK (1.32s)
+        5.2 ms ±  91 μs, 0.20x
+```
+
 ## Unicode database version update
 
 To update the Unicode version please update the version number in

diff --git a/appveyor.yml b/appveyor.yml
@@ -33,7 +33,7 @@ environment:
     # version.
     #STACKVER: "1.6.5"
     STACK_UPGRADE: "y"
-    RESOLVER: "lts-18.17"
+    RESOLVER: "lts-18.18"
     STACK_ROOT: "c:\\sr"
 
     # ------------------------------------------------------------------------

diff --git a/bench/Main.hs b/bench/Main.hs
@@ -1,154 +1,180 @@
-import Data.Ix (Ix(..))
 import Control.DeepSeq (NFData, deepseq)
-import Test.Tasty.Bench
+import Data.Ix (Ix(..))
+import Test.Tasty.Bench (Benchmark, bgroup, bench, bcompare, nf, defaultMain)
 
 import qualified Data.Char as B
 import qualified Unicode.Char.Case as C
 import qualified Unicode.Char.General as G
+import qualified Unicode.Char.General.Compat as GC
 import qualified Unicode.Char.Identifiers as I
 import qualified Unicode.Char.Normalization as N
+import qualified Unicode.Char.Numeric as Num
+
+-- | A unit benchmark
+data Bench a = Bench
+  { _title :: !String  -- ^ Name
+  , _func :: Char -> a -- ^ Function to benchmark
+  }
 
 main :: IO ()
 main = defaultMain
   [ bgroup "Unicode.Char.Case"
-    [ bgroup "isLower"
-      [ bench "base"         $ nf (fold_ B.isLower) (minBound, maxBound)
-      , bench "unicode-data" $ nf (fold_ C.isLower) (minBound, maxBound)
+    [ bgroup' "isLower"
+      [ Bench "base"         B.isLower
+      , Bench "unicode-data" C.isLower
       ]
-    , bgroup "isUpper"
-      [ bench "base"         $ nf (fold_ B.isUpper) (minBound, maxBound)
-      , bench "unicode-data" $ nf (fold_ C.isUpper) (minBound, maxBound)
+    , bgroup' "isUpper"
+      [ Bench "base"         B.isUpper
+      , Bench "unicode-data" C.isUpper
       ]
     ]
   , bgroup "Unicode.Char.General"
     -- Character classification
-    [ bgroup "generalCategory"
-      [ bench "base"         $ nf (fold_ (show . B.generalCategory)) (minBound, maxBound)
-      , bench "unicode-data" $ nf (fold_ (show . G.generalCategory)) (minBound, maxBound)
-      ]
-    , bgroup "isAlpa"
-      [ bench "base"         $ nf (fold_ B.isAlpha) (minBound, maxBound)
-      , bench "unicode-data" $ nf (fold_ G.isAlpha) (minBound, maxBound)
-      ]
-    , bgroup "isAlpabetic"
-      [ bench "unicode-data" $ nf (fold_ G.isAlphabetic) (minBound, maxBound)
+    [ bgroup' "generalCategory"
+      [ Bench "base"          (show . B.generalCategory)
+      , Bench "unicode-data"  (show . G.generalCategory)
       ]
-    , bgroup "isAlpaNum"
-      [ bench "base"         $ nf (fold_ B.isAlphaNum) (minBound, maxBound)
-      , bench "unicode-data" $ nf (fold_ G.isAlphaNum) (minBound, maxBound)
+    , bgroup "isAlphabetic"
+      [ benchNF "unicode-data"  G.isAlphabetic
       ]
-    , bgroup "isControl"
-      [ bench "base"         $ nf (fold_ B.isControl) (minBound, maxBound)
-      , bench "unicode-data" $ nf (fold_ G.isControl) (minBound, maxBound)
+    , bgroup' "isAlphaNum"
+      [ Bench "base"          B.isAlphaNum
+      , Bench "unicode-data"  G.isAlphaNum
       ]
-    , bgroup "isLetter"
-      [ bench "base"         $ nf (fold_ G.isLetter) (minBound, maxBound)
-      , bench "unicode-data" $ nf (fold_ G.isLetter) (minBound, maxBound)
+    , bgroup' "isControl"
+      [ Bench "base"          B.isControl
+      , Bench "unicode-data"  G.isControl
       ]
-    , bgroup "isMark"
-      [ bench "base"         $ nf (fold_ B.isMark) (minBound, maxBound)
-      , bench "unicode-data" $ nf (fold_ G.isMark) (minBound, maxBound)
+    , bgroup' "isMark"
+      [ Bench "base"          B.isMark
+      , Bench "unicode-data"  G.isMark
       ]
-    , bgroup "isNumber"
-      [ bench "base"         $ nf (fold_ B.isNumber) (minBound, maxBound)
-      , bench "unicode-data" $ nf (fold_ G.isNumber) (minBound, maxBound)
+    , bgroup' "isPrint"
+      [ Bench "base"          B.isPrint
+      , Bench "unicode-data"  G.isPrint
       ]
-    , bgroup "isPrint"
-      [ bench "base"         $ nf (fold_ B.isPrint) (minBound, maxBound)
-      , bench "unicode-data" $ nf (fold_ G.isPrint) (minBound, maxBound)
+    , bgroup' "isPunctuation"
+      [ Bench "base"          B.isPunctuation
+      , Bench "unicode-data"  G.isPunctuation
       ]
-    , bgroup "isPunctuation"
-      [ bench "base"         $ nf (fold_ B.isPunctuation) (minBound, maxBound)
-      , bench "unicode-data" $ nf (fold_ G.isPunctuation) (minBound, maxBound)
+    , bgroup' "isSeparator"
+      [ Bench "base"          B.isSeparator
+      , Bench "unicode-data"  G.isSeparator
       ]
-    , bgroup "isSeparator"
-      [ bench "base"         $ nf (fold_ B.isSeparator) (minBound, maxBound)
-      , bench "unicode-data" $ nf (fold_ G.isSeparator) (minBound, maxBound)
-      ]
-    , bgroup "isSpace"
-      [ bench "base"         $ nf (fold_ G.isSpace) (minBound, maxBound)
-      , bench "unicode-data" $ nf (fold_ G.isSpace) (minBound, maxBound)
-      ]
-    , bgroup "isSymbol"
-      [ bench "base"         $ nf (fold_ B.isSymbol) (minBound, maxBound)
-      , bench "unicode-data" $ nf (fold_ G.isSymbol) (minBound, maxBound)
+    , bgroup' "isSymbol"
+      [ Bench "base"          B.isSymbol
+      , Bench "unicode-data"  G.isSymbol
       ]
     , bgroup "isWhiteSpace"
-      [ bench "unicode-data" $ nf (fold_ G.isWhiteSpace) (minBound, maxBound)
+      [ benchNF "unicode-data"  G.isWhiteSpace
       ]
     -- Korean Hangul Characters
     , bgroup "isHangul"
-      [ bench "unicode-data" $ nf (fold_ G.isHangul) (minBound, maxBound)
+      [ benchNF "unicode-data"  G.isHangul
       ]
     , bgroup "isHangulLV"
-      [ bench "unicode-data" $ nf (fold_ G.isHangul) (minBound, maxBound)
+      [ benchNF "unicode-data"  G.isHangul
       ]
     , bgroup "isJamo"
-      [ bench "unicode-data" $ nf (fold_ G.isJamo) (minBound, maxBound)
+      [ benchNF "unicode-data"  G.isJamo
       ]
     , bgroup "jamoLIndex"
-      [ bench "unicode-data" $ nf (fold_ G.jamoLIndex) (minBound, maxBound)
+      [ benchNF "unicode-data"  G.jamoLIndex
       ]
     , bgroup "jamoVIndex"
-      [ bench "unicode-data" $ nf (fold_ G.jamoVIndex) (minBound, maxBound)
+      [ benchNF "unicode-data"  G.jamoVIndex
       ]
     , bgroup "jamoTIndex"
-      [ bench "unicode-data" $ nf (fold_ G.jamoTIndex) (minBound, maxBound)
+      [ benchNF "unicode-data"  G.jamoTIndex
+      ]
+    ]
+  , bgroup "Unicode.Char.General.Compat"
+    [ bgroup' "isAlpha"
+      [ Bench "base"          B.isAlpha
+      , Bench "unicode-data"  GC.isAlpha
+      ]
+    , bgroup' "isLetter"
+      [ Bench "base"          B.isLetter
+      , Bench "unicode-data"  GC.isLetter
+      ]
+    , bgroup' "isSpace"
+      [ Bench "base"          B.isSpace
+      , Bench "unicode-data"  GC.isSpace
       ]
     ]
   , bgroup "Unicode.Char.Identifiers"
     [ bgroup "isIDContinue"
-      [ bench "unicode-data" $ nf (fold_ I.isIDContinue) (minBound, maxBound)
+      [ benchNF "unicode-data"  I.isIDContinue
       ]
     , bgroup "isIDStart"
-      [ bench "unicode-data" $ nf (fold_ I.isIDStart) (minBound, maxBound)
+      [ benchNF "unicode-data"  I.isIDStart
       ]
     , bgroup "isXIDContinue"
-      [ bench "unicode-data" $ nf (fold_ I.isXIDContinue) (minBound, maxBound)
+      [ benchNF "unicode-data"  I.isXIDContinue
       ]
     , bgroup "isXIDStart"
-      [ bench "unicode-data" $ nf (fold_ I.isXIDStart) (minBound, maxBound)
+      [ benchNF "unicode-data"  I.isXIDStart
       ]
     , bgroup "isPatternSyntax"
-      [ bench "unicode-data" $ nf (fold_ I.isPatternSyntax) (minBound, maxBound)
+      [ benchNF "unicode-data"  I.isPatternSyntax
       ]
     , bgroup "isPatternWhitespace"
-      [ bench "unicode-data" $ nf (fold_ I.isPatternWhitespace) (minBound, maxBound)
+      [ benchNF "unicode-data"  I.isPatternWhitespace
       ]
     ]
   , bgroup "Unicode.Char.Normalization"
     [ bgroup "isCombining"
-      [ bench "unicode-data" $ nf (fold_ N.isCombining) (minBound, maxBound)
+      [ benchNF "unicode-data"  N.isCombining
       ]
     , bgroup "combiningClass"
-      [ bench "unicode-data" $ nf (fold_ N.combiningClass) (minBound, maxBound)
+      [ benchNF "unicode-data"  N.combiningClass
       ]
     , bgroup "isCombiningStarter"
-      [ bench "unicode-data" $ nf (fold_ N.isCombiningStarter) (minBound, maxBound)
+      [ benchNF "unicode-data"  N.isCombiningStarter
       ]
     -- [TODO] compose, composeStarters
     , bgroup "isDecomposable"
       [ bgroup "Canonical"
-        [ bench "unicode-data" $ nf (fold_ (N.isDecomposable N.Canonical)) (minBound, maxBound)
+        [ benchNF "unicode-data" (N.isDecomposable N.Canonical)
         ]
       , bgroup "Kompat"
-        [ bench "unicode-data" $ nf (fold_ (N.isDecomposable N.Kompat)) (minBound, maxBound)
+        [ benchNF "unicode-data" (N.isDecomposable N.Kompat)
         ]
       ]
     -- [FIXME] Fail due to non-exhaustive pattern matching
     -- , bgroup "decompose"
     --   [ bgroup "Canonical"
-    --     [ bench "unicode-data" $ nf (fold_ (N.decompose N.Canonical)) (minBound, maxBound)
+    --     [ benchNF "unicode-data" (N.decompose N.Canonical)
     --     ]
     --   , bgroup "Kompat"
-    --     [ bench "unicode-data" $ nf (fold_ (N.decompose N.Kompat)) (minBound, maxBound)
+    --     [ benchNF "unicode-data" (N.decompose N.Kompat)
     --     ]
     --   ]
     , bgroup "decomposeHangul"
-      [ bench "unicode-data" $ nf (fold_ N.decomposeHangul) (minBound, maxBound)
+      [ benchNF "unicode-data" N.decomposeHangul
+      ]
+    ]
+  , bgroup "Unicode.Char.Numeric"
+    [ bgroup' "isNumber"
+      [ Bench "base"          B.isNumber
+      , Bench "unicode-data"  Num.isNumber
       ]
     ]
   ]
   where
+    bgroup' groupTitle bs = bgroup groupTitle
+      [ benchNF' groupTitle title f
+      | Bench title f <- bs
+      ]
+
+    -- [NOTE] Works if groupTitle uniquely identifies the benchmark group.
+    benchNF' groupTitle title = case title of
+      "base" -> benchNF title
+      _      -> bcompare ("$NF == \"base\" && $(NF-1) == \"" ++ groupTitle ++ "\"")
+              . benchNF title
+
+    benchNF :: forall a. (NFData a) => String -> (Char -> a) -> Benchmark
+    benchNF t f = bench t $ nf (fold_ f) (minBound, maxBound)
+
     fold_ :: forall a. (NFData a) => (Char -> a) -> (Char, Char) -> ()
     fold_ f = foldr (deepseq . f) () . range