lojjic · talltyler · Jun 4, 2024 · Jun 4, 2024 · Jun 26, 2024 · 1ec5
diff --git a/src/embeddingLevels.js b/src/embeddingLevels.js
@@ -8,6 +8,7 @@ import {
   TYPES
 } from './charTypes.js'
 import { closingToOpeningBracket, getCanonicalBracket, openingToClosingBracket } from './brackets.js'
+import { stringToArray } from './util/stringToArray.js'
 
 // Local type aliases
 const {
@@ -42,22 +43,36 @@ const {
  */
 
 /**
- * This function applies the Bidirectional Algorithm to a string, returning the resolved embedding levels
+ * This function applies the Bidirectional Algorithm to a string of characters, returning the resolved embedding levels
  * in a single Uint8Array plus a list of objects holding each paragraph's start and end indices and resolved
  * base embedding level.
  *
- * @param {string} string - The input string
+ * @param {string} string - a string of characters
  * @param {"ltr"|"rtl"|"auto"} [baseDirection] - Use "ltr" or "rtl" to force a base paragraph direction,
  *        otherwise a direction will be chosen automatically from each paragraph's contents.
  * @return {GetEmbeddingLevelsResult}
  */
 export function getEmbeddingLevels (string, baseDirection) {
+  return getEmbeddingLevelsForCharacters(stringToArray(string), baseDirection);
+}
+
+/**
+ * This function applies the Bidirectional Algorithm to an array of characters, returning the resolved embedding levels
+ * in a single Uint8Array plus a list of objects holding each paragraph's start and end indices and resolved
+ * base embedding level.
+ *
+ * @param {string[]} characters - an array of character strings
+ * @param {"ltr"|"rtl"|"auto"} [baseDirection] - Use "ltr" or "rtl" to force a base paragraph direction,
+ *        otherwise a direction will be chosen automatically from each paragraph's contents.
+ * @return {GetEmbeddingLevelsResult}
+ */
+export function getEmbeddingLevelsForCharacters (characters, baseDirection) {
   const MAX_DEPTH = 125
 
   // Start by mapping all characters to their unicode type, as a bitmask integer
-  const charTypes = new Uint32Array(string.length)
-  for (let i = 0; i < string.length; i++) {
-    charTypes[i] = getBidiCharType(string[i])
+  const charTypes = new Uint32Array(characters.length)
+  for (let i = 0; i < characters.length; i++) {
+    charTypes[i] = getBidiCharType(characters[i])
   }
 
   const charTypeCounts = new Map() //will be cleared at start of each paragraph
@@ -74,18 +89,18 @@ export function getEmbeddingLevels (string, baseDirection) {
     }
   }
 
-  const embedLevels = new Uint8Array(string.length)
+  const embedLevels = new Uint8Array(characters.length)
   const isolationPairs = new Map() //init->pdi and pdi->init
 
   // === 3.3.1 The Paragraph Level ===
   // 3.3.1 P1: Split the text into paragraphs
   const paragraphs = [] // [{start, end, level}, ...]
   let paragraph = null
-  for (let i = 0; i < string.length; i++) {
+  for (let i = 0; i < characters.length; i++) {
     if (!paragraph) {
       paragraphs.push(paragraph = {
         start: i,
-        end: string.length - 1,
+        end: characters.length - 1,
         // 3.3.1 P2-P3: Determine the paragraph level
         level: baseDirection === 'rtl' ? 1 : baseDirection === 'ltr' ? 0 : determineAutoEmbedLevel(i, false)
       })
@@ -477,7 +492,7 @@ export function getEmbeddingLevels (string, baseDirection) {
             // type, as that may have been changed earlier. This doesn't seem to be explicitly
             // called out in the spec, but is required for passage of certain tests.
             if (charTypes[seqIndices[si]] & NEUTRAL_ISOLATE_TYPES) {
-              const char = string[seqIndices[si]]
+              const char = characters[seqIndices[si]]
               let oppositeBracket
               // Opening bracket
               if (openingToClosingBracket(char) !== null) {
@@ -553,7 +568,7 @@ export function getEmbeddingLevels (string, baseDirection) {
             if (useStrongType !== embedDirection) {
               for (let si = openSeqIdx + 1; si < seqIndices.length; si++) {
                 if (!(charTypes[seqIndices[si]] & BN_LIKE_TYPES)) {
-                  if (getBidiCharType(string[seqIndices[si]]) & TYPE_NSM) {
+                  if (getBidiCharType(characters[seqIndices[si]]) & TYPE_NSM) {
                     charTypes[seqIndices[si]] = useStrongType
                   }
                   break
@@ -563,7 +578,7 @@ export function getEmbeddingLevels (string, baseDirection) {
             if (useStrongType !== embedDirection) {
               for (let si = closeSeqIdx + 1; si < seqIndices.length; si++) {
                 if (!(charTypes[seqIndices[si]] & BN_LIKE_TYPES)) {
-                  if (getBidiCharType(string[seqIndices[si]]) & TYPE_NSM) {
+                  if (getBidiCharType(characters[seqIndices[si]]) & TYPE_NSM) {
                     charTypes[seqIndices[si]] = useStrongType
                   }
                   break
@@ -636,8 +651,8 @@ export function getEmbeddingLevels (string, baseDirection) {
       // 3.4 L1.1-4: Reset the embedding level of segment/paragraph separators, and any sequence of whitespace or
       // isolate formatting characters preceding them or the end of the paragraph, to the paragraph level.
       // NOTE: this will also need to be applied to each individual line ending after line wrapping occurs.
-      if (i === paragraph.end || getBidiCharType(string[i]) & (TYPE_S | TYPE_B)) {
-        for (let j = i; j >= 0 && (getBidiCharType(string[j]) & TRAILING_TYPES); j--) {
+      if (i === paragraph.end || getBidiCharType(characters[i]) & (TYPE_S | TYPE_B)) {
+        for (let j = i; j >= 0 && (getBidiCharType(characters[j]) & TRAILING_TYPES); j--) {
           embedLevels[j] = paragraph.level
         }
       }
@@ -653,7 +668,7 @@ export function getEmbeddingLevels (string, baseDirection) {
 
   function determineAutoEmbedLevel (start, isFSI) {
     // 3.3.1 P2 - P3
-    for (let i = start; i < string.length; i++) {
+    for (let i = start; i < characters.length; i++) {
       const charType = charTypes[i]
       if (charType & (TYPE_R | TYPE_AL)) {
         return 1
@@ -663,7 +678,7 @@ export function getEmbeddingLevels (string, baseDirection) {
       }
       if (charType & ISOLATE_INIT_TYPES) {
         const pdi = indexOfMatchingPDI(i)
-        i = pdi === -1 ? string.length : pdi
+        i = pdi === -1 ? characters.length : pdi
       }
     }
     return 0
@@ -672,7 +687,7 @@ export function getEmbeddingLevels (string, baseDirection) {
   function indexOfMatchingPDI (isolateStart) {
     // 3.1.2 BD9
     let isolationLevel = 1
-    for (let i = isolateStart + 1; i < string.length; i++) {
+    for (let i = isolateStart + 1; i < characters.length; i++) {
       const charType = charTypes[i]
       if (charType & TYPE_B) {
         break

diff --git a/src/index.js b/src/index.js
@@ -1,5 +1,6 @@
-export { getEmbeddingLevels } from './embeddingLevels.js'
-export { getReorderSegments, getReorderedIndices, getReorderedString } from './reordering.js'
+export { getEmbeddingLevels, getEmbeddingLevelsForCharacters } from './embeddingLevels.js'
+export { getReorderSegments, getReorderedCharacters, getReorderedIndices, getReorderedString } from './reordering.js'
 export { getBidiCharType, getBidiCharTypeName } from './charTypes.js'
 export { getMirroredCharacter, getMirroredCharactersMap } from './mirroring.js'
 export { closingToOpeningBracket, openingToClosingBracket, getCanonicalBracket } from './brackets.js'
+export { stringToArray } from './util/stringToArray.js'
diff --git a/src/mirroring.js b/src/mirroring.js
@@ -22,23 +22,23 @@ export function getMirroredCharacter (char) {
 }
 
 /**
- * Given a string and its resolved embedding levels, build a map of indices to replacement chars
+ * Given a character array and its resolved embedding levels, build a map of indices to replacement chars
  * for any characters in right-to-left segments that have defined mirrored characters.
- * @param string
+ * @param {string[]} characters - an array of character strings
  * @param embeddingLevels
  * @param [start]
  * @param [end]
  * @return {Map<number, string>}
  */
-export function getMirroredCharactersMap(string, embeddingLevels, start, end) {
-  let strLen = string.length
+export function getMirroredCharactersMap(characters, embeddingLevels, start, end) {
+  let strLen = characters.length
   start = Math.max(0, start == null ? 0 : +start)
   end = Math.min(strLen - 1, end == null ? strLen - 1 : +end)
 
   const map = new Map()
   for (let i = start; i <= end; i++) {
     if (embeddingLevels[i] & 1) { //only odd (rtl) levels
-      const mirror = getMirroredCharacter(string[i])
+      const mirror = getMirroredCharacter(characters[i])
       if (mirror !== null) {
         map.set(i, mirror)
       }

diff --git a/src/reordering.js b/src/reordering.js
@@ -1,17 +1,18 @@
 import { getBidiCharType, TRAILING_TYPES } from './charTypes.js'
 import { getMirroredCharacter } from './mirroring.js'
+import { stringToArray } from './util/stringToArray.js'
 
 /**
  * Given a start and end denoting a single line within a string, and a set of precalculated
  * bidi embedding levels, produce a list of segments whose ordering should be flipped, in sequence.
- * @param {string} string - the full input string
+ * @param {string[]} characters - an array of character strings
  * @param {GetEmbeddingLevelsResult} embeddingLevelsResult - the result object from getEmbeddingLevels
- * @param {number} [start] - first character in a subset of the full string
- * @param {number} [end] - last character in a subset of the full string
+ * @param {number} [start] - first character in a subset of the full characters array
+ * @param {number} [end] - last character in a subset of the full characters array
  * @return {number[][]} - the list of start/end segments that should be flipped, in order.
  */
-export function getReorderSegments(string, embeddingLevelsResult, start, end) {
-  let strLen = string.length
+export function getReorderSegments(characters, embeddingLevelsResult, start, end) {
+  let strLen = characters.length
   start = Math.max(0, start == null ? 0 : +start)
   end = Math.min(strLen - 1, end == null ? strLen - 1 : +end)
 
@@ -25,7 +26,7 @@ export function getReorderSegments(string, embeddingLevelsResult, start, end) {
 
       // 3.4 L1.4: Reset any sequence of whitespace characters and/or isolate formatting characters at the
       // end of the line to the paragraph level.
-      for (let i = lineEnd; i >= lineStart && (getBidiCharType(string[i]) & TRAILING_TYPES); i--) {
+      for (let i = lineEnd; i >= lineStart && (getBidiCharType(characters[i]) & TRAILING_TYPES); i--) {
         lineLevels[i] = paragraph.level
       }
 
@@ -57,35 +58,46 @@ export function getReorderSegments(string, embeddingLevelsResult, start, end) {
 }
 
 /**
- * @param {string} string
+ * @param {array} characters
  * @param {GetEmbeddingLevelsResult} embedLevelsResult
  * @param {number} [start]
  * @param {number} [end]
- * @return {string} the new string with bidi segments reordered
+ * @return {array} a new array with bidi segments reordered
  */
-export function getReorderedString(string, embedLevelsResult, start, end) {
-  const indices = getReorderedIndices(string, embedLevelsResult, start, end)
-  const chars = [...string]
+export function getReorderedCharacters(characters, embedLevelsResult, start, end) {
+  const indices = getReorderedIndices(characters, embedLevelsResult, start, end)
+  let result = [];
   indices.forEach((charIndex, i) => {
-    chars[i] = (
-      (embedLevelsResult.levels[charIndex] & 1) ? getMirroredCharacter(string[charIndex]) : null
-    ) || string[charIndex]
+    result[i] = (
+      (embedLevelsResult.levels[charIndex] & 1) ? getMirroredCharacter(characters[charIndex]) : null
+    ) || characters[charIndex]
   })
-  return chars.join('')
+  return result
 }
 
 /**
  * @param {string} string
  * @param {GetEmbeddingLevelsResult} embedLevelsResult
  * @param {number} [start]
  * @param {number} [end]
+ * @return {string} the new string with bidi segments reordered
+ */
+export function getReorderedString(string, embedLevelsResult, start, end) {
+  return getReorderedCharacters(stringToArray(string), embedLevelsResult, start, end).join('')
+}
+
+/**
+ * @param {string[]} characters - an array of character strings
+ * @param {GetEmbeddingLevelsResult} embedLevelsResult
+ * @param {number} [start]
+ * @param {number} [end]
  * @return {number[]} an array with character indices in their new bidi order
  */
-export function getReorderedIndices(string, embedLevelsResult, start, end) {
-  const segments = getReorderSegments(string, embedLevelsResult, start, end)
+export function getReorderedIndices(characters, embedLevelsResult, start, end) {
+  const segments = getReorderSegments(characters, embedLevelsResult, start, end)
   // Fill an array with indices
   const indices = []
-  for (let i = 0; i < string.length; i++) {
+  for (let i = 0; i < characters.length; i++) {
     indices[i] = i
   }
   // Reverse each segment in order

diff --git a/src/util/stringToArray.js b/src/util/stringToArray.js
@@ -0,0 +1,10 @@
+/**
+ * Break a string into rendered characters (graphemes), 
+ * using simpler methods of breaking strings apart doesn't take into account characters with multiple bytes.
+ * For instance `'👱🏽‍♂️'.length === 7`
+ * @param {string} string - input string
+ * @return {string[]} - the string broken down into an array of characters.
+ */
+export function stringToArray (string) {
+  return [...new Intl.Segmenter().segment(string)].map(x => x.segment);
+}
diff --git a/test/BidiCharacterTest.js b/test/BidiCharacterTest.js
@@ -30,9 +30,11 @@ module.exports.runBidiCharacterTest = function (bidi) {
       expectedOrder = expectedOrder.split(' ').map(s => parseInt(s, 10))
 
       const start = performance.now()
-      const embedLevelsResult = bidi.getEmbeddingLevels(input, paraDir)
+      const characters = bidi.stringToArray(input)
+      const embedLevelsResult = bidi.getEmbeddingLevelsForCharacters(characters.slice(), paraDir)
       const {levels, paragraphs} = embedLevelsResult
-      let reordered = bidi.getReorderedIndices(input, embedLevelsResult)
+      let reordered = bidi.getReorderedIndices(characters.slice(), embedLevelsResult)
+
       totalTime += performance.now() - start
 
       reordered = reordered.filter(i => expectedLevels[i] !== 'x') //those with indeterminate level are ommitted

diff --git a/test/BidiCharacterTest.txt b/test/BidiCharacterTest.txt
@@ -80,12 +80,12 @@
 202E 202D 05D0 202C 0028 005B 202C 202B 0061 005D 0029 0062;2;1;x x 4 x 3 3 x x 4 4 4 4;8 9 10 11 5 4 2
 
 # Nonspacing marks applied to paired brackets
-0061 0028 0062 0029 0331;1;1;2 2 2 2 2;0 1 2 3 4
-0061 0028 0332 0062 0029 0333;1;1;2 2 2 2 2 2;0 1 2 3 4 5
-05D0 0028 05D1 0029 0331;0;0;1 1 1 1 1;4 3 2 1 0
-05D0 0028 0332 05D1 0029 0333;0;0;1 1 1 1 1 1;5 4 3 2 1 0
-0661 0028 0662 0029 0331;0;0;2 1 2 1 1;4 3 2 1 0
-0661 0028 0332 0662 0029 0333;0;0;2 1 1 2 1 1;5 4 3 2 1 0
+0061 0028 0062 0029 0331;1;1;2 2 2 1;3 0 1 2
+0061 0028 0332 0062 0029 0333;1;1;2 2 2 1;3 0 1 2
+05D0 0028 05D1 0029 0331;0;0;1 1 1 0;2 1 0 3
+05D0 0028 0332 05D1 0029 0333;0;0;1 1 1 0;2 1 0 3
+0661 0028 0662 0029 0331;0;0;2 1 2 0;2 1 0 3
+0661 0028 0332 0662 0029 0333;0;0;2 1 2 0;2 1 0 3
 
 # Nested bracket pairs that reach and exceed the fixed capacity of the bracket stack
 # a ( ( ... ( b ) ) ... ) with 62, 63, and 64 nested bracket pairs
@@ -228,8 +228,8 @@
 0661 0009 0028 0662 0029;2;0;2 0 1 2 1;0 1 4 3 2
 0661 0020 0028 0662 0029;2;0;2 1 1 2 1;4 3 2 1 0
 05D0 0029 0020 0028 0661 0029;0;0;1 1 1 1 2 1;5 4 3 2 1 0
-05D0 0029 0028 0301 0031 0029;0;0;1 1 1 1 2 1;5 4 3 2 1 0
-05D0 0029 0028 0301 0661 0029;0;0;1 1 1 1 2 1;5 4 3 2 1 0
+05D0 0029 0028 0301 0031 0029;0;0;1 1 1 2 0;3 2 1 0 4
+05D0 0029 0028 0301 0661 0029;0;0;1 1 1 2 0;3 2 1 0 4
 0627 0028 0661 003F 0020 0029 005D;0;0;1 1 2 1 1 1 0;5 4 3 2 1 0 6
 
 # Combinations of paired brackets, numbers, and directional formatting characters

diff --git a/test/BidiTest.js b/test/BidiTest.js
@@ -72,9 +72,10 @@ module.exports.runBidiTest = function (bidi) {
         if (testFilter && testFilter(lineIdx + 1, paraDir) === false) continue
 
         const start = performance.now()
-        const embedLevelsResult = bidi.getEmbeddingLevels(inputString, paraDir)
+        const characters = inputString.split(''); //bidi.stringToArray(inputString)
+        const embedLevelsResult = bidi.getEmbeddingLevelsForCharacters(characters.slice(), paraDir)
         const {levels, paragraphs} = embedLevelsResult
-        let reordered = bidi.getReorderedIndices(inputString, embedLevelsResult)
+        let reordered = bidi.getReorderedIndices(characters.slice(), embedLevelsResult)
         totalTime += performance.now() - start
         reordered = reordered.filter(i => expectedLevels[i] !== 'x') //those with indeterminate level are ommitted