From 18fdc9de278c32936d649737d059073773e00164 Mon Sep 17 00:00:00 2001 From: Tyler Larson Date: Tue, 4 Jun 2024 08:53:07 -0400 Subject: [PATCH 1/3] This adds support for multibyte characters, all functions now operate based on character arrays --- src/embeddingLevels.js | 47 +++++++++++++++++++++++++------------- src/index.js | 5 ++-- src/mirroring.js | 10 ++++---- src/reordering.js | 48 ++++++++++++++++++++++++--------------- src/util/stringToArray.js | 10 ++++++++ 5 files changed, 79 insertions(+), 41 deletions(-) create mode 100644 src/util/stringToArray.js diff --git a/src/embeddingLevels.js b/src/embeddingLevels.js index 3815393..1ebb02e 100644 --- a/src/embeddingLevels.js +++ b/src/embeddingLevels.js @@ -8,6 +8,7 @@ import { TYPES } from './charTypes.js' import { closingToOpeningBracket, getCanonicalBracket, openingToClosingBracket } from './brackets.js' +import { stringToArray } from './util/stringToArray.js' // Local type aliases const { @@ -42,22 +43,36 @@ const { */ /** - * This function applies the Bidirectional Algorithm to a string, returning the resolved embedding levels + * This function applies the Bidirectional Algorithm to a string of characters, returning the resolved embedding levels * in a single Uint8Array plus a list of objects holding each paragraph's start and end indices and resolved * base embedding level. * - * @param {string} string - The input string + * @param {string} string - a string of characters * @param {"ltr"|"rtl"|"auto"} [baseDirection] - Use "ltr" or "rtl" to force a base paragraph direction, * otherwise a direction will be chosen automatically from each paragraph's contents. * @return {GetEmbeddingLevelsResult} */ export function getEmbeddingLevels (string, baseDirection) { + return getEmbeddingLevelsForCharacters(stringToArray(string), baseDirection); +} + +/** + * This function applies the Bidirectional Algorithm to an array of characters, returning the resolved embedding levels + * in a single Uint8Array plus a list of objects holding each paragraph's start and end indices and resolved + * base embedding level. + * + * @param {string[]} characters - an array of character strings + * @param {"ltr"|"rtl"|"auto"} [baseDirection] - Use "ltr" or "rtl" to force a base paragraph direction, + * otherwise a direction will be chosen automatically from each paragraph's contents. + * @return {GetEmbeddingLevelsResult} + */ +export function getEmbeddingLevelsForCharacters (characters, baseDirection) { const MAX_DEPTH = 125 // Start by mapping all characters to their unicode type, as a bitmask integer - const charTypes = new Uint32Array(string.length) - for (let i = 0; i < string.length; i++) { - charTypes[i] = getBidiCharType(string[i]) + const charTypes = new Uint32Array(characters.length) + for (let i = 0; i < characters.length; i++) { + charTypes[i] = getBidiCharType(characters[i]) } const charTypeCounts = new Map() //will be cleared at start of each paragraph @@ -74,18 +89,18 @@ export function getEmbeddingLevels (string, baseDirection) { } } - const embedLevels = new Uint8Array(string.length) + const embedLevels = new Uint8Array(characters.length) const isolationPairs = new Map() //init->pdi and pdi->init // === 3.3.1 The Paragraph Level === // 3.3.1 P1: Split the text into paragraphs const paragraphs = [] // [{start, end, level}, ...] let paragraph = null - for (let i = 0; i < string.length; i++) { + for (let i = 0; i < characters.length; i++) { if (!paragraph) { paragraphs.push(paragraph = { start: i, - end: string.length - 1, + end: characters.length - 1, // 3.3.1 P2-P3: Determine the paragraph level level: baseDirection === 'rtl' ? 1 : baseDirection === 'ltr' ? 0 : determineAutoEmbedLevel(i, false) }) @@ -477,7 +492,7 @@ export function getEmbeddingLevels (string, baseDirection) { // type, as that may have been changed earlier. This doesn't seem to be explicitly // called out in the spec, but is required for passage of certain tests. if (charTypes[seqIndices[si]] & NEUTRAL_ISOLATE_TYPES) { - const char = string[seqIndices[si]] + const char = characters[seqIndices[si]] let oppositeBracket // Opening bracket if (openingToClosingBracket(char) !== null) { @@ -553,7 +568,7 @@ export function getEmbeddingLevels (string, baseDirection) { if (useStrongType !== embedDirection) { for (let si = openSeqIdx + 1; si < seqIndices.length; si++) { if (!(charTypes[seqIndices[si]] & BN_LIKE_TYPES)) { - if (getBidiCharType(string[seqIndices[si]]) & TYPE_NSM) { + if (getBidiCharType(characters[seqIndices[si]]) & TYPE_NSM) { charTypes[seqIndices[si]] = useStrongType } break @@ -563,7 +578,7 @@ export function getEmbeddingLevels (string, baseDirection) { if (useStrongType !== embedDirection) { for (let si = closeSeqIdx + 1; si < seqIndices.length; si++) { if (!(charTypes[seqIndices[si]] & BN_LIKE_TYPES)) { - if (getBidiCharType(string[seqIndices[si]]) & TYPE_NSM) { + if (getBidiCharType(characters[seqIndices[si]]) & TYPE_NSM) { charTypes[seqIndices[si]] = useStrongType } break @@ -636,8 +651,8 @@ export function getEmbeddingLevels (string, baseDirection) { // 3.4 L1.1-4: Reset the embedding level of segment/paragraph separators, and any sequence of whitespace or // isolate formatting characters preceding them or the end of the paragraph, to the paragraph level. // NOTE: this will also need to be applied to each individual line ending after line wrapping occurs. - if (i === paragraph.end || getBidiCharType(string[i]) & (TYPE_S | TYPE_B)) { - for (let j = i; j >= 0 && (getBidiCharType(string[j]) & TRAILING_TYPES); j--) { + if (i === paragraph.end || getBidiCharType(characters[i]) & (TYPE_S | TYPE_B)) { + for (let j = i; j >= 0 && (getBidiCharType(characters[j]) & TRAILING_TYPES); j--) { embedLevels[j] = paragraph.level } } @@ -653,7 +668,7 @@ export function getEmbeddingLevels (string, baseDirection) { function determineAutoEmbedLevel (start, isFSI) { // 3.3.1 P2 - P3 - for (let i = start; i < string.length; i++) { + for (let i = start; i < characters.length; i++) { const charType = charTypes[i] if (charType & (TYPE_R | TYPE_AL)) { return 1 @@ -663,7 +678,7 @@ export function getEmbeddingLevels (string, baseDirection) { } if (charType & ISOLATE_INIT_TYPES) { const pdi = indexOfMatchingPDI(i) - i = pdi === -1 ? string.length : pdi + i = pdi === -1 ? characters.length : pdi } } return 0 @@ -672,7 +687,7 @@ export function getEmbeddingLevels (string, baseDirection) { function indexOfMatchingPDI (isolateStart) { // 3.1.2 BD9 let isolationLevel = 1 - for (let i = isolateStart + 1; i < string.length; i++) { + for (let i = isolateStart + 1; i < characters.length; i++) { const charType = charTypes[i] if (charType & TYPE_B) { break diff --git a/src/index.js b/src/index.js index d146bb0..5a466fb 100644 --- a/src/index.js +++ b/src/index.js @@ -1,5 +1,6 @@ -export { getEmbeddingLevels } from './embeddingLevels.js' -export { getReorderSegments, getReorderedIndices, getReorderedString } from './reordering.js' +export { getEmbeddingLevels, getEmbeddingLevelsForCharacters } from './embeddingLevels.js' +export { getReorderSegments, getReorderedCharacters, getReorderedIndices, getReorderedString } from './reordering.js' export { getBidiCharType, getBidiCharTypeName } from './charTypes.js' export { getMirroredCharacter, getMirroredCharactersMap } from './mirroring.js' export { closingToOpeningBracket, openingToClosingBracket, getCanonicalBracket } from './brackets.js' +export { stringToArray } from './util/stringToArray.js' diff --git a/src/mirroring.js b/src/mirroring.js index c214b04..e11f9be 100644 --- a/src/mirroring.js +++ b/src/mirroring.js @@ -22,23 +22,23 @@ export function getMirroredCharacter (char) { } /** - * Given a string and its resolved embedding levels, build a map of indices to replacement chars + * Given a character array and its resolved embedding levels, build a map of indices to replacement chars * for any characters in right-to-left segments that have defined mirrored characters. - * @param string + * @param {string[]} characters - an array of character strings * @param embeddingLevels * @param [start] * @param [end] * @return {Map} */ -export function getMirroredCharactersMap(string, embeddingLevels, start, end) { - let strLen = string.length +export function getMirroredCharactersMap(characters, embeddingLevels, start, end) { + let strLen = characters.length start = Math.max(0, start == null ? 0 : +start) end = Math.min(strLen - 1, end == null ? strLen - 1 : +end) const map = new Map() for (let i = start; i <= end; i++) { if (embeddingLevels[i] & 1) { //only odd (rtl) levels - const mirror = getMirroredCharacter(string[i]) + const mirror = getMirroredCharacter(characters[i]) if (mirror !== null) { map.set(i, mirror) } diff --git a/src/reordering.js b/src/reordering.js index 94a42ed..c4f033f 100644 --- a/src/reordering.js +++ b/src/reordering.js @@ -1,17 +1,18 @@ import { getBidiCharType, TRAILING_TYPES } from './charTypes.js' import { getMirroredCharacter } from './mirroring.js' +import { stringToArray } from './utils/stringToArray.js' /** * Given a start and end denoting a single line within a string, and a set of precalculated * bidi embedding levels, produce a list of segments whose ordering should be flipped, in sequence. - * @param {string} string - the full input string + * @param {string[]} characters - an array of character strings * @param {GetEmbeddingLevelsResult} embeddingLevelsResult - the result object from getEmbeddingLevels - * @param {number} [start] - first character in a subset of the full string - * @param {number} [end] - last character in a subset of the full string + * @param {number} [start] - first character in a subset of the full characters array + * @param {number} [end] - last character in a subset of the full characters array * @return {number[][]} - the list of start/end segments that should be flipped, in order. */ -export function getReorderSegments(string, embeddingLevelsResult, start, end) { - let strLen = string.length +export function getReorderSegments(characters, embeddingLevelsResult, start, end) { + let strLen = characters.length start = Math.max(0, start == null ? 0 : +start) end = Math.min(strLen - 1, end == null ? strLen - 1 : +end) @@ -25,7 +26,7 @@ export function getReorderSegments(string, embeddingLevelsResult, start, end) { // 3.4 L1.4: Reset any sequence of whitespace characters and/or isolate formatting characters at the // end of the line to the paragraph level. - for (let i = lineEnd; i >= lineStart && (getBidiCharType(string[i]) & TRAILING_TYPES); i--) { + for (let i = lineEnd; i >= lineStart && (getBidiCharType(characters[i]) & TRAILING_TYPES); i--) { lineLevels[i] = paragraph.level } @@ -57,21 +58,21 @@ export function getReorderSegments(string, embeddingLevelsResult, start, end) { } /** - * @param {string} string + * @param {array} characters * @param {GetEmbeddingLevelsResult} embedLevelsResult * @param {number} [start] * @param {number} [end] - * @return {string} the new string with bidi segments reordered + * @return {array} a new array with bidi segments reordered */ -export function getReorderedString(string, embedLevelsResult, start, end) { - const indices = getReorderedIndices(string, embedLevelsResult, start, end) - const chars = [...string] +export function getReorderedCharacters(characters, embedLevelsResult, start, end) { + const indices = getReorderedIndices(characters, embedLevelsResult, start, end) + let result = []; indices.forEach((charIndex, i) => { - chars[i] = ( - (embedLevelsResult.levels[charIndex] & 1) ? getMirroredCharacter(string[charIndex]) : null - ) || string[charIndex] + result[i] = ( + (embedLevelsResult.levels[charIndex] & 1) ? getMirroredCharacter(characters[charIndex]) : null + ) || characters[charIndex] }) - return chars.join('') + return result } /** @@ -79,13 +80,24 @@ export function getReorderedString(string, embedLevelsResult, start, end) { * @param {GetEmbeddingLevelsResult} embedLevelsResult * @param {number} [start] * @param {number} [end] + * @return {string} the new string with bidi segments reordered + */ +export function getReorderedString(string, embedLevelsResult, start, end) { + return getReorderedCharacters(stringToArray(string), embedLevelsResult, start, end).join('') +} + +/** + * @param {string[]} characters - an array of character strings + * @param {GetEmbeddingLevelsResult} embedLevelsResult + * @param {number} [start] + * @param {number} [end] * @return {number[]} an array with character indices in their new bidi order */ -export function getReorderedIndices(string, embedLevelsResult, start, end) { - const segments = getReorderSegments(string, embedLevelsResult, start, end) +export function getReorderedIndices(characters, embedLevelsResult, start, end) { + const segments = getReorderSegments(characters, embedLevelsResult, start, end) // Fill an array with indices const indices = [] - for (let i = 0; i < string.length; i++) { + for (let i = 0; i < characters.length; i++) { indices[i] = i } // Reverse each segment in order diff --git a/src/util/stringToArray.js b/src/util/stringToArray.js new file mode 100644 index 0000000..3efa7ee --- /dev/null +++ b/src/util/stringToArray.js @@ -0,0 +1,10 @@ +/** + * Break a string into rendered characters (graphemes), + * using simpler methods of breaking strings apart doesn't take into account characters with multiple bytes. + * For instance `'👱🏽‍♂️'.length === 7` + * @param {string} string - input string + * @return {string[]} - the string broken down into an array of characters. + */ +export function stringToArray (string) { + return [...new Intl.Segmenter().segment(string)].map(x => x.segment); +} \ No newline at end of file From c696aab32582b68e2b2f6522e9fb8245055c5a96 Mon Sep 17 00:00:00 2001 From: Tyler Larson Date: Tue, 4 Jun 2024 09:01:29 -0400 Subject: [PATCH 2/3] misspelling --- src/reordering.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/reordering.js b/src/reordering.js index c4f033f..e363ac8 100644 --- a/src/reordering.js +++ b/src/reordering.js @@ -1,6 +1,6 @@ import { getBidiCharType, TRAILING_TYPES } from './charTypes.js' import { getMirroredCharacter } from './mirroring.js' -import { stringToArray } from './utils/stringToArray.js' +import { stringToArray } from './util/stringToArray.js' /** * Given a start and end denoting a single line within a string, and a set of precalculated From 0416b8a0c80364da8342a99b48ac360a32d995bb Mon Sep 17 00:00:00 2001 From: Tyler Larson Date: Wed, 26 Jun 2024 14:15:13 -0400 Subject: [PATCH 3/3] trying to fix the unit tests --- test/BidiCharacterTest.js | 6 ++++-- test/BidiCharacterTest.txt | 16 ++++++++-------- test/BidiTest.js | 5 +++-- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/test/BidiCharacterTest.js b/test/BidiCharacterTest.js index a16fa77..cdfa844 100644 --- a/test/BidiCharacterTest.js +++ b/test/BidiCharacterTest.js @@ -30,9 +30,11 @@ module.exports.runBidiCharacterTest = function (bidi) { expectedOrder = expectedOrder.split(' ').map(s => parseInt(s, 10)) const start = performance.now() - const embedLevelsResult = bidi.getEmbeddingLevels(input, paraDir) + const characters = bidi.stringToArray(input) + const embedLevelsResult = bidi.getEmbeddingLevelsForCharacters(characters.slice(), paraDir) const {levels, paragraphs} = embedLevelsResult - let reordered = bidi.getReorderedIndices(input, embedLevelsResult) + let reordered = bidi.getReorderedIndices(characters.slice(), embedLevelsResult) + totalTime += performance.now() - start reordered = reordered.filter(i => expectedLevels[i] !== 'x') //those with indeterminate level are ommitted diff --git a/test/BidiCharacterTest.txt b/test/BidiCharacterTest.txt index 02e2083..5ac0a33 100644 --- a/test/BidiCharacterTest.txt +++ b/test/BidiCharacterTest.txt @@ -80,12 +80,12 @@ 202E 202D 05D0 202C 0028 005B 202C 202B 0061 005D 0029 0062;2;1;x x 4 x 3 3 x x 4 4 4 4;8 9 10 11 5 4 2 # Nonspacing marks applied to paired brackets -0061 0028 0062 0029 0331;1;1;2 2 2 2 2;0 1 2 3 4 -0061 0028 0332 0062 0029 0333;1;1;2 2 2 2 2 2;0 1 2 3 4 5 -05D0 0028 05D1 0029 0331;0;0;1 1 1 1 1;4 3 2 1 0 -05D0 0028 0332 05D1 0029 0333;0;0;1 1 1 1 1 1;5 4 3 2 1 0 -0661 0028 0662 0029 0331;0;0;2 1 2 1 1;4 3 2 1 0 -0661 0028 0332 0662 0029 0333;0;0;2 1 1 2 1 1;5 4 3 2 1 0 +0061 0028 0062 0029 0331;1;1;2 2 2 1;3 0 1 2 +0061 0028 0332 0062 0029 0333;1;1;2 2 2 1;3 0 1 2 +05D0 0028 05D1 0029 0331;0;0;1 1 1 0;2 1 0 3 +05D0 0028 0332 05D1 0029 0333;0;0;1 1 1 0;2 1 0 3 +0661 0028 0662 0029 0331;0;0;2 1 2 0;2 1 0 3 +0661 0028 0332 0662 0029 0333;0;0;2 1 2 0;2 1 0 3 # Nested bracket pairs that reach and exceed the fixed capacity of the bracket stack # a ( ( ... ( b ) ) ... ) with 62, 63, and 64 nested bracket pairs @@ -228,8 +228,8 @@ 0661 0009 0028 0662 0029;2;0;2 0 1 2 1;0 1 4 3 2 0661 0020 0028 0662 0029;2;0;2 1 1 2 1;4 3 2 1 0 05D0 0029 0020 0028 0661 0029;0;0;1 1 1 1 2 1;5 4 3 2 1 0 -05D0 0029 0028 0301 0031 0029;0;0;1 1 1 1 2 1;5 4 3 2 1 0 -05D0 0029 0028 0301 0661 0029;0;0;1 1 1 1 2 1;5 4 3 2 1 0 +05D0 0029 0028 0301 0031 0029;0;0;1 1 1 2 0;3 2 1 0 4 +05D0 0029 0028 0301 0661 0029;0;0;1 1 1 2 0;3 2 1 0 4 0627 0028 0661 003F 0020 0029 005D;0;0;1 1 2 1 1 1 0;5 4 3 2 1 0 6 # Combinations of paired brackets, numbers, and directional formatting characters diff --git a/test/BidiTest.js b/test/BidiTest.js index ac9adfc..c3ad8a5 100644 --- a/test/BidiTest.js +++ b/test/BidiTest.js @@ -72,9 +72,10 @@ module.exports.runBidiTest = function (bidi) { if (testFilter && testFilter(lineIdx + 1, paraDir) === false) continue const start = performance.now() - const embedLevelsResult = bidi.getEmbeddingLevels(inputString, paraDir) + const characters = inputString.split(''); //bidi.stringToArray(inputString) + const embedLevelsResult = bidi.getEmbeddingLevelsForCharacters(characters.slice(), paraDir) const {levels, paragraphs} = embedLevelsResult - let reordered = bidi.getReorderedIndices(inputString, embedLevelsResult) + let reordered = bidi.getReorderedIndices(characters.slice(), embedLevelsResult) totalTime += performance.now() - start reordered = reordered.filter(i => expectedLevels[i] !== 'x') //those with indeterminate level are ommitted