Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for multibyte characters #10

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 31 additions & 16 deletions src/embeddingLevels.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import {
TYPES
} from './charTypes.js'
import { closingToOpeningBracket, getCanonicalBracket, openingToClosingBracket } from './brackets.js'
import { stringToArray } from './util/stringToArray.js'

// Local type aliases
const {
Expand Down Expand Up @@ -42,22 +43,36 @@ const {
*/

/**
* This function applies the Bidirectional Algorithm to a string, returning the resolved embedding levels
* This function applies the Bidirectional Algorithm to a string of characters, returning the resolved embedding levels
* in a single Uint8Array plus a list of objects holding each paragraph's start and end indices and resolved
* base embedding level.
*
* @param {string} string - The input string
* @param {string} string - a string of characters
* @param {"ltr"|"rtl"|"auto"} [baseDirection] - Use "ltr" or "rtl" to force a base paragraph direction,
* otherwise a direction will be chosen automatically from each paragraph's contents.
* @return {GetEmbeddingLevelsResult}
*/
export function getEmbeddingLevels (string, baseDirection) {
return getEmbeddingLevelsForCharacters(stringToArray(string), baseDirection);
}

/**
* This function applies the Bidirectional Algorithm to an array of characters, returning the resolved embedding levels
* in a single Uint8Array plus a list of objects holding each paragraph's start and end indices and resolved
* base embedding level.
*
* @param {string[]} characters - an array of character strings
* @param {"ltr"|"rtl"|"auto"} [baseDirection] - Use "ltr" or "rtl" to force a base paragraph direction,
* otherwise a direction will be chosen automatically from each paragraph's contents.
* @return {GetEmbeddingLevelsResult}
*/
export function getEmbeddingLevelsForCharacters (characters, baseDirection) {
const MAX_DEPTH = 125

// Start by mapping all characters to their unicode type, as a bitmask integer
const charTypes = new Uint32Array(string.length)
for (let i = 0; i < string.length; i++) {
charTypes[i] = getBidiCharType(string[i])
const charTypes = new Uint32Array(characters.length)
for (let i = 0; i < characters.length; i++) {
charTypes[i] = getBidiCharType(characters[i])
}

const charTypeCounts = new Map() //will be cleared at start of each paragraph
Expand All @@ -74,18 +89,18 @@ export function getEmbeddingLevels (string, baseDirection) {
}
}

const embedLevels = new Uint8Array(string.length)
const embedLevels = new Uint8Array(characters.length)
const isolationPairs = new Map() //init->pdi and pdi->init

// === 3.3.1 The Paragraph Level ===
// 3.3.1 P1: Split the text into paragraphs
const paragraphs = [] // [{start, end, level}, ...]
let paragraph = null
for (let i = 0; i < string.length; i++) {
for (let i = 0; i < characters.length; i++) {
if (!paragraph) {
paragraphs.push(paragraph = {
start: i,
end: string.length - 1,
end: characters.length - 1,
// 3.3.1 P2-P3: Determine the paragraph level
level: baseDirection === 'rtl' ? 1 : baseDirection === 'ltr' ? 0 : determineAutoEmbedLevel(i, false)
})
Expand Down Expand Up @@ -477,7 +492,7 @@ export function getEmbeddingLevels (string, baseDirection) {
// type, as that may have been changed earlier. This doesn't seem to be explicitly
// called out in the spec, but is required for passage of certain tests.
if (charTypes[seqIndices[si]] & NEUTRAL_ISOLATE_TYPES) {
const char = string[seqIndices[si]]
const char = characters[seqIndices[si]]
let oppositeBracket
// Opening bracket
if (openingToClosingBracket(char) !== null) {
Expand Down Expand Up @@ -553,7 +568,7 @@ export function getEmbeddingLevels (string, baseDirection) {
if (useStrongType !== embedDirection) {
for (let si = openSeqIdx + 1; si < seqIndices.length; si++) {
if (!(charTypes[seqIndices[si]] & BN_LIKE_TYPES)) {
if (getBidiCharType(string[seqIndices[si]]) & TYPE_NSM) {
if (getBidiCharType(characters[seqIndices[si]]) & TYPE_NSM) {
charTypes[seqIndices[si]] = useStrongType
}
break
Expand All @@ -563,7 +578,7 @@ export function getEmbeddingLevels (string, baseDirection) {
if (useStrongType !== embedDirection) {
for (let si = closeSeqIdx + 1; si < seqIndices.length; si++) {
if (!(charTypes[seqIndices[si]] & BN_LIKE_TYPES)) {
if (getBidiCharType(string[seqIndices[si]]) & TYPE_NSM) {
if (getBidiCharType(characters[seqIndices[si]]) & TYPE_NSM) {
charTypes[seqIndices[si]] = useStrongType
}
break
Expand Down Expand Up @@ -636,8 +651,8 @@ export function getEmbeddingLevels (string, baseDirection) {
// 3.4 L1.1-4: Reset the embedding level of segment/paragraph separators, and any sequence of whitespace or
// isolate formatting characters preceding them or the end of the paragraph, to the paragraph level.
// NOTE: this will also need to be applied to each individual line ending after line wrapping occurs.
if (i === paragraph.end || getBidiCharType(string[i]) & (TYPE_S | TYPE_B)) {
for (let j = i; j >= 0 && (getBidiCharType(string[j]) & TRAILING_TYPES); j--) {
if (i === paragraph.end || getBidiCharType(characters[i]) & (TYPE_S | TYPE_B)) {
for (let j = i; j >= 0 && (getBidiCharType(characters[j]) & TRAILING_TYPES); j--) {
embedLevels[j] = paragraph.level
}
}
Expand All @@ -653,7 +668,7 @@ export function getEmbeddingLevels (string, baseDirection) {

function determineAutoEmbedLevel (start, isFSI) {
// 3.3.1 P2 - P3
for (let i = start; i < string.length; i++) {
for (let i = start; i < characters.length; i++) {
const charType = charTypes[i]
if (charType & (TYPE_R | TYPE_AL)) {
return 1
Expand All @@ -663,7 +678,7 @@ export function getEmbeddingLevels (string, baseDirection) {
}
if (charType & ISOLATE_INIT_TYPES) {
const pdi = indexOfMatchingPDI(i)
i = pdi === -1 ? string.length : pdi
i = pdi === -1 ? characters.length : pdi
}
}
return 0
Expand All @@ -672,7 +687,7 @@ export function getEmbeddingLevels (string, baseDirection) {
function indexOfMatchingPDI (isolateStart) {
// 3.1.2 BD9
let isolationLevel = 1
for (let i = isolateStart + 1; i < string.length; i++) {
for (let i = isolateStart + 1; i < characters.length; i++) {
const charType = charTypes[i]
if (charType & TYPE_B) {
break
Expand Down
5 changes: 3 additions & 2 deletions src/index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
export { getEmbeddingLevels } from './embeddingLevels.js'
export { getReorderSegments, getReorderedIndices, getReorderedString } from './reordering.js'
export { getEmbeddingLevels, getEmbeddingLevelsForCharacters } from './embeddingLevels.js'
export { getReorderSegments, getReorderedCharacters, getReorderedIndices, getReorderedString } from './reordering.js'
export { getBidiCharType, getBidiCharTypeName } from './charTypes.js'
export { getMirroredCharacter, getMirroredCharactersMap } from './mirroring.js'
export { closingToOpeningBracket, openingToClosingBracket, getCanonicalBracket } from './brackets.js'
export { stringToArray } from './util/stringToArray.js'
10 changes: 5 additions & 5 deletions src/mirroring.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,23 +22,23 @@ export function getMirroredCharacter (char) {
}

/**
* Given a string and its resolved embedding levels, build a map of indices to replacement chars
* Given a character array and its resolved embedding levels, build a map of indices to replacement chars
* for any characters in right-to-left segments that have defined mirrored characters.
* @param string
* @param {string[]} characters - an array of character strings
* @param embeddingLevels
* @param [start]
* @param [end]
* @return {Map<number, string>}
*/
export function getMirroredCharactersMap(string, embeddingLevels, start, end) {
let strLen = string.length
export function getMirroredCharactersMap(characters, embeddingLevels, start, end) {
let strLen = characters.length
start = Math.max(0, start == null ? 0 : +start)
end = Math.min(strLen - 1, end == null ? strLen - 1 : +end)

const map = new Map()
for (let i = start; i <= end; i++) {
if (embeddingLevels[i] & 1) { //only odd (rtl) levels
const mirror = getMirroredCharacter(string[i])
const mirror = getMirroredCharacter(characters[i])
if (mirror !== null) {
map.set(i, mirror)
}
Expand Down
48 changes: 30 additions & 18 deletions src/reordering.js
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
import { getBidiCharType, TRAILING_TYPES } from './charTypes.js'
import { getMirroredCharacter } from './mirroring.js'
import { stringToArray } from './util/stringToArray.js'

/**
* Given a start and end denoting a single line within a string, and a set of precalculated
* bidi embedding levels, produce a list of segments whose ordering should be flipped, in sequence.
* @param {string} string - the full input string
* @param {string[]} characters - an array of character strings
* @param {GetEmbeddingLevelsResult} embeddingLevelsResult - the result object from getEmbeddingLevels
* @param {number} [start] - first character in a subset of the full string
* @param {number} [end] - last character in a subset of the full string
* @param {number} [start] - first character in a subset of the full characters array
* @param {number} [end] - last character in a subset of the full characters array
* @return {number[][]} - the list of start/end segments that should be flipped, in order.
*/
export function getReorderSegments(string, embeddingLevelsResult, start, end) {
let strLen = string.length
export function getReorderSegments(characters, embeddingLevelsResult, start, end) {
let strLen = characters.length
start = Math.max(0, start == null ? 0 : +start)
end = Math.min(strLen - 1, end == null ? strLen - 1 : +end)

Expand All @@ -25,7 +26,7 @@ export function getReorderSegments(string, embeddingLevelsResult, start, end) {

// 3.4 L1.4: Reset any sequence of whitespace characters and/or isolate formatting characters at the
// end of the line to the paragraph level.
for (let i = lineEnd; i >= lineStart && (getBidiCharType(string[i]) & TRAILING_TYPES); i--) {
for (let i = lineEnd; i >= lineStart && (getBidiCharType(characters[i]) & TRAILING_TYPES); i--) {
lineLevels[i] = paragraph.level
}

Expand Down Expand Up @@ -57,35 +58,46 @@ export function getReorderSegments(string, embeddingLevelsResult, start, end) {
}

/**
* @param {string} string
* @param {array} characters
* @param {GetEmbeddingLevelsResult} embedLevelsResult
* @param {number} [start]
* @param {number} [end]
* @return {string} the new string with bidi segments reordered
* @return {array} a new array with bidi segments reordered
*/
export function getReorderedString(string, embedLevelsResult, start, end) {
const indices = getReorderedIndices(string, embedLevelsResult, start, end)
const chars = [...string]
export function getReorderedCharacters(characters, embedLevelsResult, start, end) {
const indices = getReorderedIndices(characters, embedLevelsResult, start, end)
let result = [];
indices.forEach((charIndex, i) => {
chars[i] = (
(embedLevelsResult.levels[charIndex] & 1) ? getMirroredCharacter(string[charIndex]) : null
) || string[charIndex]
result[i] = (
(embedLevelsResult.levels[charIndex] & 1) ? getMirroredCharacter(characters[charIndex]) : null
) || characters[charIndex]
})
return chars.join('')
return result
}

/**
* @param {string} string
* @param {GetEmbeddingLevelsResult} embedLevelsResult
* @param {number} [start]
* @param {number} [end]
* @return {string} the new string with bidi segments reordered
*/
export function getReorderedString(string, embedLevelsResult, start, end) {
return getReorderedCharacters(stringToArray(string), embedLevelsResult, start, end).join('')
}

/**
* @param {string[]} characters - an array of character strings
* @param {GetEmbeddingLevelsResult} embedLevelsResult
* @param {number} [start]
* @param {number} [end]
* @return {number[]} an array with character indices in their new bidi order
*/
export function getReorderedIndices(string, embedLevelsResult, start, end) {
const segments = getReorderSegments(string, embedLevelsResult, start, end)
export function getReorderedIndices(characters, embedLevelsResult, start, end) {
const segments = getReorderSegments(characters, embedLevelsResult, start, end)
// Fill an array with indices
const indices = []
for (let i = 0; i < string.length; i++) {
for (let i = 0; i < characters.length; i++) {
indices[i] = i
}
// Reverse each segment in order
Expand Down
10 changes: 10 additions & 0 deletions src/util/stringToArray.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
/**
* Break a string into rendered characters (graphemes),
* using simpler methods of breaking strings apart doesn't take into account characters with multiple bytes.
* For instance `'πŸ‘±πŸ½β€β™‚οΈ'.length === 7`
* @param {string} string - input string
* @return {string[]} - the string broken down into an array of characters.
*/
export function stringToArray (string) {
return [...new Intl.Segmenter().segment(string)].map(x => x.segment);
Copy link

@1ec5 1ec5 Aug 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intl.Segmenter segments by grapheme clusters by default. This is possibly overkill for fixing #9. In some scripts, it will group whole syllables of multiple characters. If the issue is just that surrogate pairs are getting split up, then use the spread operator ([...string]) or the string[Symbol.iterator]() iterator directly on the string you want to split, as described in this documentation:

return [...string];

The standard iterator will also get you whole Unicode codepoints:

for (const char of string) {
  const codePoint = char.codePointAt(0); // never a lone surrogate
}

The string iterator is much more widely supported than Intl.Segmenter, which only landed in Firefox a few months ago.

}
6 changes: 4 additions & 2 deletions test/BidiCharacterTest.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,11 @@ module.exports.runBidiCharacterTest = function (bidi) {
expectedOrder = expectedOrder.split(' ').map(s => parseInt(s, 10))

const start = performance.now()
const embedLevelsResult = bidi.getEmbeddingLevels(input, paraDir)
const characters = bidi.stringToArray(input)
const embedLevelsResult = bidi.getEmbeddingLevelsForCharacters(characters.slice(), paraDir)
const {levels, paragraphs} = embedLevelsResult
let reordered = bidi.getReorderedIndices(input, embedLevelsResult)
let reordered = bidi.getReorderedIndices(characters.slice(), embedLevelsResult)

totalTime += performance.now() - start

reordered = reordered.filter(i => expectedLevels[i] !== 'x') //those with indeterminate level are ommitted
Expand Down
16 changes: 8 additions & 8 deletions test/BidiCharacterTest.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,12 @@
202E 202D 05D0 202C 0028 005B 202C 202B 0061 005D 0029 0062;2;1;x x 4 x 3 3 x x 4 4 4 4;8 9 10 11 5 4 2

# Nonspacing marks applied to paired brackets
0061 0028 0062 0029 0331;1;1;2 2 2 2 2;0 1 2 3 4
0061 0028 0332 0062 0029 0333;1;1;2 2 2 2 2 2;0 1 2 3 4 5
05D0 0028 05D1 0029 0331;0;0;1 1 1 1 1;4 3 2 1 0
05D0 0028 0332 05D1 0029 0333;0;0;1 1 1 1 1 1;5 4 3 2 1 0
0661 0028 0662 0029 0331;0;0;2 1 2 1 1;4 3 2 1 0
0661 0028 0332 0662 0029 0333;0;0;2 1 1 2 1 1;5 4 3 2 1 0
0061 0028 0062 0029 0331;1;1;2 2 2 1;3 0 1 2

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe you are not supposed to touch this file it is imported directly from the Unicode bidi spec

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think BidiCharacterTest.js should also use string.split('')

0061 0028 0332 0062 0029 0333;1;1;2 2 2 1;3 0 1 2
05D0 0028 05D1 0029 0331;0;0;1 1 1 0;2 1 0 3
05D0 0028 0332 05D1 0029 0333;0;0;1 1 1 0;2 1 0 3
0661 0028 0662 0029 0331;0;0;2 1 2 0;2 1 0 3
0661 0028 0332 0662 0029 0333;0;0;2 1 2 0;2 1 0 3

# Nested bracket pairs that reach and exceed the fixed capacity of the bracket stack
# a ( ( ... ( b ) ) ... ) with 62, 63, and 64 nested bracket pairs
Expand Down Expand Up @@ -228,8 +228,8 @@
0661 0009 0028 0662 0029;2;0;2 0 1 2 1;0 1 4 3 2
0661 0020 0028 0662 0029;2;0;2 1 1 2 1;4 3 2 1 0
05D0 0029 0020 0028 0661 0029;0;0;1 1 1 1 2 1;5 4 3 2 1 0
05D0 0029 0028 0301 0031 0029;0;0;1 1 1 1 2 1;5 4 3 2 1 0
05D0 0029 0028 0301 0661 0029;0;0;1 1 1 1 2 1;5 4 3 2 1 0
05D0 0029 0028 0301 0031 0029;0;0;1 1 1 2 0;3 2 1 0 4
05D0 0029 0028 0301 0661 0029;0;0;1 1 1 2 0;3 2 1 0 4
0627 0028 0661 003F 0020 0029 005D;0;0;1 1 2 1 1 1 0;5 4 3 2 1 0 6

# Combinations of paired brackets, numbers, and directional formatting characters
Expand Down
5 changes: 3 additions & 2 deletions test/BidiTest.js
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,10 @@ module.exports.runBidiTest = function (bidi) {
if (testFilter && testFilter(lineIdx + 1, paraDir) === false) continue

const start = performance.now()
const embedLevelsResult = bidi.getEmbeddingLevels(inputString, paraDir)
const characters = inputString.split(''); //bidi.stringToArray(inputString)
const embedLevelsResult = bidi.getEmbeddingLevelsForCharacters(characters.slice(), paraDir)
const {levels, paragraphs} = embedLevelsResult
let reordered = bidi.getReorderedIndices(inputString, embedLevelsResult)
let reordered = bidi.getReorderedIndices(characters.slice(), embedLevelsResult)
totalTime += performance.now() - start
reordered = reordered.filter(i => expectedLevels[i] !== 'x') //those with indeterminate level are ommitted

Expand Down