feat(transformers): introduce matchAlgorithm option for new matchin…

…g algorithm (#835) Co-authored-by: Anthony Fu <[email protected]>
shikijs · Jan 20, 2025 · ceca984 · ceca984
1 parent 4ed7fa3
commit ceca984
Show file tree

Hide file tree

Showing 21 changed files with 448 additions and 105 deletions.
diff --git a/docs/packages/transformers.md b/docs/packages/transformers.md
@@ -41,6 +41,44 @@ const html = await codeToHtml(code, {
 
 Transformers only applies classes and does not come with styles; you can provide your own CSS rules to style them properly.
 
+## Matching Algorithm
+
+We found that the algorithm for matching comments in v1 is sometime conterintuitive, where we are trying to fix it in a progressive way. Since v1.29.0, we introduced a new `matchAlgorithm` option to most of the transformer for you to toggle between different matching algorithms. Right now, the default is `v1` which is the old algorithm, and `v3` is the new algorithm. When Shiki v3 is landed, the default will be `v3`.
+
+```ts
+const html = await codeToHtml(code, {
+  lang: 'ts',
+  theme: 'nord',
+  transformers: [
+    transformerNotationDiff({
+      matchAlgorithm: 'v3', // [!code hl]
+    }),
+  ],
+})
+```
+
+### `matchAlgorithm: 'v1'`
+
+The matching algorithm mostly affects the single-line comment matching, in `v1`, it will count the comment line as the first line, while in `v3`, it will count start from the comment line:
+
+```ts
+// [\!code highlight:3]
+console.log('highlighted') // [!code hl]
+console.log('highlighted') // [!code hl]
+console.log('not highlighted')
+```
+
+### `matchAlgorithm: 'v3'`
+
+In `v3`, the matching algorithm will start counting from the line below the comment:
+
+```ts
+// [\!code highlight:2]
+console.log('highlighted') // [!code hl]
+console.log('highlighted') // [!code hl]
+console.log('not highlighted')
+```
+
 ## Transformers
 
 ### `transformerNotationDiff`

diff --git a/packages/transformers/src/index.ts b/packages/transformers/src/index.ts
@@ -11,4 +11,3 @@ export * from './transformers/remove-line-breaks'
 export * from './transformers/remove-notation-escape'
 export * from './transformers/render-whitespace'
 export * from './transformers/style-to-class'
-export * from './utils'
diff --git a/packages/transformers/src/shared/highlight-word.ts b/packages/transformers/src/shared/highlight-word.ts
@@ -24,6 +24,7 @@ function getTextContent(element: ElementContent): string {
  * @param ignoredElement
  * @param index highlight beginning index
  * @param len highlight length
+ * @param className class name to add to highlighted nodes
  */
 function highlightRange(
   this: ShikiTransformerContext,
@@ -64,14 +65,14 @@ function highlightRange(
   }
 }
 
-function hasOverlap(range1: [number, number], range2: [ number, number]): boolean {
+function hasOverlap(range1: [number, number], range2: [number, number]): boolean {
   return (range1[0] <= range2[1]) && (range1[1]) >= range2[0]
 }
 
 function separateToken(span: Element, textNode: Text, index: number, len: number): [
-  before: Element | undefined,
-  med: Element,
-  after: Element | undefined,
+    before: Element | undefined,
+    med: Element,
+    after: Element | undefined,
 ] {
   const text = textNode.value
 

diff --git a/packages/transformers/src/shared/notation-transformer.ts b/packages/transformers/src/shared/notation-transformer.ts
@@ -0,0 +1,97 @@
+import type { Element, Text } from 'hast'
+import type { ShikiTransformer, ShikiTransformerContext } from 'shiki'
+import { parseComments, type ParsedComments, v1ClearEndCommentPrefix } from './parse-comments'
+
+export type MatchAlgorithm = 'v1' | 'v3'
+
+export interface MatchAlgorithmOptions {
+  /**
+   * Match algorithm to use
+   *
+   * @see https://shiki.style/packages/transformers#matching-algorithm
+   * @default 'v1'
+   */
+  matchAlgorithm?: MatchAlgorithm
+}
+
+export function createCommentNotationTransformer(
+  name: string,
+  regex: RegExp,
+  onMatch: (
+    this: ShikiTransformerContext,
+    match: string[],
+    line: Element,
+    commentNode: Element,
+    lines: Element[],
+    index: number
+  ) => boolean,
+  matchAlgorithm: MatchAlgorithm = 'v1',
+): ShikiTransformer {
+  return {
+    name,
+    code(code) {
+      const lines = code.children.filter(i => i.type === 'element')
+      const linesToRemove: (Element | Text)[] = []
+
+      code.data ??= {} as any
+      const data = code.data as {
+        _shiki_notation?: ParsedComments
+      }
+
+      data._shiki_notation ??= parseComments(lines, ['jsx', 'tsx'].includes(this.options.lang), matchAlgorithm)
+      const parsed = data._shiki_notation
+
+      for (const comment of parsed) {
+        if (comment.info[1].length === 0)
+          continue
+
+        const isLineCommentOnly = comment.line.children.length === (comment.isJsxStyle ? 3 : 1)
+        let lineIdx = lines.indexOf(comment.line)
+        if (isLineCommentOnly && matchAlgorithm !== 'v1')
+          lineIdx++
+
+        let replaced = false
+        comment.info[1] = comment.info[1].replace(regex, (...match) => {
+          if (onMatch.call(this, match, comment.line, comment.token, lines, lineIdx)) {
+            replaced = true
+            return ''
+          }
+
+          return match[0]
+        })
+
+        if (!replaced)
+          continue
+
+        if (matchAlgorithm === 'v1') {
+          comment.info[1] = v1ClearEndCommentPrefix(comment.info[1])
+        }
+
+        const isEmpty = comment.info[1].trim().length === 0
+        // ignore comment node
+        if (isEmpty)
+          comment.info[1] = ''
+
+        if (isEmpty && isLineCommentOnly) {
+          linesToRemove.push(comment.line)
+        }
+        else if (isEmpty && comment.isJsxStyle) {
+          comment.line.children.splice(comment.line.children.indexOf(comment.token) - 1, 3)
+        }
+        else if (isEmpty) {
+          comment.line.children.splice(comment.line.children.indexOf(comment.token), 1)
+        }
+        else {
+          const head = comment.token.children[0]
+
+          if (head.type === 'text') {
+            head.value = comment.info.join('')
+          }
+        }
+      }
+
+      for (const line of linesToRemove)
+        code.children.splice(code.children.indexOf(line), 1)
+    },
+  }
+}
diff --git a/packages/transformers/src/shared/parse-comments.ts b/packages/transformers/src/shared/parse-comments.ts
@@ -0,0 +1,134 @@
+import type { Element, ElementContent } from 'hast'
+import type { MatchAlgorithm } from './notation-transformer'
+
+export type ParsedComments = {
+  line: Element
+  token: Element
+  info: [prefix: string, content: string, suffix?: string]
+  isJsxStyle: boolean
+}[]
+
+/**
+ * some comment formats have to be located at the end of line
+ * hence we can skip matching them for other tokens
+ */
+const matchers: [re: RegExp, endOfLine: boolean][] = [
+  [/^(<!--)(.+)(-->)$/, false],
+  [/^(\/\*)(.+)(\*\/)$/, false],
+  [/^(\/\/|["'#]|;{1,2}|%{1,2}|--)(.*)$/, true],
+  /**
+   * for multi-line comments like this
+   */
+  [/^(\*)(.+)$/, true],
+]
+
+/**
+ * @param lines line tokens
+ * @param jsx enable JSX parsing
+ * @param matchAlgorithm matching algorithm
+ */
+export function parseComments(
+  lines: Element[],
+  jsx: boolean,
+  matchAlgorithm: MatchAlgorithm,
+): ParsedComments {
+  const out: ParsedComments = []
+
+  for (const line of lines) {
+    const elements = line.children
+    let start = elements.length - 1
+    if (matchAlgorithm === 'v1')
+      start = 0
+    else if (jsx)
+      // one step further for JSX as comment is inside curly brackets
+      start = elements.length - 2
+
+    for (let i = Math.max(start, 0); i < elements.length; i++) {
+      const token = elements[i]
+      if (token.type !== 'element')
+        continue
+      const head = token.children.at(0)
+      if (head?.type !== 'text')
+        continue
+
+      const isLast = i === elements.length - 1
+      const match = matchToken(head.value, isLast)
+      if (!match)
+        continue
+
+      if (jsx && !isLast && i !== 0) {
+        out.push({
+          info: match,
+          line,
+          token,
+          isJsxStyle: isValue(elements[i - 1], '{') && isValue(elements[i + 1], '}'),
+        })
+      }
+      else {
+        out.push({
+          info: match,
+          line,
+          token,
+          isJsxStyle: false,
+        })
+      }
+    }
+  }
+
+  return out
+}
+
+function isValue(element: ElementContent, value: string): boolean {
+  if (element.type !== 'element')
+    return false
+  const text = element.children[0]
+  if (text.type !== 'text')
+    return false
+
+  return text.value.trim() === value
+}
+
+/**
+ * @param text text value of comment node
+ * @param isLast whether the token is located at the end of line
+ */
+function matchToken(text: string, isLast: boolean): [prefix: string, content: string, suffix?: string] | undefined {
+  // no leading and trailing spaces allowed for matchers
+  // we extract the spaces
+  let trimmed = text.trimStart()
+  const spaceFront = text.length - trimmed.length
+
+  trimmed = trimmed.trimEnd()
+  const spaceEnd = text.length - trimmed.length - spaceFront
+
+  for (const [matcher, endOfLine] of matchers) {
+    if (endOfLine && !isLast)
+      continue
+
+    const result = matcher.exec(trimmed)
+    if (!result)
+      continue
+
+    return [
+      ' '.repeat(spaceFront) + result[1],
+      result[2],
+      result[3] ? result[3] + ' '.repeat(spaceEnd) : undefined,
+    ]
+  }
+}
+
+/**
+ * Remove empty comment prefixes at line end, e.g. `// `
+ *
+ * For matchAlgorithm v1
+ */
+export function v1ClearEndCommentPrefix(text: string): string {
+  const regex = /(?:\/\/|["'#]|;{1,2}|%{1,2}|--)(.*)$/
+  const result = regex.exec(text)
+
+  if (result && result[1].trim().length === 0) {
+    return text.slice(0, result.index)
+  }
+
+  return text
+}
diff --git a/packages/transformers/src/transformers/meta-highlight.ts b/packages/transformers/src/transformers/meta-highlight.ts
@@ -12,8 +12,7 @@ export function parseMetaHighlightString(meta: string): number[] | null {
       const num = v.split('-').map(v => Number.parseInt(v, 10))
       if (num.length === 1)
         return [num[0]]
-      else
-        return Array.from({ length: num[1] - num[0] + 1 }, (_, i) => i + num[0])
+      return Array.from({ length: num[1] - num[0] + 1 }, (_, i) => i + num[0])
     })
   return lines
 }
@@ -45,8 +44,13 @@ export function transformerMetaHighlight(
       if (!this.options.meta?.__raw) {
         return
       }
-      ;(this.meta as any)[symbol] ||= parseMetaHighlightString(this.options.meta.__raw)
-      const lines: number[] = (this.meta as any)[symbol] || []
+      const meta = this.meta as {
+        [symbol]: number[] | null
+      }
+
+      meta[symbol] ??= parseMetaHighlightString(this.options.meta.__raw)
+      const lines: number[] = meta[symbol] ?? []
+
       if (lines.includes(line))
         this.addClassToHast(node, className)
       return node

diff --git a/packages/transformers/src/transformers/notation-diff.ts b/packages/transformers/src/transformers/notation-diff.ts
@@ -1,7 +1,8 @@
 import type { ShikiTransformer } from 'shiki'
+import type { MatchAlgorithmOptions } from '../shared/notation-transformer'
 import { transformerNotationMap } from './notation-map'
 
-export interface TransformerNotationDiffOptions {
+export interface TransformerNotationDiffOptions extends MatchAlgorithmOptions {
   /**
    * Class for added lines
    */
@@ -35,6 +36,7 @@ export function transformerNotationDiff(
         '--': classLineRemove,
       },
       classActivePre,
+      matchAlgorithm: options.matchAlgorithm,
     },
     '@shikijs/transformers:notation-diff',
   )

diff --git a/packages/transformers/src/transformers/notation-error-level.ts b/packages/transformers/src/transformers/notation-error-level.ts
@@ -1,7 +1,8 @@
 import type { ShikiTransformer } from 'shiki'
+import type { MatchAlgorithmOptions } from '../shared/notation-transformer'
 import { transformerNotationMap } from './notation-map'
 
-export interface TransformerNotationErrorLevelOptions {
+export interface TransformerNotationErrorLevelOptions extends MatchAlgorithmOptions {
   classMap?: Record<string, string | string[]>
   /**
    * Class added to the <pre> element when the current code has diff
@@ -27,6 +28,7 @@ export function transformerNotationErrorLevel(
     {
       classMap,
       classActivePre,
+      matchAlgorithm: options.matchAlgorithm,
     },
     '@shikijs/transformers:notation-error-level',
   )

diff --git a/packages/transformers/src/transformers/notation-focus.ts b/packages/transformers/src/transformers/notation-focus.ts
@@ -1,7 +1,8 @@
 import type { ShikiTransformer } from 'shiki'
+import type { MatchAlgorithmOptions } from '../shared/notation-transformer'
 import { transformerNotationMap } from './notation-map'
 
-export interface TransformerNotationFocusOptions {
+export interface TransformerNotationFocusOptions extends MatchAlgorithmOptions {
   /**
    * Class for focused lines
    */
@@ -29,6 +30,7 @@ export function transformerNotationFocus(
         focus: classActiveLine,
       },
       classActivePre,
+      matchAlgorithm: options.matchAlgorithm,
     },
     '@shikijs/transformers:notation-focus',
   )