Mr0grog · iloveitaly · Aug 12, 2023 · Aug 12, 2023 · Aug 12, 2023 · Aug 12, 2023
diff --git a/README.md b/README.md
@@ -27,18 +27,44 @@ First make sure you have Node.js installed. Then:
     ```sh
     > npm run build
     ```
-    
+
     …and the built output will be in the `dist` folder.
-    
+
     To start a server with live rebuilding, run:
-    
+
     ```sh
     > npm start
     ```
-    
+
     Then point your browser to `http://localhost:9000` to see the site. It will automatically rebuild whenever you change any files.
 
 
+## CLI Usage
+
+```shell
+echo "<b>some html</b>" | npx google-docs-to-markdown
+```
+
+However, what you really want it run this after copying text from Google Docs. To do this, you'll need to extract the HTML
+on the clipboard. Here's a script for macOS to do this:
+
+```shell
+swift - <<EOF | npx google-docs-to-markdown | pbcopy
+import Cocoa
+
+let type = NSPasteboard.PasteboardType.html
+
+guard let string = NSPasteboard.general.string(forType:type) else {
+  fputs("Could not find string data of type '\(type)' on the system pasteboard\n", stderr)
+  exit(1)
+}
+
+print(string)
+EOF
+```
+
+You can then tie this script to a keyboard shortcut if using something like Raycast or another launchbar.
+
 ## Contributors
 
 This project is open source, and gets better with the hard work and collaboration of multiple people. Thanks to the following for their contributions:

diff --git a/cmd.js b/cmd.js
@@ -0,0 +1,23 @@
+#!/usr/bin/env node
+
+import {convertDocsHtmlToMarkdown} from './lib/convert.js';
+import { buffer as readBuffer } from 'node:stream/consumers';
+
+// in order to debug this tool over the command line, you can read a file with broken input locally
+// ex: fs.readFileSync('./local-file.html', 'utf8');
+
+const rawInput = await readBuffer(process.stdin);
+const inputHTML = rawInput.toString('utf-8');
+
+if(!inputHTML) {
+  console.error('no HTML provided over stdin');
+  process.exit(1);
+}
+
+try {
+  convertDocsHtmlToMarkdown(inputHTML).then(markdown => {
+    process.stdout.write(markdown);
+  });
+} catch (error) {
+  console.error(`Error converting HTML to markdown: ${error.message}`);
+}
diff --git a/lib/convert.js b/lib/convert.js
@@ -1,12 +1,14 @@
 import fixGoogleHtml from './fix-google-html.js';
 // rehype-dom-parse is a lightweight version of rehype-parse that leverages
 // browser APIs -- reduces bundle size by ~200 kB!
-import parse from 'rehype-dom-parse';
+import {default as rehypeDom} from 'rehype-dom-parse';
+import {default as rehypeNode} from 'rehype-parse';
 import { all } from 'rehype-remark';
 import rehype2remarkWithSpaces from './rehype-to-remark-with-spaces.js';
 import remarkGfm from 'remark-gfm';
 import stringify from 'remark-stringify';
 import { unified } from 'unified';
+import { default as logTree } from './log-tree.js';
 
 /** @typedef {import("mdast-util-to-markdown").State} as MdastState */
 /** @typedef {import("unist").Node} UnistNode */
@@ -35,10 +37,14 @@ function doubleBlankLinesBeforeHeadings (previous, next, _parent, _state) {
   return undefined;
 }
 
+const isNode = typeof process !== 'undefined' && process.versions?.node;
+const rehypeParse = isNode ? rehypeNode : rehypeDom;
+const rehypeParseOptions = isNode ? {fragment: true, verbose: true, emitParseErrors: true} : {}
+
 const processor = unified()
-  .use(parse)
+  .use(rehypeParse, rehypeParseOptions)
   .use(fixGoogleHtml)
-  // .use(require('./lib/log-tree').default)
+  // .use(logTree)
   .use(rehype2remarkWithSpaces, {
     handlers: {
       // Preserve sup/sub markup; most Markdowns have no markup for it.

diff --git a/lib/fix-google-html.js b/lib/fix-google-html.js
@@ -123,6 +123,24 @@ export function fixNestedLists (node) {
   });
 }
 
+// Google Docs wraps the entire document in a <b> tag which is not removed when parsed by the parsing library
+// this seems to only occur when running via nodejs (not the browser).
+// this function is only ever called once with the parent node as the input node
+function removeRootBoldWrapper(node) {
+  // there are some cases, like translating <b>something something</b> where we don't want to remove the root node
+  if(node.children.length === 1 && node.children[0].tagName === 'b') {
+    return
+  }
+
+  for(let i = 0; i < node.children.length; i++) {
+    const child = node.children[i];
+
+    if(child.tagName === 'b') {
+      node.children.splice(i, 1, ...child.children);
+    }
+  }
+}
+
 /**
  * Google Docs does italics/bolds/etc on `<span>`s with style attributes, but
  * rehype-remark does not pick up on those. Instead, transform them into
@@ -259,7 +277,7 @@ export function unwrapLineBreaks (node) {
 /**
  * Moves linebreaks outside of anchor elements,
  * if the linebreak is the first and/or last child of the anchor.
- * @param {RehypeNode} node 
+ * @param {RehypeNode} node
  */
 export function moveLinebreaksOutsideOfAnchors (node) {
   visit(node, isAnchor, (node, index, parent) => {
@@ -579,6 +597,7 @@ function fixChecklists (node) {
  */
 export default function fixGoogleHtml () {
   return (tree, _file) => {
+    removeRootBoldWrapper(tree);
     unInlineStyles(tree);
     createCodeBlocks(tree);
     moveSpaceOutsideSensitiveChildren(tree);