Skip to content

Commit

Permalink
perf(readability): use happy-dom
Browse files Browse the repository at this point in the history
  • Loading branch information
Kikobeats committed Jan 11, 2025
1 parent 2c11034 commit 8cbaec0
Show file tree
Hide file tree
Showing 6 changed files with 1,427 additions and 10 deletions.
13 changes: 13 additions & 0 deletions packages/metascraper-readability/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,19 @@
$ npm install metascraper-readability --save
```

## API

### metascraper-readability([options])

#### options

##### getDocument

Type: `function`<br>
Default: [source code](https://github.com/microlinkhq/metascraper/blob/master/packages/metascraper-readability/src/index.js#L14-L20)

The function to be called to serialized html into a DOM document.

## License

**metascraper-readability** © [Microlink](https://microlink.io), released under the [MIT](https://github.com/microlinkhq/metascraper/blob/master/LICENSE.md) License.<br>
Expand Down
1,345 changes: 1,345 additions & 0 deletions packages/metascraper-readability/benchmark/fixture.html

Large diffs are not rendered by default.

42 changes: 42 additions & 0 deletions packages/metascraper-readability/benchmark/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
'use strict'

const { readFileSync } = require('fs')

const url = 'https://arxiv.org/pdf/2412.06592'
const html = readFileSync('./fixture.html', 'utf8')

const jsdom = () => {
const { JSDOM, VirtualConsole } = require('jsdom')
const dom = new JSDOM(html, { url, virtualConsole: new VirtualConsole() })
return dom.window.document
}

const happydom = () => {
const { Window } = require('happy-dom')
const window = new Window({ url })
const document = window.document
document.documentElement.innerHTML = html
return document
}

const { Readability } = require('@mozilla/readability')

const measure = fn => {
const now = Date.now()
const parsed = new Readability(fn()).parse()
return { parsed, duration: Date.now() - now }
}

const jsdomResult = measure(jsdom)
const happydomResult = measure(happydom)

const isEqual = (value1, value2) =>
JSON.stringify(value1) === JSON.stringify(value2)

if (!isEqual(jsdomResult.parsed, happydomResult.parsed)) {
console.error('Results are different')
process.exit(1)
}

console.log(` jsdom: ${jsdomResult.duration}ms`)
console.log(`happydom: ${happydomResult.duration}ms`)
9 changes: 9 additions & 0 deletions packages/metascraper-readability/benchmark/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"name": "@metascraper-readability/benchmark",
"private": true,
"version": "1.0.0",
"devDependencies": {
"dom-parser": "latest",
"happy-dom": "latest"
}
}
2 changes: 1 addition & 1 deletion packages/metascraper-readability/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"dependencies": {
"@metascraper/helpers": "workspace:*",
"@mozilla/readability": "~0.5.0",
"jsdom": "~25.0.1"
"happy-dom": "~16.5.3"
},
"devDependencies": {
"ava": "5",
Expand Down
26 changes: 17 additions & 9 deletions packages/metascraper-readability/src/index.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
'use strict'

const { memoizeOne, composeRule } = require('@metascraper/helpers')

const { Readability } = require('@mozilla/readability')
const { JSDOM, VirtualConsole } = require('jsdom')

const parseReader = reader => {
try {
Expand All @@ -13,15 +11,25 @@ const parseReader = reader => {
}
}

const readability = memoizeOne((url, html) => {
const dom = new JSDOM(html, { url, virtualConsole: new VirtualConsole() })
const reader = new Readability(dom.window.document)
return parseReader(reader)
}, memoizeOne.EqualityFirstArgument)
const defaultGetDocument = ({ url, html }) => {
const { Window } = require('happy-dom')
const window = new Window({ url })
const document = window.document
document.documentElement.innerHTML = html
return document
}

module.exports = ({ getDocument = defaultGetDocument } = {}) => {
const readability = memoizeOne((url, html, getDocument) => {
const document = getDocument({ url, html })
const reader = new Readability(document)
return parseReader(reader)
}, memoizeOne.EqualityFirstArgument)

const getReadbility = composeRule(($, url) => readability(url, $.html()))
const getReadbility = composeRule(($, url) =>
readability(url, $.html(), getDocument)
)

module.exports = () => {
return {
author: getReadbility({ from: 'byline', to: 'author' }),
description: getReadbility({ from: 'excerpt', to: 'description' }),
Expand Down

0 comments on commit 8cbaec0

Please sign in to comment.