Skip to content

Commit

Permalink
Replace lxml with BeautifulSoup.
Browse files Browse the repository at this point in the history
  • Loading branch information
mscavnicky committed Aug 15, 2020
1 parent fee525a commit 554cb4b
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 40 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@

Fast asynchronous web scraper with minimalist API inspired by awesome [node-osmosis](https://github.com/rchipka/node-osmosis).

Hypotonic provides SQLAlchemy-like command chaining DSL to define HTML scrapers. Everything is executed asynchronously via `asyncio` and is ultra-fast thanks to `lxml` parser. Supports querying by XPath or CSS selectors.
Hypotonic provides SQLAlchemy-like command chaining DSL to define HTML scrapers. Everything is executed asynchronously via `asyncio` and all dependencies are pure Python. Supports querying by CSS selectors with Scrapy's pseudo-attributes. XPath is not supported due to `libxml` requirement.

Hypotonic does not natively execute JavaScript on websites and it is recommended to use [prerender](https://prerender.com).

## Installing

Hypotonic requires Python 3.6+ and `libxml2` C library.
Hypotonic requires Python 3.6+.

`pip install hypotonic`

Expand Down
50 changes: 27 additions & 23 deletions hypotonic/context.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import unicodedata
from lxml import html
from parsel import csstranslator
from cssselect import SelectorError
import urllib.parse
import bs4


class StringContext:
Expand All @@ -28,38 +27,43 @@ def __init__(self, url, str_or_element):
self.element = self.parse(self.url, self.element)

def select(self, selector):
# Determine whether selector is using ::attr or ::text pseudo-attribute.
selector_type = None
if selector.endswith('::text'):
selector_type = 'text'
selector = selector.rstrip('::text')
elif selector.find('::attr') > 0:
# Use attribute as a selector type.
selector_type = selector[selector.find('::attr') + 7:-1]
selector = selector[:selector.find('::attr')]

selected = []
for element in self.element.xpath(self.to_xpath(selector)):
# XPath query can return both element, or lxml.etree._ElementUnicodeResult.
if isinstance(element, str):
selected.append(StringContext(self.url, element))
else:
for element in self.element.select(selector):
if selector_type is None:
selected.append(HtmlContext(self.url, element))
elif selector_type == 'text':
selected.append(
StringContext(self.url, HtmlContext(self.url, element).text()))
else:
selected.append(StringContext(self.url, element[selector_type]))
return selected

def text(self):
"""Convenience method to extract text from HTML element tree. Performs
stripping and canonicalization of UTF-8 characters (e.g. '/xa0' to ' ')."""
return unicodedata.normalize('NFKC', self.element.text_content().strip())
return unicodedata.normalize('NFKC', self.element.text.strip())

def __str__(self):
"""Convert HTML element to raw html string including element tags."""
return html.tostring(self.element).decode('utf-8')

@staticmethod
def to_xpath(selector):
"""Attempt to convert CSS selector to XPath."""
try:
return csstranslator.css2xpath(selector)
except SelectorError:
return selector
return str(self.element)

@staticmethod
def parse(url, html_string):
doc = html.fromstring(html_string)
doc = bs4.BeautifulSoup(html_string, features='html5lib')
# Making links absolute is required to allow following.
doc.make_links_absolute(url)
# Replacing <br> tags with \n, prevents text contatenating.
for br in doc.xpath('*//br'):
br.tail = '\n' + br.tail if br.tail else '\n'
for tag in doc.findAll('a', href=True):
tag['href'] = urllib.parse.urljoin(url, tag['href'])
# Replacing <br> tags with \n, prevents text concatenating.
for br in doc.findAll('br'):
br.replace_with("\n")
return doc
5 changes: 2 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,9 @@ classifiers = [

[tool.poetry.dependencies]
python = "^3.6"
parsel = ">=1.2.0"
lxml = ">=4.1.1"
cssselect = ">=1.0.3"
aiohttp = ">=3.0.9"
beautifulsoup4 = ">=4.7.1"
html5lib = ">=1.0.1"

[tool.poetry.dev-dependencies]
bumpversion = ">=0.5.0"
Expand Down
12 changes: 0 additions & 12 deletions tests/test_hypotonic.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,18 +75,6 @@ def test_find_with_invalid_selector(self, _):
self.assertFalse(data)
self.assertEqual(1, len(errors))

def test_find_with_xpath(self):
data, errors = (
Hypotonic('http://books.toscrape.com/')
.find(
'//*[contains(concat( " ", @class, " " ), concat( " ", "nav-list", " " ))]//ul//a')
.set('category')
.data())

self.assertFalse(errors)
self.assertEqual(50, len(data))
self.assertTrue({'category': 'Romance'} in data)

def test_find_with_css(self):
data, errors = (
Hypotonic('http://books.toscrape.com/')
Expand Down

0 comments on commit 554cb4b

Please sign in to comment.