From 554cb4b8906cf316fae5a2707c95c9b5bd555fcb Mon Sep 17 00:00:00 2001 From: Martin Scavnicky Date: Sat, 15 Aug 2020 14:25:44 +0200 Subject: [PATCH] Replace lxml with BeautifulSoup. --- README.md | 4 ++-- hypotonic/context.py | 50 ++++++++++++++++++++++------------------- pyproject.toml | 5 ++--- tests/test_hypotonic.py | 12 ---------- 4 files changed, 31 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index dcfef92..4135dca 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,13 @@ Fast asynchronous web scraper with minimalist API inspired by awesome [node-osmosis](https://github.com/rchipka/node-osmosis). -Hypotonic provides SQLAlchemy-like command chaining DSL to define HTML scrapers. Everything is executed asynchronously via `asyncio` and is ultra-fast thanks to `lxml` parser. Supports querying by XPath or CSS selectors. +Hypotonic provides SQLAlchemy-like command chaining DSL to define HTML scrapers. Everything is executed asynchronously via `asyncio` and all dependencies are pure Python. Supports querying by CSS selectors with Scrapy's pseudo-attributes. XPath is not supported due to `libxml` requirement. Hypotonic does not natively execute JavaScript on websites and it is recommended to use [prerender](https://prerender.com). ## Installing -Hypotonic requires Python 3.6+ and `libxml2` C library. +Hypotonic requires Python 3.6+. `pip install hypotonic` diff --git a/hypotonic/context.py b/hypotonic/context.py index 121ad3b..df06158 100644 --- a/hypotonic/context.py +++ b/hypotonic/context.py @@ -1,7 +1,6 @@ import unicodedata -from lxml import html -from parsel import csstranslator -from cssselect import SelectorError +import urllib.parse +import bs4 class StringContext: @@ -28,38 +27,43 @@ def __init__(self, url, str_or_element): self.element = self.parse(self.url, self.element) def select(self, selector): + # Determine whether selector is using ::attr or ::text pseudo-attribute. + selector_type = None + if selector.endswith('::text'): + selector_type = 'text' + selector = selector.rstrip('::text') + elif selector.find('::attr') > 0: + # Use attribute as a selector type. + selector_type = selector[selector.find('::attr') + 7:-1] + selector = selector[:selector.find('::attr')] + selected = [] - for element in self.element.xpath(self.to_xpath(selector)): - # XPath query can return both element, or lxml.etree._ElementUnicodeResult. - if isinstance(element, str): - selected.append(StringContext(self.url, element)) - else: + for element in self.element.select(selector): + if selector_type is None: selected.append(HtmlContext(self.url, element)) + elif selector_type == 'text': + selected.append( + StringContext(self.url, HtmlContext(self.url, element).text())) + else: + selected.append(StringContext(self.url, element[selector_type])) return selected def text(self): """Convenience method to extract text from HTML element tree. Performs stripping and canonicalization of UTF-8 characters (e.g. '/xa0' to ' ').""" - return unicodedata.normalize('NFKC', self.element.text_content().strip()) + return unicodedata.normalize('NFKC', self.element.text.strip()) def __str__(self): """Convert HTML element to raw html string including element tags.""" - return html.tostring(self.element).decode('utf-8') - - @staticmethod - def to_xpath(selector): - """Attempt to convert CSS selector to XPath.""" - try: - return csstranslator.css2xpath(selector) - except SelectorError: - return selector + return str(self.element) @staticmethod def parse(url, html_string): - doc = html.fromstring(html_string) + doc = bs4.BeautifulSoup(html_string, features='html5lib') # Making links absolute is required to allow following. - doc.make_links_absolute(url) - # Replacing
tags with \n, prevents text contatenating. - for br in doc.xpath('*//br'): - br.tail = '\n' + br.tail if br.tail else '\n' + for tag in doc.findAll('a', href=True): + tag['href'] = urllib.parse.urljoin(url, tag['href']) + # Replacing
tags with \n, prevents text concatenating. + for br in doc.findAll('br'): + br.replace_with("\n") return doc diff --git a/pyproject.toml b/pyproject.toml index 9dadba8..ed307a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,10 +16,9 @@ classifiers = [ [tool.poetry.dependencies] python = "^3.6" -parsel = ">=1.2.0" -lxml = ">=4.1.1" -cssselect = ">=1.0.3" aiohttp = ">=3.0.9" +beautifulsoup4 = ">=4.7.1" +html5lib = ">=1.0.1" [tool.poetry.dev-dependencies] bumpversion = ">=0.5.0" diff --git a/tests/test_hypotonic.py b/tests/test_hypotonic.py index addf998..e41cd0a 100644 --- a/tests/test_hypotonic.py +++ b/tests/test_hypotonic.py @@ -75,18 +75,6 @@ def test_find_with_invalid_selector(self, _): self.assertFalse(data) self.assertEqual(1, len(errors)) - def test_find_with_xpath(self): - data, errors = ( - Hypotonic('http://books.toscrape.com/') - .find( - '//*[contains(concat( " ", @class, " " ), concat( " ", "nav-list", " " ))]//ul//a') - .set('category') - .data()) - - self.assertFalse(errors) - self.assertEqual(50, len(data)) - self.assertTrue({'category': 'Romance'} in data) - def test_find_with_css(self): data, errors = ( Hypotonic('http://books.toscrape.com/')