Replace lxml with BeautifulSoup.

mscavnicky · Aug 15, 2020 · 554cb4b · 554cb4b
1 parent fee525a
commit 554cb4b
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -2,13 +2,13 @@
 
 Fast asynchronous web scraper with minimalist API inspired by awesome [node-osmosis](https://github.com/rchipka/node-osmosis).
 
-Hypotonic provides SQLAlchemy-like command chaining DSL to define HTML scrapers. Everything is executed asynchronously via `asyncio` and is ultra-fast thanks to `lxml` parser. Supports querying by XPath or CSS selectors.
+Hypotonic provides SQLAlchemy-like command chaining DSL to define HTML scrapers. Everything is executed asynchronously via `asyncio` and all dependencies are pure Python. Supports querying by CSS selectors with Scrapy's pseudo-attributes. XPath is not supported due to `libxml` requirement.
 
 Hypotonic does not natively execute JavaScript on websites and it is recommended to use [prerender](https://prerender.com).
 
 ## Installing
 
-Hypotonic requires Python 3.6+ and `libxml2` C library.
+Hypotonic requires Python 3.6+.
 
 `pip install hypotonic`
 

diff --git a/hypotonic/context.py b/hypotonic/context.py
@@ -1,7 +1,6 @@
 import unicodedata
-from lxml import html
-from parsel import csstranslator
-from cssselect import SelectorError
+import urllib.parse
+import bs4
 
 
 class StringContext:
@@ -28,38 +27,43 @@ def __init__(self, url, str_or_element):
       self.element = self.parse(self.url, self.element)
 
   def select(self, selector):
+    # Determine whether selector is using ::attr or ::text pseudo-attribute.
+    selector_type = None
+    if selector.endswith('::text'):
+      selector_type = 'text'
+      selector = selector.rstrip('::text')
+    elif selector.find('::attr') > 0:
+      # Use attribute as a selector type.
+      selector_type = selector[selector.find('::attr') + 7:-1]
+      selector = selector[:selector.find('::attr')]
+
     selected = []
-    for element in self.element.xpath(self.to_xpath(selector)):
-      # XPath query can return both element, or lxml.etree._ElementUnicodeResult.
-      if isinstance(element, str):
-        selected.append(StringContext(self.url, element))
-      else:
+    for element in self.element.select(selector):
+      if selector_type is None:
         selected.append(HtmlContext(self.url, element))
+      elif selector_type == 'text':
+        selected.append(
+          StringContext(self.url, HtmlContext(self.url, element).text()))
+      else:
+        selected.append(StringContext(self.url, element[selector_type]))
     return selected
 
   def text(self):
     """Convenience method to extract text from HTML element tree. Performs
       stripping and canonicalization of UTF-8 characters (e.g. '/xa0' to ' ')."""
-    return unicodedata.normalize('NFKC', self.element.text_content().strip())
+    return unicodedata.normalize('NFKC', self.element.text.strip())
 
   def __str__(self):
     """Convert HTML element to raw html string including element tags."""
-    return html.tostring(self.element).decode('utf-8')
-
-  @staticmethod
-  def to_xpath(selector):
-    """Attempt to convert CSS selector to XPath."""
-    try:
-      return csstranslator.css2xpath(selector)
-    except SelectorError:
-      return selector
+    return str(self.element)
 
   @staticmethod
   def parse(url, html_string):
-    doc = html.fromstring(html_string)
+    doc = bs4.BeautifulSoup(html_string, features='html5lib')
     # Making links absolute is required to allow following.
-    doc.make_links_absolute(url)
-    # Replacing <br> tags with \n, prevents text contatenating.
-    for br in doc.xpath('*//br'):
-      br.tail = '\n' + br.tail if br.tail else '\n'
+    for tag in doc.findAll('a', href=True):
+      tag['href'] = urllib.parse.urljoin(url, tag['href'])
+    # Replacing <br> tags with \n, prevents text concatenating.
+    for br in doc.findAll('br'):
+      br.replace_with("\n")
     return doc
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,10 +16,9 @@ classifiers = [
 
 [tool.poetry.dependencies]
 python = "^3.6"
-parsel = ">=1.2.0"
-lxml = ">=4.1.1"
-cssselect = ">=1.0.3"
 aiohttp = ">=3.0.9"
+beautifulsoup4 = ">=4.7.1"
+html5lib = ">=1.0.1"
 
 [tool.poetry.dev-dependencies]
 bumpversion = ">=0.5.0"

diff --git a/tests/test_hypotonic.py b/tests/test_hypotonic.py
@@ -75,18 +75,6 @@ def test_find_with_invalid_selector(self, _):
     self.assertFalse(data)
     self.assertEqual(1, len(errors))
 
-  def test_find_with_xpath(self):
-    data, errors = (
-      Hypotonic('http://books.toscrape.com/')
-        .find(
-        '//*[contains(concat( " ", @class, " " ), concat( " ", "nav-list", " " ))]//ul//a')
-        .set('category')
-        .data())
-
-    self.assertFalse(errors)
-    self.assertEqual(50, len(data))
-    self.assertTrue({'category': 'Romance'} in data)
-
   def test_find_with_css(self):
     data, errors = (
       Hypotonic('http://books.toscrape.com/')