-
Notifications
You must be signed in to change notification settings - Fork 38
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Issue #87 Make sanitization happen in Post.htmlize() #88
Changes from 1 commit
1507ffc
8b4ef79
bfafd43
00415a3
6eb8e58
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
import bleach | ||
from readability.readability import Document | ||
|
||
from baleen.exceptions import ExportError | ||
|
||
|
||
def _get_raw_html(html): | ||
""" | ||
:param html: html content | ||
:return: the unmodified html | ||
""" | ||
return html | ||
|
||
|
||
def _get_safe_html(html): | ||
""" | ||
Applies Readability's sanitize() method to content. | ||
:param html: the content to sanitize | ||
:return: the body of the html content minus html tags | ||
""" | ||
if html is None: | ||
return None | ||
return Document(html).summary() | ||
|
||
|
||
def _get_text_from_html(html): | ||
""" | ||
Applies the 'safe' level of sanitization, removes newlines, | ||
and converts the html entity for ampersand into the ampersand character. | ||
:param html: the content to sanitize | ||
:return: sanitized content | ||
""" | ||
if html is None: | ||
return html | ||
|
||
text = _get_safe_html(html) | ||
text = bleach.clean(text, tags=[], strip=True) | ||
text = text.strip() | ||
text = text.replace("\n", "") | ||
text = text.replace("&", "&") | ||
return text | ||
|
||
|
||
def sanitize_html(html, level): | ||
""" | ||
Return a sanitized version of html content | ||
:param html: the content to sanitized | ||
:param level: the type of sanitization - one of ['raw', 'safe', 'text', None] | ||
:return: sanitized content | ||
""" | ||
if level == SAFE: | ||
return _get_safe_html(html) | ||
elif level == RAW: | ||
return _get_raw_html(html) | ||
elif level == TEXT: | ||
return _get_text_from_html(html) | ||
elif level is None: | ||
return html | ||
|
||
raise ExportError( | ||
"{level} is not a supported sanitize_html level.".format( | ||
level=level | ||
) | ||
) | ||
|
||
|
||
RAW = 'raw' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These module constants should be hoisted to the top of the file. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
SAFE = 'safe' | ||
TEXT = 'text' | ||
SANITIZE_LEVELS = (RAW, SAFE, TEXT) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,20 +18,13 @@ | |
########################################################################## | ||
|
||
import unittest | ||
import logging | ||
from unittest.mock import MagicMock | ||
|
||
from mongomock import MongoClient as MockMongoClient | ||
from unittest import mock | ||
from unittest.mock import MagicMock | ||
|
||
from baleen.export import * | ||
from baleen.feed import * | ||
from baleen.models import connect | ||
from baleen.exceptions import ExportError | ||
|
||
########################################################################## | ||
## Fixtures | ||
########################################################################## | ||
|
||
BOOKS_FEED = Feed( | ||
title='The Rumpus.net', | ||
|
@@ -254,57 +247,3 @@ def test_export_with_category_path_failure(self): | |
exporter.export() | ||
|
||
|
||
class SanitizeHtmlTests(unittest.TestCase): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It absolutely makes sense to remove these tests, should we have a test for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's verging on overkill but I like making percentages go up. Let me look at it this weekend. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hehe, ok -- well do you want me to merge this PR and make a new one, or would you rather keep this PR open until you write the tests? BTW - do you see a button at the bottom entitled "squash and merge", once everything is reviewed and CI approved, please feel free to merge yourself if you do see that button. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll merge this and take up the test in another feature, possibly for #89 . |
||
""" Tests the exporter's HTML sanitize methods """ | ||
|
||
@classmethod | ||
def setUpClass(cls): | ||
cls.sample_html = ('<html>' | ||
'<head><script>javascript here</script></head>' | ||
'<body><b>body &\n mind</b></body>' | ||
'</html>') | ||
|
||
cls.conn = connect(host='mongomock://localhost') | ||
assert isinstance(cls.conn, MockMongoClient) | ||
root_path = '/tmp/corpus' | ||
cls.exporter = MongoExporter(root_path, categories=CATEGORIES_IN_DB) | ||
|
||
@classmethod | ||
def tearDownClass(self): | ||
""" | ||
Drop the mongomock connection | ||
""" | ||
assert isinstance(self.conn, MockMongoClient) | ||
self.conn = None | ||
|
||
def test_sanitize_requires_a_valid_level(self): | ||
""" Sanitize_html requires a supported level """ | ||
with self.assertRaises(ExportError): | ||
self.exporter.sanitize_html(self.sample_html, "bogus") | ||
|
||
def test_sanitize_returns_input_for_level_none(self): | ||
""" sanitize_html returns unmodified input for level None """ | ||
self.assertEqual(self.exporter.sanitize_html(self.sample_html, None), self.sample_html) | ||
|
||
def test_sanitize_raw(self): | ||
""" Sanitize level raw returns the content as submitted """ | ||
self.assertEqual(self.exporter.sanitize_html(self.sample_html, RAW), self.sample_html) | ||
|
||
def test_sanitize_safe(self): | ||
""" Sanitize level safe applies Readability and returns the body """ | ||
|
||
# Give Readability a simpler HTML sample to keep its parse strategy simple | ||
sample_html = ('<html>' | ||
'<head><script>javascript here</script></head>' | ||
'<body>body</body>' | ||
'</html>') | ||
expected = '<body id="readabilityBody">body</body>' | ||
self.assertEqual(self.exporter.sanitize_html(sample_html, SAFE), expected) | ||
|
||
def test_sanitize_text(self): | ||
""" | ||
Sanitize level text strips HTML tags, removes newlines, | ||
and converts the html entity ampersand into an ampersand character | ||
""" | ||
expected = 'body & mind' | ||
self.assertEqual(self.exporter.sanitize_html(self.sample_html, TEXT), expected) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
# test.utils_tests.test_timez | ||
# Testing for the timez time helpers library. | ||
# | ||
# Author: Benjamin Bengfort <[email protected]> | ||
# Created: Sun Feb 21 15:33:18 2016 -0500 | ||
# | ||
# Copyright (C) 2016 Bengfort.com | ||
# For license information, see LICENSE.txt | ||
# | ||
# ID: test_timez.py [df0c71b] [email protected] $ | ||
|
||
""" | ||
Testing for the timez time helpers library. | ||
""" | ||
|
||
########################################################################## | ||
## Imports | ||
########################################################################## | ||
|
||
import unittest | ||
|
||
from mongomock import MongoClient as MockMongoClient | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you need MockMongoClient here? |
||
|
||
from baleen.exceptions import ExportError | ||
from baleen.export import MongoExporter | ||
from baleen.models import connect | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These imports are not needed. |
||
from baleen.utils.text import sanitize_html, RAW, SAFE, TEXT | ||
from tests.test_export import CATEGORIES_IN_DB | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This import is not needed. |
||
|
||
|
||
class SanitizeHtmlTests(unittest.TestCase): | ||
""" Tests the exporter's HTML sanitize methods """ | ||
|
||
@classmethod | ||
def setUpClass(cls): | ||
cls.sample_html = ('<html>' | ||
'<head><script>javascript here</script></head>' | ||
'<body><b>body &\n mind</b></body>' | ||
'</html>') | ||
|
||
@classmethod | ||
def tearDownClass(self): | ||
""" | ||
Drop the mongomock connection | ||
""" | ||
pass | ||
|
||
def test_sanitize_requires_a_valid_level(self): | ||
""" Sanitize_html requires a supported level """ | ||
with self.assertRaises(ExportError): | ||
sanitize_html(self.sample_html, "bogus") | ||
|
||
def test_sanitize_returns_input_for_level_none(self): | ||
""" sanitize_html returns unmodified input for level None """ | ||
self.assertEqual(sanitize_html(self.sample_html, None), self.sample_html) | ||
|
||
def test_sanitize_raw(self): | ||
""" Sanitize level raw returns the content as submitted """ | ||
self.assertEqual(sanitize_html(self.sample_html, RAW), self.sample_html) | ||
|
||
def test_sanitize_raw_handles_none(self): | ||
""" | ||
Sanitize level raw accepts None gracefully | ||
""" | ||
self.assertEqual(sanitize_html(None, RAW), None) | ||
|
||
def test_sanitize_safe(self): | ||
""" Sanitize level safe applies Readability and returns the body """ | ||
|
||
# Give Readability a simpler HTML sample to keep its parse strategy simple | ||
sample_html = ('<html>' | ||
'<head><script>javascript here</script></head>' | ||
'<body>body</body>' | ||
'</html>') | ||
expected = '<body id="readabilityBody">body</body>' | ||
self.assertEqual(sanitize_html(sample_html, SAFE), expected) | ||
|
||
def test_sanitize_safe_handles_none(self): | ||
""" | ||
Sanitize level safe accepts None gracefully | ||
""" | ||
self.assertEqual(sanitize_html(None, SAFE), None) | ||
|
||
def test_sanitize_text(self): | ||
""" | ||
Sanitize level text strips HTML tags, removes newlines, | ||
and converts the html entity ampersand into an ampersand character | ||
""" | ||
expected = 'body & mind' | ||
self.assertEqual(sanitize_html(self.sample_html, TEXT), expected) | ||
|
||
def test_sanitize_text_handles_none(self): | ||
""" | ||
Sanitize level text accepts None gracefully | ||
""" | ||
self.assertEqual(sanitize_html(None, TEXT), None) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I really like what you've done here, it is extremely sensible. However, I'd ask for one minor change so that this is not a complete passthrough to the sanitize function. What would you think about changing the signature to accept
None
as the argument to level and changing the level argument to sanitize, e.g.:I think this makes it a bit more understandable to users, what do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You betcha.