Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue #87 Make sanitization happen in Post.htmlize() #88

Merged
merged 5 commits into from
Jun 12, 2017
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 10 additions & 69 deletions baleen/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,30 +17,22 @@
## Imports
##########################################################################

import os
import codecs
import bleach

from enum import Enum
import os
from collections import Counter
from datetime import datetime
from baleen.models import Feed, Post
from enum import Enum
from operator import itemgetter

from tqdm import tqdm
from collections import Counter
from operator import itemgetter
from baleen.exceptions import ExportError
from readability.readability import Document

##########################################################################
## Module Constants
##########################################################################
from baleen.exceptions import ExportError
from baleen.models import Feed, Post
from baleen.utils.text import sanitize_html, SAFE, SANITIZE_LEVELS

DTFMT = "%b %d, %Y at %H:%M"
RAW = 'raw'
SAFE = 'safe'
TEXT = 'text'
SANITIZE_LEVELS = (RAW, SAFE, TEXT)
SCHEMES = ('json', 'html') + SANITIZE_LEVELS
EXPORT_FORMATS = ('json', 'html')
SCHEMES = EXPORT_FORMATS + SANITIZE_LEVELS
State = Enum('State', 'Init, Started, Finished')


Expand Down Expand Up @@ -222,7 +214,7 @@ def export(self, root=None, categories=None, level=SAFE):
with codecs.open(path, 'w', encoding='utf-8') as f:
action = {
'json': lambda: post.to_json(indent=2),
'html': lambda: self.sanitize_html(post.htmlize(), level),
'html': lambda: post.htmlize(level=level)
}[self.scheme]

f.write(action())
Expand All @@ -232,57 +224,6 @@ def export(self, root=None, categories=None, level=SAFE):
self.readme(os.path.join(self.root, "README"))
self.feedinfo(os.path.join(self.root, "feeds.json"))

def sanitize_html(self, html, level):
"""
Return a sanitized version of html content
:param html: the content to sanitized
:param level: the type of sanitization - one of ['raw', 'safe', 'text', None]
:return: sanitized content
"""
if level == SAFE:
return self._get_safe_html(html)
elif level == RAW:
return self._get_raw_html(html)
elif level == TEXT:
return self._get_text_from_html(html)
elif level is None:
return html

raise ExportError(
"{level} is not a supported sanitize_html level.".format(
level=level
)
)

def _get_raw_html(self, html):
"""
:param html: html content
:return: the unmodified html
"""
return html

def _get_safe_html(self, html):
"""
Applies Readability's sanitize() method to content.
:param html: the content to sanitize
:return: the body of the html content minus html tags
"""
return Document(html).summary()

def _get_text_from_html(self, html):
"""
Applies the 'safe' level of sanitization, removes newlines,
and converts the html entity for ampersand into the ampersand character.
:param html: the content to sanitize
:return: sanitized content
"""
text = self._get_safe_html(html)
text = bleach.clean(text, tags=[], strip=True)
text = text.strip()
text = text.replace("\n", "")
text = text.replace("&", "&")
return text


if __name__ == '__main__':
import baleen.models as db
Expand Down
14 changes: 8 additions & 6 deletions baleen/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
from baleen.config import settings
from baleen.utils.cryptography import hash_string
from baleen.utils.timez import humanizedelta
from baleen.utils.text import sanitize_html, SAFE, RAW, SANITIZE_LEVELS


##########################################################################
## Module Constants
Expand Down Expand Up @@ -134,14 +136,14 @@ def hash(self):
"""
return hash_string(self.content)

def htmlize(self):
def htmlize(self, level=RAW):
"""
Returns an HTML string of the content of the Post.
In the future we may use bleach to do sanitization or other simple
sanity checks to ensure that things are going ok, which is why this
method stub exists.
Returns the content of the Post with html sanitized
:param level: the level of sanitizing, default to RAW
:return:
"""
return self.content

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I really like what you've done here, it is extremely sensible. However, I'd ask for one minor change so that this is not a complete passthrough to the sanitize function. What would you think about changing the signature to accept None as the argument to level and changing the level argument to sanitize, e.g.:

def htmlize(self, sanitize=None):
    """
    ... 
    
    :param sanitize: str or None, default None 
        Specify a level to sanitize the HTML. If ``None``, then no HTML sanitization will 
        be conducted. Otherwise, specify ``'raw'``, ``'safe'``, or ``'text'`` to sanitize 
        the HTML at different levels. See ref::``sanitize_html`` for more. 
    """
    if sanitize:
        return sanitize_html(html=self.content, level=sanitize)
    return self.content 

I think this makes it a bit more understandable to users, what do you think?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You betcha.

return sanitize_html(html=self.content, level=level)

def __unicode__(self):
return self.title if self.title else self.url
Expand Down
70 changes: 70 additions & 0 deletions baleen/utils/text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import bleach
from readability.readability import Document

from baleen.exceptions import ExportError


def _get_raw_html(html):
"""
:param html: html content
:return: the unmodified html
"""
return html


def _get_safe_html(html):
"""
Applies Readability's sanitize() method to content.
:param html: the content to sanitize
:return: the body of the html content minus html tags
"""
if html is None:
return None
return Document(html).summary()


def _get_text_from_html(html):
"""
Applies the 'safe' level of sanitization, removes newlines,
and converts the html entity for ampersand into the ampersand character.
:param html: the content to sanitize
:return: sanitized content
"""
if html is None:
return html

text = _get_safe_html(html)
text = bleach.clean(text, tags=[], strip=True)
text = text.strip()
text = text.replace("\n", "")
text = text.replace("&", "&")
return text


def sanitize_html(html, level):
"""
Return a sanitized version of html content
:param html: the content to sanitized
:param level: the type of sanitization - one of ['raw', 'safe', 'text', None]
:return: sanitized content
"""
if level == SAFE:
return _get_safe_html(html)
elif level == RAW:
return _get_raw_html(html)
elif level == TEXT:
return _get_text_from_html(html)
elif level is None:
return html

raise ExportError(
"{level} is not a supported sanitize_html level.".format(
level=level
)
)


RAW = 'raw'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These module constants should be hoisted to the top of the file.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

SAFE = 'safe'
TEXT = 'text'
SANITIZE_LEVELS = (RAW, SAFE, TEXT)
63 changes: 1 addition & 62 deletions tests/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,13 @@
##########################################################################

import unittest
import logging
from unittest.mock import MagicMock

from mongomock import MongoClient as MockMongoClient
from unittest import mock
from unittest.mock import MagicMock

from baleen.export import *
from baleen.feed import *
from baleen.models import connect
from baleen.exceptions import ExportError

##########################################################################
## Fixtures
##########################################################################

BOOKS_FEED = Feed(
title='The Rumpus.net',
Expand Down Expand Up @@ -254,57 +247,3 @@ def test_export_with_category_path_failure(self):
exporter.export()


class SanitizeHtmlTests(unittest.TestCase):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It absolutely makes sense to remove these tests, should we have a test for Post.htmlize()? It will just be mocking sanitize_html and making sure it is called with the right arguments. If you think that's overkill though I understand.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's verging on overkill but I like making percentages go up. Let me look at it this weekend.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hehe, ok -- well do you want me to merge this PR and make a new one, or would you rather keep this PR open until you write the tests?

BTW - do you see a button at the bottom entitled "squash and merge", once everything is reviewed and CI approved, please feel free to merge yourself if you do see that button.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll merge this and take up the test in another feature, possibly for #89 .

""" Tests the exporter's HTML sanitize methods """

@classmethod
def setUpClass(cls):
cls.sample_html = ('<html>'
'<head><script>javascript here</script></head>'
'<body><b>body &amp;\n mind</b></body>'
'</html>')

cls.conn = connect(host='mongomock://localhost')
assert isinstance(cls.conn, MockMongoClient)
root_path = '/tmp/corpus'
cls.exporter = MongoExporter(root_path, categories=CATEGORIES_IN_DB)

@classmethod
def tearDownClass(self):
"""
Drop the mongomock connection
"""
assert isinstance(self.conn, MockMongoClient)
self.conn = None

def test_sanitize_requires_a_valid_level(self):
""" Sanitize_html requires a supported level """
with self.assertRaises(ExportError):
self.exporter.sanitize_html(self.sample_html, "bogus")

def test_sanitize_returns_input_for_level_none(self):
""" sanitize_html returns unmodified input for level None """
self.assertEqual(self.exporter.sanitize_html(self.sample_html, None), self.sample_html)

def test_sanitize_raw(self):
""" Sanitize level raw returns the content as submitted """
self.assertEqual(self.exporter.sanitize_html(self.sample_html, RAW), self.sample_html)

def test_sanitize_safe(self):
""" Sanitize level safe applies Readability and returns the body """

# Give Readability a simpler HTML sample to keep its parse strategy simple
sample_html = ('<html>'
'<head><script>javascript here</script></head>'
'<body>body</body>'
'</html>')
expected = '<body id="readabilityBody">body</body>'
self.assertEqual(self.exporter.sanitize_html(sample_html, SAFE), expected)

def test_sanitize_text(self):
"""
Sanitize level text strips HTML tags, removes newlines,
and converts the html entity ampersand into an ampersand character
"""
expected = 'body & mind'
self.assertEqual(self.exporter.sanitize_html(self.sample_html, TEXT), expected)
96 changes: 96 additions & 0 deletions tests/utils_tests/test_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# test.utils_tests.test_timez
# Testing for the timez time helpers library.
#
# Author: Benjamin Bengfort <[email protected]>
# Created: Sun Feb 21 15:33:18 2016 -0500
#
# Copyright (C) 2016 Bengfort.com
# For license information, see LICENSE.txt
#
# ID: test_timez.py [df0c71b] [email protected] $

"""
Testing for the timez time helpers library.
"""

##########################################################################
## Imports
##########################################################################

import unittest

from mongomock import MongoClient as MockMongoClient
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you need MockMongoClient here?


from baleen.exceptions import ExportError
from baleen.export import MongoExporter
from baleen.models import connect
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These imports are not needed.

from baleen.utils.text import sanitize_html, RAW, SAFE, TEXT
from tests.test_export import CATEGORIES_IN_DB
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This import is not needed.



class SanitizeHtmlTests(unittest.TestCase):
""" Tests the exporter's HTML sanitize methods """

@classmethod
def setUpClass(cls):
cls.sample_html = ('<html>'
'<head><script>javascript here</script></head>'
'<body><b>body &amp;\n mind</b></body>'
'</html>')

@classmethod
def tearDownClass(self):
"""
Drop the mongomock connection
"""
pass

def test_sanitize_requires_a_valid_level(self):
""" Sanitize_html requires a supported level """
with self.assertRaises(ExportError):
sanitize_html(self.sample_html, "bogus")

def test_sanitize_returns_input_for_level_none(self):
""" sanitize_html returns unmodified input for level None """
self.assertEqual(sanitize_html(self.sample_html, None), self.sample_html)

def test_sanitize_raw(self):
""" Sanitize level raw returns the content as submitted """
self.assertEqual(sanitize_html(self.sample_html, RAW), self.sample_html)

def test_sanitize_raw_handles_none(self):
"""
Sanitize level raw accepts None gracefully
"""
self.assertEqual(sanitize_html(None, RAW), None)

def test_sanitize_safe(self):
""" Sanitize level safe applies Readability and returns the body """

# Give Readability a simpler HTML sample to keep its parse strategy simple
sample_html = ('<html>'
'<head><script>javascript here</script></head>'
'<body>body</body>'
'</html>')
expected = '<body id="readabilityBody">body</body>'
self.assertEqual(sanitize_html(sample_html, SAFE), expected)

def test_sanitize_safe_handles_none(self):
"""
Sanitize level safe accepts None gracefully
"""
self.assertEqual(sanitize_html(None, SAFE), None)

def test_sanitize_text(self):
"""
Sanitize level text strips HTML tags, removes newlines,
and converts the html entity ampersand into an ampersand character
"""
expected = 'body & mind'
self.assertEqual(sanitize_html(self.sample_html, TEXT), expected)

def test_sanitize_text_handles_none(self):
"""
Sanitize level text accepts None gracefully
"""
self.assertEqual(sanitize_html(None, TEXT), None)