forked from akkana/scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprettysoup.py
executable file
·85 lines (63 loc) · 2.5 KB
/
prettysoup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python3
# Prettyprint an HTML document (from BeautifulSoup)
# in a customizable way. You can control which tags get newlines
# before them, after them or both.
# BS4's prettify() changes the HTML, possibly adding extra whitespace:
# See https://bugs.launchpad.net/beautifulsoup/+bug/1697296
# Copyright 2018 by Akkana Peck. Share and enjoy under the GPLv2 or later.
import re
#
# Default formatting rules:
#
# Tags to be printed on a line by themselves.
tags_separate_line = [ "html", "head", "body", "p", "br", "ul", "ol", "div",
"table", "tr", "title", "meta", "link" ]
# Tags that define a line: they get a newline before start and after end.
# <h1>Here is the page header</h1>
tags_define_line = [ "li", "tr", "h1", "h2", "h3", "h4", "h5", "h6" ]
# Any tag not specified in one of those two list will be printed inline,
# like <b>bold text</b> and <a href="#">links</a>.
def prettyprint(soup,
separate_line=tags_separate_line,
define_line=tags_define_line,
remove_original_newlines=False):
"""Simple prettyprinter. Add newlines around specified tags.
No indentation, etc.
Will preserve all original newlines and only add new ones
unless remove_original_newlines is specified.
Returns a string.
"""
# Extract empty tags
for x in soup.find_all():
if len(x.text) == 0:
x.extract()
# Let BeautifulSoup convert to text, then do regexp on the text.
s = str(soup)
if remove_original_newlines:
s = s.replace('\n', '')
# Replace non-breaking spaces in the unicode string
# (note: this assumes Python3, so str is unicode):
s = s.replace("\u00A0", " ")
for tag in separate_line:
pat = "(<%s.*?>)" % tag
s = re.sub(pat, r"\n\1\n", s)
pat = "(</%s>)" % tag
s = re.sub(pat, r"\n\1\n", s)
for tag in define_line:
pat = "(<%s.*?>)" % tag
s = re.sub(pat, r"\n\1", s)
pat = "(</%s>)" % tag
s = re.sub(pat, r"\1\n", s)
# Now we will have some multiple newlines, so clean those up.
s = re.sub('\n\n*', '\n', s)
# If there's no doctype, we probably added a newline before <html>.
if s.startswith('\n'):
s = s[1:]
return s
if __name__ == '__main__':
from bs4 import BeautifulSoup
import sys
for f in sys.argv[1:]:
with open(f) as fp:
soup = BeautifulSoup(fp, "lxml")
pp = prettyprint(soup, remove_original_newlines=True)