forked from freelawproject/juriscraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAbstractSite.py
346 lines (294 loc) · 13.2 KB
/
AbstractSite.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
from datetime import date
import hashlib
import logging.handlers
import re
from urlparse import urlsplit, urlunsplit, urljoin
import requests
from lxml import html
from juriscraper.lib.string_utils import harmonize, clean_string, trunc
from juriscraper.tests import MockRequest
try:
# Use cchardet for performance to detect the character encoding.
import cchardet as chardet
except ImportError:
import chardet
LOG_FILENAME = '/var/log/juriscraper/debug.log'
# Set up a specific logger with our desired output level
logger = logging.getLogger('Logger')
logger.setLevel(logging.DEBUG)
# make a formatter
formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
# Create a handler, and attach it to the logger
handler = logging.handlers.RotatingFileHandler(LOG_FILENAME,
maxBytes=5120000,
backupCount=7)
logger.addHandler(handler)
handler.setFormatter(formatter)
# logging.basicConfig(format='%(asctime)s - %(levelname)s: %(message)s',
# level=logging.DEBUG)
class InsanityException(Exception):
def __init__(self, message):
Exception.__init__(self, message)
class AbstractSite(object):
"""Contains generic methods for scraping data. Should be extended by all
scrapers.
Should not contain lists that can't be sorted by the _date_sort function."""
def __init__(self):
super(AbstractSite, self).__init__()
# Computed metadata
self.hash = None
self.html = None
self.method = 'GET'
self.use_sessions = False
self.status = None
self.back_scrape_iterable = None
self.cookies = {}
# Upstream metadata
self.court_id = None
self.url = None
self.parameters = None
self._opt_attrs = []
self._req_attrs = []
self._all_attrs = []
def __str__(self):
out = []
for attr, val in self.__dict__.iteritems():
out.append('%s: %s' % (attr, val))
return '\n'.join(out)
def parse(self):
if self.status is None:
# Run the downloader if it hasn't been run already
self.html = self._download()
# Set the attribute to the return value from _get_foo()
# e.g., this does self.case_names = _get_case_names()
for attr in self._all_attrs:
self.__setattr__(attr, getattr(self, '_get_%s' % attr)())
self._clean_attributes()
self._post_parse()
self._check_sanity()
self._date_sort()
self._make_hash()
return self
def tweak_request_object(self, r):
"""
Does nothing, but provides a hook that allows inheriting objects to
tweak the requests object if necessary.
"""
pass
def _clean_text(self, text):
""" Cleans up text before we make it into an HTML tree:
1. Nukes <![CDATA stuff.
2. Nukes XML encoding declarations
3. Replaces </br> with <br/>
4. Nukes invalid bytes in input
5. ?
"""
# Remove <![CDATA because it causes breakage in lxml.
text = re.sub(r'<!\[CDATA\[', '', text)
text = re.sub(r'\]\]>', '', text)
# Remove <?xml> declaration in Unicode objects, because it causes an error:
# "ValueError: Unicode strings with encoding declaration are not supported."
# Note that the error only occurs if the <?xml> tag has an "encoding"
# attribute, but we remove it in all cases, as there's no downside to
# removing it. This moves our encoding detection to chardet, rather than
# lxml.
if isinstance(text, unicode):
text = re.sub(r'^\s*<\?xml\s+.*?\?>', '', text)
# Fix </br>
text = re.sub('</br>', '<br/>', text)
# Fix invalid bytes (http://stackoverflow.com/questions/8733233/filtering-out-certain-bytes-in-python)
text = re.sub(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\u10000-\u10FFFF]+', '', text)
return text
def _clean_attributes(self):
"""Iterate over attribute values and clean them"""
for attr in self._all_attrs:
item = getattr(self, attr)
if item is not None:
cleaned_item = []
for sub_item in item:
if attr == 'download_urls':
sub_item = sub_item.strip()
else:
if isinstance(sub_item, basestring):
sub_item = clean_string(sub_item)
if attr in ['case_names', 'docket_numbers']:
sub_item = harmonize(sub_item)
cleaned_item.append(sub_item)
self.__setattr__(attr, cleaned_item)
def _post_parse(self):
"""This provides an hook for subclasses to do custom work on the data after the parsing is complete."""
pass
def _check_sanity(self):
"""Check that the objects attributes make sense:
1. Do all the attributes have the same length?
1. Do we have any content at all?
1. Is there a bare minimum of meta data?
1. Are the dates datetime objects, not strings?
1. Are any dates from the 22nd century? (01-01-2104)
1. Are case_names more than just empty whitespace?
1. Has the `cookies` attribute been normalized to a dict?
1. ?
The signature of this method is subject to change as additional checks become
convenient.
Inheriting classes should override this method calling super to give it the
necessary parameters.
If sanity is OK, no return value. If not, throw InsanityException or
warnings, as appropriate.
"""
lengths = {}
for attr in self._all_attrs:
if self.__getattribute__(attr) is not None:
lengths[attr] = len(self.__getattribute__(attr))
values = lengths.values()
if values.count(values[0]) != len(values):
# Are all elements equal?
raise InsanityException("%s: Scraped meta data fields have differing"
" lengths: %s" % (self.court_id, lengths))
if len(self.case_names) == 0:
logger.warning('%s: Returned with zero items.' % self.court_id)
else:
for field in self._req_attrs:
if self.__getattribute__(field) is None:
raise InsanityException('%s: Required fields do not contain any data: %s' % (self.court_id, field))
i = 0
for name in self.case_names:
if not name.strip():
raise InsanityException("Item with index %s has an empty case name." % i)
i += 1
for d in self.case_dates:
if not isinstance(d, date):
raise InsanityException('%s: member of case_dates list not a valid date object. '
'Instead it is: %s with value: %s' % (self.court_id, type(d), d))
if d.year > 2100:
raise InsanityException('%s: member of case_dates list is from the 22nd century. '
'with value %s' % (self.court_id, d.year))
# Is cookies a dict?
if type(self.cookies) != dict:
raise InsanityException('self.cookies not set to be a dict by '
'scraper.')
logger.info("%s: Successfully found %s items." % (self.court_id,
len(self.case_names)))
def _date_sort(self):
""" Sort the object by date.
"""
if len(self.case_names) > 0:
obj_list_attrs = [self.__getattribute__(attr) for attr in
self._all_attrs if
isinstance(self.__getattribute__(attr), list)]
zipped = zip(*obj_list_attrs)
zipped.sort(reverse=True)
i = 0
obj_list_attrs = zip(*zipped)
for attr in self._all_attrs:
if isinstance(self.__getattribute__(attr), list):
self.__setattr__(attr, obj_list_attrs[i][:])
i += 1
def _make_hash(self):
"""Make a unique ID. ETag and Last-Modified from courts cannot be
trusted
"""
self.hash = hashlib.sha1(str(self.case_names)).hexdigest()
def _link_repl(self, href):
"""Makes links absolute, working around buggy URLs and nuking anchors.
Some URLS, like the following, make no sense:
- https://www.appeals2.az.gov/../Decisions/CR20130096OPN.pdf.
^^^^ -- This makes no sense!
The fix is to remove any extra '/..' patterns at the beginning of the
path.
Others have annoying anchors on the end, like:
- http://example.com/path/#anchor
Note that lxml has a method generally for this purpose called
make_links_absolute, but we cannot use it because it does not work
around invalid relative URLS, nor remove anchors. This is a limitation
of Python's urljoin that will be fixed in Python 3.5 according to a bug
we filed: http://bugs.python.org/issue22118
"""
url_parts = urlsplit(urljoin(self.url, href))
url = urlunsplit(
url_parts[:2] +
(re.sub('^(/\.\.)+', '', url_parts.path),) +
url_parts[3:]
)
return url.split('#')[0]
def _download(self, request_dict={}):
"""Methods for downloading the latest version of Site
"""
if self.method == 'POST':
truncated_params = {}
for k, v in self.parameters.iteritems():
truncated_params[k] = trunc(v, 50, ellipsis='...[truncated]')
logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params))
else:
logger.info("Now downloading case page at: %s" % self.url)
# Get the response. Disallow redirects so they throw an error
s = requests.session()
if self.method == 'GET':
r = s.get(self.url,
headers={'User-Agent': 'Juriscraper'},
**request_dict)
elif self.method == 'POST':
r = s.post(self.url,
headers={'User-Agent': 'Juriscraper'},
data=self.parameters,
**request_dict)
elif self.method == 'LOCAL':
mr = MockRequest(url=self.url)
r = mr.get()
# Provides a hook for inheriting objects to tweak the request object.
self.tweak_request_object(r)
# Throw an error if a bad status code is returned.
r.raise_for_status()
# If the encoding is iso-8859-1, switch it to cp1252 (a superset)
if r.encoding == 'ISO-8859-1':
r.encoding = 'cp1252'
# Provide the response in the Site object
self.r = r
self.status = r.status_code
if r.encoding is None:
# Requests detects the encoding when the item is GET'ed using
# HTTP headers, and then when r.text is accessed, if the encoding
# hasn't been set by that point. By setting the encoding here, we
# ensure that it's done by cchardet, if it hasn't been done with
# HTTP headers. This way it is done before r.text is accessed
# (which would do it with vanilla chardet). This is a big
# performance boon, and can be removed once requests is upgraded
# (https://github.com/kennethreitz/requests/pull/814/)
r.encoding = chardet.detect(r.content)['encoding']
# Grab the content
text = self._clean_text(r.text)
html_tree = html.fromstring(text)
html_tree.rewrite_links(self._link_repl)
return html_tree
def _download_backwards(self):
# methods for downloading the entire Site
pass
@staticmethod
def _cleanup_content(content):
"""
Given the HTML from a page, the binary PDF file, or similar, do any
last-minute cleaning.
This method should be called as the last step by any caller and works
to do any cleanup that is necessary. Usually, this is needed on HTML
pages, in jurisdictions that post their content in an HTML page with
headers, footers and other content must be stripped after the page
has been downloaded by the caller.
"""
return content
def _get_cookies(self):
"""
Some websites require cookies in order to be scraped. This method
provides a hook where cookies can be retrieved by calling functions.
Generally the cookies will be set by the _download() method.
self.cookies is a list of dicts of the form:
[
{
u'domain': u'www.search.txcourts.gov',
u'httponly': True,
u'name': u'ASP.NET_SessionId',
u'path': u'/',
u'secure': False,
u'value': u'hryku05534xhgr45yxvusuux'
},
]
"""
return self._cookies