-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtorscrape.py
149 lines (116 loc) · 4.19 KB
/
torscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python
#-*- coding:utf-8 -*-
"""
URL scraper with TOR support, to dynamicly change IPs while scaping.
This is useful in scenarios where you are ratelimited by IP.
To run this you need:
- torctl
- pycurl
- TOR running on 127.0.0.1:9050 with controlport 127.0.0.1:9051
EXAMPLE:
import torscrape
urls = ["http://icanhazip.com/"]*10
def my_handler(url, data):
print url, data
torscrape.process(urls, my_handler, refresh_ip=1, verbose=True)
RANT:
This would have been a perfect example to put in a class object.
However, Python and multiprocessing does not play well with
stuff that you put in classes, making it completly impossible to
make a nice OO-design.
Also multiprocessing and keyboard interrupts make me cry.
"""
class _get_options:
def __init__(self, url, handler, user_agent, tor_host, tor_port):
self.url = url
self.handler = handler
self.user_agent = user_agent
self.tor_host = tor_host
self.tor_port = tor_port
def get(url, handler, user_agent="Mozilla/5.0", tor_host="127.0.0.1", tor_port=9050):
try:
import StringIO
import pycurl
strio = StringIO.StringIO()
c = pycurl.Curl()
c.setopt(pycurl.URL, url)
c.setopt(pycurl.PROXY, tor_host)
c.setopt(pycurl.PROXYPORT, tor_port)
c.setopt(pycurl.HTTPHEADER, ['User-agent: %s' % user_agent])
c.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS4)
c.setopt(pycurl.WRITEFUNCTION, strio.write)
c.perform()
c.close()
return handler(url, strio.getvalue())
except KeyboardInterrupt:
return
def _get_with_args(a):
return get(a.url, a.handler, a.user_agent, a.tor_host, a.tor_port)
def change_ip(verbose=False):
import time
import os
import sys
from TorCtl import TorCtl
if verbose: print "Renewing TOR route:",
sys.stdout.flush()
torcontrol = TorCtl.connect()
if torcontrol is None: raise TorNotReachableException()
torcontrol.sendAndRecv("signal newnym\r\n")
time.sleep(5)
if verbose: print "done"
sys.stdout.flush()
# stupid function is sending output
stdout = sys.stdout
sys.stdout = open(os.devnull,"w")
torcontrol.close()
sys.stdout = stdout
def process(urls, handler, user_agent="Mozilla/5.0", tor_host="127.0.0.1", tor_port=9050, refresh_ip=10, threads=None, verbose=False):
""" Processes a set of urls and dispatches them to a handler
urls: a list of url strings
handler: a function that accepts a string, which is the returned text
user_agent: the user agent to use
tor_host: tor host
tor_port: tor port
refresh_ip: refresh ip every NUM url fetches
threads: use number of parallel threads
verbose: be verbose
"""
from multiprocessing import Pool
import sys
if threads:
pool = Pool(threads, _init_worker)
else:
pool = Pool(initializer=_init_worker)
if refresh_ip:
paginated_urls = _paginate(urls, refresh_ip)
for page in paginated_urls:
change_ip(verbose)
if verbose: print "Processing:", page
packed_args = [_get_options(url, handler, user_agent, tor_host, tor_port) for url in page]
try:
pool.map(_get_with_args, packed_args)
except KeyboardInterrupt:
print "Wait for termination:",
sys.stdout.flush()
pool.terminate()
print "done"
sys.stdout.flush()
break
else:
packed_args = [_get_options(url, handler, user_agent, tor_host, tor_port) for url in urls]
try:
pool.map(_get_with_args, packed_args)
except KeyboardInterrupt:
print "Wait for termination:",
sys.stdout.flush()
pool.terminate()
print "done"
sys.stdout.flush()
def _init_worker():
import signal
signal.signal(signal.SIGINT, signal.SIG_IGN)
def _paginate(seq, rowlen):
for start in xrange(0, len(seq), rowlen):
yield seq[start:start+rowlen]
class TorNotReachableException(Exception):
pass