-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
101 lines (82 loc) · 3.55 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# -*- coding: utf-8 -*-
import json
import requests
from bs4 import BeautifulSoup
from datetime import datetime
class Crawler(object):
def generate_sql(self, database, table, data):
sql = "UNLOCK TABLES;"
sql += "\nCREATE DATABASE IF NOT EXISTS {};".format(database)
sql += "\nUSE {};".format(database)
sql += "\nDROP TABLE IF EXISTS `{}`;\nCREATE TABLE `{}` (".format(table, table)
sql += "`id` int(11) NOT NULL AUTO_INCREMENT, "
for key, value in data[0].iteritems():
sql += "`{}` longtext, ".format(key)
sql += "PRIMARY KEY(`id`)) ENGINE=InnoDB DEFAULT CHARSET=utf8;"
sql += "\nLOCK TABLES `{}` WRITE;".format(table)
if len(data) > 1:
sql += "\nINSERT INTO `{}` VALUES ".format(table)
for i, item in enumerate(data):
sql += "({}, ".format(i+1)
for ii, (key, value) in enumerate(item.iteritems()):
if ii == len(item)-1:
sql += "'{}'".format(value)
else:
sql += "'{}',".format(value)
if i == len(data)-1:
sql += ");"
else:
sql += "),"
sql += "\nUNLOCK TABLES;"
file_name = '/tmp/crawler-sql-{}-{}.sql'.format(table, datetime.now().strftime('%d-%m-%Y_%H-%M'))
with open(file_name, 'w') as sql_file:
sql_file.write(sql)
sql_file.close()
return file_name
def get(self, url, headers=None):
"""
Makes a GET request that returns text/html and json results or raise an exception with request status code.
"""
response = requests.get(url, headers)
if response.status_code == 200:
return BeautifulSoup(response.text, 'html.parser')
raise Exception('GET request not found! HTTP Status Code is {}'.format(response.status_code))
def load_data_file(self, data_file):
"""
Opens data file type like json or txt and return a python object.
"""
with open('{}'.format(data_file)) as data:
return json.load(data)
def post(self, url, data=None, headers=None, session=False):
"""
Makes a POST request with optional requests.Session param that returns tex/html and json results or raise an
exception with request status code.
"""
request = requests
if session:
request = request.Session()
response = request.post(url, data=data, headers=headers)
if response.status_code == 200:
return response
raise Exception('POST request not found! HTTP Status Code is {}'.format(response.status_code))
def post_form(self, url=None, header=None, data=None, data_file=None, links=None, form=None, method=None):
"""
Opens url to check some links/forms to proceed to a new step, find the html form ID to post data/data_file on
this form.
"""
if not data and data_file:
data = self.load_data_file(data_file)
if not links:
links = []
for link in links:
while True:
html = self.get(url)
if html.find(id=form):
break
if html.find(id=link):
url = html.find(id=link).get('href')
continue
break
for item in data:
result = self.post(url, headers=header, data=item, session=True)
print(BeautifulSoup(result.text, 'html.parser').prettify())