-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathphp.py
executable file
·77 lines (61 loc) · 2.01 KB
/
php.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python
#coding=utf-8
import re
from bs4 import BeautifulSoup
import requests
import os
import time
import threading
import ee
urls=[]
UNUSED_PATH = [None,"/","#","javascript:void(0)"]
HTTP_SESSION = requests.session()
HTTP_SESSION.headers.update({"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Mobile/14E304 MicroMessenger/6.5.12 NetType/WIFI Language/zh_CN"})
def http_get(url, params={}):
try:
r = HTTP_SESSION.get(url, params=params)
if r.status_code == 200:
return r
else:
print "%s request failed %d " % (url,r.status_code)
return None
except:
print "%s request except" % url
return None
def dispatch_web_process(soup,url):
ee.get_video(soup,url)
#lutu.get_video(soup)
def dispatch_web_db(title,url):
ee.save_to_db(title,url)
def dispatch_url_cache(url):
if ee.cache(url):
urls.append(url)
def get_all_url(uri, url=''):#递归爬取所有url
if not ee.filter(url):
return
print 'process... %s' % url
r = http_get(uri+url)
if r == None :
print "get_all_url in %s with response none" % url
return
r.encoding = 'gb18030'
soup = BeautifulSoup(r.text, "html.parser")
allA = soup.find_all('a')
if len(url) == 0 and (not allA):
print "get a empty <a> tag from %s " % uri
dispatch_web_process(soup,url)
for k in allA:
kv = k['href']
if kv in UNUSED_PATH:
continue
if ( uri in kv or (not 'http' in kv)) and kv not in urls:
dispatch_url_cache(kv)
get_all_url(uri, kv)
elif re.search('http.*mp4', kv) and kv not in urls:
dispatch_url_cache(kv)
dispatch_web_db(soup.title.string,kv)
print 'fuck:'+kv
print soup.title.string
#http://www.lutu6.com
#http://www.431ee.com
get_all_url('http://www.431ee.com')