-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetencoding.py
28 lines (21 loc) · 995 Bytes
/
getencoding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#!/usr/bin/python
import urllib2
import lxml.html
import re
def getencoding(html):
dom = lxml.html.fromstring(html.decode('utf8', 'ignore'), parser = lxml.html.HTMLParser(remove_comments = True))
print dom.find('a')
encs = dom.xpath('.//head/meta[@charset]/@charset')
print dom.xpath('.//a')
encs += [re.findall(r'charset=(.*)', _.get('content'))[0] for _ in dom.xpath('.//head/meta[@http-equiv][@content]') if _.get('http-equiv').lower() == "content-type" and _.get('content').count('charset=') == 1]
encs = set([_.lower() for _ in encs])
if set(['gb2312', 'gbk']) <= encs: encs.remove('gb2312')
if set(['gb2312']) == encs: encs = set(['gbk'])
print len(encs)
if len(encs) == 1: return encs.pop()
#no encoding or multiple encoding(for web-cache sites)
try:
import chardet
return chardet.detect(html)['encoding']
except ImportError, e: raise e
print getencoding(urllib2.urlopen('http://www.sina.com.cn').read())