forked from CloudBotIRC/CloudBot
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request CloudBotIRC#210 from linuxdaemon/gonzobot+link-ann…
…ounce-emoji Fix decoding page title in link announcer
- Loading branch information
Showing
1 changed file
with
22 additions
and
40 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,59 +1,41 @@ | ||
import requests | ||
import re | ||
from bs4 import BeautifulSoup | ||
from contextlib import closing | ||
from cloudbot import hook | ||
|
||
import requests | ||
from bs4 import BeautifulSoup | ||
|
||
from cloudbot import hook | ||
from cloudbot.hook import Priority, Action | ||
|
||
# This will match any URL, blacklist removed and abstracted to a priority/halting system | ||
url_re = re.compile(r'https?://(?:[a-zA-Z]|[0-9]|[$-_@.&+~]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', re.I) | ||
|
||
opt_out = [] | ||
|
||
traditional = [ | ||
(1024 ** 5, 'PB'), | ||
(1024 ** 4, 'TB'), | ||
(1024 ** 3, 'GB'), | ||
(1024 ** 2, 'MB'), | ||
(1024 ** 1, 'KB'), | ||
(1024 ** 0, 'B'), | ||
] | ||
HEADERS = { | ||
'Accept-Language': 'en-US,en;q=0.5', | ||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36' | ||
} | ||
|
||
|
||
def bytesto(bytes, system = traditional): | ||
""" converts bytes to something """ | ||
bytes = int(bytes) | ||
for factor, suffix in system: | ||
if bytes >= factor: | ||
break | ||
amount = int(bytes/factor) | ||
return str(amount) + suffix | ||
MAX_RECV = 1000000 | ||
|
||
|
||
@hook.regex(url_re, priority=Priority.LOW, action=Action.HALTTYPE, only_no_match=True) | ||
def print_url_title(message, match, chan): | ||
if chan in opt_out: | ||
return | ||
HEADERS = { | ||
'Accept-Language': 'en-US,en;q=0.5', | ||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36' | ||
} | ||
with closing(requests.get(match.group(), headers = HEADERS, stream = True, timeout=3)) as r: | ||
|
||
with closing(requests.get(match.group(), headers=HEADERS, stream=True, timeout=3)) as r: | ||
if not r.encoding: | ||
# remove the content type and size from output for now | ||
r.close() | ||
return | ||
#content = r.headers['content-type'] | ||
#size = bytesto(r.headers['content-length']) | ||
#out = "Content Type: \x02{}\x02 Size: \x02{}\x02".format(content, size) | ||
#return out | ||
content = r.raw.read(1000000+1, decode_content=True) | ||
if len(content) > 1000000: | ||
r.close() | ||
return | ||
html = BeautifulSoup(content) | ||
r.close() | ||
title = " ".join(html.title.text.strip().splitlines()) | ||
out = "Title: \x02{}\x02".format(title) | ||
message(out, chan) | ||
|
||
content = r.raw.read(MAX_RECV + 1, decode_content=True) | ||
encoding = r.encoding | ||
|
||
if len(content) > MAX_RECV: | ||
return | ||
|
||
html = BeautifulSoup(content, "lxml", from_encoding=encoding) | ||
title = " ".join(html.title.text.strip().splitlines()) | ||
out = "Title: \x02{}\x02".format(title) | ||
message(out, chan) |