From e8572d7ba0226c15447d07769e84c72db56a7784 Mon Sep 17 00:00:00 2001 From: linuxdaemon Date: Fri, 3 Nov 2017 17:43:07 -0500 Subject: [PATCH 1/2] Fix decoding page title in link announcer --- plugins/link_announcer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/link_announcer.py b/plugins/link_announcer.py index 48bfb8294..0d35bcb34 100644 --- a/plugins/link_announcer.py +++ b/plugins/link_announcer.py @@ -52,7 +52,7 @@ def print_url_title(message, match, chan): if len(content) > 1000000: r.close() return - html = BeautifulSoup(content) + html = BeautifulSoup(content, "lxml", from_encoding=r.encoding) r.close() title = " ".join(html.title.text.strip().splitlines()) out = "Title: \x02{}\x02".format(title) From 53a5e94dcb4c4c5352daaf51a8241f66fd907584 Mon Sep 17 00:00:00 2001 From: linuxdaemon Date: Fri, 3 Nov 2017 18:38:17 -0500 Subject: [PATCH 2/2] Clean up link announcer --- plugins/link_announcer.py | 62 ++++++++++++++------------------------- 1 file changed, 22 insertions(+), 40 deletions(-) diff --git a/plugins/link_announcer.py b/plugins/link_announcer.py index 0d35bcb34..ffe370920 100644 --- a/plugins/link_announcer.py +++ b/plugins/link_announcer.py @@ -1,9 +1,10 @@ -import requests import re -from bs4 import BeautifulSoup from contextlib import closing -from cloudbot import hook +import requests +from bs4 import BeautifulSoup + +from cloudbot import hook from cloudbot.hook import Priority, Action # This will match any URL, blacklist removed and abstracted to a priority/halting system @@ -11,49 +12,30 @@ opt_out = [] -traditional = [ - (1024 ** 5, 'PB'), - (1024 ** 4, 'TB'), - (1024 ** 3, 'GB'), - (1024 ** 2, 'MB'), - (1024 ** 1, 'KB'), - (1024 ** 0, 'B'), - ] +HEADERS = { + 'Accept-Language': 'en-US,en;q=0.5', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36' +} - -def bytesto(bytes, system = traditional): - """ converts bytes to something """ - bytes = int(bytes) - for factor, suffix in system: - if bytes >= factor: - break - amount = int(bytes/factor) - return str(amount) + suffix +MAX_RECV = 1000000 @hook.regex(url_re, priority=Priority.LOW, action=Action.HALTTYPE, only_no_match=True) def print_url_title(message, match, chan): if chan in opt_out: return - HEADERS = { - 'Accept-Language': 'en-US,en;q=0.5', - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36' - } - with closing(requests.get(match.group(), headers = HEADERS, stream = True, timeout=3)) as r: + + with closing(requests.get(match.group(), headers=HEADERS, stream=True, timeout=3)) as r: if not r.encoding: - # remove the content type and size from output for now - r.close() return - #content = r.headers['content-type'] - #size = bytesto(r.headers['content-length']) - #out = "Content Type: \x02{}\x02 Size: \x02{}\x02".format(content, size) - #return out - content = r.raw.read(1000000+1, decode_content=True) - if len(content) > 1000000: - r.close() - return - html = BeautifulSoup(content, "lxml", from_encoding=r.encoding) - r.close() - title = " ".join(html.title.text.strip().splitlines()) - out = "Title: \x02{}\x02".format(title) - message(out, chan) + + content = r.raw.read(MAX_RECV + 1, decode_content=True) + encoding = r.encoding + + if len(content) > MAX_RECV: + return + + html = BeautifulSoup(content, "lxml", from_encoding=encoding) + title = " ".join(html.title.text.strip().splitlines()) + out = "Title: \x02{}\x02".format(title) + message(out, chan)