Merge pull request CloudBotIRC#210 from linuxdaemon/gonzobot+link-ann…

…ounce-emoji Fix decoding page title in link announcer
linuxdaemon · Nov 14, 2017 · 0c6c16b · 0c6c16b
2 parents 5fa9847 + 53a5e94
commit 0c6c16b
Showing 1 changed file with 22 additions and 40 deletions.
diff --git a/plugins/link_announcer.py b/plugins/link_announcer.py
@@ -1,59 +1,41 @@
-import requests
 import re
-from bs4 import BeautifulSoup
 from contextlib import closing
-from cloudbot import hook
 
+import requests
+from bs4 import BeautifulSoup
+
+from cloudbot import hook
 from cloudbot.hook import Priority, Action
 
 # This will match any URL, blacklist removed and abstracted to a priority/halting system
 url_re = re.compile(r'https?://(?:[a-zA-Z]|[0-9]|[$-_@.&+~]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', re.I)
 
 opt_out = []
 
-traditional = [
-    (1024 ** 5, 'PB'),
-    (1024 ** 4, 'TB'),
-    (1024 ** 3, 'GB'),
-    (1024 ** 2, 'MB'),
-    (1024 ** 1, 'KB'),
-    (1024 ** 0, 'B'),
-    ]
+HEADERS = {
+    'Accept-Language': 'en-US,en;q=0.5',
+    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
+}
 
-
-def bytesto(bytes, system = traditional):
-    """ converts bytes to something """
-    bytes = int(bytes)
-    for factor, suffix in system:
-        if bytes >= factor:
-            break
-    amount = int(bytes/factor)
-    return str(amount) + suffix
+MAX_RECV = 1000000
 
 
 @hook.regex(url_re, priority=Priority.LOW, action=Action.HALTTYPE, only_no_match=True)
 def print_url_title(message, match, chan):
     if chan in opt_out:
         return
-    HEADERS = {
-        'Accept-Language': 'en-US,en;q=0.5',
-        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
-    }
-    with closing(requests.get(match.group(), headers = HEADERS, stream = True, timeout=3)) as r:
+
+    with closing(requests.get(match.group(), headers=HEADERS, stream=True, timeout=3)) as r:
         if not r.encoding:
-            # remove the content type and size from output for now
-            r.close()
             return
-            #content = r.headers['content-type']
-            #size = bytesto(r.headers['content-length'])
-            #out = "Content Type: \x02{}\x02 Size: \x02{}\x02".format(content, size)
-            #return out
-        content = r.raw.read(1000000+1, decode_content=True)
-        if len(content) > 1000000:
-            r.close()
-            return
-        html = BeautifulSoup(content)
-        r.close()
-        title = " ".join(html.title.text.strip().splitlines())
-        out = "Title: \x02{}\x02".format(title)
-        message(out, chan)
+
+        content = r.raw.read(MAX_RECV + 1, decode_content=True)
+        encoding = r.encoding
+
+    if len(content) > MAX_RECV:
+        return
+
+    html = BeautifulSoup(content, "lxml", from_encoding=encoding)
+    title = " ".join(html.title.text.strip().splitlines())
+    out = "Title: \x02{}\x02".format(title)
+    message(out, chan)