Skip to content

Commit

Permalink
Merge pull request CloudBotIRC#210 from linuxdaemon/gonzobot+link-ann…
Browse files Browse the repository at this point in the history
…ounce-emoji

Fix decoding page title in link announcer
  • Loading branch information
edwardslabs authored Nov 14, 2017
2 parents 5fa9847 + 53a5e94 commit 0c6c16b
Showing 1 changed file with 22 additions and 40 deletions.
62 changes: 22 additions & 40 deletions plugins/link_announcer.py
Original file line number Diff line number Diff line change
@@ -1,59 +1,41 @@
import requests
import re
from bs4 import BeautifulSoup
from contextlib import closing
from cloudbot import hook

import requests
from bs4 import BeautifulSoup

from cloudbot import hook
from cloudbot.hook import Priority, Action

# This will match any URL, blacklist removed and abstracted to a priority/halting system
url_re = re.compile(r'https?://(?:[a-zA-Z]|[0-9]|[$-_@.&+~]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', re.I)

opt_out = []

traditional = [
(1024 ** 5, 'PB'),
(1024 ** 4, 'TB'),
(1024 ** 3, 'GB'),
(1024 ** 2, 'MB'),
(1024 ** 1, 'KB'),
(1024 ** 0, 'B'),
]
HEADERS = {
'Accept-Language': 'en-US,en;q=0.5',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
}


def bytesto(bytes, system = traditional):
""" converts bytes to something """
bytes = int(bytes)
for factor, suffix in system:
if bytes >= factor:
break
amount = int(bytes/factor)
return str(amount) + suffix
MAX_RECV = 1000000


@hook.regex(url_re, priority=Priority.LOW, action=Action.HALTTYPE, only_no_match=True)
def print_url_title(message, match, chan):
if chan in opt_out:
return
HEADERS = {
'Accept-Language': 'en-US,en;q=0.5',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
}
with closing(requests.get(match.group(), headers = HEADERS, stream = True, timeout=3)) as r:

with closing(requests.get(match.group(), headers=HEADERS, stream=True, timeout=3)) as r:
if not r.encoding:
# remove the content type and size from output for now
r.close()
return
#content = r.headers['content-type']
#size = bytesto(r.headers['content-length'])
#out = "Content Type: \x02{}\x02 Size: \x02{}\x02".format(content, size)
#return out
content = r.raw.read(1000000+1, decode_content=True)
if len(content) > 1000000:
r.close()
return
html = BeautifulSoup(content)
r.close()
title = " ".join(html.title.text.strip().splitlines())
out = "Title: \x02{}\x02".format(title)
message(out, chan)

content = r.raw.read(MAX_RECV + 1, decode_content=True)
encoding = r.encoding

if len(content) > MAX_RECV:
return

html = BeautifulSoup(content, "lxml", from_encoding=encoding)
title = " ".join(html.title.text.strip().splitlines())
out = "Title: \x02{}\x02".format(title)
message(out, chan)

0 comments on commit 0c6c16b

Please sign in to comment.