Skip to content

Commit

Permalink
Clean up link announcer
Browse files Browse the repository at this point in the history
  • Loading branch information
linuxdaemon committed Nov 3, 2017
1 parent e8572d7 commit 53a5e94
Showing 1 changed file with 22 additions and 40 deletions.
62 changes: 22 additions & 40 deletions plugins/link_announcer.py
Original file line number Diff line number Diff line change
@@ -1,59 +1,41 @@
import requests
import re
from bs4 import BeautifulSoup
from contextlib import closing
from cloudbot import hook

import requests
from bs4 import BeautifulSoup

from cloudbot import hook
from cloudbot.hook import Priority, Action

# This will match any URL, blacklist removed and abstracted to a priority/halting system
url_re = re.compile(r'https?://(?:[a-zA-Z]|[0-9]|[$-_@.&+~]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', re.I)

opt_out = []

traditional = [
(1024 ** 5, 'PB'),
(1024 ** 4, 'TB'),
(1024 ** 3, 'GB'),
(1024 ** 2, 'MB'),
(1024 ** 1, 'KB'),
(1024 ** 0, 'B'),
]
HEADERS = {
'Accept-Language': 'en-US,en;q=0.5',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
}


def bytesto(bytes, system = traditional):
""" converts bytes to something """
bytes = int(bytes)
for factor, suffix in system:
if bytes >= factor:
break
amount = int(bytes/factor)
return str(amount) + suffix
MAX_RECV = 1000000


@hook.regex(url_re, priority=Priority.LOW, action=Action.HALTTYPE, only_no_match=True)
def print_url_title(message, match, chan):
if chan in opt_out:
return
HEADERS = {
'Accept-Language': 'en-US,en;q=0.5',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
}
with closing(requests.get(match.group(), headers = HEADERS, stream = True, timeout=3)) as r:

with closing(requests.get(match.group(), headers=HEADERS, stream=True, timeout=3)) as r:
if not r.encoding:
# remove the content type and size from output for now
r.close()
return
#content = r.headers['content-type']
#size = bytesto(r.headers['content-length'])
#out = "Content Type: \x02{}\x02 Size: \x02{}\x02".format(content, size)
#return out
content = r.raw.read(1000000+1, decode_content=True)
if len(content) > 1000000:
r.close()
return
html = BeautifulSoup(content, "lxml", from_encoding=r.encoding)
r.close()
title = " ".join(html.title.text.strip().splitlines())
out = "Title: \x02{}\x02".format(title)
message(out, chan)

content = r.raw.read(MAX_RECV + 1, decode_content=True)
encoding = r.encoding

if len(content) > MAX_RECV:
return

html = BeautifulSoup(content, "lxml", from_encoding=encoding)
title = " ".join(html.title.text.strip().splitlines())
out = "Title: \x02{}\x02".format(title)
message(out, chan)

0 comments on commit 53a5e94

Please sign in to comment.