-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmetrics.py
86 lines (73 loc) · 2.7 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import urllib
import re
from pattern.web import Wikipedia, Google, NEWS, Newsfeed
from pattern.en import split, parse, wordnet
def sentiment(content):
if len(wordnet.sentiment) == 0:
wordnet.sentiment.load()
relevant_types = ['JJ', 'VB', 'RB'] #adjectives, verbs, adverbs
score = 0
sentences = split(parse(content))
for sentence in sentences:
for index, word in enumerate(sentence.words):
if word.string != '' and word.type in relevant_types:
try:
synset = wordnet.synsets(word.string, word.type)
except KeyError:
#incorrect part of speech tag or not in wordnet, skip it
continue
pos, neg, obj = synset[0].weight
#weights concluding statements
#idea from [Ohana, Tierney '09]
documentpos = index / float(len(sentence.words))
#weights more subjective statements
subjscore = ((pos - neg) * (1 - obj))
score = score + subjscore * documentpos
return score
def heuristic_scrape(article):
from pattern.web import URL, Document, HTTP404NotFound, URLError, plaintext
try:
content = URL(article).download(timeout=120)
except (URLError, HTTP404NotFound):
print "Error downloading", article
return None
dom = Document(content)
text = ''
for node in dom.by_tag('p'):
for c in node:
if c.type == 'text':
text = text + ' ' + plaintext(c.source())
return text.strip()
##Wikipedia, the poor man's ontology
def isnews(topic):
engine = Wikipedia()
result = engine.search(topic)
if result:
if topic.lower() not in result.title.lower():
return False
newsthings = ['places','cities','capitals','countries','people','wars']
categories = result.categories
for category in categories:
for thing in newsthings:
if thing in category.lower():
return True
return False
else:
return False
def gnews_hits(topic):
engine = Google()
results = engine.search(topic, type=NEWS)
return results.total
def gnews_polarity(topic):
engine = Google()
results = engine.search(topic, type=NEWS)
score = 0
#only 8 results without using paging/cursor
for result in results:
content = heuristic_scrape(urllib.unquote(result.url))
if content:
polarity = sentiment(content)
score = score + polarity
else:
results.remove(result)
return score / float(len(results)) #avg sentiment