-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtweet_scorer.py
executable file
·161 lines (119 loc) · 4.6 KB
/
tweet_scorer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env python3
import glob
import json
def hasKeywords(str, keys):
"""
Checks a string for certain keywords. Returns true if ANY are found
Arguments:
---------------
str - The string to be checked
keys - a list of keywords for which str will be searched
Returns:
---------------
True - if ANY of the strings in keys are found within str
False - if NONE of the strings in keys are found within str
Example:
---------------
ex_string = "the quick brown fox jumped over the lazy dog"
hasKeywords(ex_string, ["brown", "elephant"]) --> True
hasKeywords(ex_string, ["BROWN", "elephant"]) --> False
hasKeywords(ex_string, ["cat", "pear"] --> False
"""
contains_keys = False
for key in keys:
if(str.find(key) != -1):
contains_keys = True
return contains_keys
def getSentiments(file):
"""
Reads in a file of word sentiment scores and stores them in a dictionary
Arguments:
---------------
file - The filepath to a csv containing words and sentiments in [word, sentiment] format.
Assumes that data will be a list of strings, and that the "\n" character is present.
Returns:
---------------
sentiments - A dictionary where the words from file are keys, and the scores are values.
NOTE: all word keys are stored in all caps ("hey" --> "HEY") to avoid mismatches
on capitalization
Example:
---------------
path = "example.csv"
sentiments = getSentiments(path)
print(sentiments["Hello"]) --> 0.5
"""
sentiments = {}
sentiments_file = open(file, "r")
for line in sentiments_file:
s = line.split(",")
sentiments[s[0].strip(".,:;'-!?").upper()] = float(s[1].strip("\n"))
#make sure to close the file when done reading
sentiments_file.close()
return sentiments
def score(tweet, sentiments, sign=1):
"""
Takes a list of words, assigns a score to all words in the list which have sentiment scores,
and applies an optional sign for comparison of two sides of a spectrum. For example, comparison
Arguments:
---------------
tweet - A list of words to be scored (called tweet because this is usually a split tweet)
sentiments - A dictionary keyed by word with sentiment scores as values (sentiments["hello"] --> 0.5)
sign - optional value if intending to compare two things on a spectrum (Republican <--> Democratic)
Returns:
---------------
score - A float containing the sum total of word sentiment scores for the list overall.
Example:
---------------
tweet_1 = ["YOU", "ARE", "A", "BAD", "DOG"]
tweet_2 = ["YOU", "ARE", "A", "BAD", "BAD", "DOG"]
tweet_3 = ["YOU", "ARE", "A", "GOOD", "DOG"]
sentiments = getSentiments("example.csv")
print(score(tweet_1, sentiments)) --> -0.5
print(score(tweet_2, sentiments)) --> -1.0
print(score(tweet_3, sentiments)) --> 0.4
print(score(tweet_3, sentiments, sign=-1)) --> -0.4
"""
score = 0.0
w = ""
for word in tweet:
try:
score = score + sign*sentiments[word]
except:
score = score
return score
#grab all the json files which store tweets. These are not all the same size, but that is fine
files = glob.glob("*.json")
#remove the file which is active currently while we continue to collect tweets
#files.remove("tweets_7.json")
#set up two lists of keywords. "Left" corresponds to keywords focused on the democratic party candidate, "Right" the republican
leftKeys = ["DEMOCRAT", "DNC", "BIDEN", "@JOEBIDEN", "KAMALA"]
rightKeys = ["REPUBLICAN","TRUMP", "@REALDONALDTRUMP", "POTUS", "PENCE", "RNC"]
#import the sentiment word scores
sentiments = getSentiments("sentiments.csv")
#make a csv file to save the scores, times, and locations of all tweets, then add the header
write_to = open("scored_tweets.csv", "x")
write_to.write("Score,Time,Latitude,Longitude\n")
#score all tweets file by file (each line in the files is a separate json formatted tweet gathered from twitter)
for file in files:
f = open(file, "r")
for line in f:
data = json.loads(line)
#store all the components of the json file in variables. Note that .upper() is used to match the format of sentiments
text = data["text"].upper()
time = data["time"]
latitude = data["Latitude"]
longitude = data["Longitude"]
#check for left or right keywords and assign appropriate sign
if(hasKeywords(text, rightKeys)):
split_text = text.split(" ")
s = score(split_text, sentiments, sign=-1)
if(s!=0):
write_to.write(str(s) + "," + str(time) + "," + str(latitude) + "," + str(longitude) + "\n")
elif(hasKeywords(text, leftKeys)):
split_text = text.split(" ")
s = score(split_text, sentiments)
if(s!=0):
write_to.write(str(s) + "," + str(time) + "," + str(latitude) + "," + str(longitude) + "\n")
#close all our files
f.close()
f.close()