-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathassn1_imdb_nbayes.jl
144 lines (125 loc) · 3.93 KB
/
assn1_imdb_nbayes.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#NOTE: The entire scripte takes about 6-7 min. to run on the first try. Takes less than a min. in subsequent runs
# Get a list of review files to iterate over the filenames later
trn_pos = readdir(pwd()*"\\train\\pos")
trn_neg = readdir(pwd()*"\\train\\neg")
test_pos = readdir(pwd()*"\\test\\pos")
test_neg = readdir(pwd()*"\\test\\neg")
# Define a struct to serve as the 'value' in vocab dictionary. count field to store an array
#of word freq in pos and neg reviews. Int. id currently unused.
struct id_count
id
count
end
# To process raw review texts. All reviews are cropped/padded to fixed size
function str_process_pad(review,fix_size)
review = (lowercase(review))
review = replace(review,r"""[,.:;?!()]""" => "")
review = replace(review,'\"' => "")
review = replace(review,"<br /><br />" => "")
review = split(review)
curr_size = length(review)
if curr_size >= fix_size
return review[1:fix_size]
else
padding = fill("<pad>",(fix_size - curr_size))
review =[review; padding]
return review
end
end
wdict = Dict()
w2idcount(x) = get!(wdict, x, id_count(1+length(wdict),[0 0]))
#Dummy placeholders for unknown words and end-of-sentece padding
UNK = w2idcount("<unk>")
PAD = w2idcount("<pad>")
#Constants to determine review size and the threshold for word appearences
fix_size = 300
min_num_appearences = 5
POS = 1
NEG =2
# Read, process training reviews and count words for further training
for file_name in trn_pos
review = open(pwd()*"\\train\\pos\\"*file_name) do file
read(file, String)
end
review = str_process_pad(review,fix_size)
w2idcount.(review)
for word in review
wdict[word].count[POS]+=1
end
end
for file_name in trn_neg
review = open(pwd()*"\\train\\neg\\"*file_name) do file
read(file, String)
end
review = str_process_pad(review,fix_size)
w2idcount.(review)
for word in review
wdict[word].count[NEG]+=1
end
end
#Deleting the words under an 'appearence threshold' from the vocab, adding their count to UNK
for (key,value) in wdict
if sum(value.count)<min_num_appearences
wdict["<unk>"].count[POS]+=wdict[key].count[1]
wdict["<unk>"].count[NEG]+=wdict[key].count[2]
delete!(wdict,key)
end
#Function to classify each review as pos/neg. Sum of log probabilities are used to avoid
#potential problems that could arise from the multip. of small numbers. Smoothing (+1 for)
#each word's count is also implemented
function pred_review(review,word_freq)
#First add prior probs
logprob_pos = log(length(trn_pos)/(length(trn_pos)+length(trn_neg)))
logprob_neg = log(length(trn_neg)/(length(trn_pos)+length(trn_neg)))
num_words_pos = fix_size*length(trn_pos)
num_words_neg = fix_size*length(trn_neg)
for word in review
counts = get(word_freq, word,word_freq["<unk>"]).count
logprob_pos+= log((counts[POS]+1)/num_words_pos)
logprob_neg+= log((counts[NEG]+1)/num_words_neg)
end
if logprob_pos>=logprob_neg
return "Positive"
else
return "Negative"
end
end
#Reading, processing, predicting and evaluating the predictions for test set
#3 min
TP=0
FN=0
TN=0
FP=0
#Positive test samples
for file_name in test_pos
review = open(pwd()*"\\test\\pos\\"*file_name) do file
read(file, String)
end
review = str_process_pad(review,fix_size)
pred_label = pred_review(review,wdict)
if pred_label == "Positive"
TP+=1
else
FN+=1
end
end
#Negative test samples
for file_name in test_neg
review = open(pwd()*"\\test\\neg\\"*file_name) do file
read(file, String)
end
review = str_process_pad(review,fix_size)
pred_label = pred_review(review,wdict)
if pred_label == "Positive"
FP+=1
else
TN+=1
end
end
end
acc = (TP+TN)/(TP+TN+FP+FN)
rec = TP/(TP+FN)
prec = TP/(TP+FP)
println("Test accuracy: " ,acc)
println("Test precision: ", prec)
print("Test recall: ",rec)