-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrunthis.py
67 lines (52 loc) · 2.54 KB
/
runthis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from embeddings import WordEmbeddings, WhiskyEmbeddings
from dataclass import WhiskyClass as WC
import csv
data = []
with open('reviews.csv', 'rt') as rv:
reader = csv.reader(rv, delimiter=',')
for line in reader:
data.append(line)
def run_it_all(dat, tok, rm_s, size, window, skipgram, workers, min_count):
"""
Return a WhiskyEmbeddings object which allows for some cool trickeries
such as finding similar whiskies, describing whiskies and finding
similar wordings ('synonyms') in the whisky-tasting vocabulary.
This function does it all from beginning to the end:
1) Transform the scraped whisky reviews into a well-structured object
2) Use all whisky reviews to build a corpus and train a whisky-specific
word2vec model.
3) Use the word embeddings to create whisky embeddings.
The methods in WhiskyEmbeddings can then be used.
All of this takes approx. 30-60 seconds.
:param dat: input data
:param tok: (bool) use tokenize and gensim preprocessing or not?
:param rm_s: (bool) remove stopwords or not?
:param size: (int) the number of word2vec dimensions
:param window: (int) window size of the context while training word2vec
:param skipgram: (bool) use skipgram model or CBOW?
:param workers: (int) number of workers to train word2vec
:param min_count: (int) min. number of occurrences in corpus. Words with
less occurences will be deleted.
:return: (WhiskyEmbeddings instance)
"""
# 1) Transform whisky reviews into well-structured objects:
all_reviews = [WC(x, tokenize=tok, rm_stopwords=rm_s) for x in dat[1:]]
# 2) Build a corpus and train a word2vec model:
w2v = WordEmbeddings(all_reviews)
word_vectors = w2v.train(size, window, skipgram, workers, min_count)
# 3) Create whisky embeddings
w_embedding = WhiskyEmbeddings(all_reviews, word_vectors)
return w_embedding
# Fit three different model and check out differences
we1 = run_it_all(data, tok=True, rm_s=True, size=50, window=3,
skipgram=True, workers=5, min_count=30)
we2 = run_it_all(data, tok=True, rm_s=True, size=100, window=3,
skipgram=True, workers=5, min_count=30)
we3 = run_it_all(data, tok=False, rm_s=True, size=100, window=3,
skipgram=False, workers=5, min_count=50)
we1.most_similar_whiskies('makers mark')
we3.most_similar_whiskies('makers mark')
we3.most_similar_whiskies('makers mark')
we1.describe_whisky('lagavulin 16', n=20)
we2.describe_whisky('lagavulin 16', n=20)
we3.describe_whisky('lagavulin 16', n=20)