-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_model.py
90 lines (66 loc) · 2.94 KB
/
train_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import json
import numpy as np
import pandas as pd
import spacy
from spacy.tokens import DocBin
from sklearn.ensemble import RandomForestClassifier
from joblib import dump
from sklearn.model_selection import GridSearchCV
nlp = spacy.load("en_core_web_lg")
# Helper functions
def lemmatize_text(text, preprocessed=True):
return process_text(text, "lemmatize", preprocessed)
def tokenize_text(text, preprocessed=True):
return process_text(text, "tokenize", preprocessed)
def process_text(text, mode: str, preprocessed=True):
if not preprocessed:
text = nlp(text)
if mode == "tokenize":
processed_text = [token.text for token in
text] # token and embed must have the same processing + SpaCy provides embeddings for punctuation
elif mode == "embed":
processed_text = [token.vector for token in text] # token and embed must have the same processing
elif mode == "lemmatize":
processed_text = [token.lemma_ for token in text
if not token.is_punct and not token.is_space and not token.like_url and not token.like_email]
else:
raise ValueError("Mode not supported")
return processed_text
def save_preprocessed(raw_text, save_path):
doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
for doc in nlp.pipe(raw_text):
doc_bin.add(doc)
# save DocBin to a file on disc
doc_bin.to_disk(save_path)
if __name__ == '__main__':
# Load and preprocess data using SpaCy
data_file = open('intents.json').read()
intents = json.loads(data_file)
data = []
for intent in intents['intents']:
for pattern in intent['patterns']:
data.append([pattern, intent['tag']])
df_json = pd.DataFrame(data, columns=['text', 'intent'])
df_csv = pd.read_csv("sentences/full.csv")
df = pd.concat([df_json, df_csv], axis=0)
file_name_spacy = 'preprocessed_dataset_chatbot.spacy'
save_preprocessed(raw_text=df["text"], save_path=file_name_spacy)
# Load DocBin at later time or on different system from disc or bytes object
doc_bin = DocBin().from_disk(file_name_spacy)
df["doc"] = list(doc_bin.get_docs(nlp.vocab))
X_train = df["doc"].reset_index(drop=True)
y_train = df["intent"].reset_index(drop=True)
X_train_embedded = df["doc"].apply(process_text, args=("embed", True,))
X_train_embedded_avg = X_train_embedded.apply(np.mean, axis=0).apply(pd.Series)
clf = RandomForestClassifier()
params_grid = {
'n_estimators': [200],
'max_features': [None, 'sqrt', 'log2'],
'max_depth': [8, 10, 12],
'criterion': ['gini', 'entropy', 'log_loss']
}
gs = GridSearchCV(clf, params_grid, cv=3, n_jobs=1, refit="balanced_accuracy", scoring=["balanced_accuracy", "f1_macro"], verbose=2)
gs.fit(X_train_embedded_avg, y=y_train)
print(gs.best_params_)
print(gs.best_score_)
dump(gs.best_estimator_, filename="clf_chatbot")