-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext.py
41 lines (30 loc) · 1.15 KB
/
text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import gensim
import nltk
# The 'english' stemmer is better than the original 'porter' stemmer.
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import wordnet
import numpy as np
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()
def get_wordnet_pos(word):
"""Map POS tag to first character lemmatize() accepts"""
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {"J": wordnet.ADJ,
"N": wordnet.NOUN,
"V": wordnet.VERB,
"R": wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
def lemmatize_stemming(text):
'''
Stemmer causes too many invalid words.
'''
#return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
return lemmatizer.lemmatize(text, pos=get_wordnet_pos(text))
# Tokenize and lemmatize
def preprocess(text):
result=[]
# simple_preprocess(): lowercases, tokenizes, de-accents (optional)
for token in gensim.utils.simple_preprocess(text):
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
result.append(lemmatize_stemming(token))
return result