-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathStep_1.py
35 lines (23 loc) · 874 Bytes
/
Step_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
#nltk.download('all') # use this if any Lookup error occurs.
input = "This is Surendra's with ?/// Punctuations ;' = + code running"
def preprocess(text):
# Removal of Punctuation.
text_without_punc = ""
for word in text:
if word not in string.punctuation:
text_without_punc+=word
# Lower case every word :
text_without_punc =text_without_punc.lower()
# Tokenization:
tokenlist = nltk.word_tokenize(text_without_punc)
# Removal of stopwords:
tokens_without_sw = [ word for word in tokenlist if word not in stopwords.words()]
# Lemmatization :
lemmas = WordNetLemmatizer()
Lemmatokens = [lemmas.lemmatize(token) for token in tokens_without_sw]
return Lemmatokens
print(preprocess(input))