-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_text.py
40 lines (33 loc) · 1.08 KB
/
preprocess_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import ray
import re
import nltk
from NLPyPort.FullPipeline import *
NLPYPORT_OPTIONS = {
"tokenizer": True,
"pos_tagger": True,
"lemmatizer": True,
"entity_recognition": False,
"np_chunking": False,
"pre_load": False,
"string_or_array": True
}
def preprocess_texts(texts):
ray.init(num_cpus=4, ignore_reinit_error=True)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('floresta')
nltk.download('rslp')
results = [preprocess.remote(text) for text in texts]
return [ray.get(i) for i in results]
@ray.remote
def preprocess(text):
doc = new_full_pipe(text, options=NLPYPORT_OPTIONS)
tokens = [lema for idx, lema in enumerate(doc.lemas)
if lema != 'EOS'
and lema != '']
tokens = [token for token in tokens
if token not in nltk.corpus.stopwords.words('portuguese')]
tokens = [token for token in tokens
if not re.match('[^A-Za-z0-9]+', token)]
return ' '.join([token for token in tokens
if not any(char.isdigit() for char in token)])