-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
89 lines (72 loc) · 2.6 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python3
import os
import re
from string import punctuation
import nltk
from nltk.corpus import stopwords
import spacy
class FilePreprocesser:
def __init__(self):
try:
self.stopwords = stopwords.words('spanish')
except LookupError:
nltk.download('stopwords')
self.stopwords = stopwords.words('spanish')
def file2text(self, file):
raw = [line for line in file.split('\n') if line]
return raw
def text2sentences(self, text, remove_punct=False, remove_sw=False, lower=False):
sentences = [s.strip().split() for p in text for s in p.split('.') if s]
if remove_punct:
sentences = [self.remove_punctuation(s[:]) for s in sentences]
if remove_sw:
sentences = [self.remove_stopwords(s[:]) for s in sentences]
if lower:
sentences = [[tok.lower() for tok in s] for s in sentences]
return sentences
def text2sentences_types(self, text, per_sentence=True, remove_punct=True, remove_sw=True, lower=True):
sentences = self.text2tokens(text, per_sentence, remove_punct, remove_sw, lower)
sentences_types = [list(dict.fromkeys(s)) for s in sentences]
return sentences_types
def text2tokens(self, text, per_sentence=True, remove_punct=True, remove_sw=True, lower=True):
sentences = self.text2sentences(text)
tokens = []
# TODO: use list comprehension
for s in sentences:
tokens_s = []
if remove_punct:
tokens_s = self.remove_punctuation(s[:])
if remove_sw:
tokens_s = self.remove_stopwords(tokens_s[:])
if lower:
tokens_s = [tok.lower() for tok in tokens_s[:]]
tokens.append([tok for tok in tokens_s])
if not per_sentence:
tokens = [tok for toks in tokens for tok in toks]
return tokens
return tokens
def clean_text(self, text):
text = ''.join([re.sub(r'\d+|—', ' ', c) for c in text])
return text
def remove_punctuation(self, tokens):
return [re.sub(r'\W+', '', tok) for tok in tokens]
def remove_stopwords(self, tokens):
return [tok for tok in tokens if tok not in self.stopwords]
def lemmatize(self, vocab):
load_model = spacy.load("es_core_news_sm", disable = ['parser','ner'])
l = load_model(vocab)
return l[0].lemma_
class Formas:
def __init__(self, formas_file):
self.formas = self.read_file(formas_file)
def read_file(self, file):
formas = {}
with open(file, 'r', encoding='latin-1') as f:
lines = f.readlines()
for line in lines[1:]: # skipping head
row = re.split(r'[\t\n]', line)
row = [r.strip() for r in row if r]
# [Orden, Forma, Frec.absoluta, Frec.normalizada]
# ['10000.', 'normalización', '1,182', '7.74']
formas[row[1]] = int(re.search(r'\d+', row[0]).group(0))
return formas