-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconjutils.py
129 lines (108 loc) · 4.11 KB
/
conjutils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
""""
Conjugator Utilites
Provides functions to ease setting up the conjugator proper
"""
import re
import verbs as v
from enum import IntEnum
import pandas as pd
import numpy as np
import os
class IrregularIdx(IntEnum):
CATEGORY = 0
INFINITIVE = 1
PAST_STEM = 2
PARTICIPLE= 3
CONJUGATION = 4 # present tense only, for Irregular and preterite-present verbs
def get_irregular_verbs() -> list:
"""
Retrieve the irregular verb constructions from the file and store them as a list of tuples.
Each line becomes its own tuple, with each element within each tuple being the following:
0 (IrregularIdx.CATEGORY): The verb's category
- Numbered categories refer to its classification as a strong verb
- M is for mixed
- I is for irregular
- PP is for preterite-present
1 (IrregularIdx.INFINITIVE)
2 (IrregularIdx.PAST_STEM)
3 (IrregularIdx.PARTICIPLE)
4 (IrregularIdx.CONJUGATION): present tense conjugation (indicative) for irregular and preterite-present verbs
Return:
list[tuple[str, str, str, str, str]]
"""
file = open(os.environ["VERB_DATA_DIR"] + "/" + "verbs.txt", "r")
lines = file.readlines()
file.close()
verbs = []
for line in lines:
verb = (line.rstrip("\n")).split(",") # remove the newline as well
verb = tuple(verb)
verbs.append(verb)
return verbs
def find_verb_matches(word : str, verbs : list) -> list:
"""
Return all verb-tuples that end with <word>.
Construct a list of matches by comparing <word> with the infinitive in <verbs> and if they
end identically. If so, it is added to the list.
Parameters:
word: str --> string to compare the irregular verbs to.
verbs: list[tuple[str, str, str, str, str]] --> list holding the irregular verb information.
Return:
list[tuple[str, str, str, str, str]] --> list of irregular verb-tuples that match the word-ending regex pattern.
"""
matches = [verb for verb in verbs if re.findall("(" + verb[IrregularIdx.INFINITIVE] + ")" + "$", word) != []]
return matches
def construct_verb(word : str, match : tuple) -> v.Verb:
"""Construct a Verb object manually by overwriting stem values from __init__()."""
remainder = word[:-len(match[IrregularIdx.INFINITIVE])]
verb = v.Verb()
verb.past_stem = remainder + match[IrregularIdx.PAST_STEM]
# TODO: fill with the other values
return verb
def get_prefixes() -> str:
"""Retrieve the prefixes from the file as a single regex expression."""
file = open(os.environ["VERB_DATA_DIR"] + "/" + "prefix.txt", "r")
lines = file.readlines()
file.close()
# append each line (excluding the new line) within an OR capture
prefixes = "^("
for line in lines:
prefixes += "(" + line[:-1] + ")|"
prefixes = prefixes[:-1] + ")" # remove last/redundant "|"
return prefixes
def get_last_prefix(matches : tuple):
"""Return the last non-empty element in tuple <matches>."""
prefix_match = ""
for match in matches:
if match:
prefix_match = match # the previous match is overwritten.
return prefix_match
def get_prefix(word : str, prefixes_expr: str) -> tuple:
"""
Extract the prefixes from provided <word>.
Continuously appends consecutive prefixes to string <prefixes> in order to obtain
a string containing all of the contained prefixes within <word> that
adhere to the <prefixes> regex pattern. As a result, as <prefixes> expands,
the remaining length of <word> decreases, eventually reducing it to its
unprefixed root.
Parameters:
word : str --> word to extract the prefixes from.
prefixes_expr : str --> regex expression containing all valid verbal prefixes.
Return:
tuple[str, str] --> [0]: The resulting string from appending all of the consecutively found prefixes in <word>.
[1]: The resulting string from removing the found prefixes from <word> within <word> (the root).
"""
prefixes = ""
prefix = ""
root = ""
while(1):
# keep extracting last prefix matched until all have been found.
word = word[len(prefix):]
found_prefixes = re.findall(prefixes_expr, word)
if found_prefixes == []:
break
found_prefixes = found_prefixes[0] # list is a singleton
prefix = get_last_prefix(found_prefixes)
prefixes += prefix
root = word
return (prefixes, root)