-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_dictionary.py
163 lines (132 loc) · 4.85 KB
/
create_dictionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import glob
import os
from typing import Set, List, Tuple
from utils import get_processor
from pinyin_to_ipa import pinyin_to_ipa
DATASET_PATH = "datasets"
DICTIONARY_NAME = "pinyin_dictionary"
# dianr can be transcribed as [tjɐʵ] or [tjɐɻ]
# either we consider ɐʵ as one or two phones
# separating phones can make it harder to find the start and stop times
SPLIT_PHONEMES = [] # [("ʵ", " ɻ"), ("˞", " ɻ"), ("ɚ", "ə ɻ"), ("aə", "a")]
# Dictionary for erhua transformations
# https://en.wikipedia.org/wiki/Erhua#Standard_rules
ERHUA_SUFFIX_TO_IPA = {
"uangr": [["w", "ɑ̃ʵ"]],
"iangr": [["j", "ɑ̃ʵ"]],
"iongr": [["j", "ʊ̃ʵ"]],
"vanr": [["ɥ", "ɐʵ"]],
"uair": [["w", "ɐʵ"]],
"ianr": [["j", "ɐʵ"]],
"iaor": [["j", "ɑu̯˞"]],
"uanr": [["w", "ɐʵ"]],
"engr": [["ɤ̃ʵ"]],
"angr": [["ɑ̃ʵ"]],
"ongr": [["w", "ɤ̃ʵ"], ["ʊ̃˞"]],
"ingr": [["j", "ɤ̃ʵ"]],
"ver": [["ɥ", "œʵ"]],
"uar": [["w", "äʵ"], ["w", "ɐʵ"]],
"uor": [["w", "ɔʵ"]],
"air": [["ɐʵ"]],
"eir": [["ɚ"]],
"aor": [["ɑu̯˞"]],
"our": [["ou̯˞"]],
"anr": [["ɐʵ"]],
"enr": [["ɚ"]],
"iar": [["j", "äʵ"], ["j", "ɐʵ"]],
"ier": [["j", "ɛʵ"]],
"iur": [["j", "ou̯ʵ"]],
"inr": [["j", "ɚ"]],
"uir": [["w", "ɚ"]],
"unr": [["w", "ɚ"]],
"vnr": [["ɥ", "ɚ"]],
"ar": [["äʵ"], ["ɐʵ"]],
"or": [["ɔʵ"]],
"er": [["ɤʵ"]],
"ur": [["u˞"]],
"vr": [["ɥ", "ɚ"]],
"ir": [["ɚ"]],
}
def apply_erhua(pinyin: str, ipa: List[str]) -> List[str]:
"""
Apply erhua transformation to the given IPA representation.
"""
# Remove final 'n' or 'ŋ' if present
if ipa[-1] in ["ŋ", "n"]:
ipa = ipa[:-1]
# Remove last character if IPA has more than one character
if len(ipa) > 1:
ipa = ipa[:-1]
result = []
for pinyin_ending, ipa_endings in ERHUA_SUFFIX_TO_IPA.items():
if pinyin.endswith(pinyin_ending) and pinyin != pinyin_ending:
for ipa_ending in ipa_endings:
if ipa[-1] == ipa_ending[0]:
ipa_ending = ipa_ending[1:]
elif ipa[-1] + "˞" == ipa_ending[0]: # wur
ipa = []
result.append(ipa + ipa_ending)
break
return result
def convert_pinyin_to_ipa(pinyin: str) -> List[List[str]]:
"""
Convert a Pinyin string to its IPA representation(s).
"""
# Check for erhua
erhua = pinyin.endswith("r") and pinyin != "er"
if erhua:
pinyin = pinyin[:-1]
ipa_output = []
for ipa in pinyin_to_ipa(pinyin):
ipa = list(ipa)
# Special case for "yo"
if pinyin == "yo" and ipa == ["w", "o"]:
ipa = ["j", "ɔ"]
if erhua:
erhua_ipas = apply_erhua(pinyin + "r", ipa)
ipa_output.extend([i for i in erhua_ipas if i not in ipa_output])
else:
ipa_output.append(ipa)
return ipa_output
def write_read_pinyins(dataset_path: str, dataset_name: str) -> Set[str]:
processor = get_processor(dataset_name)
pinyins = set()
for sentence in processor.process(dataset_path):
pinyins.update(pinyin for pinyin, _ in sentence["word"])
processed_text = " ".join(pinyin for pinyin, _ in sentence["word"])
output_path = os.path.join(
dataset_path, sentence["wav_path"], f"{sentence['id']}.lab"
)
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w") as text_output:
text_output.write(processed_text.strip())
print(f"Number of pinyins: {len(pinyins)}")
return pinyins
def generate_dictionary_entries(pinyins: Set[str]) -> List[Tuple[str, str]]:
entries = []
for pinyin in pinyins:
ipas = convert_pinyin_to_ipa(pinyin)
for ipa in ipas:
str_ipa = " ".join(ipa)
for s0, s1 in SPLIT_PHONEMES:
str_ipa = str_ipa.replace(s0, s1)
entries.extend((pinyin, str_ipa))
print(f"Number of phones: {len(set(k for _, v in entries for k in v.split()))}")
return sorted(set(entries))
def write_dictionary(entries: List[Tuple[str, str]], filename: str) -> None:
with open(filename, "w") as f:
for pinyin, ipa in entries:
f.write(f"{pinyin}\t{ipa}\n")
def main():
for dataset_path in sorted(glob.glob(os.path.join(DATASET_PATH, "*"))):
if not os.path.isdir(dataset_path):
continue
dataset_name = os.path.basename(dataset_path)
print(f"Processing dataset: {dataset_name}")
print("Generating lab files...")
pinyins = write_read_pinyins(dataset_path, dataset_name)
print("Generating dictionary...")
entries = generate_dictionary_entries(pinyins)
write_dictionary(entries, f"{dataset_name}_{DICTIONARY_NAME}.txt")
if __name__ == "__main__":
main()