forked from contr4l/SimilarCharacter
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathJudgeSimilarity.py
70 lines (57 loc) · 2.19 KB
/
JudgeSimilarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# coding=gbk
from Dict import TrAngle as dict1 # 四码字典
import Character # 包含所有汉字的列表
from tqdm import tqdm # 进度条
from writenumDict import write_num_dict as dict2 # 笔画数字典
from Dict import structure_dict as dict3 # 结构字典
from Pronunciation import pronunciation_index
lst = Character.Symbol_lst()
file1 = open('D:/py/Shape.txt', 'w')
def get_similar(char1,char2): # char1,char2为汉字
# 获取四码和汉字笔画数
code1 = dict1[char1]
code2 = dict1[char2]
write_num1 = int(dict2[char1])
write_num2 = int(dict2[char2])
structure1 = dict3.setdefault(char1, None)
structure2 = dict3.setdefault(char2, None)
# 定义结构相似度
if structure1 and structure2 and structure1 == structure2:
structure_index = 1
else:
structure_index = 0
# 定义四码相似度
code_index = 0
# 四码分为四位计算,若相同则指数+1,不同为0,总权数除以4再加权
for _i in range(4):
if code1[_i] == code2[_i]:
code_index += 1
code_index /= 4
# 添加发音相似度
voice_index = pronunciation_index(char1,char2)
# 笔画数利用相对偏差的方式进行计算
write_num_index = 1- abs((write_num1 - write_num2)/max(write_num1,write_num2))
# 四码权重、笔画权重和结构权重分别为为 0.4 0.3 0.3
write_index = code_index * 0.4 + write_num_index * 0.3 + structure_index * 0.3
similarity_index_ = write_index * 0.5 + voice_index * 0.5
return write_index,voice_index,similarity_index_
def main():
print('形近字判断写入中...')
for i in tqdm(lst):
file1.write(i+' ')
for j in lst:
if i == j:
pass
else:
# 设计一个加权音字形相似度算法,根据笔画数和四码相近程度来判断,若大于某一个阈值,则写入相近字文件
write_index, voice_index, similarity_index = get_similar(i,j)
if (voice_index == 1 and write_index >= 0.6) or write_index >= 0.9:
file1.write(j)
elif similarity_index >= 0.85:
file1.write(j)
else:
pass
file1.write('\n')
file1.close()
if __name__ == '__main__':
main()