-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext-similar-verify.py
53 lines (44 loc) · 1.47 KB
/
text-similar-verify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# -*- coding: utf-8 -*-
import numpy as np
import jieba
class CountVectorizer:
def __init__(self):
self.word2id = {}
self.id2word = {}
self.vocab_size = 0
def fit_transform(self, texts):
# 构建词典
for text in texts:
for word in text.split():
if word not in self.word2id:
self.word2id[word] = self.vocab_size
self.id2word[self.vocab_size] = word
self.vocab_size += 1
# 构建词频矩阵
X = np.zeros((len(texts), self.vocab_size))
for i, text in enumerate(texts):
for word in text.split():
X[i, self.word2id[word]] += 1
return X
def cosine_similarity(text1, text2):
# 将文本转换为词频矩阵
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([text1, text2])
# 计算两个文本的余弦相似度
similarity = np.dot(X[0], X[1].T) / (np.linalg.norm(X[0]) * np.linalg.norm(X[1]))
return similarity
def cut_word(text):
#进行中文分詞
return " ".join(list(jieba.cut(text)))
# 定义相似度阈值
threshold = 0.8
# 输入文本
text1 = input('Please enter the first text: ')
text2 = input('Please enter the second text: ')
# 计算相似度并判断
similarity = cosine_similarity(cut_word(text1), cut_word(text2))
print(similarity)
if similarity >= threshold:
print('The texts are similar.')
else:
print('The texts are not similar.')