Skip to content

Commit

Permalink
Update to python3.9, scipy and migrate gensim 3 to 4 (#5)
Browse files Browse the repository at this point in the history
* update python version, gensim, scipy

* migrate gensim 3 to 4

* update requirements.txt

* update version
  • Loading branch information
shihono authored Nov 3, 2023
1 parent 13e99c5 commit 3595e01
Show file tree
Hide file tree
Showing 6 changed files with 185 additions and 233 deletions.
2 changes: 1 addition & 1 deletion eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def main():
column_indexes = args.col

wv = load_keyvector(model_path)
logger.info('Word vector {} dim, Vocab size {}'.format(wv.vector_size, len(wv.vocab)))
logger.info('Word vector {} dim, Vocab size {}'.format(wv.vector_size, len(wv)))

# set tokenizer : mecab or sudachipy
tokenizer = None
Expand Down
384 changes: 169 additions & 215 deletions poetry.lock

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
[tool.poetry]
name = "evaluate_japanese_w2v"
version = "0.1.1"
version = "0.2.0"
description = ""
authors = ["shihono <[email protected]>"]

[tool.poetry.dependencies]
python = "^3.8"
python = ">=3.9,<3.13"
numpy = "^1.22.3"
gensim = "<=3.8.3"
scipy = "<1.8.0"
gensim = "^4.1"
scipy = "^1.10.0"
chardet = "^4.0.0"
mecab-python3 = "^1.0.5"
SudachiPy = "^0.6.3"
Expand Down
18 changes: 8 additions & 10 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
chardet==4.0.0; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0")
cython==0.29.14; python_version >= "2.6" and python_full_version < "3.0.0" or python_full_version >= "3.3.0"
gensim==3.8.3
mecab-python3==1.0.5
numpy==1.22.3; python_version >= "3.8"
scipy==1.6.1; python_version >= "3.7"
six==1.16.0; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.3.0"
smart-open==5.2.1; python_version >= "3.6" and python_version < "4.0"
sudachidict-core==20211220
sudachipy==0.6.3
chardet==4.0.0 ; python_version >= "3.9" and python_version < "3.13"
gensim==4.3.2 ; python_version >= "3.9" and python_version < "3.13"
mecab-python3==1.0.8 ; python_version >= "3.9" and python_version < "3.13"
numpy==1.25.2 ; python_version >= "3.9" and python_version < "3.13"
scipy==1.11.3 ; python_version >= "3.9" and python_version < "3.13"
smart-open==6.4.0 ; python_version >= "3.9" and python_version < "3.13"
sudachidict-core==20211220 ; python_version >= "3.9" and python_version < "3.13"
sudachipy==0.6.7 ; python_version >= "3.9" and python_version < "3.13"
4 changes: 2 additions & 2 deletions src/ja_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,15 @@ def divide_word(self, word):


def get_divided_wv(word, wv, ja_tokenizer):
if word in wv.vocab:
if word in wv:
return wv.get_vector(word)

div_words = ja_tokenizer.divide_word(word)
res_vectors = np.zeros((wv.vector_size, len(div_words)))
logger.debug('{} divide into {}'.format(word, div_words))
oov_cnt = 0
for idx, w in enumerate(div_words):
if w in wv.vocab:
if w in wv:
res_vectors[:, idx] = wv.get_vector(w)
else:
oov_cnt += 1
Expand Down
2 changes: 1 addition & 1 deletion src/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def cal_wv_similarity(dataset, wv, oov_score=-1, tokenizer=None):
oov_cnt = 0
for i, d in enumerate(dataset.gold_data):
word1, word2 = d.word1, d.word2
if (word1 in wv.vocab) and (word2 in wv.vocab):
if (word1 in wv.key_to_index) and (word2 in wv.key_to_index):
sim = wv.similarity(word1, word2)
else:
if tokenizer is not None:
Expand Down

0 comments on commit 3595e01

Please sign in to comment.