Update to python3.9, scipy and migrate gensim 3 to 4 (#5)

* update python version, gensim, scipy * migrate gensim 3 to 4 * update requirements.txt * update version
shihono · Nov 3, 2023 · 3595e01 · 3595e01
1 parent 13e99c5
commit 3595e01
Show file tree

Hide file tree

Showing 6 changed files with 185 additions and 233 deletions.
diff --git a/eval.py b/eval.py
@@ -64,7 +64,7 @@ def main():
     column_indexes = args.col
 
     wv = load_keyvector(model_path)
-    logger.info('Word vector {} dim, Vocab size {}'.format(wv.vector_size, len(wv.vocab)))
+    logger.info('Word vector {} dim, Vocab size {}'.format(wv.vector_size, len(wv)))
 
     # set tokenizer : mecab or sudachipy
     tokenizer = None

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,14 +1,14 @@
 [tool.poetry]
 name = "evaluate_japanese_w2v"
-version = "0.1.1"
+version = "0.2.0"
 description = ""
 authors = ["shihono <[email protected]>"]
 
 [tool.poetry.dependencies]
-python = "^3.8"
+python = ">=3.9,<3.13"
 numpy = "^1.22.3"
-gensim = "<=3.8.3"
-scipy = "<1.8.0"
+gensim = "^4.1"
+scipy = "^1.10.0"
 chardet = "^4.0.0"
 mecab-python3 = "^1.0.5"
 SudachiPy = "^0.6.3"

diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,8 @@
-chardet==4.0.0; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0")
-cython==0.29.14; python_version >= "2.6" and python_full_version < "3.0.0" or python_full_version >= "3.3.0"
-gensim==3.8.3
-mecab-python3==1.0.5
-numpy==1.22.3; python_version >= "3.8"
-scipy==1.6.1; python_version >= "3.7"
-six==1.16.0; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.3.0"
-smart-open==5.2.1; python_version >= "3.6" and python_version < "4.0"
-sudachidict-core==20211220
-sudachipy==0.6.3
+chardet==4.0.0 ; python_version >= "3.9" and python_version < "3.13"
+gensim==4.3.2 ; python_version >= "3.9" and python_version < "3.13"
+mecab-python3==1.0.8 ; python_version >= "3.9" and python_version < "3.13"
+numpy==1.25.2 ; python_version >= "3.9" and python_version < "3.13"
+scipy==1.11.3 ; python_version >= "3.9" and python_version < "3.13"
+smart-open==6.4.0 ; python_version >= "3.9" and python_version < "3.13"
+sudachidict-core==20211220 ; python_version >= "3.9" and python_version < "3.13"
+sudachipy==0.6.7 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/src/ja_tokenizer.py b/src/ja_tokenizer.py
@@ -56,15 +56,15 @@ def divide_word(self, word):
 
 
 def get_divided_wv(word, wv, ja_tokenizer):
-    if word in wv.vocab:
+    if word in wv:
         return wv.get_vector(word)
 
     div_words = ja_tokenizer.divide_word(word)
     res_vectors = np.zeros((wv.vector_size, len(div_words)))
     logger.debug('{} divide into {}'.format(word, div_words))
     oov_cnt = 0
     for idx, w in enumerate(div_words):
-        if w in wv.vocab:
+        if w in wv:
             res_vectors[:, idx] = wv.get_vector(w)
         else:
             oov_cnt += 1

diff --git a/src/similarity.py b/src/similarity.py
@@ -139,7 +139,7 @@ def cal_wv_similarity(dataset, wv, oov_score=-1, tokenizer=None):
     oov_cnt = 0
     for i, d in enumerate(dataset.gold_data):
         word1, word2 = d.word1, d.word2
-        if (word1 in wv.vocab) and (word2 in wv.vocab):
+        if (word1 in wv.key_to_index) and (word2 in wv.key_to_index):
             sim = wv.similarity(word1, word2)
         else:
             if tokenizer is not None: