Merge pull request #246 from WorksApplications/feature/243-rename-tok…

…-create Rename `Dictionary.create` to `Dictionary.tokenizer`
WorksApplications · Nov 26, 2024 · b5d8753 · b5d8753
2 parents dcb86fd + 3aedcef
commit b5d8753
Show file tree

Hide file tree

Showing 14 changed files with 50 additions and 28 deletions.
diff --git a/src/main/java/com/worksap/nlp/sudachi/Dictionary.java b/src/main/java/com/worksap/nlp/sudachi/Dictionary.java
@@ -41,6 +41,16 @@ public interface Dictionary extends AutoCloseable {
      *
      * @return a tokenizer
      */
+    public Tokenizer tokenizer();
+
+    /**
+     * Creates a tokenizer instance.
+     *
+     * @return a tokenizer
+     * 
+     * @deprecated renamed to {@link tokenizer()}
+     */
+    @Deprecated
     public Tokenizer create();
 
     @Override

diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
@@ -128,7 +128,7 @@ public void close() throws IOException {
     }
 
     @Override
-    public Tokenizer create() {
+    public Tokenizer tokenizer() {
         if (grammar == null || lexicon == null) {
             throw new IllegalStateException("trying to use closed dictionary");
         }
@@ -140,6 +140,11 @@ public Tokenizer create() {
         return tokenizer;
     }
 
+    @Override
+    public Tokenizer create() {
+        return tokenizer();
+    }
+
     @Override
     public int getPartOfSpeechSize() {
         return grammar.getPartOfSpeechSize();

diff --git a/src/main/java/com/worksap/nlp/sudachi/SudachiCommandLine.java b/src/main/java/com/worksap/nlp/sudachi/SudachiCommandLine.java
@@ -291,7 +291,7 @@ public static void main(String[] args) throws IOException {
         try (PrintStream output = outputFileName == null ? new FileOrStdoutPrintStream()
                 : new FileOrStdoutPrintStream(outputFileName);
                 Dictionary dict = new DictionaryFactory().create(config)) {
-            Tokenizer tokenizer = dict.create();
+            Tokenizer tokenizer = dict.tokenizer();
             if (isEnableDump) {
                 tokenizer.setDumpOutput(output);
             }

diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.java
@@ -42,10 +42,16 @@ public void tearDown() throws IOException {
     }
 
     @Test
+    @Deprecated
     public void create() {
         assertThat(dict.create(), isA(Tokenizer.class));
     }
 
+    @Test
+    public void createTokenizer() {
+        assertThat(dict.tokenizer(), isA(Tokenizer.class));
+    }
+
     @Test
     public void getPartOfSpeechSize() {
         assertThat(dict.getPartOfSpeechSize(), is(8));
@@ -64,7 +70,7 @@ public void instantiateConfigWithoutCharDef() throws IOException {
         cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict());
         try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) {
             assertThat(jd, notNullValue());
-            assertThat(jd.create(), notNullValue());
+            assertThat(jd.tokenizer(), notNullValue());
         }
     }
 
@@ -79,14 +85,14 @@ private JapaneseDictionary makeDictionaryIncorrectly() throws IOException {
     @Test(expected = IllegalStateException.class)
     public void throwExceptionOnDictionaryUsageAfterClose() throws IOException {
         JapaneseDictionary dic = makeDictionaryIncorrectly();
-        Tokenizer ignored = dic.create();
+        Tokenizer ignored = dic.tokenizer();
     }
 
     private Tokenizer makeTokenizerIncorrectly() throws IOException {
         Config cfg = Config.fromClasspath("sudachi_minimum.json");
         cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict());
         try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) {
-            return jd.create();
+            return jd.tokenizer();
         }
     }
 

diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerMaskTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerMaskTest.kt
@@ -41,7 +41,7 @@ class JapaneseTokenizerMaskTest {
     cfg0.addOovProviderPlugin(SimpleOovProviderPlugin::class.java)
     val cfg = cfg0.withFallback(TestDictionary.user0Cfg())
     val dic = DictionaryFactory().create(cfg) as JapaneseDictionary
-    val tokenizer = dic.create()
+    val tokenizer = dic.tokenizer()
 
     assertEquals(2, dic.oovProviderPlugins.size)
     assertIs<CaptureOtherWords>(dic.oovProviderPlugins[0])
@@ -62,7 +62,7 @@ class JapaneseTokenizerMaskTest {
     val cfg = TestDictionary.user0Cfg()
     cfg.addOovProviderPlugin(CaptureOtherWords::class.java)
     val dic = DictionaryFactory().create(cfg) as JapaneseDictionary
-    val tokenizer = dic.create()
+    val tokenizer = dic.tokenizer()
 
     assertIs<SimpleOovProviderPlugin>(dic.oovProviderPlugins[0])
     assertIs<CaptureOtherWords>(dic.oovProviderPlugins[1])

diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt
@@ -24,7 +24,7 @@ import kotlin.test.assertEquals
 import kotlin.test.assertFailsWith
 
 class JapaneseTokenizerStreamingTest {
-  private val tokenizer = TestDictionary.user0().create()
+  private val tokenizer = TestDictionary.user0().tokenizer()
 
   class BadReader(private val data: String, private val window: Int = 512) : Reader() {
 

diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java
@@ -48,7 +48,7 @@ public class JapaneseTokenizerTest {
     @Before
     public void setUp() {
         dict = TestDictionary.INSTANCE.user1();
-        tokenizer = (JapaneseTokenizer) dict.create();
+        tokenizer = (JapaneseTokenizer) dict.tokenizer();
     }
 
     private static Matcher<Morpheme> morpheme(String surface, int begin, int end) {
@@ -353,7 +353,7 @@ public void zeroLengthMorpheme() {
     public void disableEmptyMorpheme() throws IOException {
         Config config = TestDictionary.INSTANCE.user1Cfg();
         dict = new DictionaryFactory().create(Config.empty().withFallback(config).allowEmptyMorpheme(false));
-        tokenizer = (JapaneseTokenizer) dict.create();
+        tokenizer = (JapaneseTokenizer) dict.tokenizer();
 
         List<Morpheme> s = tokenizer.tokenize("…");
         assertThat(s.size(), is(3));

diff --git a/src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java b/src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java
@@ -32,7 +32,7 @@ public class JoinKatakanaOovPluginTest {
     @Before
     public void setUp() throws IOException {
         Dictionary dict = TestDictionary.INSTANCE.user1();
-        tokenizer = (JapaneseTokenizer) dict.create();
+        tokenizer = (JapaneseTokenizer) dict.tokenizer();
         plugin = new JoinKatakanaOovPlugin();
         plugin.setOovFactory((short) -1);
     }

diff --git a/src/test/java/com/worksap/nlp/sudachi/JoinNumericPluginTest.java b/src/test/java/com/worksap/nlp/sudachi/JoinNumericPluginTest.java
@@ -33,7 +33,7 @@ public void setUp() throws IOException {
         Config config = TestDictionary.INSTANCE.user0Cfg()
                 .characterDefinition(getClass().getClassLoader().getResource("joinnumeric/char.def"));
         Dictionary dict = new DictionaryFactory().create(config);
-        tokenizer = (JapaneseTokenizer) dict.create();
+        tokenizer = (JapaneseTokenizer) dict.tokenizer();
 
         plugin = new JoinNumericPlugin();
         plugin.setSettings(Settings.parse("{}", PathAnchor.none()));

diff --git a/src/test/java/com/worksap/nlp/sudachi/MorphemeImplTest.kt b/src/test/java/com/worksap/nlp/sudachi/MorphemeImplTest.kt
@@ -25,7 +25,7 @@ class MorphemeImplTest {
   fun useToString() {
     val dic = TestDictionary.user0()
     // should be split into す/だ/ち, all of them are OOV
-    val sudachi = dic.create().tokenize("すだち")
+    val sudachi = dic.tokenizer().tokenize("すだち")
     // wid of OOV is (0xf, posId)
     assertEquals(
         "MorphemeImpl{begin=0, end=1, surface=す, pos=4/名詞,普通名詞,一般,*,*,*, wid=(15,4)}",
@@ -36,20 +36,20 @@ class MorphemeImplTest {
   fun userdata() {
     // system
     val sdic = TestDictionary.user0()
-    val tokyo = sdic.create().tokenize("東京")
+    val tokyo = sdic.tokenizer().tokenize("東京")
     assertTrue(tokyo[0].getUserData().isEmpty())
 
     // oov
-    val oovs = sdic.create().tokenize("すだち")
+    val oovs = sdic.tokenizer().tokenize("すだち")
     assertTrue(oovs[0].getUserData().isEmpty())
 
     // user with data
     val udic = TestDictionary.user1()
-    val sudachi = udic.create().tokenize("すだち")
+    val sudachi = udic.tokenizer().tokenize("すだち")
     assertEquals("徳島県産", sudachi[0].getUserData())
 
     // user without data
-    val piraru = udic.create().tokenize("ぴらる")
+    val piraru = udic.tokenizer().tokenize("ぴらる")
     assertTrue(piraru[0].getUserData().isEmpty())
   }
 }
diff --git a/src/test/java/com/worksap/nlp/sudachi/OovProviderPluginTest.kt b/src/test/java/com/worksap/nlp/sudachi/OovProviderPluginTest.kt
@@ -108,7 +108,7 @@ class OovProviderPluginTest {
     val dict = DictionaryFactory().create(cfg) as JapaneseDictionary
     val plugin = assertIs<FakeOovProvider>(dict.oovProviderPlugins.last())
     assertEquals(8, plugin.posId)
-    val tokinzer = dict.create()
+    val tokinzer = dict.tokenizer()
     val tokens = tokinzer.tokenize("すだちかぼす")
     assertEquals("スダチ", tokens[0].partOfSpeech()[5])
     assertEquals("カボス", tokens[1].partOfSpeech()[5])

diff --git a/src/test/java/com/worksap/nlp/sudachi/PosMatcherTest.kt b/src/test/java/com/worksap/nlp/sudachi/PosMatcherTest.kt
@@ -22,11 +22,12 @@ import kotlin.test.*
 class PosMatcherTest {
 
   private val dic = DictionaryFactory().create(TestDictionary.user2Cfg()) as JapaneseDictionary
+  private val tok = dic.tokenizer()
 
   @Test
   fun basic() {
     val nouns = dic.posMatcher(PartialPOS("名詞"))
-    val morphs = dic.create().tokenize("京都に行った")
+    val morphs = tok.tokenize("京都に行った")
     assertEquals(4, morphs.size)
     assertTrue(nouns.test(morphs[0]))
     assertFalse(nouns.test(morphs[1]))
@@ -37,7 +38,7 @@ class PosMatcherTest {
   @Test
   fun userDic() {
     val filter = dic.posMatcher { it[3] == "ミカン科" }
-    val morphs = dic.create().tokenize("すだちにかぼす")
+    val morphs = tok.tokenize("すだちにかぼす")
     assertEquals(3, morphs.size)
     assertTrue(filter.test(morphs[0]))
     assertFalse(filter.test(morphs[1]))
@@ -49,7 +50,7 @@ class PosMatcherTest {
     val f1 = dic.posMatcher { it[5] == "スダチ" }
     val f2 = dic.posMatcher { it[5] == "カボス" }
     val filter = f1.union(f2)
-    val morphs = dic.create().tokenize("すだちにかぼす")
+    val morphs = tok.tokenize("すだちにかぼす")
     assertEquals(3, morphs.size)
     assertTrue(filter.test(morphs[0]))
     assertFalse(filter.test(morphs[1]))
@@ -61,7 +62,7 @@ class PosMatcherTest {
     val f1 = dic.posMatcher { it[5] == "終止形-一般" }
     val f2 = dic.posMatcher { it[0] == "動詞" }
     val filter = f1.intersection(f2)
-    val morphs = dic.create().tokenize("いった東京行く")
+    val morphs = tok.tokenize("いった東京行く")
     assertEquals(4, morphs.size)
     assertFalse(filter.test(morphs[0]))
     assertFalse(filter.test(morphs[1]))
@@ -72,7 +73,7 @@ class PosMatcherTest {
   @Test
   fun invert() {
     val filter = dic.posMatcher { it[3] == "ミカン科" }.invert()
-    val morphs = dic.create().tokenize("すだちにかぼす")
+    val morphs = tok.tokenize("すだちにかぼす")
     assertEquals(3, morphs.size)
     assertFalse(filter.test(morphs[0]))
     assertTrue(filter.test(morphs[1]))

diff --git a/src/test/java/com/worksap/nlp/sudachi/RegexOovProviderTest.kt b/src/test/java/com/worksap/nlp/sudachi/RegexOovProviderTest.kt
@@ -34,7 +34,7 @@ class RegexOovProviderTest {
             .addList("pos", "名詞", "普通名詞", "一般", "*", "*", "*")
     @Suppress("UNCHECKED_CAST") block(cfg, pluginCfg as Config.PluginConf<RegexOovProvider>)
     // prepend our OOV configuration to the main configuration
-    return DictionaryFactory().create(cfg.withFallback(TestDictionary.user0Cfg())).create()
+    return DictionaryFactory().create(cfg.withFallback(TestDictionary.user0Cfg())).tokenizer()
   }
 
   @Test

diff --git a/src/test/java/com/worksap/nlp/sudachi/UserDictionaryTest.java b/src/test/java/com/worksap/nlp/sudachi/UserDictionaryTest.java
@@ -38,7 +38,7 @@ public void fullUserDict() throws IOException {
         config.addUserDictionary(instance.getUserDict2());
 
         try (Dictionary dict = new DictionaryFactory().create(config)) {
-            Tokenizer tokenizer = dict.create();
+            Tokenizer tokenizer = dict.tokenizer();
             List<Morpheme> morphs = tokenizer.tokenize("ぴさる");
             assertThat(morphs.size(), is(1));
             Morpheme m = morphs.get(0);
@@ -62,7 +62,7 @@ public void splitForUserDict() throws IOException {
         TestDictionary td = TestDictionary.INSTANCE;
         Config config = td.user0Cfg().addUserDictionary(td.getUserDict2()).addUserDictionary(td.getUserDict1());
         try (Dictionary dict = new DictionaryFactory().create(config)) {
-            Tokenizer tokenizer = dict.create();
+            Tokenizer tokenizer = dict.tokenizer();
             List<Morpheme> morphs = tokenizer.tokenize("東京府");
             assertThat(morphs.size(), is(1));
             Morpheme m = morphs.get(0);
@@ -77,7 +77,7 @@ public void splitForUserDict() throws IOException {
     public void userDefinedPos() throws IOException {
         Config config = TestDictionary.INSTANCE.user2Cfg();
         try (Dictionary dict = new DictionaryFactory().create(config)) {
-            Tokenizer tokenizer = dict.create();
+            Tokenizer tokenizer = dict.tokenizer();
             List<Morpheme> morphs = tokenizer.tokenize("すだちかぼす");
             assertThat(morphs.size(), is(2));
             Morpheme m = morphs.get(0);
@@ -89,7 +89,7 @@ public void userDefinedPos() throws IOException {
         TestDictionary td = TestDictionary.INSTANCE;
         config = td.user0Cfg().addUserDictionary(td.getUserDict2()).addUserDictionary(td.getUserDict1());
         try (Dictionary dict = new DictionaryFactory().create(config)) {
-            Tokenizer tokenizer = dict.create();
+            Tokenizer tokenizer = dict.tokenizer();
             List<Morpheme> morphs = tokenizer.tokenize("すだちかぼす");
             assertThat(morphs.size(), is(2));
             Morpheme m = morphs.get(0);