From 4f660956d9dbc5be50b4d89722c3a65fce3b19ce Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 20 Nov 2024 17:14:55 +0900 Subject: [PATCH 1/3] make Dictionary.create deprecated and add Dictionary.tokenizer instead --- src/main/java/com/worksap/nlp/sudachi/Dictionary.java | 10 ++++++++++ .../com/worksap/nlp/sudachi/JapaneseDictionary.java | 7 ++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/Dictionary.java b/src/main/java/com/worksap/nlp/sudachi/Dictionary.java index e0dc39c7..ff16e414 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Dictionary.java +++ b/src/main/java/com/worksap/nlp/sudachi/Dictionary.java @@ -41,6 +41,16 @@ public interface Dictionary extends AutoCloseable { * * @return a tokenizer */ + public Tokenizer tokenizer(); + + /** + * Creates a tokenizer instance. + * + * @return a tokenizer + * + * @deprecated renamed to {@link tokenizer()} + */ + @Deprecated public Tokenizer create(); @Override diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java index 2da590d1..c563cfb7 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java +++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java @@ -128,7 +128,7 @@ public void close() throws IOException { } @Override - public Tokenizer create() { + public Tokenizer tokenizer() { if (grammar == null || lexicon == null) { throw new IllegalStateException("trying to use closed dictionary"); } @@ -140,6 +140,11 @@ public Tokenizer create() { return tokenizer; } + @Override + public Tokenizer create() { + return tokenizer(); + } + @Override public int getPartOfSpeechSize() { return grammar.getPartOfSpeechSize(); From a349c5445bcd7060ff35d4e772e203f70977a545 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 20 Nov 2024 17:15:16 +0900 Subject: [PATCH 2/3] update the use of dict.create -> dict.tokenizer --- .../com/worksap/nlp/sudachi/SudachiCommandLine.java | 2 +- .../worksap/nlp/sudachi/JapaneseDictionaryTest.java | 8 ++++---- .../worksap/nlp/sudachi/JapaneseTokenizerMaskTest.kt | 4 ++-- .../nlp/sudachi/JapaneseTokenizerStreamingTest.kt | 2 +- .../worksap/nlp/sudachi/JapaneseTokenizerTest.java | 4 ++-- .../nlp/sudachi/JoinKatakanaOovPluginTest.java | 2 +- .../worksap/nlp/sudachi/JoinNumericPluginTest.java | 2 +- .../java/com/worksap/nlp/sudachi/MorphemeImplTest.kt | 10 +++++----- .../com/worksap/nlp/sudachi/OovProviderPluginTest.kt | 2 +- .../java/com/worksap/nlp/sudachi/PosMatcherTest.kt | 11 ++++++----- .../com/worksap/nlp/sudachi/RegexOovProviderTest.kt | 2 +- .../com/worksap/nlp/sudachi/UserDictionaryTest.java | 8 ++++---- 12 files changed, 29 insertions(+), 28 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/SudachiCommandLine.java b/src/main/java/com/worksap/nlp/sudachi/SudachiCommandLine.java index 462269e2..477f8a25 100644 --- a/src/main/java/com/worksap/nlp/sudachi/SudachiCommandLine.java +++ b/src/main/java/com/worksap/nlp/sudachi/SudachiCommandLine.java @@ -291,7 +291,7 @@ public static void main(String[] args) throws IOException { try (PrintStream output = outputFileName == null ? new FileOrStdoutPrintStream() : new FileOrStdoutPrintStream(outputFileName); Dictionary dict = new DictionaryFactory().create(config)) { - Tokenizer tokenizer = dict.create(); + Tokenizer tokenizer = dict.tokenizer(); if (isEnableDump) { tokenizer.setDumpOutput(output); } diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.java index d00c577a..b118f8d4 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.java @@ -43,7 +43,7 @@ public void tearDown() throws IOException { @Test public void create() { - assertThat(dict.create(), isA(Tokenizer.class)); + assertThat(dict.tokenizer(), isA(Tokenizer.class)); } @Test @@ -64,7 +64,7 @@ public void instantiateConfigWithoutCharDef() throws IOException { cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict()); try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) { assertThat(jd, notNullValue()); - assertThat(jd.create(), notNullValue()); + assertThat(jd.tokenizer(), notNullValue()); } } @@ -79,14 +79,14 @@ private JapaneseDictionary makeDictionaryIncorrectly() throws IOException { @Test(expected = IllegalStateException.class) public void throwExceptionOnDictionaryUsageAfterClose() throws IOException { JapaneseDictionary dic = makeDictionaryIncorrectly(); - Tokenizer ignored = dic.create(); + Tokenizer ignored = dic.tokenizer(); } private Tokenizer makeTokenizerIncorrectly() throws IOException { Config cfg = Config.fromClasspath("sudachi_minimum.json"); cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict()); try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) { - return jd.create(); + return jd.tokenizer(); } } diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerMaskTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerMaskTest.kt index ff845dab..5fd6fbf7 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerMaskTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerMaskTest.kt @@ -41,7 +41,7 @@ class JapaneseTokenizerMaskTest { cfg0.addOovProviderPlugin(SimpleOovProviderPlugin::class.java) val cfg = cfg0.withFallback(TestDictionary.user0Cfg()) val dic = DictionaryFactory().create(cfg) as JapaneseDictionary - val tokenizer = dic.create() + val tokenizer = dic.tokenizer() assertEquals(2, dic.oovProviderPlugins.size) assertIs(dic.oovProviderPlugins[0]) @@ -62,7 +62,7 @@ class JapaneseTokenizerMaskTest { val cfg = TestDictionary.user0Cfg() cfg.addOovProviderPlugin(CaptureOtherWords::class.java) val dic = DictionaryFactory().create(cfg) as JapaneseDictionary - val tokenizer = dic.create() + val tokenizer = dic.tokenizer() assertIs(dic.oovProviderPlugins[0]) assertIs(dic.oovProviderPlugins[1]) diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt index b5f3d54a..85aebbae 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt @@ -24,7 +24,7 @@ import kotlin.test.assertEquals import kotlin.test.assertFailsWith class JapaneseTokenizerStreamingTest { - private val tokenizer = TestDictionary.user0().create() + private val tokenizer = TestDictionary.user0().tokenizer() class BadReader(private val data: String, private val window: Int = 512) : Reader() { diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java index ad9321ad..50494e85 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java @@ -48,7 +48,7 @@ public class JapaneseTokenizerTest { @Before public void setUp() { dict = TestDictionary.INSTANCE.user1(); - tokenizer = (JapaneseTokenizer) dict.create(); + tokenizer = (JapaneseTokenizer) dict.tokenizer(); } private static Matcher morpheme(String surface, int begin, int end) { @@ -353,7 +353,7 @@ public void zeroLengthMorpheme() { public void disableEmptyMorpheme() throws IOException { Config config = TestDictionary.INSTANCE.user1Cfg(); dict = new DictionaryFactory().create(Config.empty().withFallback(config).allowEmptyMorpheme(false)); - tokenizer = (JapaneseTokenizer) dict.create(); + tokenizer = (JapaneseTokenizer) dict.tokenizer(); List s = tokenizer.tokenize("…"); assertThat(s.size(), is(3)); diff --git a/src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java b/src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java index e7705314..e8c8c0e6 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java @@ -32,7 +32,7 @@ public class JoinKatakanaOovPluginTest { @Before public void setUp() throws IOException { Dictionary dict = TestDictionary.INSTANCE.user1(); - tokenizer = (JapaneseTokenizer) dict.create(); + tokenizer = (JapaneseTokenizer) dict.tokenizer(); plugin = new JoinKatakanaOovPlugin(); plugin.setOovFactory((short) -1); } diff --git a/src/test/java/com/worksap/nlp/sudachi/JoinNumericPluginTest.java b/src/test/java/com/worksap/nlp/sudachi/JoinNumericPluginTest.java index cce4c843..384d9436 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JoinNumericPluginTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JoinNumericPluginTest.java @@ -33,7 +33,7 @@ public void setUp() throws IOException { Config config = TestDictionary.INSTANCE.user0Cfg() .characterDefinition(getClass().getClassLoader().getResource("joinnumeric/char.def")); Dictionary dict = new DictionaryFactory().create(config); - tokenizer = (JapaneseTokenizer) dict.create(); + tokenizer = (JapaneseTokenizer) dict.tokenizer(); plugin = new JoinNumericPlugin(); plugin.setSettings(Settings.parse("{}", PathAnchor.none())); diff --git a/src/test/java/com/worksap/nlp/sudachi/MorphemeImplTest.kt b/src/test/java/com/worksap/nlp/sudachi/MorphemeImplTest.kt index 4b744239..c959aeda 100644 --- a/src/test/java/com/worksap/nlp/sudachi/MorphemeImplTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/MorphemeImplTest.kt @@ -25,7 +25,7 @@ class MorphemeImplTest { fun useToString() { val dic = TestDictionary.user0() // should be split into す/だ/ち, all of them are OOV - val sudachi = dic.create().tokenize("すだち") + val sudachi = dic.tokenizer().tokenize("すだち") // wid of OOV is (0xf, posId) assertEquals( "MorphemeImpl{begin=0, end=1, surface=す, pos=4/名詞,普通名詞,一般,*,*,*, wid=(15,4)}", @@ -36,20 +36,20 @@ class MorphemeImplTest { fun userdata() { // system val sdic = TestDictionary.user0() - val tokyo = sdic.create().tokenize("東京") + val tokyo = sdic.tokenizer().tokenize("東京") assertTrue(tokyo[0].getUserData().isEmpty()) // oov - val oovs = sdic.create().tokenize("すだち") + val oovs = sdic.tokenizer().tokenize("すだち") assertTrue(oovs[0].getUserData().isEmpty()) // user with data val udic = TestDictionary.user1() - val sudachi = udic.create().tokenize("すだち") + val sudachi = udic.tokenizer().tokenize("すだち") assertEquals("徳島県産", sudachi[0].getUserData()) // user without data - val piraru = udic.create().tokenize("ぴらる") + val piraru = udic.tokenizer().tokenize("ぴらる") assertTrue(piraru[0].getUserData().isEmpty()) } } diff --git a/src/test/java/com/worksap/nlp/sudachi/OovProviderPluginTest.kt b/src/test/java/com/worksap/nlp/sudachi/OovProviderPluginTest.kt index 681d0d74..bc865397 100644 --- a/src/test/java/com/worksap/nlp/sudachi/OovProviderPluginTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/OovProviderPluginTest.kt @@ -108,7 +108,7 @@ class OovProviderPluginTest { val dict = DictionaryFactory().create(cfg) as JapaneseDictionary val plugin = assertIs(dict.oovProviderPlugins.last()) assertEquals(8, plugin.posId) - val tokinzer = dict.create() + val tokinzer = dict.tokenizer() val tokens = tokinzer.tokenize("すだちかぼす") assertEquals("スダチ", tokens[0].partOfSpeech()[5]) assertEquals("カボス", tokens[1].partOfSpeech()[5]) diff --git a/src/test/java/com/worksap/nlp/sudachi/PosMatcherTest.kt b/src/test/java/com/worksap/nlp/sudachi/PosMatcherTest.kt index cdca9810..2dc1d931 100644 --- a/src/test/java/com/worksap/nlp/sudachi/PosMatcherTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/PosMatcherTest.kt @@ -22,11 +22,12 @@ import kotlin.test.* class PosMatcherTest { private val dic = DictionaryFactory().create(TestDictionary.user2Cfg()) as JapaneseDictionary + private val tok = dic.tokenizer() @Test fun basic() { val nouns = dic.posMatcher(PartialPOS("名詞")) - val morphs = dic.create().tokenize("京都に行った") + val morphs = tok.tokenize("京都に行った") assertEquals(4, morphs.size) assertTrue(nouns.test(morphs[0])) assertFalse(nouns.test(morphs[1])) @@ -37,7 +38,7 @@ class PosMatcherTest { @Test fun userDic() { val filter = dic.posMatcher { it[3] == "ミカン科" } - val morphs = dic.create().tokenize("すだちにかぼす") + val morphs = tok.tokenize("すだちにかぼす") assertEquals(3, morphs.size) assertTrue(filter.test(morphs[0])) assertFalse(filter.test(morphs[1])) @@ -49,7 +50,7 @@ class PosMatcherTest { val f1 = dic.posMatcher { it[5] == "スダチ" } val f2 = dic.posMatcher { it[5] == "カボス" } val filter = f1.union(f2) - val morphs = dic.create().tokenize("すだちにかぼす") + val morphs = tok.tokenize("すだちにかぼす") assertEquals(3, morphs.size) assertTrue(filter.test(morphs[0])) assertFalse(filter.test(morphs[1])) @@ -61,7 +62,7 @@ class PosMatcherTest { val f1 = dic.posMatcher { it[5] == "終止形-一般" } val f2 = dic.posMatcher { it[0] == "動詞" } val filter = f1.intersection(f2) - val morphs = dic.create().tokenize("いった東京行く") + val morphs = tok.tokenize("いった東京行く") assertEquals(4, morphs.size) assertFalse(filter.test(morphs[0])) assertFalse(filter.test(morphs[1])) @@ -72,7 +73,7 @@ class PosMatcherTest { @Test fun invert() { val filter = dic.posMatcher { it[3] == "ミカン科" }.invert() - val morphs = dic.create().tokenize("すだちにかぼす") + val morphs = tok.tokenize("すだちにかぼす") assertEquals(3, morphs.size) assertFalse(filter.test(morphs[0])) assertTrue(filter.test(morphs[1])) diff --git a/src/test/java/com/worksap/nlp/sudachi/RegexOovProviderTest.kt b/src/test/java/com/worksap/nlp/sudachi/RegexOovProviderTest.kt index f53de5b6..660a226d 100644 --- a/src/test/java/com/worksap/nlp/sudachi/RegexOovProviderTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/RegexOovProviderTest.kt @@ -34,7 +34,7 @@ class RegexOovProviderTest { .addList("pos", "名詞", "普通名詞", "一般", "*", "*", "*") @Suppress("UNCHECKED_CAST") block(cfg, pluginCfg as Config.PluginConf) // prepend our OOV configuration to the main configuration - return DictionaryFactory().create(cfg.withFallback(TestDictionary.user0Cfg())).create() + return DictionaryFactory().create(cfg.withFallback(TestDictionary.user0Cfg())).tokenizer() } @Test diff --git a/src/test/java/com/worksap/nlp/sudachi/UserDictionaryTest.java b/src/test/java/com/worksap/nlp/sudachi/UserDictionaryTest.java index 320eabb4..82563cc6 100644 --- a/src/test/java/com/worksap/nlp/sudachi/UserDictionaryTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/UserDictionaryTest.java @@ -38,7 +38,7 @@ public void fullUserDict() throws IOException { config.addUserDictionary(instance.getUserDict2()); try (Dictionary dict = new DictionaryFactory().create(config)) { - Tokenizer tokenizer = dict.create(); + Tokenizer tokenizer = dict.tokenizer(); List morphs = tokenizer.tokenize("ぴさる"); assertThat(morphs.size(), is(1)); Morpheme m = morphs.get(0); @@ -62,7 +62,7 @@ public void splitForUserDict() throws IOException { TestDictionary td = TestDictionary.INSTANCE; Config config = td.user0Cfg().addUserDictionary(td.getUserDict2()).addUserDictionary(td.getUserDict1()); try (Dictionary dict = new DictionaryFactory().create(config)) { - Tokenizer tokenizer = dict.create(); + Tokenizer tokenizer = dict.tokenizer(); List morphs = tokenizer.tokenize("東京府"); assertThat(morphs.size(), is(1)); Morpheme m = morphs.get(0); @@ -77,7 +77,7 @@ public void splitForUserDict() throws IOException { public void userDefinedPos() throws IOException { Config config = TestDictionary.INSTANCE.user2Cfg(); try (Dictionary dict = new DictionaryFactory().create(config)) { - Tokenizer tokenizer = dict.create(); + Tokenizer tokenizer = dict.tokenizer(); List morphs = tokenizer.tokenize("すだちかぼす"); assertThat(morphs.size(), is(2)); Morpheme m = morphs.get(0); @@ -89,7 +89,7 @@ public void userDefinedPos() throws IOException { TestDictionary td = TestDictionary.INSTANCE; config = td.user0Cfg().addUserDictionary(td.getUserDict2()).addUserDictionary(td.getUserDict1()); try (Dictionary dict = new DictionaryFactory().create(config)) { - Tokenizer tokenizer = dict.create(); + Tokenizer tokenizer = dict.tokenizer(); List morphs = tokenizer.tokenize("すだちかぼす"); assertThat(morphs.size(), is(2)); Morpheme m = morphs.get(0); From 3aedcefb80ae485be0b499210ce7bba51675b8ce Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 26 Nov 2024 09:12:45 +0900 Subject: [PATCH 3/3] add test for deprecated dict.create for now --- .../com/worksap/nlp/sudachi/JapaneseDictionaryTest.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.java index b118f8d4..fc861f9d 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.java @@ -42,7 +42,13 @@ public void tearDown() throws IOException { } @Test + @Deprecated public void create() { + assertThat(dict.create(), isA(Tokenizer.class)); + } + + @Test + public void createTokenizer() { assertThat(dict.tokenizer(), isA(Tokenizer.class)); }