Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename Dictionary.create to Dictionary.tokenizer #246

Merged
merged 3 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/main/java/com/worksap/nlp/sudachi/Dictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,16 @@ public interface Dictionary extends AutoCloseable {
*
* @return a tokenizer
*/
public Tokenizer tokenizer();

/**
* Creates a tokenizer instance.
*
* @return a tokenizer
*
* @deprecated renamed to {@link tokenizer()}
*/
@Deprecated
public Tokenizer create();

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ public void close() throws IOException {
}

@Override
public Tokenizer create() {
public Tokenizer tokenizer() {
if (grammar == null || lexicon == null) {
throw new IllegalStateException("trying to use closed dictionary");
}
Expand All @@ -140,6 +140,11 @@ public Tokenizer create() {
return tokenizer;
}

@Override
public Tokenizer create() {
return tokenizer();
}

@Override
public int getPartOfSpeechSize() {
return grammar.getPartOfSpeechSize();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ public static void main(String[] args) throws IOException {
try (PrintStream output = outputFileName == null ? new FileOrStdoutPrintStream()
: new FileOrStdoutPrintStream(outputFileName);
Dictionary dict = new DictionaryFactory().create(config)) {
Tokenizer tokenizer = dict.create();
Tokenizer tokenizer = dict.tokenizer();
if (isEnableDump) {
tokenizer.setDumpOutput(output);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,16 @@ public void tearDown() throws IOException {
}

@Test
@Deprecated
public void create() {
assertThat(dict.create(), isA(Tokenizer.class));
}

@Test
public void createTokenizer() {
assertThat(dict.tokenizer(), isA(Tokenizer.class));
}

@Test
public void getPartOfSpeechSize() {
assertThat(dict.getPartOfSpeechSize(), is(8));
Expand All @@ -64,7 +70,7 @@ public void instantiateConfigWithoutCharDef() throws IOException {
cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict());
try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) {
assertThat(jd, notNullValue());
assertThat(jd.create(), notNullValue());
assertThat(jd.tokenizer(), notNullValue());
}
}

Expand All @@ -79,14 +85,14 @@ private JapaneseDictionary makeDictionaryIncorrectly() throws IOException {
@Test(expected = IllegalStateException.class)
public void throwExceptionOnDictionaryUsageAfterClose() throws IOException {
JapaneseDictionary dic = makeDictionaryIncorrectly();
Tokenizer ignored = dic.create();
Tokenizer ignored = dic.tokenizer();
}

private Tokenizer makeTokenizerIncorrectly() throws IOException {
Config cfg = Config.fromClasspath("sudachi_minimum.json");
cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict());
try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) {
return jd.create();
return jd.tokenizer();
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class JapaneseTokenizerMaskTest {
cfg0.addOovProviderPlugin(SimpleOovProviderPlugin::class.java)
val cfg = cfg0.withFallback(TestDictionary.user0Cfg())
val dic = DictionaryFactory().create(cfg) as JapaneseDictionary
val tokenizer = dic.create()
val tokenizer = dic.tokenizer()

assertEquals(2, dic.oovProviderPlugins.size)
assertIs<CaptureOtherWords>(dic.oovProviderPlugins[0])
Expand All @@ -62,7 +62,7 @@ class JapaneseTokenizerMaskTest {
val cfg = TestDictionary.user0Cfg()
cfg.addOovProviderPlugin(CaptureOtherWords::class.java)
val dic = DictionaryFactory().create(cfg) as JapaneseDictionary
val tokenizer = dic.create()
val tokenizer = dic.tokenizer()

assertIs<SimpleOovProviderPlugin>(dic.oovProviderPlugins[0])
assertIs<CaptureOtherWords>(dic.oovProviderPlugins[1])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import kotlin.test.assertEquals
import kotlin.test.assertFailsWith

class JapaneseTokenizerStreamingTest {
private val tokenizer = TestDictionary.user0().create()
private val tokenizer = TestDictionary.user0().tokenizer()

class BadReader(private val data: String, private val window: Int = 512) : Reader() {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public class JapaneseTokenizerTest {
@Before
public void setUp() {
dict = TestDictionary.INSTANCE.user1();
tokenizer = (JapaneseTokenizer) dict.create();
tokenizer = (JapaneseTokenizer) dict.tokenizer();
}

private static Matcher<Morpheme> morpheme(String surface, int begin, int end) {
Expand Down Expand Up @@ -353,7 +353,7 @@ public void zeroLengthMorpheme() {
public void disableEmptyMorpheme() throws IOException {
Config config = TestDictionary.INSTANCE.user1Cfg();
dict = new DictionaryFactory().create(Config.empty().withFallback(config).allowEmptyMorpheme(false));
tokenizer = (JapaneseTokenizer) dict.create();
tokenizer = (JapaneseTokenizer) dict.tokenizer();

List<Morpheme> s = tokenizer.tokenize("…");
assertThat(s.size(), is(3));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ public class JoinKatakanaOovPluginTest {
@Before
public void setUp() throws IOException {
Dictionary dict = TestDictionary.INSTANCE.user1();
tokenizer = (JapaneseTokenizer) dict.create();
tokenizer = (JapaneseTokenizer) dict.tokenizer();
plugin = new JoinKatakanaOovPlugin();
plugin.setOovFactory((short) -1);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public void setUp() throws IOException {
Config config = TestDictionary.INSTANCE.user0Cfg()
.characterDefinition(getClass().getClassLoader().getResource("joinnumeric/char.def"));
Dictionary dict = new DictionaryFactory().create(config);
tokenizer = (JapaneseTokenizer) dict.create();
tokenizer = (JapaneseTokenizer) dict.tokenizer();

plugin = new JoinNumericPlugin();
plugin.setSettings(Settings.parse("{}", PathAnchor.none()));
Expand Down
10 changes: 5 additions & 5 deletions src/test/java/com/worksap/nlp/sudachi/MorphemeImplTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class MorphemeImplTest {
fun useToString() {
val dic = TestDictionary.user0()
// should be split into す/だ/ち, all of them are OOV
val sudachi = dic.create().tokenize("すだち")
val sudachi = dic.tokenizer().tokenize("すだち")
// wid of OOV is (0xf, posId)
assertEquals(
"MorphemeImpl{begin=0, end=1, surface=す, pos=4/名詞,普通名詞,一般,*,*,*, wid=(15,4)}",
Expand All @@ -36,20 +36,20 @@ class MorphemeImplTest {
fun userdata() {
// system
val sdic = TestDictionary.user0()
val tokyo = sdic.create().tokenize("東京")
val tokyo = sdic.tokenizer().tokenize("東京")
assertTrue(tokyo[0].getUserData().isEmpty())

// oov
val oovs = sdic.create().tokenize("すだち")
val oovs = sdic.tokenizer().tokenize("すだち")
assertTrue(oovs[0].getUserData().isEmpty())

// user with data
val udic = TestDictionary.user1()
val sudachi = udic.create().tokenize("すだち")
val sudachi = udic.tokenizer().tokenize("すだち")
assertEquals("徳島県産", sudachi[0].getUserData())

// user without data
val piraru = udic.create().tokenize("ぴらる")
val piraru = udic.tokenizer().tokenize("ぴらる")
assertTrue(piraru[0].getUserData().isEmpty())
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ class OovProviderPluginTest {
val dict = DictionaryFactory().create(cfg) as JapaneseDictionary
val plugin = assertIs<FakeOovProvider>(dict.oovProviderPlugins.last())
assertEquals(8, plugin.posId)
val tokinzer = dict.create()
val tokinzer = dict.tokenizer()
val tokens = tokinzer.tokenize("すだちかぼす")
assertEquals("スダチ", tokens[0].partOfSpeech()[5])
assertEquals("カボス", tokens[1].partOfSpeech()[5])
Expand Down
11 changes: 6 additions & 5 deletions src/test/java/com/worksap/nlp/sudachi/PosMatcherTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,12 @@ import kotlin.test.*
class PosMatcherTest {

private val dic = DictionaryFactory().create(TestDictionary.user2Cfg()) as JapaneseDictionary
private val tok = dic.tokenizer()

@Test
fun basic() {
val nouns = dic.posMatcher(PartialPOS("名詞"))
val morphs = dic.create().tokenize("京都に行った")
val morphs = tok.tokenize("京都に行った")
assertEquals(4, morphs.size)
assertTrue(nouns.test(morphs[0]))
assertFalse(nouns.test(morphs[1]))
Expand All @@ -37,7 +38,7 @@ class PosMatcherTest {
@Test
fun userDic() {
val filter = dic.posMatcher { it[3] == "ミカン科" }
val morphs = dic.create().tokenize("すだちにかぼす")
val morphs = tok.tokenize("すだちにかぼす")
assertEquals(3, morphs.size)
assertTrue(filter.test(morphs[0]))
assertFalse(filter.test(morphs[1]))
Expand All @@ -49,7 +50,7 @@ class PosMatcherTest {
val f1 = dic.posMatcher { it[5] == "スダチ" }
val f2 = dic.posMatcher { it[5] == "カボス" }
val filter = f1.union(f2)
val morphs = dic.create().tokenize("すだちにかぼす")
val morphs = tok.tokenize("すだちにかぼす")
assertEquals(3, morphs.size)
assertTrue(filter.test(morphs[0]))
assertFalse(filter.test(morphs[1]))
Expand All @@ -61,7 +62,7 @@ class PosMatcherTest {
val f1 = dic.posMatcher { it[5] == "終止形-一般" }
val f2 = dic.posMatcher { it[0] == "動詞" }
val filter = f1.intersection(f2)
val morphs = dic.create().tokenize("いった東京行く")
val morphs = tok.tokenize("いった東京行く")
assertEquals(4, morphs.size)
assertFalse(filter.test(morphs[0]))
assertFalse(filter.test(morphs[1]))
Expand All @@ -72,7 +73,7 @@ class PosMatcherTest {
@Test
fun invert() {
val filter = dic.posMatcher { it[3] == "ミカン科" }.invert()
val morphs = dic.create().tokenize("すだちにかぼす")
val morphs = tok.tokenize("すだちにかぼす")
assertEquals(3, morphs.size)
assertFalse(filter.test(morphs[0]))
assertTrue(filter.test(morphs[1]))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class RegexOovProviderTest {
.addList("pos", "名詞", "普通名詞", "一般", "*", "*", "*")
@Suppress("UNCHECKED_CAST") block(cfg, pluginCfg as Config.PluginConf<RegexOovProvider>)
// prepend our OOV configuration to the main configuration
return DictionaryFactory().create(cfg.withFallback(TestDictionary.user0Cfg())).create()
return DictionaryFactory().create(cfg.withFallback(TestDictionary.user0Cfg())).tokenizer()
}

@Test
Expand Down
8 changes: 4 additions & 4 deletions src/test/java/com/worksap/nlp/sudachi/UserDictionaryTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public void fullUserDict() throws IOException {
config.addUserDictionary(instance.getUserDict2());

try (Dictionary dict = new DictionaryFactory().create(config)) {
Tokenizer tokenizer = dict.create();
Tokenizer tokenizer = dict.tokenizer();
List<Morpheme> morphs = tokenizer.tokenize("ぴさる");
assertThat(morphs.size(), is(1));
Morpheme m = morphs.get(0);
Expand All @@ -62,7 +62,7 @@ public void splitForUserDict() throws IOException {
TestDictionary td = TestDictionary.INSTANCE;
Config config = td.user0Cfg().addUserDictionary(td.getUserDict2()).addUserDictionary(td.getUserDict1());
try (Dictionary dict = new DictionaryFactory().create(config)) {
Tokenizer tokenizer = dict.create();
Tokenizer tokenizer = dict.tokenizer();
List<Morpheme> morphs = tokenizer.tokenize("東京府");
assertThat(morphs.size(), is(1));
Morpheme m = morphs.get(0);
Expand All @@ -77,7 +77,7 @@ public void splitForUserDict() throws IOException {
public void userDefinedPos() throws IOException {
Config config = TestDictionary.INSTANCE.user2Cfg();
try (Dictionary dict = new DictionaryFactory().create(config)) {
Tokenizer tokenizer = dict.create();
Tokenizer tokenizer = dict.tokenizer();
List<Morpheme> morphs = tokenizer.tokenize("すだちかぼす");
assertThat(morphs.size(), is(2));
Morpheme m = morphs.get(0);
Expand All @@ -89,7 +89,7 @@ public void userDefinedPos() throws IOException {
TestDictionary td = TestDictionary.INSTANCE;
config = td.user0Cfg().addUserDictionary(td.getUserDict2()).addUserDictionary(td.getUserDict1());
try (Dictionary dict = new DictionaryFactory().create(config)) {
Tokenizer tokenizer = dict.create();
Tokenizer tokenizer = dict.tokenizer();
List<Morpheme> morphs = tokenizer.tokenize("すだちかぼす");
assertThat(morphs.size(), is(2));
Morpheme m = morphs.get(0);
Expand Down