Skip to content

Commit

Permalink
Merge pull request #246 from WorksApplications/feature/243-rename-tok…
Browse files Browse the repository at this point in the history
…-create

Rename `Dictionary.create` to `Dictionary.tokenizer`
  • Loading branch information
mh-northlander authored Nov 26, 2024
2 parents dcb86fd + 3aedcef commit b5d8753
Show file tree
Hide file tree
Showing 14 changed files with 50 additions and 28 deletions.
10 changes: 10 additions & 0 deletions src/main/java/com/worksap/nlp/sudachi/Dictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,16 @@ public interface Dictionary extends AutoCloseable {
*
* @return a tokenizer
*/
public Tokenizer tokenizer();

/**
* Creates a tokenizer instance.
*
* @return a tokenizer
*
* @deprecated renamed to {@link tokenizer()}
*/
@Deprecated
public Tokenizer create();

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ public void close() throws IOException {
}

@Override
public Tokenizer create() {
public Tokenizer tokenizer() {
if (grammar == null || lexicon == null) {
throw new IllegalStateException("trying to use closed dictionary");
}
Expand All @@ -140,6 +140,11 @@ public Tokenizer create() {
return tokenizer;
}

@Override
public Tokenizer create() {
return tokenizer();
}

@Override
public int getPartOfSpeechSize() {
return grammar.getPartOfSpeechSize();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ public static void main(String[] args) throws IOException {
try (PrintStream output = outputFileName == null ? new FileOrStdoutPrintStream()
: new FileOrStdoutPrintStream(outputFileName);
Dictionary dict = new DictionaryFactory().create(config)) {
Tokenizer tokenizer = dict.create();
Tokenizer tokenizer = dict.tokenizer();
if (isEnableDump) {
tokenizer.setDumpOutput(output);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,16 @@ public void tearDown() throws IOException {
}

@Test
@Deprecated
public void create() {
assertThat(dict.create(), isA(Tokenizer.class));
}

@Test
public void createTokenizer() {
assertThat(dict.tokenizer(), isA(Tokenizer.class));
}

@Test
public void getPartOfSpeechSize() {
assertThat(dict.getPartOfSpeechSize(), is(8));
Expand All @@ -64,7 +70,7 @@ public void instantiateConfigWithoutCharDef() throws IOException {
cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict());
try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) {
assertThat(jd, notNullValue());
assertThat(jd.create(), notNullValue());
assertThat(jd.tokenizer(), notNullValue());
}
}

Expand All @@ -79,14 +85,14 @@ private JapaneseDictionary makeDictionaryIncorrectly() throws IOException {
@Test(expected = IllegalStateException.class)
public void throwExceptionOnDictionaryUsageAfterClose() throws IOException {
JapaneseDictionary dic = makeDictionaryIncorrectly();
Tokenizer ignored = dic.create();
Tokenizer ignored = dic.tokenizer();
}

private Tokenizer makeTokenizerIncorrectly() throws IOException {
Config cfg = Config.fromClasspath("sudachi_minimum.json");
cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict());
try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) {
return jd.create();
return jd.tokenizer();
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class JapaneseTokenizerMaskTest {
cfg0.addOovProviderPlugin(SimpleOovProviderPlugin::class.java)
val cfg = cfg0.withFallback(TestDictionary.user0Cfg())
val dic = DictionaryFactory().create(cfg) as JapaneseDictionary
val tokenizer = dic.create()
val tokenizer = dic.tokenizer()

assertEquals(2, dic.oovProviderPlugins.size)
assertIs<CaptureOtherWords>(dic.oovProviderPlugins[0])
Expand All @@ -62,7 +62,7 @@ class JapaneseTokenizerMaskTest {
val cfg = TestDictionary.user0Cfg()
cfg.addOovProviderPlugin(CaptureOtherWords::class.java)
val dic = DictionaryFactory().create(cfg) as JapaneseDictionary
val tokenizer = dic.create()
val tokenizer = dic.tokenizer()

assertIs<SimpleOovProviderPlugin>(dic.oovProviderPlugins[0])
assertIs<CaptureOtherWords>(dic.oovProviderPlugins[1])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import kotlin.test.assertEquals
import kotlin.test.assertFailsWith

class JapaneseTokenizerStreamingTest {
private val tokenizer = TestDictionary.user0().create()
private val tokenizer = TestDictionary.user0().tokenizer()

class BadReader(private val data: String, private val window: Int = 512) : Reader() {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public class JapaneseTokenizerTest {
@Before
public void setUp() {
dict = TestDictionary.INSTANCE.user1();
tokenizer = (JapaneseTokenizer) dict.create();
tokenizer = (JapaneseTokenizer) dict.tokenizer();
}

private static Matcher<Morpheme> morpheme(String surface, int begin, int end) {
Expand Down Expand Up @@ -353,7 +353,7 @@ public void zeroLengthMorpheme() {
public void disableEmptyMorpheme() throws IOException {
Config config = TestDictionary.INSTANCE.user1Cfg();
dict = new DictionaryFactory().create(Config.empty().withFallback(config).allowEmptyMorpheme(false));
tokenizer = (JapaneseTokenizer) dict.create();
tokenizer = (JapaneseTokenizer) dict.tokenizer();

List<Morpheme> s = tokenizer.tokenize("…");
assertThat(s.size(), is(3));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ public class JoinKatakanaOovPluginTest {
@Before
public void setUp() throws IOException {
Dictionary dict = TestDictionary.INSTANCE.user1();
tokenizer = (JapaneseTokenizer) dict.create();
tokenizer = (JapaneseTokenizer) dict.tokenizer();
plugin = new JoinKatakanaOovPlugin();
plugin.setOovFactory((short) -1);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public void setUp() throws IOException {
Config config = TestDictionary.INSTANCE.user0Cfg()
.characterDefinition(getClass().getClassLoader().getResource("joinnumeric/char.def"));
Dictionary dict = new DictionaryFactory().create(config);
tokenizer = (JapaneseTokenizer) dict.create();
tokenizer = (JapaneseTokenizer) dict.tokenizer();

plugin = new JoinNumericPlugin();
plugin.setSettings(Settings.parse("{}", PathAnchor.none()));
Expand Down
10 changes: 5 additions & 5 deletions src/test/java/com/worksap/nlp/sudachi/MorphemeImplTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class MorphemeImplTest {
fun useToString() {
val dic = TestDictionary.user0()
// should be split into す/だ/ち, all of them are OOV
val sudachi = dic.create().tokenize("すだち")
val sudachi = dic.tokenizer().tokenize("すだち")
// wid of OOV is (0xf, posId)
assertEquals(
"MorphemeImpl{begin=0, end=1, surface=す, pos=4/名詞,普通名詞,一般,*,*,*, wid=(15,4)}",
Expand All @@ -36,20 +36,20 @@ class MorphemeImplTest {
fun userdata() {
// system
val sdic = TestDictionary.user0()
val tokyo = sdic.create().tokenize("東京")
val tokyo = sdic.tokenizer().tokenize("東京")
assertTrue(tokyo[0].getUserData().isEmpty())

// oov
val oovs = sdic.create().tokenize("すだち")
val oovs = sdic.tokenizer().tokenize("すだち")
assertTrue(oovs[0].getUserData().isEmpty())

// user with data
val udic = TestDictionary.user1()
val sudachi = udic.create().tokenize("すだち")
val sudachi = udic.tokenizer().tokenize("すだち")
assertEquals("徳島県産", sudachi[0].getUserData())

// user without data
val piraru = udic.create().tokenize("ぴらる")
val piraru = udic.tokenizer().tokenize("ぴらる")
assertTrue(piraru[0].getUserData().isEmpty())
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ class OovProviderPluginTest {
val dict = DictionaryFactory().create(cfg) as JapaneseDictionary
val plugin = assertIs<FakeOovProvider>(dict.oovProviderPlugins.last())
assertEquals(8, plugin.posId)
val tokinzer = dict.create()
val tokinzer = dict.tokenizer()
val tokens = tokinzer.tokenize("すだちかぼす")
assertEquals("スダチ", tokens[0].partOfSpeech()[5])
assertEquals("カボス", tokens[1].partOfSpeech()[5])
Expand Down
11 changes: 6 additions & 5 deletions src/test/java/com/worksap/nlp/sudachi/PosMatcherTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,12 @@ import kotlin.test.*
class PosMatcherTest {

private val dic = DictionaryFactory().create(TestDictionary.user2Cfg()) as JapaneseDictionary
private val tok = dic.tokenizer()

@Test
fun basic() {
val nouns = dic.posMatcher(PartialPOS("名詞"))
val morphs = dic.create().tokenize("京都に行った")
val morphs = tok.tokenize("京都に行った")
assertEquals(4, morphs.size)
assertTrue(nouns.test(morphs[0]))
assertFalse(nouns.test(morphs[1]))
Expand All @@ -37,7 +38,7 @@ class PosMatcherTest {
@Test
fun userDic() {
val filter = dic.posMatcher { it[3] == "ミカン科" }
val morphs = dic.create().tokenize("すだちにかぼす")
val morphs = tok.tokenize("すだちにかぼす")
assertEquals(3, morphs.size)
assertTrue(filter.test(morphs[0]))
assertFalse(filter.test(morphs[1]))
Expand All @@ -49,7 +50,7 @@ class PosMatcherTest {
val f1 = dic.posMatcher { it[5] == "スダチ" }
val f2 = dic.posMatcher { it[5] == "カボス" }
val filter = f1.union(f2)
val morphs = dic.create().tokenize("すだちにかぼす")
val morphs = tok.tokenize("すだちにかぼす")
assertEquals(3, morphs.size)
assertTrue(filter.test(morphs[0]))
assertFalse(filter.test(morphs[1]))
Expand All @@ -61,7 +62,7 @@ class PosMatcherTest {
val f1 = dic.posMatcher { it[5] == "終止形-一般" }
val f2 = dic.posMatcher { it[0] == "動詞" }
val filter = f1.intersection(f2)
val morphs = dic.create().tokenize("いった東京行く")
val morphs = tok.tokenize("いった東京行く")
assertEquals(4, morphs.size)
assertFalse(filter.test(morphs[0]))
assertFalse(filter.test(morphs[1]))
Expand All @@ -72,7 +73,7 @@ class PosMatcherTest {
@Test
fun invert() {
val filter = dic.posMatcher { it[3] == "ミカン科" }.invert()
val morphs = dic.create().tokenize("すだちにかぼす")
val morphs = tok.tokenize("すだちにかぼす")
assertEquals(3, morphs.size)
assertFalse(filter.test(morphs[0]))
assertTrue(filter.test(morphs[1]))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class RegexOovProviderTest {
.addList("pos", "名詞", "普通名詞", "一般", "*", "*", "*")
@Suppress("UNCHECKED_CAST") block(cfg, pluginCfg as Config.PluginConf<RegexOovProvider>)
// prepend our OOV configuration to the main configuration
return DictionaryFactory().create(cfg.withFallback(TestDictionary.user0Cfg())).create()
return DictionaryFactory().create(cfg.withFallback(TestDictionary.user0Cfg())).tokenizer()
}

@Test
Expand Down
8 changes: 4 additions & 4 deletions src/test/java/com/worksap/nlp/sudachi/UserDictionaryTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public void fullUserDict() throws IOException {
config.addUserDictionary(instance.getUserDict2());

try (Dictionary dict = new DictionaryFactory().create(config)) {
Tokenizer tokenizer = dict.create();
Tokenizer tokenizer = dict.tokenizer();
List<Morpheme> morphs = tokenizer.tokenize("ぴさる");
assertThat(morphs.size(), is(1));
Morpheme m = morphs.get(0);
Expand All @@ -62,7 +62,7 @@ public void splitForUserDict() throws IOException {
TestDictionary td = TestDictionary.INSTANCE;
Config config = td.user0Cfg().addUserDictionary(td.getUserDict2()).addUserDictionary(td.getUserDict1());
try (Dictionary dict = new DictionaryFactory().create(config)) {
Tokenizer tokenizer = dict.create();
Tokenizer tokenizer = dict.tokenizer();
List<Morpheme> morphs = tokenizer.tokenize("東京府");
assertThat(morphs.size(), is(1));
Morpheme m = morphs.get(0);
Expand All @@ -77,7 +77,7 @@ public void splitForUserDict() throws IOException {
public void userDefinedPos() throws IOException {
Config config = TestDictionary.INSTANCE.user2Cfg();
try (Dictionary dict = new DictionaryFactory().create(config)) {
Tokenizer tokenizer = dict.create();
Tokenizer tokenizer = dict.tokenizer();
List<Morpheme> morphs = tokenizer.tokenize("すだちかぼす");
assertThat(morphs.size(), is(2));
Morpheme m = morphs.get(0);
Expand All @@ -89,7 +89,7 @@ public void userDefinedPos() throws IOException {
TestDictionary td = TestDictionary.INSTANCE;
config = td.user0Cfg().addUserDictionary(td.getUserDict2()).addUserDictionary(td.getUserDict1());
try (Dictionary dict = new DictionaryFactory().create(config)) {
Tokenizer tokenizer = dict.create();
Tokenizer tokenizer = dict.tokenizer();
List<Morpheme> morphs = tokenizer.tokenize("すだちかぼす");
assertThat(morphs.size(), is(2));
Morpheme m = morphs.get(0);
Expand Down

0 comments on commit b5d8753

Please sign in to comment.