Skip to content

Commit

Permalink
add slowLookupAllEntries method
Browse files Browse the repository at this point in the history
  • Loading branch information
mh-northlander committed Dec 3, 2024
1 parent 239d02b commit e5bdab3
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 15 deletions.
24 changes: 20 additions & 4 deletions src/main/java/com/worksap/nlp/sudachi/Dictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ public interface Dictionary extends AutoCloseable {
* Create a parallel stream of all words in the dictionary as morphemes.
*
* Corresponds to the lines in the lexicon csv, i.e. it includes entries that
* appear only when refered from other words (e.g. as constitution) during an
* appear only when referred from other words (e.g. as constitution) during an
* analysis and excludes entries that automatically added to store a
* normalization form of another word. Entries in the stream are not sorted.
*
Expand All @@ -72,17 +72,33 @@ public interface Dictionary extends AutoCloseable {
/**
* Lookup entries in the dictionary without performing an analysis.
*
* Specified surface will be normalized. This will work like performing analysis
* on the given headword and find paths with a single morpheme, but returns all
* Specified surface will be normalized. This works like performing analysis on
* the given headword and find paths with a single morpheme, but returns all
* paths instead of the lowest cost one.
*
* @param surface
* to lookup. Will be normalized beforehand.
* surface to lookup. Will be normalized beforehand.
* @return a list of morphemes that match the surface. Their begin/end will be
* 0/length of their headword.
*/
public List<Morpheme> lookup(CharSequence surface);

/**
* Lookup from all entries in the dictionary.
*
* Specified surface will be normalized. This can find entries that are not
* indexed and appear only when referred from other words (e.g. constitution),
* but is VERY slow instead. {@link Dictionary#lookup(CharSequence)} should be
* used for most cases.
*
* @param surface
* surface to lookup. Will be normalized beforehand.
* @return a list of morphemes that match the surface. Their begin/end will be
* 0/length of their headword.
* @see Dictionary#lookup(CharSequence)
*/
public List<Morpheme> slowLookupAllEntries(CharSequence surface);

/**
* Create an out-of-vocabulary morpheme from the pos id and string forms.
*
Expand Down
20 changes: 14 additions & 6 deletions src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.stream.Collectors;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
Expand Down Expand Up @@ -174,12 +176,8 @@ public Stream<Morpheme> entries() {

@Override
public List<Morpheme> lookup(CharSequence surface) {
UTF8InputTextBuilder builder = new UTF8InputTextBuilder(surface, grammar);
for (InputTextPlugin plugin : inputTextPlugins) {
plugin.rewrite(builder);
}
UTF8InputText input = builder.build();
byte[] bytes = input.getByteText();
TextNormalizer textNormalizer = textNormalizer();
byte[] bytes = textNormalizer.normalizedInputText(surface).getByteText();

List<Morpheme> morphemes = new ArrayList<>();
WordLookup wordLookup = lexicon.makeLookup();
Expand All @@ -200,6 +198,16 @@ public List<Morpheme> lookup(CharSequence surface) {
return morphemes;
}

@Override
public List<Morpheme> slowLookupAllEntries(CharSequence surface) {
TextNormalizer textNormalizer = textNormalizer();
byte[] bytes = textNormalizer.normalizedInputText(surface).getByteText();

return entries()
.filter(m -> Arrays.equals(bytes, textNormalizer.normalizedInputText(m.surface()).getByteText()))
.collect(Collectors.toList());
}

@Override
public Morpheme oovMorpheme(short posId, String surface, String reading, String normalizedForm,
String dictionaryForm) {
Expand Down
17 changes: 14 additions & 3 deletions src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -96,13 +96,24 @@ private static List<InputTextPlugin> setupDefaultInputTextPlugins(Grammar gramma
return plugins;
}

/** Normalize given text */
public String normalize(CharSequence text) {
/**
* Build {@link InputText} for the text and apply InputTextPlugins.
*
* @param text
* text to normalize
* @return Normalized text as InputText
*/
/* internal */ InputText normalizedInputText(CharSequence text) {
UTF8InputTextBuilder builder = new UTF8InputTextBuilder(text, grammar);
for (InputTextPlugin plugin : inputTextPlugins) {
plugin.rewrite(builder);
}
UTF8InputText input = builder.build();
return builder.build();
}

/** Normalize the text */
public String normalize(CharSequence text) {
InputText input = normalizedInputText(text);
return input.getText();
}
}
32 changes: 32 additions & 0 deletions src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,38 @@ abc,1,1,4675,AbC,名詞,普通名詞,一般,*,*,*,エービーシー,,,,,""")
assertEquals("abc", found.get(3).surface())
}

@Test
fun slowLookup() {
// nothing
val nothing = dict.slowLookupAllEntries("存在しない語")
assertTrue(nothing.isEmpty())

// system
val tokyo = dict.slowLookupAllEntries("東京都")
assertEquals(1, tokyo.size)
assertEquals("トウキョウト", tokyo[0].readingForm())

// user
val sudachi = TestDictionary.user1().slowLookupAllEntries("すだち")
assertEquals(1, sudachi.size)
assertEquals("徳島県産", sudachi[0].getUserData())

// CAN find entry with -1 conjunction cost
val hidden = dict.slowLookupAllEntries("隠し")
assertEquals(1, hidden.size)
assertEquals("隠し", hidden[0].surface())

// will be normalized
val norm = dict.slowLookupAllEntries("特A")
assertEquals(1, norm.size)
assertEquals("特A", norm[0].normalizedForm())

// inputTextPlugin
val yomi = dict.slowLookupAllEntries("京都(キョウト)")
assertEquals(1, yomi.size)
assertEquals("京都", yomi[0].normalizedForm())
}

@Test
fun oovMorpheme() {
val m1 = dict.oovMorpheme(1, "OOV")
Expand Down
3 changes: 1 addition & 2 deletions src/test/java/com/worksap/nlp/sudachi/TextNormalizerTest.kt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022 Works Applications Co., Ltd.
* Copyright (c) 2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -54,7 +54,6 @@ class TextNormalizerTest {
// will use default config, which has InputTextPlugins of
// [Default, ProlongedSoundMark, IgnoreYomigana]
val tn = dic.textNormalizer()
print(dic.inputTextPlugins)

assertEquals("âbγд(株)ガヴ⼼ⅲ", tn.normalize("ÂBΓД㈱ガウ゛⼼Ⅲ")) // default
assertEquals("うわーい", tn.normalize("うわーーーい")) // prolonged sound mark
Expand Down

0 comments on commit e5bdab3

Please sign in to comment.