add slowLookupAllEntries method

WorksApplications · Dec 3, 2024 · e5bdab3 · e5bdab3
1 parent 239d02b
commit e5bdab3
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 15 deletions.
diff --git a/src/main/java/com/worksap/nlp/sudachi/Dictionary.java b/src/main/java/com/worksap/nlp/sudachi/Dictionary.java
@@ -61,7 +61,7 @@ public interface Dictionary extends AutoCloseable {
      * Create a parallel stream of all words in the dictionary as morphemes.
      *
      * Corresponds to the lines in the lexicon csv, i.e. it includes entries that
-     * appear only when refered from other words (e.g. as constitution) during an
+     * appear only when referred from other words (e.g. as constitution) during an
      * analysis and excludes entries that automatically added to store a
      * normalization form of another word. Entries in the stream are not sorted.
      *
@@ -72,17 +72,33 @@ public interface Dictionary extends AutoCloseable {
     /**
      * Lookup entries in the dictionary without performing an analysis.
      * 
-     * Specified surface will be normalized. This will work like performing analysis
-     * on the given headword and find paths with a single morpheme, but returns all
+     * Specified surface will be normalized. This works like performing analysis on
+     * the given headword and find paths with a single morpheme, but returns all
      * paths instead of the lowest cost one.
      * 
      * @param surface
-     *            to lookup. Will be normalized beforehand.
+     *            surface to lookup. Will be normalized beforehand.
      * @return a list of morphemes that match the surface. Their begin/end will be
      *         0/length of their headword.
      */
     public List<Morpheme> lookup(CharSequence surface);
 
+    /**
+     * Lookup from all entries in the dictionary.
+     * 
+     * Specified surface will be normalized. This can find entries that are not
+     * indexed and appear only when referred from other words (e.g. constitution),
+     * but is VERY slow instead. {@link Dictionary#lookup(CharSequence)} should be
+     * used for most cases.
+     * 
+     * @param surface
+     *            surface to lookup. Will be normalized beforehand.
+     * @return a list of morphemes that match the surface. Their begin/end will be
+     *         0/length of their headword.
+     * @see Dictionary#lookup(CharSequence)
+     */
+    public List<Morpheme> slowLookupAllEntries(CharSequence surface);
+
     /**
      * Create an out-of-vocabulary morpheme from the pos id and string forms.
      * 

diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
@@ -24,6 +24,8 @@
 import java.io.InputStreamReader;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.stream.Collectors;
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
@@ -174,12 +176,8 @@ public Stream<Morpheme> entries() {
 
     @Override
     public List<Morpheme> lookup(CharSequence surface) {
-        UTF8InputTextBuilder builder = new UTF8InputTextBuilder(surface, grammar);
-        for (InputTextPlugin plugin : inputTextPlugins) {
-            plugin.rewrite(builder);
-        }
-        UTF8InputText input = builder.build();
-        byte[] bytes = input.getByteText();
+        TextNormalizer textNormalizer = textNormalizer();
+        byte[] bytes = textNormalizer.normalizedInputText(surface).getByteText();
 
         List<Morpheme> morphemes = new ArrayList<>();
         WordLookup wordLookup = lexicon.makeLookup();
@@ -200,6 +198,16 @@ public List<Morpheme> lookup(CharSequence surface) {
         return morphemes;
     }
 
+    @Override
+    public List<Morpheme> slowLookupAllEntries(CharSequence surface) {
+        TextNormalizer textNormalizer = textNormalizer();
+        byte[] bytes = textNormalizer.normalizedInputText(surface).getByteText();
+
+        return entries()
+                .filter(m -> Arrays.equals(bytes, textNormalizer.normalizedInputText(m.surface()).getByteText()))
+                .collect(Collectors.toList());
+    }
+
     @Override
     public Morpheme oovMorpheme(short posId, String surface, String reading, String normalizedForm,
             String dictionaryForm) {

diff --git a/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java b/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java
@@ -96,13 +96,24 @@ private static List<InputTextPlugin> setupDefaultInputTextPlugins(Grammar gramma
         return plugins;
     }
 
-    /** Normalize given text */
-    public String normalize(CharSequence text) {
+    /**
+     * Build {@link InputText} for the text and apply InputTextPlugins.
+     * 
+     * @param text
+     *            text to normalize
+     * @return Normalized text as InputText
+     */
+    /* internal */ InputText normalizedInputText(CharSequence text) {
         UTF8InputTextBuilder builder = new UTF8InputTextBuilder(text, grammar);
         for (InputTextPlugin plugin : inputTextPlugins) {
             plugin.rewrite(builder);
         }
-        UTF8InputText input = builder.build();
+        return builder.build();
+    }
+
+    /** Normalize the text */
+    public String normalize(CharSequence text) {
+        InputText input = normalizedInputText(text);
         return input.getText();
     }
 }
diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt
@@ -201,6 +201,38 @@ abc,1,1,4675,AbC,名詞,普通名詞,一般,*,*,*,エービーシー,,,,,""")
     assertEquals("abc", found.get(3).surface())
   }
 
+  @Test
+  fun slowLookup() {
+    // nothing
+    val nothing = dict.slowLookupAllEntries("存在しない語")
+    assertTrue(nothing.isEmpty())
+
+    // system
+    val tokyo = dict.slowLookupAllEntries("東京都")
+    assertEquals(1, tokyo.size)
+    assertEquals("トウキョウト", tokyo[0].readingForm())
+
+    // user
+    val sudachi = TestDictionary.user1().slowLookupAllEntries("すだち")
+    assertEquals(1, sudachi.size)
+    assertEquals("徳島県産", sudachi[0].getUserData())
+
+    // CAN find entry with -1 conjunction cost
+    val hidden = dict.slowLookupAllEntries("隠し")
+    assertEquals(1, hidden.size)
+    assertEquals("隠し", hidden[0].surface())
+
+    // will be normalized
+    val norm = dict.slowLookupAllEntries("特A")
+    assertEquals(1, norm.size)
+    assertEquals("特A", norm[0].normalizedForm())
+
+    // inputTextPlugin
+    val yomi = dict.slowLookupAllEntries("京都（キョウト）")
+    assertEquals(1, yomi.size)
+    assertEquals("京都", yomi[0].normalizedForm())
+  }
+
   @Test
   fun oovMorpheme() {
     val m1 = dict.oovMorpheme(1, "OOV")

diff --git a/src/test/java/com/worksap/nlp/sudachi/TextNormalizerTest.kt b/src/test/java/com/worksap/nlp/sudachi/TextNormalizerTest.kt
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Works Applications Co., Ltd.
+ * Copyright (c) 2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,7 +54,6 @@ class TextNormalizerTest {
     // will use default config, which has InputTextPlugins of
     // [Default, ProlongedSoundMark, IgnoreYomigana]
     val tn = dic.textNormalizer()
-    print(dic.inputTextPlugins)
 
     assertEquals("âbγд(株)ガヴ⼼ⅲ", tn.normalize("ÂＢΓД㈱ｶﾞウ゛⼼Ⅲ")) // default
     assertEquals("うわーい", tn.normalize("うわーーーい")) // prolonged sound mark