add CharCategory.loadDefault and more TextNomalizer constructor

WorksApplications · Nov 18, 2024 · 69eb3dd · 69eb3dd
1 parent 1419ad1
commit 69eb3dd
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 34 deletions.
diff --git a/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java b/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java
@@ -17,32 +17,77 @@
 package com.worksap.nlp.sudachi;
 
 import java.util.List;
+import java.io.IOException;
+import java.util.ArrayList;
 
+import com.worksap.nlp.sudachi.dictionary.CharacterCategory;
 import com.worksap.nlp.sudachi.dictionary.Grammar;
+import com.worksap.nlp.sudachi.dictionary.GrammarImpl;
 
 /**
- * Text normalizer that is equivalent to the one applied in the
- * JapaneseTokenizer.
+ * A text normalizer.
  */
 public class TextNormalizer {
-    Grammar grammar;
-    List<InputTextPlugin> inputTextPlugins;
+    private final Grammar grammar;
+    private final List<InputTextPlugin> inputTextPlugins;
 
     /**
-     * Create TextNormalizer based on the JapaneseDictionary.
+     * Create a TextNormalizer from a grammar and input text plugins.
+     * 
+     * Grammar must have
+     * {@link com.worksap.nlp.sudachi.dictionary.CharacterCategory}.
      */
-    public TextNormalizer(JapaneseDictionary dictionary) {
-        this(dictionary.getGrammar(), dictionary.inputTextPlugins);
+    public TextNormalizer(Grammar grammar, List<InputTextPlugin> inputTextPlugins) {
+        this.grammar = grammar;
+        this.inputTextPlugins = inputTextPlugins;
     }
 
     /**
-     * Create TextNormalizer from a grammar and input text plugins.
+     * Create a TextNormalizer from a grammar.
      * 
-     * Grammar must have CharCategory.
+     * Grammar must have a
+     * {@link com.worksap.nlp.sudachi.dictionary.CharacterCategory}.
+     * {@link DefaultInputTextPlugin} will be used.
      */
-    public TextNormalizer(Grammar grammar, List<InputTextPlugin> inputTextPlugins) {
-        this.grammar = grammar;
-        this.inputTextPlugins = inputTextPlugins;
+    public TextNormalizer(Grammar grammar) throws IOException {
+        this(grammar, setupDefaultInputTextPlugins(grammar));
+    }
+
+    /**
+     * Create a default TextNormalizer that uses default
+     * {@link com.worksap.nlp.sudachi.dictionary.CharacterCategory} and
+     * {@link DefaultInputTextPlugin}.
+     */
+    public static TextNormalizer defaultTextNormalizer() throws IOException {
+        Grammar grammar = new GrammarImpl();
+        grammar.setCharacterCategory(CharacterCategory.loadDefault());
+        return new TextNormalizer(grammar);
+    }
+
+    /**
+     * Create TextNormalizer based on the {@link JapaneseDictionary}.
+     */
+    public static TextNormalizer fromDictionary(JapaneseDictionary dictionary) {
+        return new TextNormalizer(dictionary.getGrammar(), dictionary.inputTextPlugins);
+    }
+
+    /**
+     * Setup {@link DefaultInputTextPlugin} using a grammar.
+     */
+    private static List<InputTextPlugin> setupDefaultInputTextPlugins(Grammar grammar) throws IOException {
+        PathAnchor anchor = PathAnchor.classpath();
+        List<Config.PluginConf<InputTextPlugin>> pconfs = Config.fromJsonString(
+                "{\"inputTextPlugin\":[{\"class\":\"com.worksap.nlp.sudachi.DefaultInputTextPlugin\"}]}", anchor)
+                .getInputTextPlugins();
+
+        List<InputTextPlugin> plugins = new ArrayList<>();
+        for (Config.PluginConf<InputTextPlugin> pconf : pconfs) {
+            InputTextPlugin p = pconf.instantiate(anchor);
+            p.setUp(grammar);
+            plugins.add(p);
+        }
+
+        return plugins;
     }
 
     /** Normalize given text */

diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/CharacterCategory.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/CharacterCategory.java
@@ -17,6 +17,7 @@
 package com.worksap.nlp.sudachi.dictionary;
 
 import com.worksap.nlp.sudachi.Config;
+import com.worksap.nlp.sudachi.PathAnchor;
 
 import java.io.*;
 import java.nio.charset.StandardCharsets;
@@ -157,4 +158,9 @@ public static CharacterCategory load(Config.Resource<CharacterCategory> resource
             return result;
         });
     }
+
+    public static CharacterCategory loadDefault() throws IOException {
+        Config.Resource<CharacterCategory> defaultResource = PathAnchor.classpath().resource("char.def");
+        return load(defaultResource);
+    }
 }
diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java
@@ -26,9 +26,6 @@
 
 import com.worksap.nlp.sudachi.WordId;
 import com.worksap.nlp.sudachi.TextNormalizer;
-import com.worksap.nlp.sudachi.Config;
-import com.worksap.nlp.sudachi.InputTextPlugin;
-import com.worksap.nlp.sudachi.PathAnchor;
 import com.worksap.nlp.sudachi.SudachiCommandLine.FileOrStdoutPrintStream;
 
 public class DictionaryPrinter {
@@ -63,8 +60,8 @@ public class DictionaryPrinter {
         }
 
         // set default char category for text normalizer
-        grammar.setCharacterCategory(CharacterCategory.load(PathAnchor.classpath().resource("char.def")));
-        textNormalizer = setupTextNormalizer(grammar);
+        grammar.setCharacterCategory(CharacterCategory.loadDefault());
+        textNormalizer = new TextNormalizer(grammar);
 
         List<String> poss = new ArrayList<>();
         for (short pid = 0; pid < grammar.getPartOfSpeechSize(); pid++) {
@@ -75,23 +72,6 @@ public class DictionaryPrinter {
         this.entrySize = dic.getLexicon().size();
     }
 
-    /** Setup TextNormalizer with given grammar and DefaultInputTextPlugin. */
-    private TextNormalizer setupTextNormalizer(Grammar grammar) throws IOException {
-        PathAnchor anchor = PathAnchor.classpath();
-        List<Config.PluginConf<InputTextPlugin>> pconfs = Config.fromJsonString(
-                "{\"inputTextPlugin\":[{\"class\":\"com.worksap.nlp.sudachi.DefaultInputTextPlugin\"}]}", anchor)
-                .getInputTextPlugins();
-
-        List<InputTextPlugin> plugins = new ArrayList<>();
-        for (Config.PluginConf<InputTextPlugin> pconf : pconfs) {
-            InputTextPlugin p = pconf.instantiate(anchor);
-            p.setUp(grammar);
-            plugins.add(p);
-        }
-
-        return new TextNormalizer(grammar, plugins);
-    }
-
     private static void printUsage() {
         Console console = System.console();
         console.printf("usage: PrintDictionary [-o file] [-s file] file\n");