From 69eb3dd1b267c67c41929b10f476f226cac16f9e Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 18 Nov 2024 14:07:05 +0900 Subject: [PATCH] add CharCategory.loadDefault and more TextNomalizer constructor --- .../worksap/nlp/sudachi/TextNormalizer.java | 69 +++++++++++++++---- .../sudachi/dictionary/CharacterCategory.java | 6 ++ .../sudachi/dictionary/DictionaryPrinter.java | 24 +------ 3 files changed, 65 insertions(+), 34 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java b/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java index aeef70cf..01a67025 100644 --- a/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java @@ -17,32 +17,77 @@ package com.worksap.nlp.sudachi; import java.util.List; +import java.io.IOException; +import java.util.ArrayList; +import com.worksap.nlp.sudachi.dictionary.CharacterCategory; import com.worksap.nlp.sudachi.dictionary.Grammar; +import com.worksap.nlp.sudachi.dictionary.GrammarImpl; /** - * Text normalizer that is equivalent to the one applied in the - * JapaneseTokenizer. + * A text normalizer. */ public class TextNormalizer { - Grammar grammar; - List inputTextPlugins; + private final Grammar grammar; + private final List inputTextPlugins; /** - * Create TextNormalizer based on the JapaneseDictionary. + * Create a TextNormalizer from a grammar and input text plugins. + * + * Grammar must have + * {@link com.worksap.nlp.sudachi.dictionary.CharacterCategory}. */ - public TextNormalizer(JapaneseDictionary dictionary) { - this(dictionary.getGrammar(), dictionary.inputTextPlugins); + public TextNormalizer(Grammar grammar, List inputTextPlugins) { + this.grammar = grammar; + this.inputTextPlugins = inputTextPlugins; } /** - * Create TextNormalizer from a grammar and input text plugins. + * Create a TextNormalizer from a grammar. * - * Grammar must have CharCategory. + * Grammar must have a + * {@link com.worksap.nlp.sudachi.dictionary.CharacterCategory}. + * {@link DefaultInputTextPlugin} will be used. */ - public TextNormalizer(Grammar grammar, List inputTextPlugins) { - this.grammar = grammar; - this.inputTextPlugins = inputTextPlugins; + public TextNormalizer(Grammar grammar) throws IOException { + this(grammar, setupDefaultInputTextPlugins(grammar)); + } + + /** + * Create a default TextNormalizer that uses default + * {@link com.worksap.nlp.sudachi.dictionary.CharacterCategory} and + * {@link DefaultInputTextPlugin}. + */ + public static TextNormalizer defaultTextNormalizer() throws IOException { + Grammar grammar = new GrammarImpl(); + grammar.setCharacterCategory(CharacterCategory.loadDefault()); + return new TextNormalizer(grammar); + } + + /** + * Create TextNormalizer based on the {@link JapaneseDictionary}. + */ + public static TextNormalizer fromDictionary(JapaneseDictionary dictionary) { + return new TextNormalizer(dictionary.getGrammar(), dictionary.inputTextPlugins); + } + + /** + * Setup {@link DefaultInputTextPlugin} using a grammar. + */ + private static List setupDefaultInputTextPlugins(Grammar grammar) throws IOException { + PathAnchor anchor = PathAnchor.classpath(); + List> pconfs = Config.fromJsonString( + "{\"inputTextPlugin\":[{\"class\":\"com.worksap.nlp.sudachi.DefaultInputTextPlugin\"}]}", anchor) + .getInputTextPlugins(); + + List plugins = new ArrayList<>(); + for (Config.PluginConf pconf : pconfs) { + InputTextPlugin p = pconf.instantiate(anchor); + p.setUp(grammar); + plugins.add(p); + } + + return plugins; } /** Normalize given text */ diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/CharacterCategory.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/CharacterCategory.java index 92f8a5bd..46ccd7fd 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/CharacterCategory.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/CharacterCategory.java @@ -17,6 +17,7 @@ package com.worksap.nlp.sudachi.dictionary; import com.worksap.nlp.sudachi.Config; +import com.worksap.nlp.sudachi.PathAnchor; import java.io.*; import java.nio.charset.StandardCharsets; @@ -157,4 +158,9 @@ public static CharacterCategory load(Config.Resource resource return result; }); } + + public static CharacterCategory loadDefault() throws IOException { + Config.Resource defaultResource = PathAnchor.classpath().resource("char.def"); + return load(defaultResource); + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index 1a4e7ae3..06db1d59 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -26,9 +26,6 @@ import com.worksap.nlp.sudachi.WordId; import com.worksap.nlp.sudachi.TextNormalizer; -import com.worksap.nlp.sudachi.Config; -import com.worksap.nlp.sudachi.InputTextPlugin; -import com.worksap.nlp.sudachi.PathAnchor; import com.worksap.nlp.sudachi.SudachiCommandLine.FileOrStdoutPrintStream; public class DictionaryPrinter { @@ -63,8 +60,8 @@ public class DictionaryPrinter { } // set default char category for text normalizer - grammar.setCharacterCategory(CharacterCategory.load(PathAnchor.classpath().resource("char.def"))); - textNormalizer = setupTextNormalizer(grammar); + grammar.setCharacterCategory(CharacterCategory.loadDefault()); + textNormalizer = new TextNormalizer(grammar); List poss = new ArrayList<>(); for (short pid = 0; pid < grammar.getPartOfSpeechSize(); pid++) { @@ -75,23 +72,6 @@ public class DictionaryPrinter { this.entrySize = dic.getLexicon().size(); } - /** Setup TextNormalizer with given grammar and DefaultInputTextPlugin. */ - private TextNormalizer setupTextNormalizer(Grammar grammar) throws IOException { - PathAnchor anchor = PathAnchor.classpath(); - List> pconfs = Config.fromJsonString( - "{\"inputTextPlugin\":[{\"class\":\"com.worksap.nlp.sudachi.DefaultInputTextPlugin\"}]}", anchor) - .getInputTextPlugins(); - - List plugins = new ArrayList<>(); - for (Config.PluginConf pconf : pconfs) { - InputTextPlugin p = pconf.instantiate(anchor); - p.setUp(grammar); - plugins.add(p); - } - - return new TextNormalizer(grammar, plugins); - } - private static void printUsage() { Console console = System.console(); console.printf("usage: PrintDictionary [-o file] [-s file] file\n");