Skip to content

Commit

Permalink
add CharCategory.loadDefault and more TextNomalizer constructor
Browse files Browse the repository at this point in the history
  • Loading branch information
mh-northlander committed Nov 18, 2024
1 parent 1419ad1 commit 69eb3dd
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 34 deletions.
69 changes: 57 additions & 12 deletions src/main/java/com/worksap/nlp/sudachi/TextNormalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,32 +17,77 @@
package com.worksap.nlp.sudachi;

import java.util.List;
import java.io.IOException;
import java.util.ArrayList;

import com.worksap.nlp.sudachi.dictionary.CharacterCategory;
import com.worksap.nlp.sudachi.dictionary.Grammar;
import com.worksap.nlp.sudachi.dictionary.GrammarImpl;

/**
* Text normalizer that is equivalent to the one applied in the
* JapaneseTokenizer.
* A text normalizer.
*/
public class TextNormalizer {
Grammar grammar;
List<InputTextPlugin> inputTextPlugins;
private final Grammar grammar;
private final List<InputTextPlugin> inputTextPlugins;

/**
* Create TextNormalizer based on the JapaneseDictionary.
* Create a TextNormalizer from a grammar and input text plugins.
*
* Grammar must have
* {@link com.worksap.nlp.sudachi.dictionary.CharacterCategory}.
*/
public TextNormalizer(JapaneseDictionary dictionary) {
this(dictionary.getGrammar(), dictionary.inputTextPlugins);
public TextNormalizer(Grammar grammar, List<InputTextPlugin> inputTextPlugins) {
this.grammar = grammar;
this.inputTextPlugins = inputTextPlugins;
}

/**
* Create TextNormalizer from a grammar and input text plugins.
* Create a TextNormalizer from a grammar.
*
* Grammar must have CharCategory.
* Grammar must have a
* {@link com.worksap.nlp.sudachi.dictionary.CharacterCategory}.
* {@link DefaultInputTextPlugin} will be used.
*/
public TextNormalizer(Grammar grammar, List<InputTextPlugin> inputTextPlugins) {
this.grammar = grammar;
this.inputTextPlugins = inputTextPlugins;
public TextNormalizer(Grammar grammar) throws IOException {
this(grammar, setupDefaultInputTextPlugins(grammar));
}

/**
* Create a default TextNormalizer that uses default
* {@link com.worksap.nlp.sudachi.dictionary.CharacterCategory} and
* {@link DefaultInputTextPlugin}.
*/
public static TextNormalizer defaultTextNormalizer() throws IOException {
Grammar grammar = new GrammarImpl();
grammar.setCharacterCategory(CharacterCategory.loadDefault());
return new TextNormalizer(grammar);
}

/**
* Create TextNormalizer based on the {@link JapaneseDictionary}.
*/
public static TextNormalizer fromDictionary(JapaneseDictionary dictionary) {
return new TextNormalizer(dictionary.getGrammar(), dictionary.inputTextPlugins);
}

/**
* Setup {@link DefaultInputTextPlugin} using a grammar.
*/
private static List<InputTextPlugin> setupDefaultInputTextPlugins(Grammar grammar) throws IOException {
PathAnchor anchor = PathAnchor.classpath();
List<Config.PluginConf<InputTextPlugin>> pconfs = Config.fromJsonString(
"{\"inputTextPlugin\":[{\"class\":\"com.worksap.nlp.sudachi.DefaultInputTextPlugin\"}]}", anchor)
.getInputTextPlugins();

List<InputTextPlugin> plugins = new ArrayList<>();
for (Config.PluginConf<InputTextPlugin> pconf : pconfs) {
InputTextPlugin p = pconf.instantiate(anchor);
p.setUp(grammar);
plugins.add(p);
}

return plugins;
}

/** Normalize given text */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package com.worksap.nlp.sudachi.dictionary;

import com.worksap.nlp.sudachi.Config;
import com.worksap.nlp.sudachi.PathAnchor;

import java.io.*;
import java.nio.charset.StandardCharsets;
Expand Down Expand Up @@ -157,4 +158,9 @@ public static CharacterCategory load(Config.Resource<CharacterCategory> resource
return result;
});
}

public static CharacterCategory loadDefault() throws IOException {
Config.Resource<CharacterCategory> defaultResource = PathAnchor.classpath().resource("char.def");
return load(defaultResource);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,6 @@

import com.worksap.nlp.sudachi.WordId;
import com.worksap.nlp.sudachi.TextNormalizer;
import com.worksap.nlp.sudachi.Config;
import com.worksap.nlp.sudachi.InputTextPlugin;
import com.worksap.nlp.sudachi.PathAnchor;
import com.worksap.nlp.sudachi.SudachiCommandLine.FileOrStdoutPrintStream;

public class DictionaryPrinter {
Expand Down Expand Up @@ -63,8 +60,8 @@ public class DictionaryPrinter {
}

// set default char category for text normalizer
grammar.setCharacterCategory(CharacterCategory.load(PathAnchor.classpath().resource("char.def")));
textNormalizer = setupTextNormalizer(grammar);
grammar.setCharacterCategory(CharacterCategory.loadDefault());
textNormalizer = new TextNormalizer(grammar);

List<String> poss = new ArrayList<>();
for (short pid = 0; pid < grammar.getPartOfSpeechSize(); pid++) {
Expand All @@ -75,23 +72,6 @@ public class DictionaryPrinter {
this.entrySize = dic.getLexicon().size();
}

/** Setup TextNormalizer with given grammar and DefaultInputTextPlugin. */
private TextNormalizer setupTextNormalizer(Grammar grammar) throws IOException {
PathAnchor anchor = PathAnchor.classpath();
List<Config.PluginConf<InputTextPlugin>> pconfs = Config.fromJsonString(
"{\"inputTextPlugin\":[{\"class\":\"com.worksap.nlp.sudachi.DefaultInputTextPlugin\"}]}", anchor)
.getInputTextPlugins();

List<InputTextPlugin> plugins = new ArrayList<>();
for (Config.PluginConf<InputTextPlugin> pconf : pconfs) {
InputTextPlugin p = pconf.instantiate(anchor);
p.setUp(grammar);
plugins.add(p);
}

return new TextNormalizer(grammar, plugins);
}

private static void printUsage() {
Console console = System.console();
console.printf("usage: PrintDictionary [-o file] [-s file] file\n");
Expand Down

0 comments on commit 69eb3dd

Please sign in to comment.