Skip to content

Commit

Permalink
add Lingua as language detection library
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Jan 13, 2025
1 parent ed420e4 commit fe475f0
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 3 deletions.
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ subprojects {
// treating them separately, these jars will be flattened into grobid-core.jar on installing,
// to avoid missing dependencies from the projects that include grobid-core (see 'jar' task in grobid-core)
localLibs = ['crfpp-1.0.2.jar',
'langdetect-1.1-20120112.jar',
'wipo-analysers-0.0.2.jar',
'imageio-pnm-1.0.jar',
'wapiti-1.5.0.jar']
Expand Down Expand Up @@ -140,6 +139,7 @@ subprojects {
implementation "com.google.guava:guava:31.0.1-jre"
implementation "org.apache.httpcomponents:httpclient:4.5.3"
implementation "black.ninia:jep:4.0.2"
implementation 'com.github.pemistahl:lingua:1.2.2'

implementation "com.fasterxml.jackson.core:jackson-core:2.14.3"
implementation "com.fasterxml.jackson.core:jackson-databind:2.14.3"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ public static LanguageUtilities getInstance() {
if (instance == null) {
synchronized (LanguageUtilities.class) {
if (instance == null) {
LOGGER.debug("synchronized getNewInstance");
instance = new LanguageUtilities();
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package org.grobid.core.lang.impl

import com.github.pemistahl.lingua.api.LanguageDetectorBuilder
import org.grobid.core.lang.Language
import org.grobid.core.lang.LanguageDetector
import org.slf4j.Logger
import org.slf4j.LoggerFactory

class LinguaLanguageDetector : LanguageDetector {
private val detector: com.github.pemistahl.lingua.api.LanguageDetector = LanguageDetectorBuilder
.fromAllLanguages()
// .withPreloadedLanguageModels()
.build()

override fun detect(text: String): Language {
val languages = detector.computeLanguageConfidenceValues(text = text)

if (LOGGER.isDebugEnabled) {
LOGGER.debug(languages.toString())
}

val l = languages.firstKey()
val p = languages[l] ?: 0.0

return Language(l.isoCode639_1.toString(), p)
}

companion object {
private val LOGGER: Logger = LoggerFactory.getLogger(LinguaLanguageDetector::class.java)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package org.grobid.core.lang.impl;

import org.grobid.core.lang.LanguageDetector;
import org.grobid.core.lang.LanguageDetectorFactory;

/**
* Implementation of a language detector factory with Lingua language identifier
*/
public class LinguaLanguageDetectorFactory implements LanguageDetectorFactory {
private static volatile LanguageDetector instance = null;

private static void init() {

}

public LanguageDetector getInstance() {
if (instance == null) {
synchronized (this) {
if(instance == null) {
init();
instance = new LinguaLanguageDetector();
}
}

}
return instance;
}

}
3 changes: 2 additions & 1 deletion grobid-home/config/grobid.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ grobid:
corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"

# the actual implementation for language recognition to be used
languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory"
# languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory"
languageDetectorFactory: "org.grobid.core.lang.impl.LinguaLanguageDetectorFactory"

# the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP)
#sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory"
Expand Down

0 comments on commit fe475f0

Please sign in to comment.