Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for English #41

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions bench/bench.iml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
11 changes: 11 additions & 0 deletions lib/lib.iml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/main/scala" isTestSource="false" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ package com.worksap.nlp.uzushio.lib.lang

import com.optimaize.langdetect.LanguageDetectorBuilder
import com.optimaize.langdetect.ngram.NgramExtractor

import java.nio.charset.{Charset, CodingErrorAction}
import java.nio.{ByteBuffer, CharBuffer}
import java.util.regex.{Matcher, Pattern}

sealed trait EstimationResult {
def str: String = "unk"
Expand All @@ -20,26 +20,71 @@ class LangEstimation(private val minBytes: Int = 256) {
private val decodeBuffer = CharBuffer.allocate(4 * 1024)
private def langDetector = LangEstimation.cachedDetector

/** Copy non-ASCII characters into detection buffer
/** Remove HTML tags and inline JavaScript from the input.
* @param input
* input buffer as a string
* @return
* cleaned string with HTML and JavaScript removed
*/
private def cleanHtmlContent(input: String): String = {
// Use regex to remove HTML tags and content inside <script> tags
val scriptPattern = Pattern.compile("(?s)<script.*?>.*?</script>")
val stylePattern = Pattern.compile("(?s)<style.*?>.*?</style>")
val commentPattern = Pattern.compile("(?s)<!--.*?-->")
val htmlTagPattern = Pattern.compile("<[^>]+>")


// Process from 50% ~
val startPos = input.length / 2
var cleanedContent = input.substring(startPos)

// First, remove the <script> and etc block
cleanedContent = removePattern(cleanedContent, scriptPattern)
cleanedContent = removePattern(cleanedContent, stylePattern)
cleanedContent = removePattern(cleanedContent, commentPattern)

// remove html tags
cleanedContent = removePattern(cleanedContent, htmlTagPattern)

println(s"Cleaned content: ${cleanedContent.take(100)}...") // debug print
cleanedContent
}

/** Helper function to remove pattern from string using JVM's regex */
private def removePattern(input: String, pattern: Pattern): String = {
val matcher: Matcher = pattern.matcher(input)
matcher.replaceAll("")
}

/** Copy meaningful content into detection buffer, removing HTML, JavaScript, and retaining text.
* Retains both ASCII and non-ASCII characters, focusing on meaningful language content.
*
* @param input
* input buffer
* input CharBuffer
* @param output
* output buffer
* output CharBuffer
*/
private def copyNonAscii(input: CharBuffer, output: CharBuffer): Unit = {
var prevWhitespace = false
while (input.hasRemaining && output.remaining() > 1) {
val char = input.get()
if ((char & 0xffff) >= 128) {
if (prevWhitespace) {
output.put(' ')
prevWhitespace = false
}
output.put(char)
private def copyMeaningfulContent(input: CharBuffer, output: CharBuffer): Unit = {
// Convert the input to a string
val content = input.toString
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is possible to avoid creating this string completely, you do not need it.
https://docs.oracle.com/en/java/javase/17/docs/api/java.base/java/util/regex/Pattern.html#matcher(java.lang.CharSequence) can use CharBuffers directly as inputs as they implement CharSequence interface.


// Use regex to remove HTML tags and JavaScript
val cleanedContent = cleanHtmlContent(content)

// Filter and clean the remaining text, retaining letters, digits, whitespace, and non-ASCII characters
val meaningfulContent = cleanedContent.flatMap { char =>
if (char.isLetterOrDigit || char.isWhitespace || char >= 128) {
Some(char)
} else {
prevWhitespace = true
None
}
}

// Put the cleaned content into the output buffer
val result = meaningfulContent.mkString.trim
println(s"Meaningful content: $result") // Print the meaningful content
// Copy meaningful content to the output buffer
output.put(result)
}

private def prepareBuffer(
Expand All @@ -60,43 +105,48 @@ class LangEstimation(private val minBytes: Int = 256) {
return None
}
decBuf.flip()
copyNonAscii(decBuf, buf)
copyMeaningfulContent(decBuf, buf)
decBuf.clear()
}

buf.flip()
Some(buf.limit())
}

/** Estimate language by taking at most 5k characters from first 20kb of text. This detector
* ignores all ASCII characters, so languages which use such scripts are not detectable. Returns
* [[BadEncoding]] if there exist non-mappable characters using the passed encoding.
/** Estimate the language by taking at most 5k characters from the first 20kb of text.
* Retains both ASCII and non-ASCII characters, but removes HTML and JavaScript tags.
* Returns [[BadEncoding]] if there are unmappable characters using the provided encoding.
*
* @param data
* text to detect language from
* the text to detect language from
* @param offset
* offset from the array start
* the offset from the start of the array
* @param charset
* charset to use for converting byte stream to characters
* the charset to use for converting byte stream to characters
* @return
* child classes of [[EstimationResult]]
* a subclass of [[EstimationResult]]
*/
def estimateLang(
data: Array[Byte],
offset: Int,
charset: Charset
): EstimationResult = {
val bufferStatus = prepareBuffer(data, offset, charset)
val internalBufferString = internalBuffer.toString
println(s"internalBuffer: $internalBufferString") // Print the content of the internal buffer
if (bufferStatus.isEmpty) {
return BadEncoding
}
val ncopied = bufferStatus.get
println(s"Copied characters: $ncopied") // Print the number of copied characters
if (ncopied > minBytes) {
val language = langDetector.detect(internalBuffer)
println(s"Detected language: ${language}") // Print the detected language
if (!language.isPresent) {
EstimationFailure
} else {
val code = language.get().getLanguage
println(s"Detected language code: $code") // Print the detected language code
ProbableLanguage(code)
}
} else {
Expand All @@ -114,4 +164,4 @@ object LangEstimation {
LanguageDetectorBuilder.create(NgramExtractor.gramLengths(1, 2)).withProfiles(profiles).build()
}

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ final case class DuplicateCandidateRow(
}

object DuplicateCandidateRow {
final val NGRAM_SIG_LEN = 128
final val NGRAM_SIG_LEN = 256 // for en, increase to > 128.
final val BITS_IN_LONG = 64
final val BIT_MASK = BITS_IN_LONG - 1
final val BYTE_MASK = (NGRAM_SIG_LEN * BITS_IN_LONG - 1) ^ BIT_MASK
Expand All @@ -159,7 +159,7 @@ object DuplicateCandidateRow {
/** size of JVM object/array header */
final val HEADER_SIZE = 16
final val MAX_MATCHING_LENGTH = 50
private val ngrams = new NgramHashExtractor(3, 4)
private val ngrams = new NgramHashExtractor(2, 5)// en n-grams, 2-5 can capture most of the cases
}

class CandidateRowProcessor(
Expand Down Expand Up @@ -964,8 +964,8 @@ object DeduplicateParagraphs {
cache = cache.toOption,
partitions = partitions(),
simHashSize = 128,
minNgramSize = 2,
maxNgramSize = 4,
minNgramSize = 3, // for en, change to 3, ja is 2
maxNgramSize = 8, // for en, change to 8-9, ja is 4
numShifts = numShifts(),
propagatePartitions = propagatePartitions(),
execution = execution(),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,59 @@
package com.worksap.nlp.uzushio.lib.lang

import com.worksap.nlp.uzushio.lib.utils.ClasspathAccess
import java.nio.charset.{Charset, StandardCharsets}
import org.scalatest.freespec.AnyFreeSpec

class LangEstimationSpec extends AnyFreeSpec with ClasspathAccess {
class LangEstimationSpec extends AnyFreeSpec {

"LangEstimation" - {
val sniffer = new LangTagSniffer()
"sniffs charset shift_jis fragment" in {
val data = classpathBytes("lang/shift_jis.txt")
val tags = sniffer.sniffTags(data, 0, data.length)
assert("Shift-JIS" == tags.charset)
val estimator = new LangEstimation()

"detects Japanese language from a simulated Wikipedia page about Japan" in {
// 模拟维基百科介绍日本的 HTML 页面,并用日语书写,Shift-JIS 编码
val htmlContent = """
<html>
<head>
<title>日本 - Wikipedia</title>
</head>
<body>
<h1>日本</h1>
<p>日本(にっぽん、にほん)は、東アジアに位置する島国で、太平洋に面しています。日本は北海道、本州、四国、九州の四つの主要な島から構成されています。</p>
<p>日本の首都は東京で、人口は世界でも有数の規模を誇ります。日本は高度に発展した国であり、技術、経済、文化など多くの分野で世界に影響を与えています。</p>
<p>日本の歴史は古く、何世紀にもわたる様々な変革と発展を遂げてきました。現代の日本は、明治維新後に急速に産業化され、世界的な経済大国となりました。</p>
<p>第二次世界大戦後、日本は驚異的な復興を遂げ、現在では世界で最も強力な経済の一つとして知られています。</p>
</body>
</html>
"""
val data = htmlContent.getBytes("Shift_JIS")
val result = estimator.estimateLang(data, 0, Charset.forName("Shift_JIS"))

// 断言检测结果应该是日语
assert(result.isInstanceOf[ProbableLanguage])
assert(result.asInstanceOf[ProbableLanguage].lang == "ja") // 期待的结果是日语
}

"detects English language from a simulated Wikipedia page about Japan" in {
// 模拟维基百科关于日本的英文页面,并用 UTF-8 编码
val htmlContent = """
<html>
<head>
<title>Japan - Wikipedia</title>
</head>
<body>
<h1>Japan</h1>
<p>Japan is an island country in East Asia, located in the northwest Pacific Ocean. It borders the Sea of Japan to the west, and extends from the Sea of Okhotsk in the north to the East China Sea and Taiwan in the south.</p>
<p>Japan is a highly developed country, known for its advanced technology, strong economy, and rich culture. With a population of over 125 million, Japan is the world's eleventh most populous country, and Tokyo, its capital, is one of the most populous cities in the world.</p>
<p>The country's history dates back to the 14th century BC, and over the centuries, it has evolved through various dynasties and periods. Modern Japan emerged in the late 19th century during the Meiji Restoration, which transformed it into an industrial and economic power.</p>
<p>After World War II, Japan experienced rapid recovery and became one of the world's leading economies. Today, Japan is known for its influence in global technology, culture, and economy.</p>
</body>
</html>
"""
val data = htmlContent.getBytes(StandardCharsets.UTF_8)
val result = estimator.estimateLang(data, 0, StandardCharsets.UTF_8)

// 断言检测结果应该是英语
assert(result.isInstanceOf[ProbableLanguage])
assert(result.asInstanceOf[ProbableLanguage].lang == "en") // 期待的结果是英语
}
}
}
}