WorksApplications · ZeekYin · Oct 6, 2024 · Oct 7, 2024 · Oct 9, 2024 · Oct 9, 2024
diff --git a/bench/bench.iml b/bench/bench.iml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
diff --git a/lib/lib.iml b/lib/lib.iml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/src/main/scala" isTestSource="false" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
@@ -2,9 +2,9 @@ package com.worksap.nlp.uzushio.lib.lang
 
 import com.optimaize.langdetect.LanguageDetectorBuilder
 import com.optimaize.langdetect.ngram.NgramExtractor
-
 import java.nio.charset.{Charset, CodingErrorAction}
 import java.nio.{ByteBuffer, CharBuffer}
+import java.util.regex.{Matcher, Pattern}
 
 sealed trait EstimationResult {
   def str: String = "unk"
@@ -20,26 +20,71 @@ class LangEstimation(private val minBytes: Int = 256) {
   private val decodeBuffer = CharBuffer.allocate(4 * 1024)
   private def langDetector = LangEstimation.cachedDetector
 
-  /** Copy non-ASCII characters into detection buffer
+  /** Remove HTML tags and inline JavaScript from the input.
+    * @param input
+    *   input buffer as a string
+    * @return
+    *   cleaned string with HTML and JavaScript removed
+    */
+  private def cleanHtmlContent(input: String): String = {
+    // Use regex to remove HTML tags and content inside <script> tags
+    val scriptPattern = Pattern.compile("(?s)<script.*?>.*?</script>")
+    val stylePattern = Pattern.compile("(?s)<style.*?>.*?</style>")
+    val commentPattern = Pattern.compile("(?s)<!--.*?-->")
+    val htmlTagPattern = Pattern.compile("<[^>]+>")
+
+
+    // Process from 50% ~
+    val startPos = input.length / 2
+    var cleanedContent = input.substring(startPos)
+
+    // First, remove the <script> and etc block
+    cleanedContent = removePattern(cleanedContent, scriptPattern)
+    cleanedContent = removePattern(cleanedContent, stylePattern)
+    cleanedContent = removePattern(cleanedContent, commentPattern)
+
+    // remove html tags
+    cleanedContent = removePattern(cleanedContent, htmlTagPattern)
+
+    println(s"Cleaned content: ${cleanedContent.take(100)}...") // debug print
+    cleanedContent
+  }
+
+  /** Helper function to remove pattern from string using JVM's regex */
+  private def removePattern(input: String, pattern: Pattern): String = {
+    val matcher: Matcher = pattern.matcher(input)
+    matcher.replaceAll("")
+  }
+
+  /** Copy meaningful content into detection buffer, removing HTML, JavaScript, and retaining text.
+    * Retains both ASCII and non-ASCII characters, focusing on meaningful language content.
+    *
     * @param input
-    *   input buffer
+    *   input CharBuffer
     * @param output
-    *   output buffer
+    *   output CharBuffer
     */
-  private def copyNonAscii(input: CharBuffer, output: CharBuffer): Unit = {
-    var prevWhitespace = false
-    while (input.hasRemaining && output.remaining() > 1) {
-      val char = input.get()
-      if ((char & 0xffff) >= 128) {
-        if (prevWhitespace) {
-          output.put(' ')
-          prevWhitespace = false
-        }
-        output.put(char)
+  private def copyMeaningfulContent(input: CharBuffer, output: CharBuffer): Unit = {
+    // Convert the input to a string
+    val content = input.toString
+
+    // Use regex to remove HTML tags and JavaScript
+    val cleanedContent = cleanHtmlContent(content)
+
+    // Filter and clean the remaining text, retaining letters, digits, whitespace, and non-ASCII characters
+    val meaningfulContent = cleanedContent.flatMap { char =>
+      if (char.isLetterOrDigit || char.isWhitespace || char >= 128) {
+        Some(char)
       } else {
-        prevWhitespace = true
+        None
       }
     }
+
+    // Put the cleaned content into the output buffer
+    val result = meaningfulContent.mkString.trim
+    println(s"Meaningful content: $result") // Print the meaningful content
+    // Copy meaningful content to the output buffer
+    output.put(result)
   }
 
   private def prepareBuffer(
@@ -60,43 +105,48 @@ class LangEstimation(private val minBytes: Int = 256) {
         return None
       }
       decBuf.flip()
-      copyNonAscii(decBuf, buf)
+      copyMeaningfulContent(decBuf, buf)
       decBuf.clear()
     }
 
     buf.flip()
     Some(buf.limit())
   }
 
-  /** Estimate language by taking at most 5k characters from first 20kb of text. This detector
-    * ignores all ASCII characters, so languages which use such scripts are not detectable. Returns
-    * [[BadEncoding]] if there exist non-mappable characters using the passed encoding.
+  /** Estimate the language by taking at most 5k characters from the first 20kb of text.
+    * Retains both ASCII and non-ASCII characters, but removes HTML and JavaScript tags.
+    * Returns [[BadEncoding]] if there are unmappable characters using the provided encoding.
     *
     * @param data
-    *   text to detect language from
+    *   the text to detect language from
     * @param offset
-    *   offset from the array start
+    *   the offset from the start of the array
     * @param charset
-    *   charset to use for converting byte stream to characters
+    *   the charset to use for converting byte stream to characters
     * @return
-    *   child classes of [[EstimationResult]]
+    *   a subclass of [[EstimationResult]]
     */
   def estimateLang(
       data: Array[Byte],
       offset: Int,
       charset: Charset
   ): EstimationResult = {
     val bufferStatus = prepareBuffer(data, offset, charset)
+    val internalBufferString = internalBuffer.toString
+    println(s"internalBuffer: $internalBufferString") // Print the content of the internal buffer
     if (bufferStatus.isEmpty) {
       return BadEncoding
     }
     val ncopied = bufferStatus.get
+    println(s"Copied characters: $ncopied") // Print the number of copied characters
     if (ncopied > minBytes) {
       val language = langDetector.detect(internalBuffer)
+      println(s"Detected language: ${language}") // Print the detected language
       if (!language.isPresent) {
         EstimationFailure
       } else {
         val code = language.get().getLanguage
+        println(s"Detected language code: $code") // Print the detected language code
         ProbableLanguage(code)
       }
     } else {
@@ -114,4 +164,4 @@ object LangEstimation {
     LanguageDetectorBuilder.create(NgramExtractor.gramLengths(1, 2)).withProfiles(profiles).build()
   }
 
-}
+}
diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala
@@ -149,7 +149,7 @@ final case class DuplicateCandidateRow(
 }
 
 object DuplicateCandidateRow {
-  final val NGRAM_SIG_LEN = 128
+  final val NGRAM_SIG_LEN = 256 // for en, increase to > 128.
   final val BITS_IN_LONG = 64
   final val BIT_MASK = BITS_IN_LONG - 1
   final val BYTE_MASK = (NGRAM_SIG_LEN * BITS_IN_LONG - 1) ^ BIT_MASK
@@ -159,7 +159,7 @@ object DuplicateCandidateRow {
   /** size of JVM object/array header */
   final val HEADER_SIZE = 16
   final val MAX_MATCHING_LENGTH = 50
-  private val ngrams = new NgramHashExtractor(3, 4)
+  private val ngrams = new NgramHashExtractor(2, 5)// en n-grams, 2-5 can capture most of the cases
 }
 
 class CandidateRowProcessor(
@@ -964,8 +964,8 @@ object DeduplicateParagraphs {
       cache = cache.toOption,
       partitions = partitions(),
       simHashSize = 128,
-      minNgramSize = 2,
-      maxNgramSize = 4,
+      minNgramSize = 3, // for en, change to 3, ja is 2
+      maxNgramSize = 8, // for en, change to 8-9, ja is 4
       numShifts = numShifts(),
       propagatePartitions = propagatePartitions(),
       execution = execution(),

diff --git a/lib/src/test/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimationSpec.scala b/lib/src/test/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimationSpec.scala
@@ -1,15 +1,59 @@
 package com.worksap.nlp.uzushio.lib.lang
 
-import com.worksap.nlp.uzushio.lib.utils.ClasspathAccess
+import java.nio.charset.{Charset, StandardCharsets}
 import org.scalatest.freespec.AnyFreeSpec
 
-class LangEstimationSpec extends AnyFreeSpec with ClasspathAccess {
+class LangEstimationSpec extends AnyFreeSpec {
+
   "LangEstimation" - {
-    val sniffer = new LangTagSniffer()
-    "sniffs charset shift_jis fragment" in {
-      val data = classpathBytes("lang/shift_jis.txt")
-      val tags = sniffer.sniffTags(data, 0, data.length)
-      assert("Shift-JIS" == tags.charset)
+    val estimator = new LangEstimation()
+
+    "detects Japanese language from a simulated Wikipedia page about Japan" in {
+      // 模拟维基百科介绍日本的 HTML 页面，并用日语书写，Shift-JIS 编码
+      val htmlContent = """
+        <html>
+          <head>
+            <title>日本 - Wikipedia</title>
+          </head>
+          <body>
+            <h1>日本</h1>
+            <p>日本（にっぽん、にほん）は、東アジアに位置する島国で、太平洋に面しています。日本は北海道、本州、四国、九州の四つの主要な島から構成されています。</p>
+            <p>日本の首都は東京で、人口は世界でも有数の規模を誇ります。日本は高度に発展した国であり、技術、経済、文化など多くの分野で世界に影響を与えています。</p>
+            <p>日本の歴史は古く、何世紀にもわたる様々な変革と発展を遂げてきました。現代の日本は、明治維新後に急速に産業化され、世界的な経済大国となりました。</p>
+            <p>第二次世界大戦後、日本は驚異的な復興を遂げ、現在では世界で最も強力な経済の一つとして知られています。</p>
+          </body>
+        </html>
+      """
+      val data = htmlContent.getBytes("Shift_JIS")
+      val result = estimator.estimateLang(data, 0, Charset.forName("Shift_JIS"))
+
+      // 断言检测结果应该是日语
+      assert(result.isInstanceOf[ProbableLanguage])
+      assert(result.asInstanceOf[ProbableLanguage].lang == "ja") // 期待的结果是日语
+    }
+
+    "detects English language from a simulated Wikipedia page about Japan" in {
+      // 模拟维基百科关于日本的英文页面，并用 UTF-8 编码
+      val htmlContent = """
+        <html>
+          <head>
+            <title>Japan - Wikipedia</title>
+          </head>
+          <body>
+            <h1>Japan</h1>
+            <p>Japan is an island country in East Asia, located in the northwest Pacific Ocean. It borders the Sea of Japan to the west, and extends from the Sea of Okhotsk in the north to the East China Sea and Taiwan in the south.</p>
+            <p>Japan is a highly developed country, known for its advanced technology, strong economy, and rich culture. With a population of over 125 million, Japan is the world's eleventh most populous country, and Tokyo, its capital, is one of the most populous cities in the world.</p>
+            <p>The country's history dates back to the 14th century BC, and over the centuries, it has evolved through various dynasties and periods. Modern Japan emerged in the late 19th century during the Meiji Restoration, which transformed it into an industrial and economic power.</p>
+            <p>After World War II, Japan experienced rapid recovery and became one of the world's leading economies. Today, Japan is known for its influence in global technology, culture, and economy.</p>
+          </body>
+        </html>
+      """
+      val data = htmlContent.getBytes(StandardCharsets.UTF_8)
+      val result = estimator.estimateLang(data, 0, StandardCharsets.UTF_8)
+
+      // 断言检测结果应该是英语
+      assert(result.isInstanceOf[ProbableLanguage])
+      assert(result.asInstanceOf[ProbableLanguage].lang == "en") // 期待的结果是英语
     }
   }
-}
+}