From 15c9a69f524fcbcee51acf19e4c412ed14b6296d Mon Sep 17 00:00:00 2001
From: ZeekYin <44542418+ZeekYin@users.noreply.github.com>
Date: Sun, 6 Oct 2024 20:44:27 +0900
Subject: [PATCH 1/7] params changed for English

---
 .../com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala | 3 ++-
 .../nlp/uzushio/lib/runners/DeduplicateParagraphs.scala   | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
index f72dc7b..f4d9637 100644
--- a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
+++ b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
@@ -28,6 +28,7 @@ class LangEstimation(private val minBytes: Int = 256) {
     */
   private def copyNonAscii(input: CharBuffer, output: CharBuffer): Unit = {
     var prevWhitespace = false
+    //
     while (input.hasRemaining && output.remaining() > 1) {
       val char = input.get()
       if ((char & 0xffff) >= 128) {
@@ -53,7 +54,7 @@ class LangEstimation(private val minBytes: Int = 256) {
     val decoder = charset.newDecoder().onUnmappableCharacter(CodingErrorAction.REPORT)
     decBuf.clear()
     buf.clear()
-
+    //
     while (inputData.remaining() > 0 && buf.remaining() > 0) {
       val result = decoder.decode(inputData, decBuf, true)
       if (result.isUnmappable || result.isError || result.isMalformed) {
diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala
index 02e17f6..520ef23 100644
--- a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala
+++ b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala
@@ -149,7 +149,7 @@ final case class DuplicateCandidateRow(
 }
 
 object DuplicateCandidateRow {
-  final val NGRAM_SIG_LEN = 128
+  final val NGRAM_SIG_LEN = 256 // for en, increase to > 128.
   final val BITS_IN_LONG = 64
   final val BIT_MASK = BITS_IN_LONG - 1
   final val BYTE_MASK = (NGRAM_SIG_LEN * BITS_IN_LONG - 1) ^ BIT_MASK
@@ -159,7 +159,7 @@ object DuplicateCandidateRow {
   /** size of JVM object/array header */
   final val HEADER_SIZE = 16
   final val MAX_MATCHING_LENGTH = 50
-  private val ngrams = new NgramHashExtractor(3, 4)
+  private val ngrams = new NgramHashExtractor(2, 5)// en n-grams, 2-5 can capture most of the cases
 }
 
 class CandidateRowProcessor(
@@ -964,8 +964,8 @@ object DeduplicateParagraphs {
       cache = cache.toOption,
       partitions = partitions(),
       simHashSize = 128,
-      minNgramSize = 2,
-      maxNgramSize = 4,
+      minNgramSize = 3, // for en, change to 3, ja is 2
+      maxNgramSize = 8, // for en, change to 8-9, ja is 4
       numShifts = numShifts(),
       propagatePartitions = propagatePartitions(),
       execution = execution(),

From ddc2454950c691b97e76d90490b6ee10d1470a07 Mon Sep 17 00:00:00 2001
From: ZeekYin <44542418+ZeekYin@users.noreply.github.com>
Date: Mon, 7 Oct 2024 13:58:56 +0900
Subject: [PATCH 2/7] add support for ascii char

---
 .../nlp/uzushio/lib/lang/LangEstimation.scala | 62 ++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
index f4d9637..5e2bcd5 100644
--- a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
+++ b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
@@ -26,10 +26,63 @@ class LangEstimation(private val minBytes: Int = 256) {
     * @param output
     *   output buffer
     */
+
+  private def cleanHtmlAndScripts(input: CharBuffer): CharBuffer = {
+    val output = CharBuffer.allocate(input.capacity())
+    var inTag = false
+    var inScript = false
+    var inStyle = false
+    var prevChar: Char = 0
+
+    while (input.hasRemaining) {
+      val char = input.get()
+
+      // Check for start of HTML tag
+      if (char == '<') {
+        prevChar = char
+        inTag = true
+        // Check for script or style tags
+        if (input.remaining() > 6) {
+          val nextTag = input.subSequence(input.position(), input.position() + 6).toString().toLowerCase()
+          if (nextTag.startsWith("script")) {
+            inScript = true
+          } else if (nextTag.startsWith("style")) {
+            inStyle = true
+          }
+        }
+      }
+
+      // Skip content inside <script> or <style> tags
+      if (inScript || inStyle) {
+        if (char == '>' && prevChar == '/') {
+          inScript = false
+          inStyle = false
+        }
+        prevChar = char
+        continue
+      }
+
+      // Skip HTML tags
+      if (inTag && char == '>') {
+        inTag = false
+        prevChar = char
+        continue
+      }
+
+      // If not in a tag, script or style, add to output
+      if (!inTag && !inScript && !inStyle) {
+        output.put(char)
+      }
+    }
+
+    output.flip() // Flip the output buffer to make it readable
+    output
+  }
+
   private def copyNonAscii(input: CharBuffer, output: CharBuffer): Unit = {
     var prevWhitespace = false
     //
-    while (input.hasRemaining && output.remaining() > 1) {
+    /*while (input.hasRemaining && output.remaining() > 1) {
       val char = input.get()
       if ((char & 0xffff) >= 128) {
         if (prevWhitespace) {
@@ -40,6 +93,13 @@ class LangEstimation(private val minBytes: Int = 256) {
       } else {
         prevWhitespace = true
       }
+    }*/
+    val cleaned = cleanHtmlAndScripts(input)
+    // copy cleaned buffer to output
+    while (cleaned.hasRemaining && output.remaining() > 1) {
+      val char = cleaned.get()
+      //copy all characters
+      output.put(char)
     }
   }
 

From 2d2a167ed40324a2e2b3ad9d47abaf4aa1747521 Mon Sep 17 00:00:00 2001
From: ZeekYin <44542418+ZeekYin@users.noreply.github.com>
Date: Wed, 9 Oct 2024 10:09:27 +0900
Subject: [PATCH 3/7] Update LangEstimation.scala

---
 .../com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala      | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
index 5e2bcd5..76a818b 100644
--- a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
+++ b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
@@ -59,14 +59,13 @@ class LangEstimation(private val minBytes: Int = 256) {
           inStyle = false
         }
         prevChar = char
-        continue
+        
       }
 
       // Skip HTML tags
       if (inTag && char == '>') {
         inTag = false
         prevChar = char
-        continue
       }
 
       // If not in a tag, script or style, add to output

From c6058f8c35677ffdfd9b50c320eab8e949d6587c Mon Sep 17 00:00:00 2001
From: ZeekYin <44542418+ZeekYin@users.noreply.github.com>
Date: Wed, 9 Oct 2024 10:43:49 +0900
Subject: [PATCH 4/7] 1

---
 bench/bench.iml                               | 11 +++
 lib/lib.iml                                   | 11 +++
 .../nlp/uzushio/lib/lang/LangEstimation.scala | 74 +------------------
 3 files changed, 24 insertions(+), 72 deletions(-)
 create mode 100644 bench/bench.iml
 create mode 100644 lib/lib.iml

diff --git a/bench/bench.iml b/bench/bench.iml
new file mode 100644
index 0000000..f76df2b
--- /dev/null
+++ b/bench/bench.iml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/lib/lib.iml b/lib/lib.iml
new file mode 100644
index 0000000..3624719
--- /dev/null
+++ b/lib/lib.iml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/src/main/scala" isTestSource="false" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
index 76a818b..e09438b 100644
--- a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
+++ b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
@@ -26,79 +26,10 @@ class LangEstimation(private val minBytes: Int = 256) {
     * @param output
     *   output buffer
     */
-
-  private def cleanHtmlAndScripts(input: CharBuffer): CharBuffer = {
-    val output = CharBuffer.allocate(input.capacity())
-    var inTag = false
-    var inScript = false
-    var inStyle = false
-    var prevChar: Char = 0
-
-    while (input.hasRemaining) {
-      val char = input.get()
-
-      // Check for start of HTML tag
-      if (char == '<') {
-        prevChar = char
-        inTag = true
-        // Check for script or style tags
-        if (input.remaining() > 6) {
-          val nextTag = input.subSequence(input.position(), input.position() + 6).toString().toLowerCase()
-          if (nextTag.startsWith("script")) {
-            inScript = true
-          } else if (nextTag.startsWith("style")) {
-            inStyle = true
-          }
-        }
-      }
-
-      // Skip content inside <script> or <style> tags
-      if (inScript || inStyle) {
-        if (char == '>' && prevChar == '/') {
-          inScript = false
-          inStyle = false
-        }
-        prevChar = char
-        
-      }
-
-      // Skip HTML tags
-      if (inTag && char == '>') {
-        inTag = false
-        prevChar = char
-      }
-
-      // If not in a tag, script or style, add to output
-      if (!inTag && !inScript && !inStyle) {
-        output.put(char)
-      }
-    }
-
-    output.flip() // Flip the output buffer to make it readable
-    output
-  }
-
   private def copyNonAscii(input: CharBuffer, output: CharBuffer): Unit = {
     var prevWhitespace = false
-    //
-    /*while (input.hasRemaining && output.remaining() > 1) {
-      val char = input.get()
-      if ((char & 0xffff) >= 128) {
-        if (prevWhitespace) {
-          output.put(' ')
-          prevWhitespace = false
-        }
-        output.put(char)
-      } else {
-        prevWhitespace = true
-      }
-    }*/
-    val cleaned = cleanHtmlAndScripts(input)
-    // copy cleaned buffer to output
-    while (cleaned.hasRemaining && output.remaining() > 1) {
-      val char = cleaned.get()
-      //copy all characters
-      output.put(char)
+    while (input.hasRemaining && output.remaining() > 1) {
+      output.put(input.get())
     }
   }
 
@@ -113,7 +44,6 @@ class LangEstimation(private val minBytes: Int = 256) {
     val decoder = charset.newDecoder().onUnmappableCharacter(CodingErrorAction.REPORT)
     decBuf.clear()
     buf.clear()
-    //
     while (inputData.remaining() > 0 && buf.remaining() > 0) {
       val result = decoder.decode(inputData, decBuf, true)
       if (result.isUnmappable || result.isError || result.isMalformed) {

From 99887b0a0cbea981e0adb9cb064cfcca1ae80a18 Mon Sep 17 00:00:00 2001
From: ZeekYin <44542418+ZeekYin@users.noreply.github.com>
Date: Wed, 9 Oct 2024 15:31:40 +0900
Subject: [PATCH 5/7] english detectable

---
 .../nlp/uzushio/lib/lang/LangEstimation.scala | 77 +++++++++++++++----
 .../uzushio/lib/lang/LangEstimationSpec.scala | 60 +++++++++++++--
 2 files changed, 112 insertions(+), 25 deletions(-)

diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
index e09438b..8053723 100644
--- a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
+++ b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
@@ -2,9 +2,9 @@ package com.worksap.nlp.uzushio.lib.lang
 
 import com.optimaize.langdetect.LanguageDetectorBuilder
 import com.optimaize.langdetect.ngram.NgramExtractor
-
 import java.nio.charset.{Charset, CodingErrorAction}
 import java.nio.{ByteBuffer, CharBuffer}
+import scala.util.matching.Regex
 
 sealed trait EstimationResult {
   def str: String = "unk"
@@ -20,17 +20,54 @@ class LangEstimation(private val minBytes: Int = 256) {
   private val decodeBuffer = CharBuffer.allocate(4 * 1024)
   private def langDetector = LangEstimation.cachedDetector
 
-  /** Copy non-ASCII characters into detection buffer
+  /** Remove HTML tags and inline JavaScript from the input.
+    * @param input
+    *   input buffer as a string
+    * @return
+    *   cleaned string with HTML and JavaScript removed
+    */
+  private def cleanHtmlContent(input: String): String = {
+    // Use regex to remove HTML tags and content inside <script> tags
+    val htmlTagPattern: Regex = """<[^>]+>""".r
+    val scriptPattern: Regex = """(?s)<script.*?>.*?</script>""".r
+
+    // First, remove the <script> block
+    val noScriptContent = scriptPattern.replaceAllIn(input, "")
+    // Then, remove all HTML tags
+    val cleaned = htmlTagPattern.replaceAllIn(noScriptContent, "")
+    println(s"Cleaned content: $cleaned") // Print the cleaned content
+    cleaned
+  }
+
+  /** Copy meaningful content into detection buffer, removing HTML, JavaScript, and retaining text.
+    * Retains both ASCII and non-ASCII characters, focusing on meaningful language content.
+    *
     * @param input
-    *   input buffer
+    *   input CharBuffer
     * @param output
-    *   output buffer
+    *   output CharBuffer
     */
-  private def copyNonAscii(input: CharBuffer, output: CharBuffer): Unit = {
-    var prevWhitespace = false
-    while (input.hasRemaining && output.remaining() > 1) {
-      output.put(input.get())
+  private def copyMeaningfulContent(input: CharBuffer, output: CharBuffer): Unit = {
+    // Convert the input to a string
+    val content = input.toString
+
+    // Use regex to remove HTML tags and JavaScript
+    val cleanedContent = cleanHtmlContent(content)
+
+    // Filter and clean the remaining text, retaining letters, digits, whitespace, and non-ASCII characters
+    val meaningfulContent = cleanedContent.flatMap { char =>
+      if (char.isLetterOrDigit || char.isWhitespace || char >= 128) {
+        Some(char)
+      } else {
+        None
+      }
     }
+
+    // Put the cleaned content into the output buffer
+    val result = meaningfulContent.mkString.trim
+    println(s"Meaningful content: $result") // Print the meaningful content
+    // Copy meaningful content to the output buffer
+    output.put(result)
   }
 
   private def prepareBuffer(
@@ -44,13 +81,14 @@ class LangEstimation(private val minBytes: Int = 256) {
     val decoder = charset.newDecoder().onUnmappableCharacter(CodingErrorAction.REPORT)
     decBuf.clear()
     buf.clear()
+
     while (inputData.remaining() > 0 && buf.remaining() > 0) {
       val result = decoder.decode(inputData, decBuf, true)
       if (result.isUnmappable || result.isError || result.isMalformed) {
         return None
       }
       decBuf.flip()
-      copyNonAscii(decBuf, buf)
+      copyMeaningfulContent(decBuf, buf)
       decBuf.clear()
     }
 
@@ -58,18 +96,18 @@ class LangEstimation(private val minBytes: Int = 256) {
     Some(buf.limit())
   }
 
-  /** Estimate language by taking at most 5k characters from first 20kb of text. This detector
-    * ignores all ASCII characters, so languages which use such scripts are not detectable. Returns
-    * [[BadEncoding]] if there exist non-mappable characters using the passed encoding.
+  /** Estimate the language by taking at most 5k characters from the first 20kb of text.
+    * Retains both ASCII and non-ASCII characters, but removes HTML and JavaScript tags.
+    * Returns [[BadEncoding]] if there are unmappable characters using the provided encoding.
     *
     * @param data
-    *   text to detect language from
+    *   the text to detect language from
     * @param offset
-    *   offset from the array start
+    *   the offset from the start of the array
     * @param charset
-    *   charset to use for converting byte stream to characters
+    *   the charset to use for converting byte stream to characters
     * @return
-    *   child classes of [[EstimationResult]]
+    *   a subclass of [[EstimationResult]]
     */
   def estimateLang(
       data: Array[Byte],
@@ -77,16 +115,21 @@ class LangEstimation(private val minBytes: Int = 256) {
       charset: Charset
   ): EstimationResult = {
     val bufferStatus = prepareBuffer(data, offset, charset)
+    val internalBufferString = internalBuffer.toString
+    println(s"internalBuffer: $internalBufferString") // Print the content of the internal buffer
     if (bufferStatus.isEmpty) {
       return BadEncoding
     }
     val ncopied = bufferStatus.get
+    println(s"Copied characters: $ncopied") // Print the number of copied characters
     if (ncopied > minBytes) {
       val language = langDetector.detect(internalBuffer)
+      println(s"Detected language: ${language}") // Print the detected language
       if (!language.isPresent) {
         EstimationFailure
       } else {
         val code = language.get().getLanguage
+        println(s"Detected language code: $code") // Print the detected language code
         ProbableLanguage(code)
       }
     } else {
@@ -104,4 +147,4 @@ object LangEstimation {
     LanguageDetectorBuilder.create(NgramExtractor.gramLengths(1, 2)).withProfiles(profiles).build()
   }
 
-}
+}
\ No newline at end of file
diff --git a/lib/src/test/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimationSpec.scala b/lib/src/test/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimationSpec.scala
index 9bd94ad..2865f86 100644
--- a/lib/src/test/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimationSpec.scala
+++ b/lib/src/test/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimationSpec.scala
@@ -1,15 +1,59 @@
 package com.worksap.nlp.uzushio.lib.lang
 
-import com.worksap.nlp.uzushio.lib.utils.ClasspathAccess
+import java.nio.charset.{Charset, StandardCharsets}
 import org.scalatest.freespec.AnyFreeSpec
 
-class LangEstimationSpec extends AnyFreeSpec with ClasspathAccess {
+class LangEstimationSpec extends AnyFreeSpec {
+
   "LangEstimation" - {
-    val sniffer = new LangTagSniffer()
-    "sniffs charset shift_jis fragment" in {
-      val data = classpathBytes("lang/shift_jis.txt")
-      val tags = sniffer.sniffTags(data, 0, data.length)
-      assert("Shift-JIS" == tags.charset)
+    val estimator = new LangEstimation()
+
+    "detects Japanese language from a simulated Wikipedia page about Japan" in {
+      // 模拟维基百科介绍日本的 HTML 页面，并用日语书写，Shift-JIS 编码
+      val htmlContent = """
+        <html>
+          <head>
+            <title>日本 - Wikipedia</title>
+          </head>
+          <body>
+            <h1>日本</h1>
+            <p>日本（にっぽん、にほん）は、東アジアに位置する島国で、太平洋に面しています。日本は北海道、本州、四国、九州の四つの主要な島から構成されています。</p>
+            <p>日本の首都は東京で、人口は世界でも有数の規模を誇ります。日本は高度に発展した国であり、技術、経済、文化など多くの分野で世界に影響を与えています。</p>
+            <p>日本の歴史は古く、何世紀にもわたる様々な変革と発展を遂げてきました。現代の日本は、明治維新後に急速に産業化され、世界的な経済大国となりました。</p>
+            <p>第二次世界大戦後、日本は驚異的な復興を遂げ、現在では世界で最も強力な経済の一つとして知られています。</p>
+          </body>
+        </html>
+      """
+      val data = htmlContent.getBytes("Shift_JIS")
+      val result = estimator.estimateLang(data, 0, Charset.forName("Shift_JIS"))
+      
+      // 断言检测结果应该是日语
+      assert(result.isInstanceOf[ProbableLanguage])
+      assert(result.asInstanceOf[ProbableLanguage].lang == "ja") // 期待的结果是日语
+    }
+
+    "detects English language from a simulated Wikipedia page about Japan" in {
+      // 模拟维基百科关于日本的英文页面，并用 UTF-8 编码
+      val htmlContent = """
+        <html>
+          <head>
+            <title>Japan - Wikipedia</title>
+          </head>
+          <body>
+            <h1>Japan</h1>
+            <p>Japan is an island country in East Asia, located in the northwest Pacific Ocean. It borders the Sea of Japan to the west, and extends from the Sea of Okhotsk in the north to the East China Sea and Taiwan in the south.</p>
+            <p>Japan is a highly developed country, known for its advanced technology, strong economy, and rich culture. With a population of over 125 million, Japan is the world's eleventh most populous country, and Tokyo, its capital, is one of the most populous cities in the world.</p>
+            <p>The country's history dates back to the 14th century BC, and over the centuries, it has evolved through various dynasties and periods. Modern Japan emerged in the late 19th century during the Meiji Restoration, which transformed it into an industrial and economic power.</p>
+            <p>After World War II, Japan experienced rapid recovery and became one of the world's leading economies. Today, Japan is known for its influence in global technology, culture, and economy.</p>
+          </body>
+        </html>
+      """
+      val data = htmlContent.getBytes(StandardCharsets.UTF_8)
+      val result = estimator.estimateLang(data, 0, StandardCharsets.UTF_8)
+      
+      // 断言检测结果应该是英语
+      assert(result.isInstanceOf[ProbableLanguage])
+      assert(result.asInstanceOf[ProbableLanguage].lang == "en") // 期待的结果是英语
     }
   }
-}
+}
\ No newline at end of file

From 572d0a6be89fd0547119c2f4c4fb2903858890bc Mon Sep 17 00:00:00 2001
From: ZeekYin <44542418+ZeekYin@users.noreply.github.com>
Date: Fri, 18 Oct 2024 16:06:24 +0900
Subject: [PATCH 6/7] start judge from 50%~

---
 .../nlp/uzushio/lib/lang/LangEstimation.scala | 32 ++++++++++++++-----
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
index 8053723..3b8355b 100644
--- a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
+++ b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
@@ -4,7 +4,7 @@ import com.optimaize.langdetect.LanguageDetectorBuilder
 import com.optimaize.langdetect.ngram.NgramExtractor
 import java.nio.charset.{Charset, CodingErrorAction}
 import java.nio.{ByteBuffer, CharBuffer}
-import scala.util.matching.Regex
+import java.util.regex.{Matcher, Pattern}
 
 sealed trait EstimationResult {
   def str: String = "unk"
@@ -28,15 +28,31 @@ class LangEstimation(private val minBytes: Int = 256) {
     */
   private def cleanHtmlContent(input: String): String = {
     // Use regex to remove HTML tags and content inside <script> tags
-    val htmlTagPattern: Regex = """<[^>]+>""".r
-    val scriptPattern: Regex = """(?s)<script.*?>.*?</script>""".r
+    val scriptPattern = Pattern.compile("(?s)<script.*?>.*?</script>")
+    val stylePattern = Pattern.compile("(?s)<style.*?>.*?</style>")
+    val commentPattern = Pattern.compile("(?s)<!--.*?-->")
+    val htmlTagPattern = Pattern.compile("<[^>]+>")
+
+    
+    // 1. 从前 50% 开始处理
+    val startPos = input.length / 2
+    var cleanedContent = input.substring(startPos)
 
     // First, remove the <script> block
-    val noScriptContent = scriptPattern.replaceAllIn(input, "")
-    // Then, remove all HTML tags
-    val cleaned = htmlTagPattern.replaceAllIn(noScriptContent, "")
-    println(s"Cleaned content: $cleaned") // Print the cleaned content
-    cleaned
+    cleanedContent = removePattern(cleanedContent, scriptPattern)
+    cleanedContent = removePattern(cleanedContent, stylePattern)
+    cleanedContent = removePattern(cleanedContent, commentPattern)
+
+    // 3. 去除 HTML 标签
+    cleanedContent = removePattern(cleanedContent, htmlTagPattern)
+
+    println(s"Cleaned content: ${cleanedContent.take(100)}...") // 打印部分清理后的内容
+    cleanedContent
+  }
+  /** Helper function to remove pattern from string using JVM's regex */
+  private def removePattern(input: String, pattern: Pattern): String = {
+    val matcher: Matcher = pattern.matcher(input)
+    matcher.replaceAll("")
   }
 
   /** Copy meaningful content into detection buffer, removing HTML, JavaScript, and retaining text.

From c1f92140269bd7201932486a9bdcda0360f8e86c Mon Sep 17 00:00:00 2001
From: ZeekYin <44542418+ZeekYin@users.noreply.github.com>
Date: Fri, 18 Oct 2024 16:16:47 +0900
Subject: [PATCH 7/7] Changed estimation method

1. Used java regex pattern
2. estimate from 50% ~
3. dismiss css, etc
---
 .../worksap/nlp/uzushio/lib/lang/LangEstimation.scala    | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
index 3b8355b..e555cb4 100644
--- a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
+++ b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
@@ -34,21 +34,22 @@ class LangEstimation(private val minBytes: Int = 256) {
     val htmlTagPattern = Pattern.compile("<[^>]+>")
 
     
-    // 1. 从前 50% 开始处理
+    // Process from 50% ~
     val startPos = input.length / 2
     var cleanedContent = input.substring(startPos)
 
-    // First, remove the <script> block
+    // First, remove the <script> and etc block
     cleanedContent = removePattern(cleanedContent, scriptPattern)
     cleanedContent = removePattern(cleanedContent, stylePattern)
     cleanedContent = removePattern(cleanedContent, commentPattern)
 
-    // 3. 去除 HTML 标签
+    // remove html tags
     cleanedContent = removePattern(cleanedContent, htmlTagPattern)
 
-    println(s"Cleaned content: ${cleanedContent.take(100)}...") // 打印部分清理后的内容
+    println(s"Cleaned content: ${cleanedContent.take(100)}...") // debug print
     cleanedContent
   }
+  
   /** Helper function to remove pattern from string using JVM's regex */
   private def removePattern(input: String, pattern: Pattern): String = {
     val matcher: Matcher = pattern.matcher(input)