diff --git a/bench/bench.iml b/bench/bench.iml
new file mode 100644
index 0000000..f76df2b
--- /dev/null
+++ b/bench/bench.iml
@@ -0,0 +1,11 @@
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/lib/lib.iml b/lib/lib.iml
new file mode 100644
index 0000000..3624719
--- /dev/null
+++ b/lib/lib.iml
@@ -0,0 +1,11 @@
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
index f72dc7b..e555cb4 100644
--- a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
+++ b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
@@ -2,9 +2,9 @@ package com.worksap.nlp.uzushio.lib.lang
import com.optimaize.langdetect.LanguageDetectorBuilder
import com.optimaize.langdetect.ngram.NgramExtractor
-
import java.nio.charset.{Charset, CodingErrorAction}
import java.nio.{ByteBuffer, CharBuffer}
+import java.util.regex.{Matcher, Pattern}
sealed trait EstimationResult {
def str: String = "unk"
@@ -20,26 +20,71 @@ class LangEstimation(private val minBytes: Int = 256) {
private val decodeBuffer = CharBuffer.allocate(4 * 1024)
private def langDetector = LangEstimation.cachedDetector
- /** Copy non-ASCII characters into detection buffer
+ /** Remove HTML tags and inline JavaScript from the input.
+ * @param input
+ * input buffer as a string
+ * @return
+ * cleaned string with HTML and JavaScript removed
+ */
+ private def cleanHtmlContent(input: String): String = {
+ // Use regex to remove HTML tags and content inside ")
+ val stylePattern = Pattern.compile("(?s).*?")
+ val commentPattern = Pattern.compile("(?s)")
+ val htmlTagPattern = Pattern.compile("<[^>]+>")
+
+
+ // Process from 50% ~
+ val startPos = input.length / 2
+ var cleanedContent = input.substring(startPos)
+
+ // First, remove the