diff --git a/bench/bench.iml b/bench/bench.iml new file mode 100644 index 0000000..f76df2b --- /dev/null +++ b/bench/bench.iml @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/lib/lib.iml b/lib/lib.iml new file mode 100644 index 0000000..3624719 --- /dev/null +++ b/lib/lib.iml @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala index f72dc7b..e555cb4 100644 --- a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala +++ b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala @@ -2,9 +2,9 @@ package com.worksap.nlp.uzushio.lib.lang import com.optimaize.langdetect.LanguageDetectorBuilder import com.optimaize.langdetect.ngram.NgramExtractor - import java.nio.charset.{Charset, CodingErrorAction} import java.nio.{ByteBuffer, CharBuffer} +import java.util.regex.{Matcher, Pattern} sealed trait EstimationResult { def str: String = "unk" @@ -20,26 +20,71 @@ class LangEstimation(private val minBytes: Int = 256) { private val decodeBuffer = CharBuffer.allocate(4 * 1024) private def langDetector = LangEstimation.cachedDetector - /** Copy non-ASCII characters into detection buffer + /** Remove HTML tags and inline JavaScript from the input. + * @param input + * input buffer as a string + * @return + * cleaned string with HTML and JavaScript removed + */ + private def cleanHtmlContent(input: String): String = { + // Use regex to remove HTML tags and content inside ") + val stylePattern = Pattern.compile("(?s).*?") + val commentPattern = Pattern.compile("(?s)") + val htmlTagPattern = Pattern.compile("<[^>]+>") + + + // Process from 50% ~ + val startPos = input.length / 2 + var cleanedContent = input.substring(startPos) + + // First, remove the