From 15c9a69f524fcbcee51acf19e4c412ed14b6296d Mon Sep 17 00:00:00 2001
From: ZeekYin <44542418+ZeekYin@users.noreply.github.com>
Date: Sun, 6 Oct 2024 20:44:27 +0900
Subject: [PATCH 1/7] params changed for English
---
.../com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala | 3 ++-
.../nlp/uzushio/lib/runners/DeduplicateParagraphs.scala | 8 ++++----
2 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
index f72dc7b..f4d9637 100644
--- a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
+++ b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
@@ -28,6 +28,7 @@ class LangEstimation(private val minBytes: Int = 256) {
*/
private def copyNonAscii(input: CharBuffer, output: CharBuffer): Unit = {
var prevWhitespace = false
+ //
while (input.hasRemaining && output.remaining() > 1) {
val char = input.get()
if ((char & 0xffff) >= 128) {
@@ -53,7 +54,7 @@ class LangEstimation(private val minBytes: Int = 256) {
val decoder = charset.newDecoder().onUnmappableCharacter(CodingErrorAction.REPORT)
decBuf.clear()
buf.clear()
-
+ //
while (inputData.remaining() > 0 && buf.remaining() > 0) {
val result = decoder.decode(inputData, decBuf, true)
if (result.isUnmappable || result.isError || result.isMalformed) {
diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala
index 02e17f6..520ef23 100644
--- a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala
+++ b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala
@@ -149,7 +149,7 @@ final case class DuplicateCandidateRow(
}
object DuplicateCandidateRow {
- final val NGRAM_SIG_LEN = 128
+ final val NGRAM_SIG_LEN = 256 // for en, increase to > 128.
final val BITS_IN_LONG = 64
final val BIT_MASK = BITS_IN_LONG - 1
final val BYTE_MASK = (NGRAM_SIG_LEN * BITS_IN_LONG - 1) ^ BIT_MASK
@@ -159,7 +159,7 @@ object DuplicateCandidateRow {
/** size of JVM object/array header */
final val HEADER_SIZE = 16
final val MAX_MATCHING_LENGTH = 50
- private val ngrams = new NgramHashExtractor(3, 4)
+ private val ngrams = new NgramHashExtractor(2, 5)// en n-grams, 2-5 can capture most of the cases
}
class CandidateRowProcessor(
@@ -964,8 +964,8 @@ object DeduplicateParagraphs {
cache = cache.toOption,
partitions = partitions(),
simHashSize = 128,
- minNgramSize = 2,
- maxNgramSize = 4,
+ minNgramSize = 3, // for en, change to 3, ja is 2
+ maxNgramSize = 8, // for en, change to 8-9, ja is 4
numShifts = numShifts(),
propagatePartitions = propagatePartitions(),
execution = execution(),
From ddc2454950c691b97e76d90490b6ee10d1470a07 Mon Sep 17 00:00:00 2001
From: ZeekYin <44542418+ZeekYin@users.noreply.github.com>
Date: Mon, 7 Oct 2024 13:58:56 +0900
Subject: [PATCH 2/7] add support for ascii char
---
.../nlp/uzushio/lib/lang/LangEstimation.scala | 62 ++++++++++++++++++-
1 file changed, 61 insertions(+), 1 deletion(-)
diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
index f4d9637..5e2bcd5 100644
--- a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
+++ b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala
@@ -26,10 +26,63 @@ class LangEstimation(private val minBytes: Int = 256) {
* @param output
* output buffer
*/
+
+ private def cleanHtmlAndScripts(input: CharBuffer): CharBuffer = {
+ val output = CharBuffer.allocate(input.capacity())
+ var inTag = false
+ var inScript = false
+ var inStyle = false
+ var prevChar: Char = 0
+
+ while (input.hasRemaining) {
+ val char = input.get()
+
+ // Check for start of HTML tag
+ if (char == '<') {
+ prevChar = char
+ inTag = true
+ // Check for script or style tags
+ if (input.remaining() > 6) {
+ val nextTag = input.subSequence(input.position(), input.position() + 6).toString().toLowerCase()
+ if (nextTag.startsWith("script")) {
+ inScript = true
+ } else if (nextTag.startsWith("style")) {
+ inStyle = true
+ }
+ }
+ }
+
+ // Skip content inside """.r
+
+ // First, remove the """.r
+ val scriptPattern = Pattern.compile("(?s).*?")
+ val stylePattern = Pattern.compile("(?s).*?")
+ val commentPattern = Pattern.compile("(?s)")
+ val htmlTagPattern = Pattern.compile("<[^>]+>")
+
+
+ // 1. 从前 50% 开始处理
+ val startPos = input.length / 2
+ var cleanedContent = input.substring(startPos)
// First, remove the