From 15c9a69f524fcbcee51acf19e4c412ed14b6296d Mon Sep 17 00:00:00 2001 From: ZeekYin <44542418+ZeekYin@users.noreply.github.com> Date: Sun, 6 Oct 2024 20:44:27 +0900 Subject: [PATCH 1/7] params changed for English --- .../com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala | 3 ++- .../nlp/uzushio/lib/runners/DeduplicateParagraphs.scala | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala index f72dc7b..f4d9637 100644 --- a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala +++ b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala @@ -28,6 +28,7 @@ class LangEstimation(private val minBytes: Int = 256) { */ private def copyNonAscii(input: CharBuffer, output: CharBuffer): Unit = { var prevWhitespace = false + // while (input.hasRemaining && output.remaining() > 1) { val char = input.get() if ((char & 0xffff) >= 128) { @@ -53,7 +54,7 @@ class LangEstimation(private val minBytes: Int = 256) { val decoder = charset.newDecoder().onUnmappableCharacter(CodingErrorAction.REPORT) decBuf.clear() buf.clear() - + // while (inputData.remaining() > 0 && buf.remaining() > 0) { val result = decoder.decode(inputData, decBuf, true) if (result.isUnmappable || result.isError || result.isMalformed) { diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala index 02e17f6..520ef23 100644 --- a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala +++ b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala @@ -149,7 +149,7 @@ final case class DuplicateCandidateRow( } object DuplicateCandidateRow { - final val NGRAM_SIG_LEN = 128 + final val NGRAM_SIG_LEN = 256 // for en, increase to > 128. final val BITS_IN_LONG = 64 final val BIT_MASK = BITS_IN_LONG - 1 final val BYTE_MASK = (NGRAM_SIG_LEN * BITS_IN_LONG - 1) ^ BIT_MASK @@ -159,7 +159,7 @@ object DuplicateCandidateRow { /** size of JVM object/array header */ final val HEADER_SIZE = 16 final val MAX_MATCHING_LENGTH = 50 - private val ngrams = new NgramHashExtractor(3, 4) + private val ngrams = new NgramHashExtractor(2, 5)// en n-grams, 2-5 can capture most of the cases } class CandidateRowProcessor( @@ -964,8 +964,8 @@ object DeduplicateParagraphs { cache = cache.toOption, partitions = partitions(), simHashSize = 128, - minNgramSize = 2, - maxNgramSize = 4, + minNgramSize = 3, // for en, change to 3, ja is 2 + maxNgramSize = 8, // for en, change to 8-9, ja is 4 numShifts = numShifts(), propagatePartitions = propagatePartitions(), execution = execution(), From ddc2454950c691b97e76d90490b6ee10d1470a07 Mon Sep 17 00:00:00 2001 From: ZeekYin <44542418+ZeekYin@users.noreply.github.com> Date: Mon, 7 Oct 2024 13:58:56 +0900 Subject: [PATCH 2/7] add support for ascii char --- .../nlp/uzushio/lib/lang/LangEstimation.scala | 62 ++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala index f4d9637..5e2bcd5 100644 --- a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala +++ b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala @@ -26,10 +26,63 @@ class LangEstimation(private val minBytes: Int = 256) { * @param output * output buffer */ + + private def cleanHtmlAndScripts(input: CharBuffer): CharBuffer = { + val output = CharBuffer.allocate(input.capacity()) + var inTag = false + var inScript = false + var inStyle = false + var prevChar: Char = 0 + + while (input.hasRemaining) { + val char = input.get() + + // Check for start of HTML tag + if (char == '<') { + prevChar = char + inTag = true + // Check for script or style tags + if (input.remaining() > 6) { + val nextTag = input.subSequence(input.position(), input.position() + 6).toString().toLowerCase() + if (nextTag.startsWith("script")) { + inScript = true + } else if (nextTag.startsWith("style")) { + inStyle = true + } + } + } + + // Skip content inside """.r + + // First, remove the """.r + val scriptPattern = Pattern.compile("(?s).*?") + val stylePattern = Pattern.compile("(?s).*?") + val commentPattern = Pattern.compile("(?s)") + val htmlTagPattern = Pattern.compile("<[^>]+>") + + + // 1. 从前 50% 开始处理 + val startPos = input.length / 2 + var cleanedContent = input.substring(startPos) // First, remove the