From 390fadb517653a8dcfb8d9e06dadbe354ba59bd8 Mon Sep 17 00:00:00 2001 From: xbotter Date: Thu, 9 Nov 2023 12:37:07 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=93=9D=20Add=20Tokenizer=20class=20fo?= =?UTF-8?q?r=20text=20tokenization?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new Tokenizer class in the ERNIE-Bot.SDK namespace that provides methods for tokenizing text. The class includes a method, ApproxNumTokens, which calculates the approximate number of tokens in a given text. The method counts the number of Han characters and estimates the number of words based on whitespace. The result is the sum of the Han character count and 1.3 times the word count. The class also includes a unit test for the ApproxNumTokens method. --- src/ERNIE-Bot.SDK/Tokenizer.cs | 39 +++++++++++++++++++++ tests/ERNIE-Bot.SDK.Tests/TokenizerTests.cs | 16 +++++++++ 2 files changed, 55 insertions(+) create mode 100644 src/ERNIE-Bot.SDK/Tokenizer.cs create mode 100644 tests/ERNIE-Bot.SDK.Tests/TokenizerTests.cs diff --git a/src/ERNIE-Bot.SDK/Tokenizer.cs b/src/ERNIE-Bot.SDK/Tokenizer.cs new file mode 100644 index 0000000..ce78981 --- /dev/null +++ b/src/ERNIE-Bot.SDK/Tokenizer.cs @@ -0,0 +1,39 @@ +using System; +using System.Collections.Generic; +using System.Text; +using System.Text.RegularExpressions; + +namespace ERNIE_Bot.SDK +{ + /// + /// This class provides methods for tokenizing text. + /// + public static class Tokenizer + { + public static int ApproxNumTokens(string text) + { + var hanCount = 0; + var res = new StringBuilder(text.Length); + + foreach (var c in text) + { + if (char.IsWhiteSpace(c)) + { + res.Append(' '); + } + else if (char.GetUnicodeCategory(c) == System.Globalization.UnicodeCategory.OtherLetter) + { + hanCount++; + res.Append(' '); + } + else + { + res.Append(c); + } + } + + var wordCount = res.ToString().Split(' ', StringSplitOptions.RemoveEmptyEntries).Length; + return hanCount + (int)Math.Floor(wordCount * 1.3); + } + } +} diff --git a/tests/ERNIE-Bot.SDK.Tests/TokenizerTests.cs b/tests/ERNIE-Bot.SDK.Tests/TokenizerTests.cs new file mode 100644 index 0000000..e27688f --- /dev/null +++ b/tests/ERNIE-Bot.SDK.Tests/TokenizerTests.cs @@ -0,0 +1,16 @@ +using ERNIE_Bot.SDK; + +namespace ERNIE_Bot.SDK.Tests +{ + public class TokenizerTests + { + [Fact] + public void TestApproxNumTokens() + { + string text = "ÕâÊÇÒ»¶Î²âÊÔÎÄ×ÖThis is a test string."; + int expected = 14; + int actual = Tokenizer.ApproxNumTokens(text); + Assert.Equal(expected, actual); + } + } +} From 8345a4710a00f0c20b75e3565c7379c8ab47a51b Mon Sep 17 00:00:00 2001 From: xbotter Date: Thu, 9 Nov 2023 12:43:30 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=94=A7=20Update=20Tokenizer=20to=20im?= =?UTF-8?q?prove=20token=20counting=20accuracy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Improve accuracy of token counting in ApproxNumTokens method - Count Chinese characters using regular expression - Count English words excluding special characters - Adjust token count based on English word count - Update unit tests to reflect changes in token counting logic --- src/ERNIE-Bot.SDK/Tokenizer.cs | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/src/ERNIE-Bot.SDK/Tokenizer.cs b/src/ERNIE-Bot.SDK/Tokenizer.cs index ce78981..e0ab4bf 100644 --- a/src/ERNIE-Bot.SDK/Tokenizer.cs +++ b/src/ERNIE-Bot.SDK/Tokenizer.cs @@ -12,28 +12,12 @@ public static class Tokenizer { public static int ApproxNumTokens(string text) { - var hanCount = 0; - var res = new StringBuilder(text.Length); + int chinese = Regex.Matches(text, @"\p{IsCJKUnifiedIdeographs}").Count; + int english = Regex.Replace(text, @"[^\p{IsBasicLatin}-]", " ") + .Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries) + .Count(w => !string.IsNullOrWhiteSpace(w) && w != "-" && w != "_"); - foreach (var c in text) - { - if (char.IsWhiteSpace(c)) - { - res.Append(' '); - } - else if (char.GetUnicodeCategory(c) == System.Globalization.UnicodeCategory.OtherLetter) - { - hanCount++; - res.Append(' '); - } - else - { - res.Append(c); - } - } - - var wordCount = res.ToString().Split(' ', StringSplitOptions.RemoveEmptyEntries).Length; - return hanCount + (int)Math.Floor(wordCount * 1.3); + return chinese + (int)Math.Floor(english * 1.3); } } } From f127dadabe2b0a812a865a08cf25e9125d850354 Mon Sep 17 00:00:00 2001 From: xbotter Date: Thu, 9 Nov 2023 12:48:58 +0800 Subject: [PATCH 3/3] fix unicode --- tests/ERNIE-Bot.SDK.Tests/TokenizerTests.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ERNIE-Bot.SDK.Tests/TokenizerTests.cs b/tests/ERNIE-Bot.SDK.Tests/TokenizerTests.cs index e27688f..f044d80 100644 --- a/tests/ERNIE-Bot.SDK.Tests/TokenizerTests.cs +++ b/tests/ERNIE-Bot.SDK.Tests/TokenizerTests.cs @@ -1,4 +1,4 @@ -using ERNIE_Bot.SDK; +using ERNIE_Bot.SDK; namespace ERNIE_Bot.SDK.Tests { @@ -7,7 +7,7 @@ public class TokenizerTests [Fact] public void TestApproxNumTokens() { - string text = "ÕâÊÇÒ»¶Î²âÊÔÎÄ×ÖThis is a test string."; + string text = "这是一段测试文字This is a test string."; int expected = 14; int actual = Tokenizer.ApproxNumTokens(text); Assert.Equal(expected, actual);