From 0e7f5c032207a4d2b64e498a179ba57f54d8edcb Mon Sep 17 00:00:00 2001 From: xbotter Date: Thu, 9 Nov 2023 12:50:37 +0800 Subject: [PATCH] Add Tokenizer Helper class to ERNIE-Bot.SDK (#61) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 📝 Add Tokenizer class for text tokenization Add a new Tokenizer class in the ERNIE-Bot.SDK namespace that provides methods for tokenizing text. The class includes a method, ApproxNumTokens, which calculates the approximate number of tokens in a given text. The method counts the number of Han characters and estimates the number of words based on whitespace. The result is the sum of the Han character count and 1.3 times the word count. The class also includes a unit test for the ApproxNumTokens method. * 🔧 Update Tokenizer to improve token counting accuracy - Improve accuracy of token counting in ApproxNumTokens method - Count Chinese characters using regular expression - Count English words excluding special characters - Adjust token count based on English word count - Update unit tests to reflect changes in token counting logic * fix unicode --- src/ERNIE-Bot.SDK/Tokenizer.cs | 23 +++++++++++++++++++++ tests/ERNIE-Bot.SDK.Tests/TokenizerTests.cs | 16 ++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 src/ERNIE-Bot.SDK/Tokenizer.cs create mode 100644 tests/ERNIE-Bot.SDK.Tests/TokenizerTests.cs diff --git a/src/ERNIE-Bot.SDK/Tokenizer.cs b/src/ERNIE-Bot.SDK/Tokenizer.cs new file mode 100644 index 0000000..e0ab4bf --- /dev/null +++ b/src/ERNIE-Bot.SDK/Tokenizer.cs @@ -0,0 +1,23 @@ +using System; +using System.Collections.Generic; +using System.Text; +using System.Text.RegularExpressions; + +namespace ERNIE_Bot.SDK +{ + /// + /// This class provides methods for tokenizing text. + /// + public static class Tokenizer + { + public static int ApproxNumTokens(string text) + { + int chinese = Regex.Matches(text, @"\p{IsCJKUnifiedIdeographs}").Count; + int english = Regex.Replace(text, @"[^\p{IsBasicLatin}-]", " ") + .Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries) + .Count(w => !string.IsNullOrWhiteSpace(w) && w != "-" && w != "_"); + + return chinese + (int)Math.Floor(english * 1.3); + } + } +} diff --git a/tests/ERNIE-Bot.SDK.Tests/TokenizerTests.cs b/tests/ERNIE-Bot.SDK.Tests/TokenizerTests.cs new file mode 100644 index 0000000..f044d80 --- /dev/null +++ b/tests/ERNIE-Bot.SDK.Tests/TokenizerTests.cs @@ -0,0 +1,16 @@ +using ERNIE_Bot.SDK; + +namespace ERNIE_Bot.SDK.Tests +{ + public class TokenizerTests + { + [Fact] + public void TestApproxNumTokens() + { + string text = "这是一段测试文字This is a test string."; + int expected = 14; + int actual = Tokenizer.ApproxNumTokens(text); + Assert.Equal(expected, actual); + } + } +}