From 1b9baba6120aea47135d6e2478478568f81edb22 Mon Sep 17 00:00:00 2001 From: Antonio Cheong Date: Fri, 29 Nov 2024 00:33:03 +0000 Subject: [PATCH] fix: invalid utf-8 parsing --- src/lib.rs | 9 +++++---- ...re-0.2.2-1.rockspec => tiktoken_core-0.2.3-1.rockspec | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) rename tiktoken_core-0.2.2-1.rockspec => tiktoken_core-0.2.3-1.rockspec (92%) diff --git a/src/lib.rs b/src/lib.rs index 0f372db..e8ab04c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +use base64; use fancy_regex::Regex; use mlua::prelude::*; use rustc_hash::FxHashMap as HashMap; @@ -6,7 +7,6 @@ use std::fs::File; use std::io::{BufRead, BufReader}; use std::sync::{Arc, Mutex}; use std::thread; -use base64; #[cfg(feature = "multithreading")] const MAX_NUM_THREADS: usize = 128; @@ -203,7 +203,7 @@ pub fn tiktoken_core(lua: &mlua::Lua) -> LuaResult { Ok(()) }, )?; - let _encode = lua.create_function(move |_, text: String| encode(&*state2, text))?; + let _encode = lua.create_function(move |_, text: mlua::String| encode(&*state2, text))?; let exports = lua.create_table()?; exports.set("new", _new)?; @@ -261,7 +261,8 @@ fn new( }); } -fn encode(state: &State, text: String) -> LuaResult<(Vec, usize, usize)> { +fn encode(state: &State, text: mlua::String) -> LuaResult<(Vec, usize, usize)> { + let encoded_str = String::from_utf8_lossy(text.as_bytes()); let allowed_special = HashSet::new(); let max_tokens = None; Ok(state @@ -270,7 +271,7 @@ fn encode(state: &State, text: String) -> LuaResult<(Vec, usize, usize)> .unwrap() .as_ref() .unwrap() - ._encode_native(&text, &allowed_special, max_tokens)) + ._encode_native(&encoded_str, &allowed_special, max_tokens)) } pub struct CoreBPENative { diff --git a/tiktoken_core-0.2.2-1.rockspec b/tiktoken_core-0.2.3-1.rockspec similarity index 92% rename from tiktoken_core-0.2.2-1.rockspec rename to tiktoken_core-0.2.3-1.rockspec index eaa52db..131ce1a 100644 --- a/tiktoken_core-0.2.2-1.rockspec +++ b/tiktoken_core-0.2.3-1.rockspec @@ -1,9 +1,9 @@ package = "tiktoken_core" -version = "0.2.2-1" +version = "0.2.3-1" source = { url = "git+https://github.com/gptlang/lua-tiktoken", - tag = "v0.2.2", + tag = "v0.2.3", } description = {