From 541270adb32514bb9f65bb57a90282633e598496 Mon Sep 17 00:00:00 2001 From: Rick Hennigan Date: Fri, 1 Dec 2023 12:41:26 -0500 Subject: [PATCH] Avoid serializing giant tokenizers into tagging rules of chat outputs --- Scripts/Common.wl | 2 +- Source/Chatbook/ChatMessages.wl | 113 ++++++++++++++++++++++++++------ Source/Chatbook/Main.wl | 1 + Source/Chatbook/SendChat.wl | 32 ++++++--- 4 files changed, 119 insertions(+), 29 deletions(-) diff --git a/Scripts/Common.wl b/Scripts/Common.wl index b2ad9cc8..60216f26 100644 --- a/Scripts/Common.wl +++ b/Scripts/Common.wl @@ -141,7 +141,7 @@ releaseID[ dir_ ] := FirstCase[ (* ::Subsection::Closed:: *) (*releaseURL*) releaseURL[ file_ ] := Enclose[ - Enclose @ Module[ { pac, repo, ver }, + Module[ { pac, repo, ver }, pac = PacletObject @ Flatten @ File @ file; repo = ConfirmBy[ Environment[ "GITHUB_REPOSITORY" ], StringQ ]; ver = ConfirmBy[ pac[ "Version" ], StringQ ]; diff --git a/Source/Chatbook/ChatMessages.wl b/Source/Chatbook/ChatMessages.wl index 0ec16383..9a4887a1 100644 --- a/Source/Chatbook/ChatMessages.wl +++ b/Source/Chatbook/ChatMessages.wl @@ -17,6 +17,7 @@ Wolfram`Chatbook`CellToChatMessage; `constructMessages; `expandMultimodalString; `getTokenizer; +`getTokenizerName; `resizeMultimodalImage; Begin[ "`Private`" ]; @@ -70,6 +71,10 @@ $styleRoles = <| "ChatSystemInput" -> "System" |>; +$cachedTokenizerNames = { "chat-bison", "claude", "gpt-2", "gpt-3.5", "gpt-4-vision", "gpt-4" }; +$cachedTokenizers = <| |>; +$fallbackTokenizer = "gpt-2"; + (* ::**************************************************************************************************************:: *) (* ::Section::Closed:: *) (*CellToChatMessage*) @@ -1079,58 +1084,120 @@ argumentTokenToString // endDefinition; (* ::**************************************************************************************************************:: *) (* ::Section::Closed:: *) (*Tokenization*) -$tokenizer := gpt2Tokenizer; +$tokenizer := $gpt2Tokenizer; + +(* ::**************************************************************************************************************:: *) +(* ::Subsection::Closed:: *) +(*getTokenizerName*) +getTokenizerName // beginDefinition; + +getTokenizerName[ KeyValuePattern[ "TokenizerName"|"Tokenizer" -> name_String ] ] := + tokenizerName @ name; + +getTokenizerName[ KeyValuePattern[ "Tokenizer" -> Except[ $$unspecified ] ] ] := + "Custom"; + +getTokenizerName[ KeyValuePattern[ "Model" -> model_ ] ] := + With[ { name = tokenizerName @ toModelName @ model }, + If[ MemberQ[ $cachedTokenizerNames, name ], + name, + $fallbackTokenizer + ] + ]; + +getTokenizerName // endDefinition; (* ::**************************************************************************************************************:: *) (* ::Subsection::Closed:: *) (*getTokenizer*) getTokenizer // beginDefinition; getTokenizer[ KeyValuePattern[ "Tokenizer" -> tokenizer: Except[ $$unspecified ] ] ] := tokenizer; -getTokenizer[ KeyValuePattern[ "Model" -> model_ ] ] := getTokenizer @ model; -getTokenizer[ model_ ] := cachedTokenizer @ toModelName @ model; +getTokenizer[ KeyValuePattern[ "TokenizerName" -> name_String ] ] := cachedTokenizer @ name; +getTokenizer[ KeyValuePattern[ "Model" -> model_ ] ] := cachedTokenizer @ toModelName @ model; getTokenizer // endDefinition; (* ::**************************************************************************************************************:: *) (* ::Subsubsection::Closed:: *) (*cachedTokenizer*) cachedTokenizer // beginDefinition; -cachedTokenizer[ All ] := AssociationMap[ cachedTokenizer, $cachedTokenizerNames ]; -cachedTokenizer[ name_String ] := cachedTokenizer0 @ tokenizerName @ toModelName @ name; + +cachedTokenizer[ All ] := + AssociationMap[ cachedTokenizer, $cachedTokenizerNames ]; + +cachedTokenizer[ id_String ] := + With[ { tokenizer = $cachedTokenizers[ tokenizerName @ toModelName @ id ] }, + tokenizer /; ! MatchQ[ tokenizer, $$unspecified ] + ]; + +cachedTokenizer[ id_String ] := Enclose[ + Module[ { name, tokenizer }, + name = ConfirmBy[ tokenizerName @ toModelName @ id, StringQ, "Name" ]; + tokenizer = findTokenizer @ name; + If[ MissingQ @ tokenizer, + (* Fallback to the GPT-2 tokenizer: *) + tokenizer = ConfirmMatch[ $gpt2Tokenizer, Except[ $$unspecified ], "GPT2Tokenizer" ]; + If[ TrueQ @ Wolfram`ChatbookInternal`$BuildingMX, + tokenizer, (* Avoid caching fallback values into MX definitions *) + cacheTokenizer[ name, tokenizer ] + ], + cacheTokenizer[ name, ConfirmMatch[ tokenizer, Except[ $$unspecified ], "Tokenizer" ] ] + ] + ], + throwInternalFailure +]; + cachedTokenizer // endDefinition; +(* ::**************************************************************************************************************:: *) +(* ::Subsubsubsection::Closed:: *) +(*cacheTokenizer*) +cacheTokenizer // beginDefinition; -cachedTokenizer0 // beginDefinition; +cacheTokenizer[ name_String, tokenizer: Except[ $$unspecified ] ] := ( + $cachedTokenizerNames = Union[ $cachedTokenizerNames, { name } ]; + $cachedTokenizers[ name ] = tokenizer +); -cachedTokenizer0[ "chat-bison" ] = ToCharacterCode[ #, "UTF8" ] &; +cacheTokenizer // endDefinition; -cachedTokenizer0[ "gpt-4-vision" ] := - If[ graphicsQ[ # ], - gpt4ImageTokenizer[ # ], - cachedTokenizer[ "gpt-4" ][ # ] - ] &; +(* ::**************************************************************************************************************:: *) +(* ::Subsubsubsection::Closed:: *) +(*findTokenizer*) +findTokenizer // beginDefinition; -cachedTokenizer0[ model_String ] := Enclose[ +findTokenizer[ model_String ] := Enclose[ Quiet @ Module[ { name, tokenizer }, initTools[ ]; Quiet @ Needs[ "Wolfram`LLMFunctions`Utilities`Tokenization`" -> None ]; name = ConfirmBy[ tokens`FindTokenizer @ model, StringQ, "Name" ]; tokenizer = ConfirmMatch[ tokens`LLMTokenizer[ Method -> name ], Except[ _tokens`LLMTokenizer ], "Tokenizer" ]; ConfirmMatch[ tokenizer[ "test" ], _List, "TokenizerTest" ]; - cachedTokenizer0[ model ] = tokenizer + tokenizer ], - gpt2Tokenizer & + Missing[ "NotFound" ] & ]; -cachedTokenizer0 // endDefinition; +findTokenizer // endDefinition; + +(* ::**************************************************************************************************************:: *) +(* ::Subsubsubsection::Closed:: *) +(*Pre-cached small tokenizer functions*) +$cachedTokenizers[ "chat-bison" ] = ToCharacterCode[ #, "UTF8" ] &; +$cachedTokenizers[ "gpt-4-vision" ] = If[ graphicsQ[ # ], gpt4ImageTokenizer[ # ], cachedTokenizer[ "gpt-4" ][ # ] ] &; (* ::**************************************************************************************************************:: *) (* ::Subsubsection::Closed:: *) (*tokenizerName*) tokenizerName // beginDefinition; -tokenizerName[ name_String ] := SelectFirst[ $cachedTokenizerNames, StringContainsQ[ name, # ] &, name ]; -tokenizerName // endDefinition; -$cachedTokenizerNames = { "gpt-4-vision", "gpt-4", "gpt-3.5", "gpt-2", "claude-2", "claude-instant-1", "chat-bison" }; +tokenizerName[ name_String ] := + SelectFirst[ + $cachedTokenizerNames, + StringContainsQ[ name, #, IgnoreCase -> True ] &, + name + ]; + +tokenizerName // endDefinition; (* ::**************************************************************************************************************:: *) (* ::Subsection::Closed:: *) @@ -1182,13 +1249,19 @@ gpt4ImageTokenCount0 // endDefinition; (* ::**************************************************************************************************************:: *) (* ::Subsection::Closed:: *) (*Fallback Tokenizer*) -gpt2Tokenizer := gpt2Tokenizer = ResourceFunction[ "GPTTokenizer" ][ ]; +$gpt2Tokenizer := $gpt2Tokenizer = gpt2Tokenizer[ ]; + +(* https://resources.wolframcloud.com/FunctionRepository/resources/GPTTokenizer *) +importResourceFunction[ gpt2Tokenizer, "GPTTokenizer" ]; (* ::**************************************************************************************************************:: *) (* ::Section::Closed:: *) (*Package Footer*) If[ Wolfram`ChatbookInternal`$BuildingMX, cachedTokenizer[ All ]; + $gpt2Tokenizer; + (* This is only needed to generate $gpt2Tokenizer once, so it can be removed to reduce MX file size: *) + Remove[ "Wolfram`Chatbook`ResourceFunctions`GPTTokenizer`GPTTokenizer" ]; ]; (* :!CodeAnalysis::EndBlock:: *) diff --git a/Source/Chatbook/Main.wl b/Source/Chatbook/Main.wl index f527f1a9..2a62801c 100644 --- a/Source/Chatbook/Main.wl +++ b/Source/Chatbook/Main.wl @@ -6,6 +6,7 @@ BeginPackage[ "Wolfram`Chatbook`" ]; (* ::**************************************************************************************************************:: *) (* ::Subsection::Closed:: *) (*Declare Symbols*) +`$AvailableTools; `$ChatHandlerData; `$ChatPost; `$ChatPre; diff --git a/Source/Chatbook/SendChat.wl b/Source/Chatbook/SendChat.wl index 3eef60e9..85402828 100644 --- a/Source/Chatbook/SendChat.wl +++ b/Source/Chatbook/SendChat.wl @@ -1212,7 +1212,14 @@ resolveAutoSettings[ settings_Association ] := resolveAutoSettings0 @ <| settings, "HandlerFunctions" -> getHandlerFunctions @ settings, "LLMEvaluator" -> getLLMEvaluator @ settings, - "ProcessingFunctions" -> getProcessingFunctions @ settings + "ProcessingFunctions" -> getProcessingFunctions @ settings, + If[ StringQ @ settings[ "Tokenizer" ], + <| + "TokenizerName" -> getTokenizerName @ settings, + "Tokenizer" -> Automatic + |>, + "TokenizerName" -> Automatic + ] |>; resolveAutoSettings // endDefinition; @@ -1253,6 +1260,7 @@ resolveAutoSetting0[ as_, "NotebookWriteMethod" ] := "PreemptiveLink"; resolveAutoSetting0[ as_, "ShowMinimized" ] := Automatic; resolveAutoSetting0[ as_, "StreamingOutputMethod" ] := "PartialDynamic"; resolveAutoSetting0[ as_, "Tokenizer" ] := getTokenizer @ as; +resolveAutoSetting0[ as_, "TokenizerName" ] := getTokenizerName @ as; resolveAutoSetting0[ as_, "ToolCallFrequency" ] := Automatic; resolveAutoSetting0[ as_, "ToolsEnabled" ] := toolsEnabledQ @ as; resolveAutoSetting0[ as_, "TrackScrollingWhenPlaced" ] := scrollOutputQ @ as; @@ -1267,7 +1275,8 @@ $autoSettingKeyDependencies = <| "MaxOutputCellStringLength" -> "MaxCellStringLength", "MaxTokens" -> "Model", "Multimodal" -> { "EnableLLMServices", "Model" }, - "Tokenizer" -> "Model", + "Tokenizer" -> "TokenizerName", + "TokenizerName" -> "Model", "Tools" -> { "LLMEvaluator", "ToolsEnabled" }, "ToolsEnabled" -> { "Model", "ToolCallFrequency" } |>; @@ -2201,7 +2210,7 @@ makeCompactChatData[ BaseEncode @ BinarySerialize[ DeleteCases[ Association[ - smallSettings @ KeyDrop[ as, "OpenAIKey" ], + smallSettings @ as, "MessageTag" -> tag, "Data" -> Association[ data, @@ -2219,20 +2228,27 @@ makeCompactChatData // endDefinition; (* ::Subsubsection::Closed:: *) (*smallSettings*) smallSettings // beginDefinition; +smallSettings[ as_Association ] := smallSettings0 @ KeyDrop[ as, { "OpenAIKey", "Tokenizer" } ] /. $exprToNameRules; +smallSettings // endDefinition; -smallSettings[ as_Association ] := - smallSettings[ as, as[ "LLMEvaluator" ] ]; +smallSettings0 // beginDefinition; -smallSettings[ as_, KeyValuePattern[ "LLMEvaluatorName" -> name_String ] ] := +smallSettings0[ as_Association ] := + smallSettings0[ as, as[ "LLMEvaluator" ] ]; + +smallSettings0[ as_, KeyValuePattern[ "LLMEvaluatorName" -> name_String ] ] := If[ AssociationQ @ GetCachedPersonaData @ name, Append[ as, "LLMEvaluator" -> name ], as ]; -smallSettings[ as_, _ ] := +smallSettings0[ as_, _ ] := as; -smallSettings // endDefinition; +smallSettings0 // endDefinition; + + +$exprToNameRules := AssociationMap[ Reverse, $AvailableTools ]; (* ::**************************************************************************************************************:: *) (* ::Subsubsection::Closed:: *)