Skip to content

Commit

Permalink
Avoid serializing giant tokenizers into tagging rules of chat outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
rhennigan committed Dec 1, 2023
1 parent 0419565 commit 541270a
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 29 deletions.
2 changes: 1 addition & 1 deletion Scripts/Common.wl
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ releaseID[ dir_ ] := FirstCase[
(* ::Subsection::Closed:: *)
(*releaseURL*)
releaseURL[ file_ ] := Enclose[
Enclose @ Module[ { pac, repo, ver },
Module[ { pac, repo, ver },
pac = PacletObject @ Flatten @ File @ file;
repo = ConfirmBy[ Environment[ "GITHUB_REPOSITORY" ], StringQ ];
ver = ConfirmBy[ pac[ "Version" ], StringQ ];
Expand Down
113 changes: 93 additions & 20 deletions Source/Chatbook/ChatMessages.wl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Wolfram`Chatbook`CellToChatMessage;
`constructMessages;
`expandMultimodalString;
`getTokenizer;
`getTokenizerName;
`resizeMultimodalImage;

Begin[ "`Private`" ];
Expand Down Expand Up @@ -70,6 +71,10 @@ $styleRoles = <|
"ChatSystemInput" -> "System"
|>;

$cachedTokenizerNames = { "chat-bison", "claude", "gpt-2", "gpt-3.5", "gpt-4-vision", "gpt-4" };
$cachedTokenizers = <| |>;
$fallbackTokenizer = "gpt-2";

(* ::**************************************************************************************************************:: *)
(* ::Section::Closed:: *)
(*CellToChatMessage*)
Expand Down Expand Up @@ -1079,58 +1084,120 @@ argumentTokenToString // endDefinition;
(* ::**************************************************************************************************************:: *)
(* ::Section::Closed:: *)
(*Tokenization*)
$tokenizer := gpt2Tokenizer;
$tokenizer := $gpt2Tokenizer;

(* ::**************************************************************************************************************:: *)
(* ::Subsection::Closed:: *)
(*getTokenizerName*)
getTokenizerName // beginDefinition;

getTokenizerName[ KeyValuePattern[ "TokenizerName"|"Tokenizer" -> name_String ] ] :=
tokenizerName @ name;

getTokenizerName[ KeyValuePattern[ "Tokenizer" -> Except[ $$unspecified ] ] ] :=
"Custom";

getTokenizerName[ KeyValuePattern[ "Model" -> model_ ] ] :=
With[ { name = tokenizerName @ toModelName @ model },
If[ MemberQ[ $cachedTokenizerNames, name ],
name,
$fallbackTokenizer
]
];

getTokenizerName // endDefinition;

(* ::**************************************************************************************************************:: *)
(* ::Subsection::Closed:: *)
(*getTokenizer*)
getTokenizer // beginDefinition;
getTokenizer[ KeyValuePattern[ "Tokenizer" -> tokenizer: Except[ $$unspecified ] ] ] := tokenizer;
getTokenizer[ KeyValuePattern[ "Model" -> model_ ] ] := getTokenizer @ model;
getTokenizer[ model_ ] := cachedTokenizer @ toModelName @ model;
getTokenizer[ KeyValuePattern[ "TokenizerName" -> name_String ] ] := cachedTokenizer @ name;
getTokenizer[ KeyValuePattern[ "Model" -> model_ ] ] := cachedTokenizer @ toModelName @ model;
getTokenizer // endDefinition;

(* ::**************************************************************************************************************:: *)
(* ::Subsubsection::Closed:: *)
(*cachedTokenizer*)
cachedTokenizer // beginDefinition;
cachedTokenizer[ All ] := AssociationMap[ cachedTokenizer, $cachedTokenizerNames ];
cachedTokenizer[ name_String ] := cachedTokenizer0 @ tokenizerName @ toModelName @ name;

cachedTokenizer[ All ] :=
AssociationMap[ cachedTokenizer, $cachedTokenizerNames ];

cachedTokenizer[ id_String ] :=
With[ { tokenizer = $cachedTokenizers[ tokenizerName @ toModelName @ id ] },
tokenizer /; ! MatchQ[ tokenizer, $$unspecified ]
];

cachedTokenizer[ id_String ] := Enclose[
Module[ { name, tokenizer },
name = ConfirmBy[ tokenizerName @ toModelName @ id, StringQ, "Name" ];
tokenizer = findTokenizer @ name;
If[ MissingQ @ tokenizer,
(* Fallback to the GPT-2 tokenizer: *)
tokenizer = ConfirmMatch[ $gpt2Tokenizer, Except[ $$unspecified ], "GPT2Tokenizer" ];
If[ TrueQ @ Wolfram`ChatbookInternal`$BuildingMX,
tokenizer, (* Avoid caching fallback values into MX definitions *)
cacheTokenizer[ name, tokenizer ]
],
cacheTokenizer[ name, ConfirmMatch[ tokenizer, Except[ $$unspecified ], "Tokenizer" ] ]
]
],
throwInternalFailure
];

cachedTokenizer // endDefinition;

(* ::**************************************************************************************************************:: *)
(* ::Subsubsubsection::Closed:: *)
(*cacheTokenizer*)
cacheTokenizer // beginDefinition;

cachedTokenizer0 // beginDefinition;
cacheTokenizer[ name_String, tokenizer: Except[ $$unspecified ] ] := (
$cachedTokenizerNames = Union[ $cachedTokenizerNames, { name } ];
$cachedTokenizers[ name ] = tokenizer
);

cachedTokenizer0[ "chat-bison" ] = ToCharacterCode[ #, "UTF8" ] &;
cacheTokenizer // endDefinition;

cachedTokenizer0[ "gpt-4-vision" ] :=
If[ graphicsQ[ # ],
gpt4ImageTokenizer[ # ],
cachedTokenizer[ "gpt-4" ][ # ]
] &;
(* ::**************************************************************************************************************:: *)
(* ::Subsubsubsection::Closed:: *)
(*findTokenizer*)
findTokenizer // beginDefinition;

cachedTokenizer0[ model_String ] := Enclose[
findTokenizer[ model_String ] := Enclose[
Quiet @ Module[ { name, tokenizer },
initTools[ ];
Quiet @ Needs[ "Wolfram`LLMFunctions`Utilities`Tokenization`" -> None ];
name = ConfirmBy[ tokens`FindTokenizer @ model, StringQ, "Name" ];
tokenizer = ConfirmMatch[ tokens`LLMTokenizer[ Method -> name ], Except[ _tokens`LLMTokenizer ], "Tokenizer" ];
ConfirmMatch[ tokenizer[ "test" ], _List, "TokenizerTest" ];
cachedTokenizer0[ model ] = tokenizer
tokenizer
],
gpt2Tokenizer &
Missing[ "NotFound" ] &
];

cachedTokenizer0 // endDefinition;
findTokenizer // endDefinition;

(* ::**************************************************************************************************************:: *)
(* ::Subsubsubsection::Closed:: *)
(*Pre-cached small tokenizer functions*)
$cachedTokenizers[ "chat-bison" ] = ToCharacterCode[ #, "UTF8" ] &;
$cachedTokenizers[ "gpt-4-vision" ] = If[ graphicsQ[ # ], gpt4ImageTokenizer[ # ], cachedTokenizer[ "gpt-4" ][ # ] ] &;

(* ::**************************************************************************************************************:: *)
(* ::Subsubsection::Closed:: *)
(*tokenizerName*)
tokenizerName // beginDefinition;
tokenizerName[ name_String ] := SelectFirst[ $cachedTokenizerNames, StringContainsQ[ name, # ] &, name ];
tokenizerName // endDefinition;

$cachedTokenizerNames = { "gpt-4-vision", "gpt-4", "gpt-3.5", "gpt-2", "claude-2", "claude-instant-1", "chat-bison" };
tokenizerName[ name_String ] :=
SelectFirst[
$cachedTokenizerNames,
StringContainsQ[ name, #, IgnoreCase -> True ] &,
name
];

tokenizerName // endDefinition;

(* ::**************************************************************************************************************:: *)
(* ::Subsection::Closed:: *)
Expand Down Expand Up @@ -1182,13 +1249,19 @@ gpt4ImageTokenCount0 // endDefinition;
(* ::**************************************************************************************************************:: *)
(* ::Subsection::Closed:: *)
(*Fallback Tokenizer*)
gpt2Tokenizer := gpt2Tokenizer = ResourceFunction[ "GPTTokenizer" ][ ];
$gpt2Tokenizer := $gpt2Tokenizer = gpt2Tokenizer[ ];

(* https://resources.wolframcloud.com/FunctionRepository/resources/GPTTokenizer *)
importResourceFunction[ gpt2Tokenizer, "GPTTokenizer" ];

(* ::**************************************************************************************************************:: *)
(* ::Section::Closed:: *)
(*Package Footer*)
If[ Wolfram`ChatbookInternal`$BuildingMX,
cachedTokenizer[ All ];
$gpt2Tokenizer;
(* This is only needed to generate $gpt2Tokenizer once, so it can be removed to reduce MX file size: *)
Remove[ "Wolfram`Chatbook`ResourceFunctions`GPTTokenizer`GPTTokenizer" ];
];

(* :!CodeAnalysis::EndBlock:: *)
Expand Down
1 change: 1 addition & 0 deletions Source/Chatbook/Main.wl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ BeginPackage[ "Wolfram`Chatbook`" ];
(* ::**************************************************************************************************************:: *)
(* ::Subsection::Closed:: *)
(*Declare Symbols*)
`$AvailableTools;
`$ChatHandlerData;
`$ChatPost;
`$ChatPre;
Expand Down
32 changes: 24 additions & 8 deletions Source/Chatbook/SendChat.wl
Original file line number Diff line number Diff line change
Expand Up @@ -1212,7 +1212,14 @@ resolveAutoSettings[ settings_Association ] := resolveAutoSettings0 @ <|
settings,
"HandlerFunctions" -> getHandlerFunctions @ settings,
"LLMEvaluator" -> getLLMEvaluator @ settings,
"ProcessingFunctions" -> getProcessingFunctions @ settings
"ProcessingFunctions" -> getProcessingFunctions @ settings,
If[ StringQ @ settings[ "Tokenizer" ],
<|
"TokenizerName" -> getTokenizerName @ settings,
"Tokenizer" -> Automatic
|>,
"TokenizerName" -> Automatic
]
|>;

resolveAutoSettings // endDefinition;
Expand Down Expand Up @@ -1253,6 +1260,7 @@ resolveAutoSetting0[ as_, "NotebookWriteMethod" ] := "PreemptiveLink";
resolveAutoSetting0[ as_, "ShowMinimized" ] := Automatic;
resolveAutoSetting0[ as_, "StreamingOutputMethod" ] := "PartialDynamic";
resolveAutoSetting0[ as_, "Tokenizer" ] := getTokenizer @ as;
resolveAutoSetting0[ as_, "TokenizerName" ] := getTokenizerName @ as;
resolveAutoSetting0[ as_, "ToolCallFrequency" ] := Automatic;
resolveAutoSetting0[ as_, "ToolsEnabled" ] := toolsEnabledQ @ as;
resolveAutoSetting0[ as_, "TrackScrollingWhenPlaced" ] := scrollOutputQ @ as;
Expand All @@ -1267,7 +1275,8 @@ $autoSettingKeyDependencies = <|
"MaxOutputCellStringLength" -> "MaxCellStringLength",
"MaxTokens" -> "Model",
"Multimodal" -> { "EnableLLMServices", "Model" },
"Tokenizer" -> "Model",
"Tokenizer" -> "TokenizerName",
"TokenizerName" -> "Model",
"Tools" -> { "LLMEvaluator", "ToolsEnabled" },
"ToolsEnabled" -> { "Model", "ToolCallFrequency" }
|>;
Expand Down Expand Up @@ -2201,7 +2210,7 @@ makeCompactChatData[
BaseEncode @ BinarySerialize[
DeleteCases[
Association[
smallSettings @ KeyDrop[ as, "OpenAIKey" ],
smallSettings @ as,
"MessageTag" -> tag,
"Data" -> Association[
data,
Expand All @@ -2219,20 +2228,27 @@ makeCompactChatData // endDefinition;
(* ::Subsubsection::Closed:: *)
(*smallSettings*)
smallSettings // beginDefinition;
smallSettings[ as_Association ] := smallSettings0 @ KeyDrop[ as, { "OpenAIKey", "Tokenizer" } ] /. $exprToNameRules;
smallSettings // endDefinition;

smallSettings[ as_Association ] :=
smallSettings[ as, as[ "LLMEvaluator" ] ];
smallSettings0 // beginDefinition;

smallSettings[ as_, KeyValuePattern[ "LLMEvaluatorName" -> name_String ] ] :=
smallSettings0[ as_Association ] :=
smallSettings0[ as, as[ "LLMEvaluator" ] ];

smallSettings0[ as_, KeyValuePattern[ "LLMEvaluatorName" -> name_String ] ] :=
If[ AssociationQ @ GetCachedPersonaData @ name,
Append[ as, "LLMEvaluator" -> name ],
as
];

smallSettings[ as_, _ ] :=
smallSettings0[ as_, _ ] :=
as;

smallSettings // endDefinition;
smallSettings0 // endDefinition;


$exprToNameRules := AssociationMap[ Reverse, $AvailableTools ];

(* ::**************************************************************************************************************:: *)
(* ::Subsubsection::Closed:: *)
Expand Down

0 comments on commit 541270a

Please sign in to comment.