Skip to content

Commit

Permalink
Merge pull request #986 from WolframResearch/feature/misc-rag-updates
Browse files Browse the repository at this point in the history
Updated RAG framework to allow easier addition of new sources
  • Loading branch information
rhennigan authored Dec 18, 2024
2 parents 67eef22 + 6a7766d commit 8242697
Show file tree
Hide file tree
Showing 18 changed files with 429 additions and 143 deletions.
1 change: 0 additions & 1 deletion .gitattributes

This file was deleted.

1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
Assets/VectorDatabases/**/*.usearch
Assets/VectorDatabases/**/*.wxf
build
Developer/VectorDatabases/SourceData/*.jsonl
Source/Chatbook/64Bit/Chatbook.mx
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"CICD",
"Componentwise",
"Connor",
"datarepository",
"deepseek",
"Deflatten",
"Deinitialization",
Expand Down
4 changes: 4 additions & 0 deletions Developer/VectorDatabases/SourceData/DataRepositoryURIs.wl
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<|
"Name" -> "DataRepositoryURIs",
"Location" -> CloudObject[ "https://www.wolframcloud.com/obj/wolframai-content/VectorDatabases/DataRepositoryURIs/1.0.0/DataRepositoryURIs.jsonl" ]
|>
3 changes: 0 additions & 3 deletions Developer/VectorDatabases/SourceData/DocumentationURIs.jsonl

This file was deleted.

4 changes: 4 additions & 0 deletions Developer/VectorDatabases/SourceData/DocumentationURIs.wl
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<|
"Name" -> "DocumentationURIs",
"Location" -> CloudObject[ "https://www.wolframcloud.com/obj/wolframai-content/VectorDatabases/DocumentationURIs/1.3.0/DocumentationURIs.jsonl" ]
|>
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<|
"Name" -> "FunctionRepositoryURIs",
"Location" -> CloudObject[ "https://www.wolframcloud.com/obj/wolframai-content/VectorDatabases/FunctionRepositoryURIs/1.0.0/FunctionRepositoryURIs.jsonl" ]
|>

This file was deleted.

4 changes: 4 additions & 0 deletions Developer/VectorDatabases/SourceData/WolframAlphaQueries.wl
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<|
"Name" -> "WolframAlphaQueries",
"Location" -> CloudObject[ "https://www.wolframcloud.com/obj/wolframai-content/VectorDatabases/WolframAlphaQueries/1.3.0/WolframAlphaQueries.jsonl" ]
|>
184 changes: 118 additions & 66 deletions Developer/VectorDatabases/VectorDatabaseBuilder.wl
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ $$vectorDatabase = _VectorDatabaseObject? System`Private`ValidQ;
(* ::**************************************************************************************************************:: *)
(* ::Subsection::Closed:: *)
(*Vector Databases*)
$vectorDBSourceDirectory = FileNameJoin @ { DirectoryName @ $InputFileName, "SourceData" };
$vectorDBTargetDirectory = FileNameJoin @ { DirectoryName[ $InputFileName, 3 ], "Assets", "VectorDatabases" };
$defaultVectorDBSourceDirectory = FileNameJoin @ { DirectoryName @ $InputFileName, "SourceData" };
$vectorDBSourceDirectory := getVectorDBSourceDirectory[ ];
$vectorDBTargetDirectory = FileNameJoin @ { DirectoryName[ $InputFileName, 3 ], "Assets", "VectorDatabases" };

$incrementalBuildBatchSize = 512;
$dbConnectivity = 16;
Expand Down Expand Up @@ -86,29 +87,36 @@ $embeddingCache = <| |>;
ImportVectorDatabaseData // ClearAll;

ImportVectorDatabaseData[ name_String ] :=
Enclose @ Module[ { file, data },
file = ConfirmBy[ FileNameJoin @ { $vectorDBSourceDirectory, name<>".jsonl" }, FileExistsQ, "File" ];
data = ConfirmMatch[ jsonlImport @ file, { ___Association? AssociationQ }, "Data" ];
data
Enclose @ Module[ { file },
file = ConfirmBy[ getVectorDBSourceFile @ name, FileExistsQ, "File" ];
ImportVectorDatabaseData @ File @ file
];

ImportVectorDatabaseData[ file_File ] :=
Enclose @ ConfirmMatch[ jsonlImport @ file, { ___Association? AssociationQ }, "Data" ];

(* ::**************************************************************************************************************:: *)
(* ::Subsection::Closed:: *)
(*ExportVectorDatabaseData*)
ExportVectorDatabaseData // ClearAll;

ExportVectorDatabaseData[ name_String, data0_List ] :=
Enclose @ Module[ { data, dir, file },
data = ConfirmBy[ toDBData @ data0, dbDataQ, "Data" ];
ExportVectorDatabaseData[ name_String, data_List ] :=
Enclose @ Module[ { dir, file },
dir = ConfirmBy[ ensureDirectory @ $vectorDBSourceDirectory, DirectoryQ, "Directory" ];
file = ConfirmBy[ FileNameJoin @ { dir, name<>".jsonl" }, StringQ, "File" ];
ExportVectorDatabaseData[ File @ file, data ]
];

ExportVectorDatabaseData[ file_File, data0_List ] :=
Enclose @ Module[ { data },
data = ConfirmBy[ toDBData @ data0, dbDataQ, "Data" ];
ConfirmBy[ jsonlExport[ file, data ], FileExistsQ, "Export" ]
];

(* ::**************************************************************************************************************:: *)
(* ::Subsection::Closed:: *)
(*AddToVectorDatabaseData*)
AddToVectorDatabaseData // beginDefinition;
AddToVectorDatabaseData // ClearAll;
AddToVectorDatabaseData // Options = { "Tag" -> "TextLiteral", "Rebuild" -> False };

AddToVectorDatabaseData[ name_String, data_List, opts: OptionsPattern[ ] ] :=
Expand All @@ -128,8 +136,6 @@ AddToVectorDatabaseData[ name_String, data_List, opts: OptionsPattern[ ] ] :=
<| "Exported" -> exported, "Rebuilt" -> rebuilt |>
];

AddToVectorDatabaseData // endDefinition;

(* ::**************************************************************************************************************:: *)
(* ::Subsection::Closed:: *)
(*BuildVectorDatabase*)
Expand All @@ -147,7 +153,7 @@ BuildVectorDatabase[ All, opts: OptionsPattern[ ] ] :=
$dbExpansionAdd = OptionValue[ "ExpansionAdd" ],
$dbExpansionSearch = OptionValue[ "ExpansionSearch" ]
},
AssociationMap[ BuildVectorDatabase, FileBaseName /@ FileNames[ "*.jsonl", $vectorDBSourceDirectory ] ]
AssociationMap[ BuildVectorDatabase, FileBaseName /@ getVectorDBSourceFile @ All ]
];

BuildVectorDatabase[ name_String, opts: OptionsPattern[ ] ] := Enclose[
Expand All @@ -169,13 +175,13 @@ BuildVectorDatabase[ name_String, opts: OptionsPattern[ ] ] := Enclose[
buildVectorDatabase // ClearAll;

buildVectorDatabase[ name_String ] :=
Enclose @ Catch @ Module[ { dir, rel, src, db, valueBag, count, n, stream, values },
Enclose @ Catch @ Module[ { dir, rel, src, db, valueBag, count, n, stream, values, built },

loadEmbeddingCache[ ];

dir = ConfirmBy[ ensureDirectory @ { $vectorDBTargetDirectory, name }, DirectoryQ, "Directory" ];
rel = ConfirmBy[ ResourceFunction[ "RelativePath" ][ dir ], DirectoryQ, "Relative" ];
src = ConfirmBy[ FileNameJoin @ { $vectorDBSourceDirectory, name<>".jsonl" }, FileExistsQ, "File" ];
src = ConfirmBy[ getVectorDBSourceFile @ name, FileExistsQ, "File" ];

DeleteFile /@ FileNames[ { "*.wxf", "*.usearch" }, dir ];
ConfirmAssert[ FileNames[ { "*.wxf", "*.usearch" }, dir ] === { }, "ClearedFilesCheck" ];
Expand All @@ -198,59 +204,65 @@ buildVectorDatabase[ name_String ] :=
valueBag = Internal`Bag[ ];
count = ConfirmMatch[ lineCount @ src, _Integer? Positive, "LineCount" ];
n = 0;
stream = ConfirmMatch[ OpenRead @ src, _InputStream, "Stream" ];

withProgress[
While[
NumericArrayQ @ ConfirmMatch[ addBatch[ db, stream, valueBag ], _NumericArray|EndOfFile, "Add" ],
n = Internal`BagLength @ valueBag
],
<|
"Text" -> "Building database \""<>name<>"\"",
"ElapsedTime" -> Automatic,
"RemainingTime" -> Automatic,
"ItemTotal" :> count,
"ItemCurrent" :> n,
"Progress" :> Automatic
|>,
"Delay" -> 0,
UpdateInterval -> 1
];

saveEmbeddingCache[ ];

values = Internal`BagPart[ valueBag, All ];

ConfirmAssert[ Length @ values === count, "ValueCount" ];
ConfirmAssert[ First @ db[ "Dimensions" ] === count, "VectorCount" ];

ConfirmBy[
writeWXFFile[ FileNameJoin @ { dir, "Values.wxf" }, values, PerformanceGoal -> "Size" ],
FileExistsQ,
"Values"
];
WithCleanup[
stream = ConfirmMatch[ OpenRead @ src, _InputStream, "Stream" ],

ConfirmBy[
writeWXFFile[
FileNameJoin @ { dir, "EmbeddingInformation.wxf" },
withProgress[
While[
NumericArrayQ @ ConfirmMatch[ addBatch[ db, stream, valueBag ], _NumericArray|EndOfFile, "Add" ],
n = Internal`BagLength @ valueBag
],
<|
"Dimension" -> $embeddingDimension,
"Type" -> $embeddingType,
"Model" -> $embeddingModel,
"Service" -> $embeddingService
|>
"Text" -> "Building database \""<>name<>"\"",
"ElapsedTime" -> Automatic,
"RemainingTime" -> Automatic,
"ItemTotal" :> count,
"ItemCurrent" :> n,
"Progress" :> Automatic
|>,
"Delay" -> 0,
UpdateInterval -> 1
];

saveEmbeddingCache[ ];

values = Internal`BagPart[ valueBag, All ];

ConfirmBy[ rewriteDBData[ rel, name ], FileExistsQ, "Rewrite" ];

built = ConfirmMatch[
VectorDatabaseObject @ File @ FileNameJoin @ { rel, name <> ".wxf" },
$$vectorDatabase,
"Result"
];

ConfirmAssert[ Length @ values === count, "ValueCount" ];
ConfirmAssert[ First @ built[ "Dimensions" ] === count, "VectorCount" ];

ConfirmBy[
writeWXFFile[ FileNameJoin @ { dir, "Values.wxf" }, values, PerformanceGoal -> "Size" ],
FileExistsQ,
"Values"
];

ConfirmBy[
writeWXFFile[
FileNameJoin @ { dir, "EmbeddingInformation.wxf" },
<|
"Dimension" -> $embeddingDimension,
"Type" -> $embeddingType,
"Model" -> $embeddingModel,
"Service" -> $embeddingService
|>
],
FileExistsQ,
"EmbeddingInformation"
],
FileExistsQ,
"EmbeddingInformation"
];

ConfirmBy[ rewriteDBData[ rel, name ], FileExistsQ, "Rewrite" ];
Close @ stream
];

ConfirmMatch[
VectorDatabaseObject @ File @ FileNameJoin @ { rel, name <> ".wxf" },
$$vectorDatabase,
"Result"
]
ConfirmMatch[ built, $$vectorDatabase, "Result" ]
];

(* ::**************************************************************************************************************:: *)
Expand All @@ -274,7 +286,7 @@ setDBDefaults[ dir_, name_String ] :=
addBatch // ClearAll;

addBatch[ db_VectorDatabaseObject, stream_InputStream, valueBag_Internal`Bag ] :=
Enclose @ Catch @ Module[ { batch, text, values, embeddings },
Enclose @ Catch @ Module[ { batch, text, values, embeddings, added },

batch = ConfirmMatch[
readJSONLines[ stream, $incrementalBuildBatchSize ],
Expand All @@ -289,9 +301,9 @@ addBatch[ db_VectorDatabaseObject, stream_InputStream, valueBag_Internal`Bag ] :
values = ConfirmMatch[ batch[[ All, "Value" ]], { __ }, "Values" ];
embeddings = ConfirmBy[ $lastEmbedding = GetEmbedding @ text, NumericArrayQ, "Embeddings" ];
ConfirmAssert[ Length @ values === Length @ embeddings, "LengthCheck" ];
Confirm[ $lastAdded = AddToVectorDatabase[ db, embeddings ], "AddToVectorDatabase" ];
added = Confirm[ $lastAdded = AddToVectorDatabase[ db, embeddings ], "AddToVectorDatabase" ];
Internal`StuffBag[ valueBag, values, 1 ];
ConfirmMatch[ db[ "Dimensions" ], { Internal`BagLength @ valueBag, $embeddingDimension }, "DimensionCheck" ];
ConfirmMatch[ added[ "Dimensions" ], { Internal`BagLength @ valueBag, $embeddingDimension }, "DimensionCheck" ];
embeddings
];

Expand Down Expand Up @@ -729,6 +741,46 @@ embeddingHash[ string_String ] :=
(* ::Section::Closed:: *)
(*Misc Utilities*)

(* ::**************************************************************************************************************:: *)
(* ::Subsection::Closed:: *)
(*getVectorDBSourceDirectory*)
getVectorDBSourceDirectory // ClearAll;

getVectorDBSourceDirectory[ ] := Enclose[
getVectorDBSourceDirectory[ ] = Confirm @ SelectFirst[
{
ReleaseHold @ PersistentSymbol[ "ChatbookDeveloper/VectorDatabaseSourceDirectory" ],
GeneralUtilities`EnsureDirectory @ $defaultVectorDBSourceDirectory
},
DirectoryQ,
$Failed
]
];

(* ::**************************************************************************************************************:: *)
(* ::Subsection::Closed:: *)
(*getVectorDBSourceFile*)
getVectorDBSourceFile // ClearAll;

getVectorDBSourceFile[ name_String ] :=
Enclose @ Catch @ Module[ { dir, jsonl, wl, as, url, downloaded },
dir = ConfirmBy[ getVectorDBSourceDirectory[ ], DirectoryQ, "Directory" ];
jsonl = FileNameJoin @ { dir, name<>".jsonl" };
If[ FileExistsQ @ jsonl, Throw @ jsonl ];
wl = ConfirmBy[ FileNameJoin @ { dir, name<>".wl" }, FileExistsQ, "File" ];
as = ConfirmBy[ Get @ wl, AssociationQ, "Data" ];
url = ConfirmMatch[ as[ "Location" ], _String|_CloudObject|_URL, "URL" ];
downloaded = ConfirmBy[ URLDownload[ url, jsonl ], FileExistsQ, "Download" ];
ConfirmBy[ jsonl, FileExistsQ, "Result" ]
];

getVectorDBSourceFile[ All ] :=
Enclose @ Module[ { dir, names },
dir = ConfirmBy[ getVectorDBSourceDirectory[ ], DirectoryQ, "Directory" ];
names = Union[ FileBaseName /@ FileNames[ { "*.jsonl", "*.wl" }, dir ] ];
getVectorDBSourceFile /@ names
];

(* ::**************************************************************************************************************:: *)
(* ::Subsection::Closed:: *)
(*withProgress*)
Expand Down
3 changes: 0 additions & 3 deletions Scripts/.githooks/post-checkout

This file was deleted.

3 changes: 0 additions & 3 deletions Scripts/.githooks/post-commit

This file was deleted.

3 changes: 0 additions & 3 deletions Scripts/.githooks/post-merge

This file was deleted.

3 changes: 0 additions & 3 deletions Scripts/.githooks/pre-push

This file was deleted.

32 changes: 32 additions & 0 deletions Scripts/BuildVectorDatabases.wls
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env wolframscript

BeginPackage[ "Wolfram`ChatbookScripts`" ];
If[ ! TrueQ @ $loadedDefinitions, Get @ FileNameJoin @ { DirectoryName @ $InputFileName, "Common.wl" } ];

(* :!CodeAnalysis::BeginBlock:: *)
(* :!CodeAnalysis::Disable::SuspiciousSessionSymbol:: *)

(* ::**************************************************************************************************************:: *)
(* ::Section::Closed:: *)
(*Initialization*)
Needs[ "Wolfram`PacletCICD`" -> "cicd`" ];

(* ::**************************************************************************************************************:: *)
(* ::Section::Closed:: *)
(*Paths*)
$sourceFile = cFile @ FileNameJoin @ { $pacletDir, "Developer", "VectorDatabases", "VectorDatabaseBuilder.wl" };
$sourceDir = cDir @ FileNameJoin @ { $pacletDir, "Developer", "VectorDatabases", "SourceData" };

(* ::**************************************************************************************************************:: *)
(* ::Section::Closed:: *)
(*Run*)
cicd`ScriptConfirmMatch[ DeleteFile /@ FileNames[ "*.jsonl", $sourceDir ], { Null... } ];
cicd`ScriptConfirmMatch[ Get @ $sourceFile, Null ];
result = cicd`ScriptConfirmBy[ BuildVectorDatabase @ All, AssociationQ ];
cicd`ConfirmAssert @ AllTrue[ result, MatchQ[ _VectorDatabaseObject ] ];

(* :!CodeAnalysis::EndBlock:: *)

EndPackage[ ];

Wolfram`ChatbookScripts`result
1 change: 1 addition & 0 deletions Source/Chatbook/Main.wl
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ BeginPackage[ "Wolfram`Chatbook`" ];
`$LastChatbookFailure;
`$LastChatbookFailureText;
`$NotebookAssistanceInputs;
`$RelatedDocumentationSources;
`$SandboxKernel;
`$ToolFunctions;
`$WorkspaceChat;
Expand Down
Loading

0 comments on commit 8242697

Please sign in to comment.