diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index 8c9c4f1c..00000000 --- a/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -*.jsonl filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index c7ba574b..3ac27e31 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ Assets/VectorDatabases/**/*.usearch Assets/VectorDatabases/**/*.wxf build +Developer/VectorDatabases/SourceData/*.jsonl Source/Chatbook/64Bit/Chatbook.mx \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index f68937c0..7e25cec6 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -12,6 +12,7 @@ "CICD", "Componentwise", "Connor", + "datarepository", "deepseek", "Deflatten", "Deinitialization", diff --git a/Developer/VectorDatabases/SourceData/DataRepositoryURIs.wl b/Developer/VectorDatabases/SourceData/DataRepositoryURIs.wl new file mode 100644 index 00000000..32e7b3eb --- /dev/null +++ b/Developer/VectorDatabases/SourceData/DataRepositoryURIs.wl @@ -0,0 +1,4 @@ +<| + "Name" -> "DataRepositoryURIs", + "Location" -> CloudObject[ "https://www.wolframcloud.com/obj/wolframai-content/VectorDatabases/DataRepositoryURIs/1.0.0/DataRepositoryURIs.jsonl" ] +|> \ No newline at end of file diff --git a/Developer/VectorDatabases/SourceData/DocumentationURIs.jsonl b/Developer/VectorDatabases/SourceData/DocumentationURIs.jsonl deleted file mode 100644 index 86563c8b..00000000 --- a/Developer/VectorDatabases/SourceData/DocumentationURIs.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0a16442991caeaa42e704bd5aa39b43163c649caa243f85e8c3de8562e56358f -size 369614780 diff --git a/Developer/VectorDatabases/SourceData/DocumentationURIs.wl b/Developer/VectorDatabases/SourceData/DocumentationURIs.wl new file mode 100644 index 00000000..9a583cce --- /dev/null +++ b/Developer/VectorDatabases/SourceData/DocumentationURIs.wl @@ -0,0 +1,4 @@ +<| + "Name" -> "DocumentationURIs", + "Location" -> CloudObject[ "https://www.wolframcloud.com/obj/wolframai-content/VectorDatabases/DocumentationURIs/1.3.0/DocumentationURIs.jsonl" ] +|> \ No newline at end of file diff --git a/Developer/VectorDatabases/SourceData/FunctionRepositoryURIs.wl b/Developer/VectorDatabases/SourceData/FunctionRepositoryURIs.wl new file mode 100644 index 00000000..1ae70f40 --- /dev/null +++ b/Developer/VectorDatabases/SourceData/FunctionRepositoryURIs.wl @@ -0,0 +1,4 @@ +<| + "Name" -> "FunctionRepositoryURIs", + "Location" -> CloudObject[ "https://www.wolframcloud.com/obj/wolframai-content/VectorDatabases/FunctionRepositoryURIs/1.0.0/FunctionRepositoryURIs.jsonl" ] +|> \ No newline at end of file diff --git a/Developer/VectorDatabases/SourceData/WolframAlphaQueries.jsonl b/Developer/VectorDatabases/SourceData/WolframAlphaQueries.jsonl deleted file mode 100644 index 94fba276..00000000 --- a/Developer/VectorDatabases/SourceData/WolframAlphaQueries.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b7475c4747948df7f0cb8d53ab9e1dd1f6bb9c50b387e48ed7cf3590cfd30808 -size 8028019 diff --git a/Developer/VectorDatabases/SourceData/WolframAlphaQueries.wl b/Developer/VectorDatabases/SourceData/WolframAlphaQueries.wl new file mode 100644 index 00000000..3444413c --- /dev/null +++ b/Developer/VectorDatabases/SourceData/WolframAlphaQueries.wl @@ -0,0 +1,4 @@ +<| + "Name" -> "WolframAlphaQueries", + "Location" -> CloudObject[ "https://www.wolframcloud.com/obj/wolframai-content/VectorDatabases/WolframAlphaQueries/1.3.0/WolframAlphaQueries.jsonl" ] +|> \ No newline at end of file diff --git a/Developer/VectorDatabases/VectorDatabaseBuilder.wl b/Developer/VectorDatabases/VectorDatabaseBuilder.wl index 8a134b83..a180bb46 100644 --- a/Developer/VectorDatabases/VectorDatabaseBuilder.wl +++ b/Developer/VectorDatabases/VectorDatabaseBuilder.wl @@ -48,8 +48,9 @@ $$vectorDatabase = _VectorDatabaseObject? System`Private`ValidQ; (* ::**************************************************************************************************************:: *) (* ::Subsection::Closed:: *) (*Vector Databases*) -$vectorDBSourceDirectory = FileNameJoin @ { DirectoryName @ $InputFileName, "SourceData" }; -$vectorDBTargetDirectory = FileNameJoin @ { DirectoryName[ $InputFileName, 3 ], "Assets", "VectorDatabases" }; +$defaultVectorDBSourceDirectory = FileNameJoin @ { DirectoryName @ $InputFileName, "SourceData" }; +$vectorDBSourceDirectory := getVectorDBSourceDirectory[ ]; +$vectorDBTargetDirectory = FileNameJoin @ { DirectoryName[ $InputFileName, 3 ], "Assets", "VectorDatabases" }; $incrementalBuildBatchSize = 512; $dbConnectivity = 16; @@ -86,29 +87,36 @@ $embeddingCache = <| |>; ImportVectorDatabaseData // ClearAll; ImportVectorDatabaseData[ name_String ] := - Enclose @ Module[ { file, data }, - file = ConfirmBy[ FileNameJoin @ { $vectorDBSourceDirectory, name<>".jsonl" }, FileExistsQ, "File" ]; - data = ConfirmMatch[ jsonlImport @ file, { ___Association? AssociationQ }, "Data" ]; - data + Enclose @ Module[ { file }, + file = ConfirmBy[ getVectorDBSourceFile @ name, FileExistsQ, "File" ]; + ImportVectorDatabaseData @ File @ file ]; +ImportVectorDatabaseData[ file_File ] := + Enclose @ ConfirmMatch[ jsonlImport @ file, { ___Association? AssociationQ }, "Data" ]; + (* ::**************************************************************************************************************:: *) (* ::Subsection::Closed:: *) (*ExportVectorDatabaseData*) ExportVectorDatabaseData // ClearAll; -ExportVectorDatabaseData[ name_String, data0_List ] := - Enclose @ Module[ { data, dir, file }, - data = ConfirmBy[ toDBData @ data0, dbDataQ, "Data" ]; +ExportVectorDatabaseData[ name_String, data_List ] := + Enclose @ Module[ { dir, file }, dir = ConfirmBy[ ensureDirectory @ $vectorDBSourceDirectory, DirectoryQ, "Directory" ]; file = ConfirmBy[ FileNameJoin @ { dir, name<>".jsonl" }, StringQ, "File" ]; + ExportVectorDatabaseData[ File @ file, data ] + ]; + +ExportVectorDatabaseData[ file_File, data0_List ] := + Enclose @ Module[ { data }, + data = ConfirmBy[ toDBData @ data0, dbDataQ, "Data" ]; ConfirmBy[ jsonlExport[ file, data ], FileExistsQ, "Export" ] ]; (* ::**************************************************************************************************************:: *) (* ::Subsection::Closed:: *) (*AddToVectorDatabaseData*) -AddToVectorDatabaseData // beginDefinition; +AddToVectorDatabaseData // ClearAll; AddToVectorDatabaseData // Options = { "Tag" -> "TextLiteral", "Rebuild" -> False }; AddToVectorDatabaseData[ name_String, data_List, opts: OptionsPattern[ ] ] := @@ -128,8 +136,6 @@ AddToVectorDatabaseData[ name_String, data_List, opts: OptionsPattern[ ] ] := <| "Exported" -> exported, "Rebuilt" -> rebuilt |> ]; -AddToVectorDatabaseData // endDefinition; - (* ::**************************************************************************************************************:: *) (* ::Subsection::Closed:: *) (*BuildVectorDatabase*) @@ -147,7 +153,7 @@ BuildVectorDatabase[ All, opts: OptionsPattern[ ] ] := $dbExpansionAdd = OptionValue[ "ExpansionAdd" ], $dbExpansionSearch = OptionValue[ "ExpansionSearch" ] }, - AssociationMap[ BuildVectorDatabase, FileBaseName /@ FileNames[ "*.jsonl", $vectorDBSourceDirectory ] ] + AssociationMap[ BuildVectorDatabase, FileBaseName /@ getVectorDBSourceFile @ All ] ]; BuildVectorDatabase[ name_String, opts: OptionsPattern[ ] ] := Enclose[ @@ -169,13 +175,13 @@ BuildVectorDatabase[ name_String, opts: OptionsPattern[ ] ] := Enclose[ buildVectorDatabase // ClearAll; buildVectorDatabase[ name_String ] := - Enclose @ Catch @ Module[ { dir, rel, src, db, valueBag, count, n, stream, values }, + Enclose @ Catch @ Module[ { dir, rel, src, db, valueBag, count, n, stream, values, built }, loadEmbeddingCache[ ]; dir = ConfirmBy[ ensureDirectory @ { $vectorDBTargetDirectory, name }, DirectoryQ, "Directory" ]; rel = ConfirmBy[ ResourceFunction[ "RelativePath" ][ dir ], DirectoryQ, "Relative" ]; - src = ConfirmBy[ FileNameJoin @ { $vectorDBSourceDirectory, name<>".jsonl" }, FileExistsQ, "File" ]; + src = ConfirmBy[ getVectorDBSourceFile @ name, FileExistsQ, "File" ]; DeleteFile /@ FileNames[ { "*.wxf", "*.usearch" }, dir ]; ConfirmAssert[ FileNames[ { "*.wxf", "*.usearch" }, dir ] === { }, "ClearedFilesCheck" ]; @@ -198,59 +204,65 @@ buildVectorDatabase[ name_String ] := valueBag = Internal`Bag[ ]; count = ConfirmMatch[ lineCount @ src, _Integer? Positive, "LineCount" ]; n = 0; - stream = ConfirmMatch[ OpenRead @ src, _InputStream, "Stream" ]; - - withProgress[ - While[ - NumericArrayQ @ ConfirmMatch[ addBatch[ db, stream, valueBag ], _NumericArray|EndOfFile, "Add" ], - n = Internal`BagLength @ valueBag - ], - <| - "Text" -> "Building database \""<>name<>"\"", - "ElapsedTime" -> Automatic, - "RemainingTime" -> Automatic, - "ItemTotal" :> count, - "ItemCurrent" :> n, - "Progress" :> Automatic - |>, - "Delay" -> 0, - UpdateInterval -> 1 - ]; - - saveEmbeddingCache[ ]; - - values = Internal`BagPart[ valueBag, All ]; - - ConfirmAssert[ Length @ values === count, "ValueCount" ]; - ConfirmAssert[ First @ db[ "Dimensions" ] === count, "VectorCount" ]; - - ConfirmBy[ - writeWXFFile[ FileNameJoin @ { dir, "Values.wxf" }, values, PerformanceGoal -> "Size" ], - FileExistsQ, - "Values" - ]; + WithCleanup[ + stream = ConfirmMatch[ OpenRead @ src, _InputStream, "Stream" ], - ConfirmBy[ - writeWXFFile[ - FileNameJoin @ { dir, "EmbeddingInformation.wxf" }, + withProgress[ + While[ + NumericArrayQ @ ConfirmMatch[ addBatch[ db, stream, valueBag ], _NumericArray|EndOfFile, "Add" ], + n = Internal`BagLength @ valueBag + ], <| - "Dimension" -> $embeddingDimension, - "Type" -> $embeddingType, - "Model" -> $embeddingModel, - "Service" -> $embeddingService - |> + "Text" -> "Building database \""<>name<>"\"", + "ElapsedTime" -> Automatic, + "RemainingTime" -> Automatic, + "ItemTotal" :> count, + "ItemCurrent" :> n, + "Progress" :> Automatic + |>, + "Delay" -> 0, + UpdateInterval -> 1 + ]; + + saveEmbeddingCache[ ]; + + values = Internal`BagPart[ valueBag, All ]; + + ConfirmBy[ rewriteDBData[ rel, name ], FileExistsQ, "Rewrite" ]; + + built = ConfirmMatch[ + VectorDatabaseObject @ File @ FileNameJoin @ { rel, name <> ".wxf" }, + $$vectorDatabase, + "Result" + ]; + + ConfirmAssert[ Length @ values === count, "ValueCount" ]; + ConfirmAssert[ First @ built[ "Dimensions" ] === count, "VectorCount" ]; + + ConfirmBy[ + writeWXFFile[ FileNameJoin @ { dir, "Values.wxf" }, values, PerformanceGoal -> "Size" ], + FileExistsQ, + "Values" + ]; + + ConfirmBy[ + writeWXFFile[ + FileNameJoin @ { dir, "EmbeddingInformation.wxf" }, + <| + "Dimension" -> $embeddingDimension, + "Type" -> $embeddingType, + "Model" -> $embeddingModel, + "Service" -> $embeddingService + |> + ], + FileExistsQ, + "EmbeddingInformation" ], - FileExistsQ, - "EmbeddingInformation" - ]; - ConfirmBy[ rewriteDBData[ rel, name ], FileExistsQ, "Rewrite" ]; + Close @ stream + ]; - ConfirmMatch[ - VectorDatabaseObject @ File @ FileNameJoin @ { rel, name <> ".wxf" }, - $$vectorDatabase, - "Result" - ] + ConfirmMatch[ built, $$vectorDatabase, "Result" ] ]; (* ::**************************************************************************************************************:: *) @@ -274,7 +286,7 @@ setDBDefaults[ dir_, name_String ] := addBatch // ClearAll; addBatch[ db_VectorDatabaseObject, stream_InputStream, valueBag_Internal`Bag ] := - Enclose @ Catch @ Module[ { batch, text, values, embeddings }, + Enclose @ Catch @ Module[ { batch, text, values, embeddings, added }, batch = ConfirmMatch[ readJSONLines[ stream, $incrementalBuildBatchSize ], @@ -289,9 +301,9 @@ addBatch[ db_VectorDatabaseObject, stream_InputStream, valueBag_Internal`Bag ] : values = ConfirmMatch[ batch[[ All, "Value" ]], { __ }, "Values" ]; embeddings = ConfirmBy[ $lastEmbedding = GetEmbedding @ text, NumericArrayQ, "Embeddings" ]; ConfirmAssert[ Length @ values === Length @ embeddings, "LengthCheck" ]; - Confirm[ $lastAdded = AddToVectorDatabase[ db, embeddings ], "AddToVectorDatabase" ]; + added = Confirm[ $lastAdded = AddToVectorDatabase[ db, embeddings ], "AddToVectorDatabase" ]; Internal`StuffBag[ valueBag, values, 1 ]; - ConfirmMatch[ db[ "Dimensions" ], { Internal`BagLength @ valueBag, $embeddingDimension }, "DimensionCheck" ]; + ConfirmMatch[ added[ "Dimensions" ], { Internal`BagLength @ valueBag, $embeddingDimension }, "DimensionCheck" ]; embeddings ]; @@ -729,6 +741,46 @@ embeddingHash[ string_String ] := (* ::Section::Closed:: *) (*Misc Utilities*) +(* ::**************************************************************************************************************:: *) +(* ::Subsection::Closed:: *) +(*getVectorDBSourceDirectory*) +getVectorDBSourceDirectory // ClearAll; + +getVectorDBSourceDirectory[ ] := Enclose[ + getVectorDBSourceDirectory[ ] = Confirm @ SelectFirst[ + { + ReleaseHold @ PersistentSymbol[ "ChatbookDeveloper/VectorDatabaseSourceDirectory" ], + GeneralUtilities`EnsureDirectory @ $defaultVectorDBSourceDirectory + }, + DirectoryQ, + $Failed + ] +]; + +(* ::**************************************************************************************************************:: *) +(* ::Subsection::Closed:: *) +(*getVectorDBSourceFile*) +getVectorDBSourceFile // ClearAll; + +getVectorDBSourceFile[ name_String ] := + Enclose @ Catch @ Module[ { dir, jsonl, wl, as, url, downloaded }, + dir = ConfirmBy[ getVectorDBSourceDirectory[ ], DirectoryQ, "Directory" ]; + jsonl = FileNameJoin @ { dir, name<>".jsonl" }; + If[ FileExistsQ @ jsonl, Throw @ jsonl ]; + wl = ConfirmBy[ FileNameJoin @ { dir, name<>".wl" }, FileExistsQ, "File" ]; + as = ConfirmBy[ Get @ wl, AssociationQ, "Data" ]; + url = ConfirmMatch[ as[ "Location" ], _String|_CloudObject|_URL, "URL" ]; + downloaded = ConfirmBy[ URLDownload[ url, jsonl ], FileExistsQ, "Download" ]; + ConfirmBy[ jsonl, FileExistsQ, "Result" ] + ]; + +getVectorDBSourceFile[ All ] := + Enclose @ Module[ { dir, names }, + dir = ConfirmBy[ getVectorDBSourceDirectory[ ], DirectoryQ, "Directory" ]; + names = Union[ FileBaseName /@ FileNames[ { "*.jsonl", "*.wl" }, dir ] ]; + getVectorDBSourceFile /@ names + ]; + (* ::**************************************************************************************************************:: *) (* ::Subsection::Closed:: *) (*withProgress*) diff --git a/Scripts/.githooks/post-checkout b/Scripts/.githooks/post-checkout deleted file mode 100644 index ca7fcb40..00000000 --- a/Scripts/.githooks/post-checkout +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-checkout' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } -git lfs post-checkout "$@" diff --git a/Scripts/.githooks/post-commit b/Scripts/.githooks/post-commit deleted file mode 100644 index 52b339cb..00000000 --- a/Scripts/.githooks/post-commit +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-commit' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } -git lfs post-commit "$@" diff --git a/Scripts/.githooks/post-merge b/Scripts/.githooks/post-merge deleted file mode 100644 index a912e667..00000000 --- a/Scripts/.githooks/post-merge +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-merge' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } -git lfs post-merge "$@" diff --git a/Scripts/.githooks/pre-push b/Scripts/.githooks/pre-push deleted file mode 100644 index 0f0089bc..00000000 --- a/Scripts/.githooks/pre-push +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'pre-push' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } -git lfs pre-push "$@" diff --git a/Scripts/BuildVectorDatabases.wls b/Scripts/BuildVectorDatabases.wls new file mode 100644 index 00000000..404c4e53 --- /dev/null +++ b/Scripts/BuildVectorDatabases.wls @@ -0,0 +1,32 @@ +#!/usr/bin/env wolframscript + +BeginPackage[ "Wolfram`ChatbookScripts`" ]; +If[ ! TrueQ @ $loadedDefinitions, Get @ FileNameJoin @ { DirectoryName @ $InputFileName, "Common.wl" } ]; + +(* :!CodeAnalysis::BeginBlock:: *) +(* :!CodeAnalysis::Disable::SuspiciousSessionSymbol:: *) + +(* ::**************************************************************************************************************:: *) +(* ::Section::Closed:: *) +(*Initialization*) +Needs[ "Wolfram`PacletCICD`" -> "cicd`" ]; + +(* ::**************************************************************************************************************:: *) +(* ::Section::Closed:: *) +(*Paths*) +$sourceFile = cFile @ FileNameJoin @ { $pacletDir, "Developer", "VectorDatabases", "VectorDatabaseBuilder.wl" }; +$sourceDir = cDir @ FileNameJoin @ { $pacletDir, "Developer", "VectorDatabases", "SourceData" }; + +(* ::**************************************************************************************************************:: *) +(* ::Section::Closed:: *) +(*Run*) +cicd`ScriptConfirmMatch[ DeleteFile /@ FileNames[ "*.jsonl", $sourceDir ], { Null... } ]; +cicd`ScriptConfirmMatch[ Get @ $sourceFile, Null ]; +result = cicd`ScriptConfirmBy[ BuildVectorDatabase @ All, AssociationQ ]; +cicd`ConfirmAssert @ AllTrue[ result, MatchQ[ _VectorDatabaseObject ] ]; + +(* :!CodeAnalysis::EndBlock:: *) + +EndPackage[ ]; + +Wolfram`ChatbookScripts`result \ No newline at end of file diff --git a/Source/Chatbook/Main.wl b/Source/Chatbook/Main.wl index fb2c61dd..3ad445c3 100644 --- a/Source/Chatbook/Main.wl +++ b/Source/Chatbook/Main.wl @@ -32,6 +32,7 @@ BeginPackage[ "Wolfram`Chatbook`" ]; `$LastChatbookFailure; `$LastChatbookFailureText; `$NotebookAssistanceInputs; +`$RelatedDocumentationSources; `$SandboxKernel; `$ToolFunctions; `$WorkspaceChat; diff --git a/Source/Chatbook/PromptGenerators/RelatedDocumentation.wl b/Source/Chatbook/PromptGenerators/RelatedDocumentation.wl index c8538c49..ccb4acb9 100644 --- a/Source/Chatbook/PromptGenerators/RelatedDocumentation.wl +++ b/Source/Chatbook/PromptGenerators/RelatedDocumentation.wl @@ -24,10 +24,28 @@ $resourceSnippetsCacheDirectory := $resourceSnippetsCacheDirectory = $rerankMethod := CurrentChatSettings[ "DocumentationRerankMethod" ]; +$defaultSources = { "DataRepository", "Documentation", "FunctionRepository" }; + +$sourceAliases = <| + "DataRepository" -> "DataRepositoryURIs", + "Documentation" -> "DocumentationURIs", + "FunctionRepository" -> "FunctionRepositoryURIs" +|>; + + +$minUnfilteredItems = 20; +$unfilteredItemsPerSource = 10; + (* ::**************************************************************************************************************:: *) (* ::Section::Closed:: *) (*Messages*) Chatbook::CloudDownloadError = "Unable to download required data from the cloud. Please try again later."; +Chatbook::InvalidSources = "Invalid value for the \"Sources\" option: `1`."; + +(* ::**************************************************************************************************************:: *) +(* ::Section::Closed:: *) +(*$RelatedDocumentationSources*) +$RelatedDocumentationSources = $defaultSources; (* ::**************************************************************************************************************:: *) (* ::Section::Closed:: *) @@ -36,7 +54,9 @@ RelatedDocumentation // beginDefinition; RelatedDocumentation // Options = { "FilteredCount" -> Automatic, "FilterResults" -> Automatic, - "MaxItems" -> 20 + "MaxItems" -> Automatic, + "RerankMethod" -> Automatic, + "Sources" :> $RelatedDocumentationSources }; GeneralUtilities`SetUsage[ RelatedDocumentation, "\ @@ -62,27 +82,32 @@ RelatedDocumentation[ prompt_, count: _Integer | UpTo[ _Integer ], opts: Options RelatedDocumentation[ prompt, Automatic, count, opts ]; RelatedDocumentation[ prompt_, property_, opts: OptionsPattern[ ] ] := - catchMine @ RelatedDocumentation[ prompt, property, OptionValue @ MaxItems, opts ]; + catchMine @ RelatedDocumentation[ + prompt, + property, + getMaxItems[ OptionValue @ MaxItems, getSources @ OptionValue[ "Sources" ] ], + opts + ]; RelatedDocumentation[ prompt_, Automatic, count_, opts: OptionsPattern[ ] ] := RelatedDocumentation[ prompt, "URIs", count, opts ]; RelatedDocumentation[ prompt: $$prompt, "URIs", Automatic, opts: OptionsPattern[ ] ] := catchMine @ Enclose[ (* TODO: filter results *) - ConfirmMatch[ vectorDBSearch[ "DocumentationURIs", prompt, "Values" ], { ___String }, "Queries" ], + ConfirmMatch[ vectorDBSearch[ getSources @ OptionValue[ "Sources" ], prompt, "Values" ], { ___String }, "Queries" ], throwInternalFailure ]; RelatedDocumentation[ All, "URIs", Automatic, opts: OptionsPattern[ ] ] := catchMine @ Enclose[ (* TODO: filter results *) - Union @ ConfirmMatch[ vectorDBSearch[ "DocumentationURIs", All ], { __String }, "QueryList" ], + Union @ ConfirmMatch[ vectorDBSearch[ getSources @ OptionValue[ "Sources" ], All ], { __String }, "QueryList" ], throwInternalFailure ]; RelatedDocumentation[ prompt: $$prompt, "Snippets", Automatic, opts: OptionsPattern[ ] ] := catchMine @ Enclose[ ConfirmMatch[ (* TODO: filter results *) - DeleteMissing[ makeDocSnippets @ vectorDBSearch[ "DocumentationURIs", prompt, "Values" ] ], + DeleteMissing[ makeDocSnippets @ vectorDBSearch[ getSources @ OptionValue[ "Sources" ], prompt, "Values" ] ], { ___String }, "Snippets" ], @@ -105,7 +130,14 @@ RelatedDocumentation[ ] := catchMine @ Enclose[ (* TODO: filter results *) - Take[ ConfirmBy[ vectorDBSearch[ "DocumentationURIs", prompt, property ], ListQ, "Results" ], UpTo @ n ], + Take[ + ConfirmBy[ + vectorDBSearch[ getSources @ OptionValue[ "Sources" ], prompt, property ], + ListQ, + "Results" + ], + UpTo @ n + ], throwInternalFailure ]; @@ -126,11 +158,19 @@ RelatedDocumentation[ prompt_, property: "Index"|"Distance", n_Integer, opts: Op ]; RelatedDocumentation[ prompt_, "Prompt", n_Integer, opts: OptionsPattern[ ] ] := - catchMine @ relatedDocumentationPrompt[ - ensureChatMessages @ prompt, - n, - MatchQ[ OptionValue[ "FilterResults" ], Automatic|True ], - Replace[ OptionValue[ "FilteredCount" ], Automatic -> Ceiling[ n / 4 ] ] + catchMine @ Block[ + { + $rerankMethod = Replace[ + OptionValue[ "RerankMethod" ], + $$unspecified :> CurrentChatSettings[ "DocumentationRerankMethod" ] + ] + }, + relatedDocumentationPrompt[ + ensureChatMessages @ prompt, + n, + MatchQ[ OptionValue[ "FilterResults" ], Automatic|True ], + Replace[ OptionValue[ "FilteredCount" ], Automatic -> Ceiling[ n / 4 ] ] + ] ]; RelatedDocumentation[ args___ ] := catchMine @ throwFailure[ @@ -141,6 +181,33 @@ RelatedDocumentation[ args___ ] := catchMine @ throwFailure[ RelatedDocumentation // endDefinition; +(* ::**************************************************************************************************************:: *) +(* ::Subsection::Closed:: *) +(*getMaxItems*) +getMaxItems // beginDefinition; +getMaxItems[ $$unspecified, sources_List ] := Max[ $minUnfilteredItems, $unfilteredItemsPerSource * Length @ sources ]; +getMaxItems[ Infinity, _ ] := 50; +getMaxItems[ n: $$size, _ ] := Ceiling @ n; +getMaxItems[ UpTo[ n_ ], sources_ ] := getMaxItems[ n, sources ]; +getMaxItems // endDefinition; + +(* ::**************************************************************************************************************:: *) +(* ::Subsection::Closed:: *) +(*getSources*) +getSources // beginDefinition; +getSources[ names_List ] := toSource /@ Flatten @ names; +getSources[ name_String ] := { toSource @ name }; +getSources[ Automatic|All ] := toSource /@ $defaultSources; +getSources[ source_ ] := throwFailure[ "InvalidSources", source ]; +getSources // endDefinition; + +(* ::**************************************************************************************************************:: *) +(* ::Subsubsection::Closed:: *) +(*toSource*) +toSource // beginDefinition; +toSource[ name_String ] := Lookup[ $sourceAliases, name, name ]; +toSource // endDefinition; + (* ::**************************************************************************************************************:: *) (* ::Subsection::Closed:: *) (*ensureChatMessages*) @@ -211,6 +278,20 @@ filterSnippets[ messages_, uris: { __String }, Except[ True ], filterCount_ ] := ]; +filterSnippets[ + messages_, + uris: { __String }, + True, + filterCount_Integer? Positive +] /; $rerankMethod === None := Enclose[ + Catch @ Module[ { snippets }, + snippets = ConfirmMatch[ makeDocSnippets @ uris, { ___String }, "Snippets" ]; + Take[ snippets, UpTo[ filterCount ] ] + ], + throwInternalFailure +]; + + filterSnippets[ messages_, uris: { __String }, @@ -305,7 +386,7 @@ $bestDocumentationPrompt = StringTemplate[ "\ Your task is to read a chat transcript between a user and assistant, and then select any relevant Wolfram Language \ documentation snippets that could help the assistant answer the user's latest message. -Each snippet is uniquely identified by a URI (always starts with 'paclet:' or 'https://resources.wolframcloud.com'). +Each snippet is uniquely identified by a URI (always starts with 'paclet:' or 'https://*.wolframcloud.com'). Choose up to %%FilteredCount%% documentation snippets that would help answer the user's MOST RECENT message. @@ -512,6 +593,13 @@ snippetCacheFile[ uri_String ] /; StringStartsQ[ uri, "paclet:" ] := snippetCacheFile[ uri_String ] /; StringStartsQ[ uri, "https://resources.wolframcloud.com/" ] := snippetCacheFile[ uri, StringDelete[ uri, "https://resources.wolframcloud.com/" ], "ResourceSystem" ]; +snippetCacheFile[ uri_String ] /; StringStartsQ[ uri, "https://datarepository.wolframcloud.com/" ] := + snippetCacheFile[ + uri, + "DataRepository" <> StringDelete[ uri, "https://datarepository.wolframcloud.com/" ], + "ResourceSystem" + ]; + snippetCacheFile[ uri_String, path0_String, name_String ] := Enclose[ Module[ { path, file }, path = ConfirmBy[ StringTrim[ path0, "/" ] <> ".wxf", StringQ, "Path" ]; @@ -613,6 +701,9 @@ toDocSnippetURL0 // beginDefinition; toDocSnippetURL0[ { "resources.wolframcloud.com", { "", repo_String, "resources", name_String } } ] := URLBuild @ { $resourceSnippetBaseURL, repo, name <> ".wxf" }; +toDocSnippetURL0[ { "datarepository.wolframcloud.com", { "", "resources", name_String } } ] := + URLBuild @ { $resourceSnippetBaseURL, "DataRepository", name <> ".wxf" }; + toDocSnippetURL0 // endDefinition; (* ::**************************************************************************************************************:: *) diff --git a/Source/Chatbook/PromptGenerators/VectorDatabases.wl b/Source/Chatbook/PromptGenerators/VectorDatabases.wl index 0c01d9cf..c091721e 100644 --- a/Source/Chatbook/PromptGenerators/VectorDatabases.wl +++ b/Source/Chatbook/PromptGenerators/VectorDatabases.wl @@ -15,8 +15,14 @@ HoldComplete[ (* ::**************************************************************************************************************:: *) (* ::Section::Closed:: *) (*Configuration*) -$vectorDBNames = { "DocumentationURIs", "WolframAlphaQueries" }; -$dbVersion = "1.2.0"; +$vectorDatabases = <| + "DataRepositoryURIs" -> <| "Version" -> "1.0.0", "Bias" -> 1.0 |>, + "DocumentationURIs" -> <| "Version" -> "1.3.0", "Bias" -> 0.0 |>, + "FunctionRepositoryURIs" -> <| "Version" -> "1.0.0", "Bias" -> 1.0 |>, + "WolframAlphaQueries" -> <| "Version" -> "1.3.0", "Bias" -> 0.0 |> +|>; + +$vectorDBNames = Keys @ $vectorDatabases; $allowDownload = True; $cacheEmbeddings = True; @@ -42,9 +48,8 @@ $maxExtraFiles = 20; (*Remote Content Locations*) $baseVectorDatabasesURL = "https://www.wolframcloud.com/obj/wolframai-content/VectorDatabases"; -(* TODO: these will be moved to the data repository: *) $vectorDBDownloadURLs = AssociationMap[ - URLBuild @ { $baseVectorDatabasesURL, $dbVersion, # <> ".zip" } &, + URLBuild @ { $baseVectorDatabasesURL, #, $vectorDatabases[ #, "Version" ], # <> ".zip" } &, $vectorDBNames ]; @@ -52,13 +57,18 @@ $vectorDBDownloadURLs = AssociationMap[ (* ::Subsection::Closed:: *) (*Paths*) $pacletVectorDBDirectory := FileNameJoin @ { $thisPaclet[ "Location" ], "Assets/VectorDatabases" }; -$localVectorDBDirectory := ChatbookFilesDirectory @ { "VectorDatabases", $dbVersion }; +$localVectorDBDirectory := ChatbookFilesDirectory[ "VectorDatabases" ]; +$cloudVectorDBDirectory := PacletObject[ "Wolfram/NotebookAssistantCloudResources" ][ "AssetLocation", "VectorDatabases" ]; (* ::**************************************************************************************************************:: *) (* ::Subsection::Closed:: *) (*Argument Patterns*) $$vectorDatabase = HoldPattern[ _VectorDatabaseObject? System`Private`ValidQ ]; +$$dbName = Alternatives @@ $vectorDBNames; +$$dbNames = { $$dbName... }; +$$dbNameOrNames = $$dbName | $$dbNames; + (* ::**************************************************************************************************************:: *) (* ::Subsection::Closed:: *) (*Cache*) @@ -96,6 +106,7 @@ getVectorDBDirectory[ ] := Enclose[ $vectorDBDirectory = SelectFirst[ { $pacletVectorDBDirectory, + If[ $CloudEvaluation, $cloudVectorDBDirectory, Nothing ], $localVectorDBDirectory }, vectorDBDirectoryQ, @@ -118,11 +129,15 @@ vectorDBDirectoryQ // endDefinition; vectorDBDirectoryQ0 // beginDefinition; vectorDBDirectoryQ0[ dir_? DirectoryQ ] := Enclose[ - Module[ { name, existsQ, expected }, - name = ConfirmBy[ FileBaseName @ dir, StringQ, "Name" ]; - existsQ = FileExistsQ @ FileNameJoin @ { dir, # } &; - expected = { name <> ".wxf", "Values.wxf", name <> "-vectors.usearch" }; - TrueQ @ AllTrue[ expected, existsQ ] + Module[ { name, existsQ, expected, versionFile, expectedVersion }, + + name = ConfirmBy[ FileBaseName @ dir, StringQ, "Name" ]; + existsQ = FileExistsQ @ FileNameJoin @ { dir, # } &; + expected = { name <> ".wxf", "Values.wxf", name <> "-vectors.usearch" }; + versionFile = FileNameJoin @ { dir, "Version.wl" }; + expectedVersion = ConfirmBy[ $vectorDatabases[ name, "Version" ], StringQ, "ExpectedVersion" ]; + + TrueQ[ AllTrue[ expected, existsQ ] && FileExistsQ @ versionFile && Get @ versionFile === expectedVersion ] ], throwInternalFailure ]; @@ -142,12 +157,15 @@ downloadVectorDatabases[ ] /; ! $allowDownload := downloadVectorDatabases[ ] := downloadVectorDatabases[ $localVectorDBDirectory, $vectorDBDownloadURLs ]; -downloadVectorDatabases[ dir0_, urls_Association ] := Enclose[ - Module[ { dir, lock, names, sizes, tasks }, +downloadVectorDatabases[ dir0_, urls0_Association ] := Enclose[ + Module[ { dir, lock, names, urls, sizes, tasks }, + + dir = ConfirmBy[ GeneralUtilities`EnsureDirectory @ dir0, DirectoryQ, "Directory" ]; + cleanupLegacyVectorDBFiles @ dir; + names = Select[ $vectorDBNames, ! DirectoryQ @ FileNameJoin @ { dir, # } & ]; + urls = KeyTake[ urls0, names ]; - dir = ConfirmBy[ GeneralUtilities`EnsureDirectory @ dir0, DirectoryQ, "Directory" ]; lock = FileNameJoin @ { dir, "download.lock" }; - names = ConfirmMatch[ Keys @ urls, { __String }, "Names" ]; sizes = ConfirmMatch[ getDownloadSize /@ Values @ urls, { __? Positive }, "Sizes" ]; $downloadProgress = AssociationMap[ 0 &, names ]; @@ -180,6 +198,21 @@ downloadVectorDatabases[ dir0_, urls_Association ] := Enclose[ downloadVectorDatabases // endDefinition; +(* ::**************************************************************************************************************:: *) +(* ::Subsubsection::Closed:: *) +(*cleanupLegacyVectorDBFiles*) +cleanupLegacyVectorDBFiles // beginDefinition; + +cleanupLegacyVectorDBFiles[ dir_String ] := Quiet @ Map[ + DeleteDirectory[ #1, DeleteContents -> True ] &, + Join[ + Select[ FileNames[ DigitCharacter.. ~~ "." ~~ DigitCharacter.. ~~ "." ~~ DigitCharacter.., dir ], DirectoryQ ], + Select[ FileNames[ $vectorDBNames, dir ], DirectoryQ[ # ] && ! vectorDBDirectoryQ0[ # ] & ] + ] +]; + +cleanupLegacyVectorDBFiles // endDefinition; + (* ::**************************************************************************************************************:: *) (* ::Subsubsection::Closed:: *) (*getDownloadSize*) @@ -210,10 +243,15 @@ unpackVectorDatabases // endDefinition; unpackVectorDatabase // beginDefinition; unpackVectorDatabase[ zip_String? FileExistsQ ] := Enclose[ - Module[ { root, dir, res }, + Module[ { name, version, root, dir, res, versionFile }, + name = ConfirmBy[ FileBaseName @ zip, StringQ, "Name" ]; + version = ConfirmBy[ $vectorDatabases[ name, "Version" ], StringQ, "Version" ]; root = ConfirmBy[ DirectoryName @ zip, DirectoryQ, "RootDirectory" ]; - dir = ConfirmBy[ GeneralUtilities`EnsureDirectory @ { root, FileBaseName @ zip }, DirectoryQ, "Directory" ]; + dir = ConfirmBy[ GeneralUtilities`EnsureDirectory @ { root, name }, DirectoryQ, "Directory" ]; res = ConfirmMatch[ ExtractArchive[ zip, dir, OverwriteTarget -> True ], { __? FileExistsQ }, "Extracted" ]; + versionFile = FileNameJoin @ { dir, "Version.wl" }; + Put[ version, versionFile ]; + ConfirmAssert[ Get @ versionFile === version, "VersionCheck" ]; DeleteFile @ zip; res ], @@ -366,24 +404,38 @@ loadVectorDBValues // endDefinition; (*vectorDBSearch*) vectorDBSearch // beginDefinition; -vectorDBSearch[ dbName_String, prompt_String ] := - vectorDBSearch[ dbName, prompt, All ]; -vectorDBSearch[ dbName_String, All ] := - vectorDBSearch[ dbName, All, "Values" ]; +(* Default arguments: *) +vectorDBSearch[ db: $$dbNameOrNames, prompt_String ] := + vectorDBSearch[ db, prompt, All ]; + +vectorDBSearch[ db: $$dbNameOrNames, All ] := + vectorDBSearch[ db, All, "Values" ]; + -vectorDBSearch[ dbName_String, "", All ] := <| +(* Shortcuts: *) +vectorDBSearch[ $$dbNameOrNames, "", All ] := <| "EmbeddingVector" -> None, "SearchData" -> Missing[ "NoInput" ], "Values" -> { } |>; -vectorDBSearch[ dbName_String, prompt_String, All ] := +vectorDBSearch[ $$dbNameOrNames, "", "Results"|"Values" ] := + { }; + +vectorDBSearch[ { }, prompt_, prop_ ] := + { }; + + +(* Cached results: *) +vectorDBSearch[ dbName: $$dbName, prompt_String, All ] := With[ { result = $vectorDBSearchCache[ dbName, prompt ] }, result /; AssociationQ @ result ]; -vectorDBSearch[ dbName_String, prompt_String, All ] := Enclose[ + +(* Main definition for string prompt: *) +vectorDBSearch[ dbName: $$dbName, prompt_String, All ] := Enclose[ Module[ { vectorDBInfo, vectorDB, allValues, embeddingVector, close, indices, distances, values, data, result }, vectorDBInfo = ConfirmBy[ getVectorDB @ dbName, AssociationQ, "VectorDBInfo" ]; @@ -410,7 +462,7 @@ vectorDBSearch[ dbName_String, prompt_String, All ] := Enclose[ ConfirmAssert[ Length @ indices === Length @ distances === Length @ values, "LengthCheck" ]; data = MapApply[ - <| "Value" -> #1, "Index" -> #2, "Distance" -> #3 |> &, + <| "Value" -> #1, "Index" -> #2, "Distance" -> #3, "Source" -> dbName |> &, Transpose @ { values, indices, distances } ]; @@ -425,28 +477,9 @@ vectorDBSearch[ dbName_String, prompt_String, All ] := Enclose[ throwInternalFailure ]; -vectorDBSearch[ dbName_String, prompt_String, key_String ] := Enclose[ - Lookup[ ConfirmBy[ vectorDBSearch[ dbName, prompt, All ], AssociationQ, "Result" ], key ], - throwInternalFailure -]; - -vectorDBSearch[ dbName_String, prompt_String, keys: { ___String } ] := Enclose[ - KeyTake[ ConfirmBy[ vectorDBSearch[ dbName, prompt, All ], AssociationQ, "Result" ], keys ], - throwInternalFailure -]; - -vectorDBSearch[ dbName_String, prompts: { ___String }, prop_ ] := - AssociationMap[ vectorDBSearch[ dbName, #, prop ] &, prompts ]; -vectorDBSearch[ dbName_String, All, "Values" ] := Enclose[ - Module[ { vectorDBInfo }, - vectorDBInfo = ConfirmBy[ getVectorDB @ dbName, AssociationQ, "VectorDB" ]; - ConfirmBy[ vectorDBInfo[ "Values" ], ListQ, "Values" ] - ], - throwInternalFailure -]; - -vectorDBSearch[ dbName_String, messages0: { __Association }, prop: "Values"|"Results" ] := Enclose[ +(* Main definition for a list of messages: *) +vectorDBSearch[ dbName: $$dbName, messages0: { __Association }, prop: "Values"|"Results" ] := Enclose[ Catch @ Module[ { messages, @@ -529,8 +562,86 @@ vectorDBSearch[ dbName_String, messages0: { __Association }, prop: "Values"|"Res throwInternalFailure ]; + +(* Properties: *) +vectorDBSearch[ db_, prompt_String, "EmbeddingVector" ] := Enclose[ + ConfirmMatch[ getEmbedding @ prompt, _NumericArray, "EmbeddingVector" ], + throwInternalFailure +]; + +vectorDBSearch[ dbName: $$dbName, prompt_String, key_String ] := Enclose[ + Lookup[ ConfirmBy[ vectorDBSearch[ dbName, prompt, All ], AssociationQ, "Result" ], key ], + throwInternalFailure +]; + +vectorDBSearch[ dbName: $$dbName, prompt_String, keys: { ___String } ] := Enclose[ + KeyTake[ ConfirmBy[ vectorDBSearch[ dbName, prompt, All ], AssociationQ, "Result" ], keys ], + throwInternalFailure +]; + +vectorDBSearch[ dbName: $$dbName, prompts: { ___String }, prop_ ] := + AssociationMap[ vectorDBSearch[ dbName, #, prop ] &, prompts ]; + + +(* Full list of possible values: *) +vectorDBSearch[ dbName: $$dbName, All, "Values" ] := Enclose[ + Module[ { vectorDBInfo }, + vectorDBInfo = ConfirmBy[ getVectorDB @ dbName, AssociationQ, "VectorDB" ]; + ConfirmBy[ vectorDBInfo[ "Values" ], ListQ, "Values" ] + ], + throwInternalFailure +]; + +vectorDBSearch[ names: $$dbNames, All, "Values" ] := + Flatten[ vectorDBSearch[ #, All, "Values" ] & /@ names ]; + + +(* Combine results from multiple vector databases: *) +vectorDBSearch[ names: $$dbNames, prompt_, prop: "Values"|"Results" ] := Enclose[ + Catch @ Module[ { results, sorted }, + + results = ConfirmMatch[ + applyBias[ #, vectorDBSearch[ #, prompt, "Results" ] ] & /@ names, + { { KeyValuePattern[ "Distance" -> $$size ].. }... }, + "Results" + ]; + + sorted = SortBy[ Flatten @ results, #Distance & ]; + + If[ prop === "Results", + sorted, + ConfirmMatch[ + DeleteDuplicates @ Lookup[ sorted, "Value" ], + { __String }, + "Values" + ] + ] + ], + throwInternalFailure +]; + +vectorDBSearch[ names: $$dbNames, prompt_, All ] := + Merge[ vectorDBSearch[ #, prompt, All ] & /@ names, Flatten ]; + +vectorDBSearch[ All, prompt_ ] := + vectorDBSearch[ $vectorDBNames, prompt ]; + +vectorDBSearch[ All, prompt_, prop_ ] := + vectorDBSearch[ $vectorDBNames, prompt, prop ]; + + vectorDBSearch // endDefinition; +(* ::**************************************************************************************************************:: *) +(* ::Subsubsection::Closed:: *) +(*applyBias*) +applyBias // beginDefinition; +applyBias[ name_String, results_ ] := applyBias[ $vectorDatabases[ name, "Bias" ], results ]; +applyBias[ None | _Missing | 0 | 0.0, results_ ] := results; +applyBias[ bias_, results_List ] := (applyBias[ bias, #1 ] &) /@ results; +applyBias[ bias: $$size, as: KeyValuePattern[ "Distance" -> d: $$size ] ] := <| as, "Distance" -> d + bias |>; +applyBias // endDefinition; + (* ::**************************************************************************************************************:: *) (* ::Subsubsection::Closed:: *) (*insertContextPrompt*) @@ -562,7 +673,7 @@ insertContextPrompt // endDefinition; (*cacheVectorDBResult*) cacheVectorDBResult // beginDefinition; -cacheVectorDBResult[ dbName_String, prompt_String, data_Association ] := ( +cacheVectorDBResult[ dbName: $$dbName, prompt_String, data_Association ] := ( If[ ! AssociationQ @ $vectorDBSearchCache, $vectorDBSearchCache = <| |> ]; If[ ! AssociationQ @ $vectorDBSearchCache[ dbName ], $vectorDBSearchCache[ dbName ] = <| |> ]; $vectorDBSearchCache[ dbName, prompt ] = data