dsg-polymtl · dorbanianas · Dec 24, 2024 · Dec 25, 2024 · Dec 25, 2024 · Jan 12, 2025
diff --git a/src/functions/aggregate/llm_first_or_last/implementation.cpp b/src/functions/aggregate/llm_first_or_last/implementation.cpp
@@ -32,11 +32,13 @@ nlohmann::json LlmFirstOrLast::Evaluate(nlohmann::json& tuples) {
     do {
         accumulated_tuples_tokens = Tiktoken::GetNumTokens(batch_tuples.dump());
         accumulated_tuples_tokens +=
-            Tiktoken::GetNumTokens(PromptManager::ConstructMarkdownHeader(tuples[start_index]));
+            Tiktoken::GetNumTokens(PromptManager::ConstructNumTuples(static_cast<int>(tuples.size())));
+        accumulated_tuples_tokens +=
+            Tiktoken::GetNumTokens(PromptManager::ConstructInputTuplesHeader(tuples[start_index]));
         while (accumulated_tuples_tokens < static_cast<unsigned int>(available_tokens) &&
                start_index < static_cast<int>(tuples.size())) {
             const auto num_tokens =
-                Tiktoken::GetNumTokens(PromptManager::ConstructMarkdownSingleTuple(tuples[start_index]));
+                Tiktoken::GetNumTokens(PromptManager::ConstructSingleInputTuple(tuples[start_index]));
             if (accumulated_tuples_tokens + num_tokens > static_cast<unsigned int>(available_tokens)) {
                 break;
             }

diff --git a/src/functions/aggregate/llm_reduce/implementation.cpp b/src/functions/aggregate/llm_reduce/implementation.cpp
@@ -33,11 +33,13 @@ nlohmann::json LlmReduce::ReduceLoop(const std::vector<nlohmann::json>& tuples,
     do {
         accumulated_tuples_tokens = Tiktoken::GetNumTokens(batch_tuples.dump());
         accumulated_tuples_tokens +=
-            Tiktoken::GetNumTokens(PromptManager::ConstructMarkdownHeader(tuples[start_index]));
+            Tiktoken::GetNumTokens(PromptManager::ConstructNumTuples(static_cast<int>(tuples.size())));
+        accumulated_tuples_tokens +=
+            Tiktoken::GetNumTokens(PromptManager::ConstructInputTuplesHeader(tuples[start_index]));
         while (accumulated_tuples_tokens < static_cast<unsigned int>(available_tokens) &&
                start_index < static_cast<int>(tuples.size())) {
             const auto num_tokens =
-                Tiktoken::GetNumTokens(PromptManager::ConstructMarkdownSingleTuple(tuples[start_index]));
+                Tiktoken::GetNumTokens(PromptManager::ConstructSingleInputTuple(tuples[start_index]));
             if (accumulated_tuples_tokens + num_tokens > static_cast<unsigned int>(available_tokens)) {
                 break;
             }

diff --git a/src/functions/aggregate/llm_rerank/implementation.cpp b/src/functions/aggregate/llm_rerank/implementation.cpp
@@ -40,9 +40,12 @@ nlohmann::json LlmRerank::SlidingWindow(nlohmann::json& tuples) {
         next_tuples.clear();
         batch_size = half_batch;
         accumulated_rows_tokens = Tiktoken::GetNumTokens(window_tuples.dump());
-        accumulated_rows_tokens += Tiktoken::GetNumTokens(PromptManager::ConstructMarkdownHeader(tuples[start_index]));
+        accumulated_rows_tokens +=
+            Tiktoken::GetNumTokens(PromptManager::ConstructNumTuples(static_cast<int>(tuples.size())));
+        accumulated_rows_tokens +=
+            Tiktoken::GetNumTokens(PromptManager::ConstructInputTuplesHeader(tuples[start_index]));
         while (available_tokens - accumulated_rows_tokens > 0 && start_index >= 0) {
-            auto num_tokens = Tiktoken::GetNumTokens(PromptManager::ConstructMarkdownSingleTuple(tuples[start_index]));
+            auto num_tokens = Tiktoken::GetNumTokens(PromptManager::ConstructSingleInputTuple(tuples[start_index]));
             if (accumulated_rows_tokens + num_tokens > static_cast<unsigned int>(available_tokens)) {
                 break;
             }

diff --git a/src/functions/scalar/llm_embedding/implementation.cpp b/src/functions/scalar/llm_embedding/implementation.cpp
@@ -30,14 +30,26 @@ std::vector<duckdb::vector<duckdb::Value>> LlmEmbedding::Operation(duckdb::DataC
         prepared_inputs.push_back(concat_input);
     }
 
-    auto embeddings = model.CallEmbedding(prepared_inputs);
+    auto batch_size = model.GetModelDetails().batch_size;
+
+    if (batch_size == 0 || batch_size > prepared_inputs.size()) {
+        batch_size = static_cast<int>(prepared_inputs.size());
+    }
+
     std::vector<duckdb::vector<duckdb::Value>> results;
-    for (size_t index = 0; index < embeddings.size(); index++) {
-        duckdb::vector<duckdb::Value> embedding;
-        for (auto& value : embeddings[index]) {
-            embedding.push_back(duckdb::Value(static_cast<double>(value)));
+    for (size_t i = 0; i < prepared_inputs.size(); i += batch_size) {
+        std::vector<std::string> batch_inputs;
+        for (size_t j = i; j < i + batch_size && j < prepared_inputs.size(); j++) {
+            batch_inputs.push_back(prepared_inputs[j]);
+        }
+        auto embeddings = model.CallEmbedding(batch_inputs);
+        for (size_t index = 0; index < embeddings.size(); index++) {
+            duckdb::vector<duckdb::Value> embedding;
+            for (auto& value : embeddings[index]) {
+                embedding.push_back(duckdb::Value(static_cast<double>(value)));
+            }
+            results.push_back(embedding);
         }
-        results.push_back(embedding);
     }
     return results;
 }

diff --git a/src/functions/scalar/llm_filter/implementation.cpp b/src/functions/scalar/llm_filter/implementation.cpp
@@ -36,6 +36,10 @@ std::vector<std::string> LlmFilter::Operation(duckdb::DataChunk& args) {
     std::vector<std::string> results;
     results.reserve(responses.size());
     for (const auto& response : responses) {
+        if (response.is_null()) {
+            results.emplace_back("True");
+            continue;
+        }
         results.push_back(response.dump());
     }
 

diff --git a/src/functions/scalar/scalar.cpp b/src/functions/scalar/scalar.cpp
@@ -18,56 +18,101 @@ nlohmann::json ScalarFunctionBase::BatchAndComplete(const std::vector<nlohmann::
     int num_tokens_meta_and_user_prompt = 0;
     num_tokens_meta_and_user_prompt += Tiktoken::GetNumTokens(user_prompt);
     num_tokens_meta_and_user_prompt += Tiktoken::GetNumTokens(llm_template);
-    const int available_tokens = model.GetModelDetails().context_window - num_tokens_meta_and_user_prompt;
+    const auto model_details = model.GetModelDetails();
+    const auto available_tokens = model_details.context_window - num_tokens_meta_and_user_prompt;
+    auto batch_size = model_details.batch_size;
 
     auto responses = nlohmann::json::array();
 
     if (available_tokens < 0) {
         throw std::runtime_error("The total number of tokens in the prompt exceeds the model's maximum token limit");
     } else {
-
-        auto accumulated_tuples_tokens = 0u;
         auto batch_tuples = nlohmann::json::array();
-        auto batch_size = tuples.size();
         int start_index = 0;
 
-        do {
-            accumulated_tuples_tokens +=
-                Tiktoken::GetNumTokens(PromptManager::ConstructMarkdownHeader(tuples[start_index]));
-            while (accumulated_tuples_tokens < static_cast<unsigned int>(available_tokens) &&
-                   start_index < static_cast<int>(tuples.size()) && batch_tuples.size() < batch_size) {
-                const auto num_tokens =
-                    Tiktoken::GetNumTokens(PromptManager::ConstructMarkdownSingleTuple(tuples[start_index]));
-                if (accumulated_tuples_tokens + num_tokens > static_cast<unsigned int>(available_tokens)) {
-                    break;
+        if (batch_size == 0) {
+            auto accumulated_tuples_tokens = 0u;
+            batch_size = static_cast<int>(tuples.size());
+            do {
+                accumulated_tuples_tokens +=
+                    Tiktoken::GetNumTokens(PromptManager::ConstructNumTuples(static_cast<int>(tuples.size())));
+                accumulated_tuples_tokens +=
+                    Tiktoken::GetNumTokens(PromptManager::ConstructInputTuplesHeader(tuples[start_index]));
+                while (accumulated_tuples_tokens < static_cast<unsigned int>(available_tokens) &&
+                       start_index < static_cast<int>(tuples.size()) && batch_tuples.size() < batch_size) {
+                    const auto num_tokens =
+                        Tiktoken::GetNumTokens(PromptManager::ConstructSingleInputTuple(tuples[start_index]));
+                    if (accumulated_tuples_tokens + num_tokens > static_cast<unsigned int>(available_tokens)) {
+                        break;
+                    }
+                    batch_tuples.push_back(tuples[start_index]);
+                    accumulated_tuples_tokens += num_tokens;
+                    start_index++;
+                }
+
+                nlohmann::json response;
+                try {
+                    response = Complete(batch_tuples, user_prompt, function_type, model);
+                } catch (const ExceededMaxOutputTokensError&) {
+                    batch_tuples.clear();
+                    const auto new_batch_size = static_cast<int>(batch_size / 10);
+                    batch_size = batch_size == 1 ? new_batch_size == 0 : new_batch_size;
+                    accumulated_tuples_tokens = 0;
+                    start_index = 0;
+                    continue;
                 }
-                batch_tuples.push_back(tuples[start_index]);
-                accumulated_tuples_tokens += num_tokens;
-                start_index++;
-            }
-
-            nlohmann::json response;
-            try {
-                response = Complete(batch_tuples, user_prompt, function_type, model);
-            } catch (const ExceededMaxOutputTokensError&) {
+
+                if (response.size() < batch_tuples.size()) {
+                    for (auto i = static_cast<int>(response.size()); i < batch_tuples.size(); i++) {
+                        response.push_back(nullptr);
+                    }
+                } else if (response.size() > batch_size) {
+                    auto new_response = nlohmann::json::array();
+                    for (auto i = 0; i < batch_size; i++) {
+                        new_response.push_back(response[i]);
+                    }
+                    response = new_response;
+                }
+
+                auto output_tokens_per_tuple = Tiktoken::GetNumTokens(response.dump()) / batch_tuples.size();
+
+                batch_size = model_details.max_output_tokens / static_cast<int>(output_tokens_per_tuple);
                 batch_tuples.clear();
-                const auto new_batch_size = static_cast<int>(batch_size * 0.1);
-                batch_size = batch_size == 1 ? new_batch_size == 0 : new_batch_size;
                 accumulated_tuples_tokens = 0;
-                start_index = 0;
-                continue;
-            }
-            auto output_tokens_per_tuple = Tiktoken::GetNumTokens(response.dump()) / batch_tuples.size();
 
-            batch_size = model.GetModelDetails().max_output_tokens / output_tokens_per_tuple;
-            batch_tuples.clear();
-            accumulated_tuples_tokens = 0;
+                for (const auto& tuple : response) {
+                    responses.push_back(tuple);
+                }
+
+            } while (start_index < static_cast<int>(tuples.size()));
+        } else {
+            do {
+                for (auto i = 0; i < batch_size; i++) {
+                    if (start_index + i < static_cast<int>(tuples.size())) {
+                        batch_tuples.push_back(tuples[start_index + i]);
+                    }
+                }
+                start_index += batch_size;
 
-            for (const auto& tuple : response) {
-                responses.push_back(tuple);
-            }
+                auto response = Complete(batch_tuples, user_prompt, function_type, model);
 
-        } while (start_index < static_cast<int>(tuples.size()));
+                if (response.size() < batch_tuples.size()) {
+                    for (auto i = static_cast<int>(response.size()); i < batch_tuples.size(); i++) {
+                        response.push_back(nullptr);
+                    }
+                } else if (response.size() > batch_size) {
+                    auto new_response = nlohmann::json::array();
+                    for (auto i = 0; i < batch_size; i++) {
+                        new_response.push_back(response[i]);
+                    }
+                    response = new_response;
+                }
+                batch_tuples.clear();
+                for (const auto& tuple : response) {
+                    responses.push_back(tuple);
+                }
+            } while (start_index < static_cast<int>(tuples.size()));
+        }
     }
 
     return responses;

diff --git a/src/include/flockmtl/model_manager/repository.hpp b/src/include/flockmtl/model_manager/repository.hpp
@@ -13,6 +13,7 @@ struct ModelDetails {
     int32_t max_output_tokens;
     float temperature;
     std::unordered_map<std::string, std::string> secret;
+    int batch_size;
 };
 
 const std::string OLLAMA = "ollama";

diff --git a/src/include/flockmtl/prompt_manager/prompt_manager.hpp b/src/include/flockmtl/prompt_manager/prompt_manager.hpp
@@ -33,16 +33,18 @@ class PromptManager {
 
     static PromptDetails CreatePromptDetails(const nlohmann::json& prompt_details_json);
 
-    static std::string ConstructMarkdownHeader(const nlohmann::json& tuple);
+    static std::string ConstructNumTuples(int num_tuples);
 
-    static std::string ConstructMarkdownSingleTuple(const nlohmann::json& tuple);
+    static std::string ConstructInputTuplesHeader(const nlohmann::json& tuple);
 
-    static std::string ConstructMarkdownArrayTuples(const nlohmann::json& tuples);
+    static std::string ConstructSingleInputTuple(const nlohmann::json& tuple);
+
+    static std::string ConstructInputTuples(const nlohmann::json& tuples);
 
     template <typename FunctionType>
     static std::string Render(const std::string& user_prompt, const nlohmann::json& tuples, FunctionType option) {
         auto prompt = PromptManager::GetTemplate(option);
-        auto markdown_tuples = PromptManager::ConstructMarkdownArrayTuples(tuples);
+        auto markdown_tuples = PromptManager::ConstructInputTuples(tuples);
         prompt = PromptManager::ReplaceSection(prompt, PromptSection::USER_PROMPT, user_prompt);
         prompt = PromptManager::ReplaceSection(prompt, PromptSection::TUPLES, markdown_tuples);
         return prompt;

diff --git a/src/include/flockmtl/prompt_manager/repository.hpp b/src/include/flockmtl/prompt_manager/repository.hpp
@@ -11,15 +11,16 @@ enum class AggregateFunctionType { REDUCE, REDUCE_JSON, FIRST, LAST, RERANK };
 enum class ScalarFunctionType { COMPLETE_JSON, COMPLETE, FILTER };
 
 constexpr auto META_PROMPT =
-    "You are a semantic analysis tool for DBMS. The tool will analyze each tuple in the provided data and respond to "
-    "user requests based on this context.\n\nUser Prompt:\n\n- {{USER_PROMPT}}\n\nTuples "
+    "You are FlockMTL a semantic analysis tool for DBMS. You will analyze each tuple in the provided data and respond "
+    "to "
+    "the user prompt.\n\nUser Prompt:\n\n- {{USER_PROMPT}}\n\nTuples "
     "Table:\n\n{{TUPLES}}\n\nInstructions:\n\n{{INSTRUCTIONS}}\n\nExpected Response Format:\n\n{{RESPONSE_FORMAT}}";
 
 class INSTRUCTIONS {
 public:
     static constexpr auto SCALAR_FUNCTION =
         "- The response should be directly relevant to each tuple without additional formatting, purely answering the "
-        "prompt as if each tuple were a standalone entity.\n- Use clear, context-relevant language to generate a "
+        "user prompt as if each tuple were a standalone entity.\n- Use clear, context-relevant language to generate a "
         "meaningful and concise answer for each tuple.";
     static constexpr auto AGGREGATE_FUNCTION =
         "- For each tuple in the provided data, evaluate the relevant attribute(s) based on the user prompt.\n- After "
@@ -36,18 +37,18 @@ class RESPONSE_FORMAT {
 public:
     // Scalar Functions
     static constexpr auto COMPLETE_JSON =
-        "The system should interpret database tuples and provide a response to the user's prompt for each tuple in a "
-        "JSON format that contains the necessary columns for the answer.\n\nThe tool should respond in JSON format as "
-        "follows:\n\n```json\n{\t\"tuples\": [\n\t\t{<response 1>},\n\t\t{<response 2>},\n\t\t...\n\t\t{<response "
-        "n>}\n\t]\n}\n```";
+        "You should return the responses to the user's prompt for each tuple in a "
+        "JSON format that contains the necessary columns for the answer.\n\nThe tool should respond in JSON format:\n\n"
+        "```json\n{\"tuples\": [{<response>},{<response>}, ..., {<response>}]}\n```";
     static constexpr auto COMPLETE =
-        "The system should interpret database tuples and provide a response to the user's prompt for each tuple in "
-        "plain text.\n\tThe tool should respond in JSON format as follows:\n\n```json\n{\"tuples\": [\"<response 1>\", "
-        "\"<response 2>\", ... , \"<response n>\"]}";
+        "You should return the responses to the user's prompt for each tuple in plain text. Ensure no tuple is "
+        "missed.\n"
+        "Respond in the following JSON format:\n\n"
+        "```json\n{\"tuples\": [\"<response>\", \"<response>\", ..., \"<response>\"]}\n```";
     static constexpr auto FILTER =
-        "The system should interpret database tuples and provide a response to the user's prompt for each tuple in a "
+        "You should return the responses to the user's prompt for each tuple in a "
         "BOOL format that would be true/false.\n\tThe tool should respond in JSON format as "
-        "follows:\n\n```json\n{\"tuples\": [<bool response 1>, <bool response 2>, ... , <bool response n>]}";
+        "follows:\n\n```json\n{\"tuples\": [<bool_response>, <bool_response>, ... , <bool_response>]}\n```";
 
     // Aggregate Functions
     static constexpr auto REDUCE =

diff --git a/src/model_manager/model.cpp b/src/model_manager/model.cpp
@@ -26,11 +26,13 @@ void Model::LoadModelDetails(const nlohmann::json& model_json) {
     }
     model_details_.secret = SecretManager::GetSecret(secret_name);
     model_details_.context_window =
-        model_json.contains("context_window") ? model_json.at("context_window").get<int>() : std::get<2>(query_result);
+        model_json.contains("context_window") ? std::stoi(model_json.at("context_window").get<std::string>()) : std::get<2>(query_result);
     model_details_.max_output_tokens = model_json.contains("max_output_tokens")
-                                           ? model_json.at("max_output_tokens").get<int>()
+                                           ? std::stoi(model_json.at("max_output_tokens").get<std::string>())
                                            : std::get<3>(query_result);
-    model_details_.temperature = model_json.contains("temperature") ? model_json.at("temperature").get<float>() : 0.5;
+    model_details_.temperature = model_json.contains("temperature") ? model_json.at("temperature").get<float>() : 0.7;
+    model_details_.temperature = model_json.contains("temperature") ? std::stof(model_json.at("temperature").get<std::string>()) : 0;
+    model_details_.batch_size = model_json.contains("batch_size") ? std::stoi(model_json.at("batch_size").get<std::string>()) : 0;
 }
 
 std::tuple<std::string, std::string, int32_t, int32_t> Model::GetQueriedModel(const std::string& model_name) {