From 20e906ac8227da785692731b3d8ef46d757ffe0c Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 22 Oct 2024 21:30:29 -0400 Subject: [PATCH 01/35] arangodb prep | initial commit --- .gitignore | 1 + ARANGODB_README.md | 33 +++++++++++++++ langchain_test.py | 101 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 135 insertions(+) create mode 100644 ARANGODB_README.md create mode 100644 langchain_test.py diff --git a/.gitignore b/.gitignore index 1d1e0a3899..9778bf8f78 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ __pycache__ *.egg-info/ .DS_Store +.venv \ No newline at end of file diff --git a/ARANGODB_README.md b/ARANGODB_README.md new file mode 100644 index 0000000000..76f4b7db30 --- /dev/null +++ b/ARANGODB_README.md @@ -0,0 +1,33 @@ +Instructions + +0. Create a virtual environment: + +```bash +python -m venv .venv + +source .venv/bin/activate +``` + +1. Install the required packages: + +```bash +pip install python-arango +pip install langchain_openai +pip install git+https://github.com/arangoml/langchain.git@arangodb#subdirectory=libs/community +``` + +2. Provision the ArangoDB with Vector Index image: + +```bash +docker create --name arango-vector -p 8529:8529 -e ARANGO_ROOT_PASSWORD=test jbajic/arangodb-arm:vector-index-preview + +docker start arango-vector +``` + +3. Set your `OPENAI_API_KEY` environment variable (contact Anthony for access) + +4. Run the test script to confirm LangChain is working: + +```bash +python langchain_test.py +``` \ No newline at end of file diff --git a/langchain_test.py b/langchain_test.py new file mode 100644 index 0000000000..e33ea16873 --- /dev/null +++ b/langchain_test.py @@ -0,0 +1,101 @@ +from arango import ArangoClient +from langchain_community.chains.graph_qa.arangodb import ArangoGraphQAChain +from langchain_community.embeddings import OpenAIEmbeddings +from langchain_community.graphs.arangodb_graph import ArangoGraph +from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship +from langchain_community.vectorstores.arangodb_vector import ArangoVector +from langchain_core.documents import Document +from langchain_openai import OpenAI + +system_db = ArangoClient().db("_system", password="test", verify=True) +system_db.delete_database("langchain_test", ignore_missing=True) +system_db.create_database("langchain_test") +db = ArangoClient().db("langchain_test", password="test", verify=True) + +#################### +# Test ArangoGraph # +#################### + +# Create nodes +node1 = Node(id="1", type="Person", properties={"name": "John", "age": 30}) +node2 = Node(id="2", type="Person", properties={"name": "Jane", "age": 28}) +node3 = Node(id="3", type="Club", properties={"name": "Karate Club"}) + +# Create relationships +relationship1 = Relationship(source=node1, target=node3, type="MEMBER_OF", properties={"joined_date": "2020-01-01"}) +relationship2 = Relationship(source=node2, target=node3, type="MEMBER_OF", properties={"joined_date": "2019-05-15"}) +relationship3 = Relationship(source=node1, target=node2, type="KNOWS", properties={"since": "2018-03-10"}) + +# Create source document +source_doc = Document( + page_content="John and Jane are members of the Karate Club. They know each other.", + metadata={"source": "club_records"}, +) + +# Create GraphDocument +graph_doc = GraphDocument( + nodes=[node1, node2, node3], relationships=[relationship1, relationship2, relationship3], source=source_doc +) + +arango_graph = ArangoGraph(db=db, include_examples=False) +arango_graph.add_graph_documents([graph_doc], graph_name="NewGraph", include_source=True) + +##################### +# Test ArangoVector # +##################### + +# Add some sample texts +texts = [ + "The quick brown fox jumps over the lazy dog", + "A journey of a thousand miles begins with a single step", + "To be or not to be, that is the question", + "All that glitters is not gold", + "hello what's up", +] + +vector_store = ArangoVector.from_texts( + texts, + OpenAIEmbeddings(), + database=db, + collection_name="vector_test", + index_name="vector_index", + distance_strategy="COSINE", +) + +texts_2 = ["the dog, cat, and mouse are all mammals"] +vector_store.add_texts(texts_2) + +# Perform a similarity search +query = "What animal is mentioned?" +results = vector_store.similarity_search_with_score(query, k=2) + +print("Search results for query:", query) +for doc, score in results: + print(f"Content: {doc.page_content}") + print(f"Metadata: {doc.metadata}") + print(f"Score: {score}") + print("---") + +# Try another query +query2 = "What's a famous Shakespeare quote?" +results2 = vector_store.similarity_search_with_score(query2, k=1) + +print("\nSearch results for query:", query2) +for doc, score in results2: + print(f"Content: {doc.page_content}") + print(f"Metadata: {doc.metadata}") + print(f"Score: {score}") + print("---") + +########################### +# Test ArangoGraphQAChain # +########################### + +llm = OpenAI(temperature=0) +graph = ArangoGraph(db=db, include_examples=False, graph_name="NewGraph") +chain = ArangoGraphQAChain.from_llm(llm, graph=graph, allow_dangerous_requests=True) +chain.verbose = True +chain.execute_aql_query = False +chain.run("What is the name of the club?") +chain.execute_aql_query = True +chain.run("What is the name of the club?") From 388d802b42cec611b1fe51efb697e62d8d005af4 Mon Sep 17 00:00:00 2001 From: Ajay Kallepalli <72517322+ajaykallepalli@users.noreply.github.com> Date: Mon, 25 Nov 2024 14:28:02 -0800 Subject: [PATCH 02/35] ArangoDB: Feedback management (#11) * initial commit * updating feedback management readme to match arango * Removing comments above import * Working API test and updated readme * Working docker compose file * Docker compose creating network and docker image * code review * update readme & dev yaml * delete dev files * Delete arango_store.py --------- Co-authored-by: Anthony Mahanna --- comps/feedback_management/arango/Dockerfile | 30 +++ comps/feedback_management/arango/README.md | 172 ++++++++++++++++ .../feedback_management/arango/arango_conn.py | 32 +++ .../arango/arango_store.py | 186 ++++++++++++++++++ comps/feedback_management/arango/config.py | 13 ++ .../docker-compose-user-feedback-arango.yaml | 38 ++++ comps/feedback_management/arango/feedback.py | 172 ++++++++++++++++ .../arango/requirements.txt | 1 + 8 files changed, 644 insertions(+) create mode 100644 comps/feedback_management/arango/Dockerfile create mode 100644 comps/feedback_management/arango/README.md create mode 100644 comps/feedback_management/arango/arango_conn.py create mode 100644 comps/feedback_management/arango/arango_store.py create mode 100644 comps/feedback_management/arango/config.py create mode 100644 comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml create mode 100644 comps/feedback_management/arango/feedback.py create mode 100644 comps/feedback_management/arango/requirements.txt diff --git a/comps/feedback_management/arango/Dockerfile b/comps/feedback_management/arango/Dockerfile new file mode 100644 index 0000000000..95ac359e63 --- /dev/null +++ b/comps/feedback_management/arango/Dockerfile @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libjemalloc-dev \ + libgl1-mesa-glx + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps +COPY requirements.txt /home/user/ + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/feedback_management/arango/requirements.txt && \ + pip install --no-cache-dir -r /home/user/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/feedback_management/arango + +ENTRYPOINT ["python", "feedback.py"] diff --git a/comps/feedback_management/arango/README.md b/comps/feedback_management/arango/README.md new file mode 100644 index 0000000000..8eb223ce95 --- /dev/null +++ b/comps/feedback_management/arango/README.md @@ -0,0 +1,172 @@ +# 🗨 Feedback Management Microservice with ArangoDB + +This README provides setup guides and all the necessary information about the Feedback Management microservice with ArangoDB database. + +--- + +## Setup Environment Variables + +```bash +export ARANGO_HOST=${ARANGO_HOST} +export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_USERNAME=${ARANGO_USERNAME} +export ARANGO_PASSWORD=${ARANGO_PASSWORD} +export DB_NAME=${DB_NAME} +export COLLECTION_NAME=${COLLECTION_NAME} +export PROTOCOL=${PROTOCOL} +export PYTHONPATH={Path to base of directory} +``` + +--- + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ~/GenAIComps +docker build -t opea/feedbackmanagement-arango-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/feedback_management/arango/Dockerfile . +``` + +### Run Docker with CLI + +- Run ArangoDB image container + + ```bash + docker run -d -p 8529:8529 --name=arango arangodb/arangodb:latest + ``` + +- Run Feedback Management microservice + + ```bash + docker run -d -p 6016:6016 \ + --name="feedbackmanagement-arango-server" \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e PROTOCOL=${PROTOCOL} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/feedbackmanagement-arango-server:latest + + ``` + +--- + +### ✅ Invoke Microservice + +The Feedback Management microservice exposes the following API endpoints: + +- Save feedback data + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "chat_id": "66445d4f71c7eff23d44f78d", + "chat_data": { + "user": "test", + "messages": [ + { + "role": "system", + "content": "You are helpful assistant" + }, + { + "role": "user", + "content": "hi", + "time": "1724915247" + }, + { + "role": "assistant", + "content": "Hi, may I help you?", + "time": "1724915249" + } + ] + }, + "feedback_data": { + "comment": "Moderate", + "rating": 3, + "is_thumbs_up": true + }}' + + + # Take note that chat_id here would be the id get from feedback_arango service + # If you do not wish to maintain chat history via feedback_arango service, you may generate some random uuid for it or just leave it empty. + ``` + +- Update feedback data by feedback_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "chat_id": "66445d4f71c7eff23d44f78d", + "chat_data": { + "user": "test", + "messages": [ + { + "role": "system", + "content": "You are helpful assistant" + }, + { + "role": "user", + "content": "hi", + "time": "1724915247" + }, + { + "role": "assistant", + "content": "Hi, may I help you?", + "time": "1724915249" + } + ] + }, + "feedback_data": { + "comment": "Fair and Moderate answer", + "rating": 2, + "is_thumbs_up": true + }, + "feedback_id": "{feedback_id of the data that wanted to update}"}' + + # Just include any feedback_data field value that you wanted to update. + ``` + +- Retrieve feedback data by user + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test"}' + ``` + +- Retrieve feedback data by feedback_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "feedback_id":"{feedback_id returned from save feedback route above}"}' + ``` + +- Delete feedback data by feedback_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/delete \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "feedback_id":"{feedback_id to be deleted}"}' + ``` diff --git a/comps/feedback_management/arango/arango_conn.py b/comps/feedback_management/arango/arango_conn.py new file mode 100644 index 0000000000..f9ac9e411d --- /dev/null +++ b/comps/feedback_management/arango/arango_conn.py @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango import ArangoClient as PythonArangoClient +from arango.database import StandardDatabase +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME, PROTOCOL + + +class ArangoClient: + conn_url = f"{PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + + @staticmethod + def get_db_client() -> StandardDatabase: + try: + # Create client + client = PythonArangoClient(hosts=ArangoClient.conn_url) + + # First connect to _system database + sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + # Create target database if it doesn't exist + if not sys_db.has_database(DB_NAME): + sys_db.create_database(DB_NAME) + + # Now connect to the target database + db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + return db + + except Exception as e: + print(e) + raise e diff --git a/comps/feedback_management/arango/arango_store.py b/comps/feedback_management/arango/arango_store.py new file mode 100644 index 0000000000..cd22b80784 --- /dev/null +++ b/comps/feedback_management/arango/arango_store.py @@ -0,0 +1,186 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango_conn import ArangoClient +from config import COLLECTION_NAME +from pydantic import BaseModel + + +class FeedbackStore: + + def __init__( + self, + user: str, + ): + self.user = user + + def initialize_storage(self) -> None: + self.db_client = ArangoClient.get_db_client() + + if not self.db_client.has_collection(COLLECTION_NAME): + self.db_client.create_collection(COLLECTION_NAME) + + self.collection = self.db_client.collection(COLLECTION_NAME) + + def save_feedback(self, feedback_data: BaseModel) -> str: + """Stores a new feedback data into the storage. + + Args: + feedback_data (object): The document to be stored. + + Returns: + str: The ID of the inserted feedback data. + + Raises: + Exception: If an error occurs while storing the feedback_data. + """ + try: + model_dump = feedback_data.model_dump(by_alias=True, mode="json", exclude={"feedback_id"}) + + inserted_feedback_data = self.collection.insert(model_dump) + + feedback_id = str(inserted_feedback_data["_key"]) + + return feedback_id + + except Exception as e: + print(e) + raise Exception(e) + + def update_feedback(self, feedback_data: BaseModel) -> bool: + """Update a feedback data in the collection with given id. + + Args: + feedback_id (str): The ID of the data to be updated. + updated_data (object): The data to be updated in the entry. + + Returns: + bool: True if the data updated successfully, False otherwise. + + Raises: + KeyError: If the document with ID is not found. + Exception: If the user does not match with the document user. + Exception: If an error occurs while updating the feedback data. + """ + _key = feedback_data.feedback_id + document = self.collection.get(_key) + + if document is None: + raise KeyError(f"Document with ID: {_key} not found.") + + if document["chat_data"]["user"] != self.user: + raise Exception(f"User mismatch. Document with ID: {_key} does not belong to user: {self.user}") + + try: + model_dump = feedback_data.feedback_data.model_dump(by_alias=True, mode="json") + + self.collection.update( + {"_key": _key, "feedback_data": model_dump}, + merge=True, + keep_none=False, + ) + + print(f"Updated document: {_key} !") + + return True + + except Exception as e: + print("Not able to update the data.") + print(e) + raise Exception(e) + + def get_all_feedback_of_user(self) -> list[dict]: + """Retrieves all feedback data of a user from the collection. + + Returns: + list[dict] | None: List of dict of feedback data of the user, None otherwise. + + Raises: + Exception: If there is an error while retrieving data. + """ + try: + feedback_data_list: list = [] + + # TODO: Clarify if we actually want to omit the `feedback_data` field. + # Implemented using MongoDB Feedback Management as a reference. + cursor = """ + FOR doc IN @@collection + FILTER doc.chat_data.user == @user + RETURN UNSET(doc, "feedback_data") + """ + + cursor = self.db_client.aql.execute( + cursor, bind_vars={"@collection": self.collection.name, "user": self.user} + ) + + for document in cursor: + document["feedback_id"] = str(document["_key"]) + del document["_id"] + del document["_key"] + del document["_rev"] + + feedback_data_list.append(document) + + return feedback_data_list + + except Exception as e: + print(e) + raise Exception(e) + + def get_feedback_by_id(self, feedback_id: str) -> dict | None: + """Retrieves a user feedback data from the collection based on the given feedback ID. + + Args: + feedback_id (str): The ID of the feedback data to retrieve. + + Returns: + dict | None: The user's feedback data if found, None otherwise. + + Raises: + KeyError: If document with ID is not found. + Exception: If the user does not match with the document user. + """ + response = self.collection.get(feedback_id) + + if response is None: + raise KeyError(f"Feedback with ID: {feedback_id} not found.") + + if response["chat_data"]["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {feedback_id} does not belong to user: {self.user}") + + del response["_id"] + del response["_key"] + del response["_rev"] + + return response + + def delete_feedback(self, feedback_id: str) -> bool: + """Delete a feedback data from collection by given feedback_id. + + Args: + feedback_id(str): The ID of the feedback data to be deleted. + + Returns: + bool: True if feedback is successfully deleted, False otherwise. + + Raises: + KeyError: If the provided feedback_id is invalid: + Exception: If the user does not match with the document user. + Exception: If any errors occurs during delete process. + """ + response = self.collection.get(feedback_id) + + if response is None: + raise KeyError(f"Feedback with ID: {feedback_id} not found.") + + if response["chat_data"]["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {feedback_id} does not belong to user: {self.user}") + + try: + response = self.collection.delete(feedback_id) + print(f"Deleted document: {feedback_id} !") + + return True + except Exception as e: + print(e) + raise Exception("Not able to delete the data.") diff --git a/comps/feedback_management/arango/config.py b/comps/feedback_management/arango/config.py new file mode 100644 index 0000000000..e3272febf8 --- /dev/null +++ b/comps/feedback_management/arango/config.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ARANGO configuration +ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") +ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +DB_NAME = os.getenv("DB_NAME", "OPEA") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") +PROTOCOL = os.getenv("PROTOCOL", "http") diff --git a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml new file mode 100644 index 0000000000..f4be0c845e --- /dev/null +++ b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arangodb: + image: arangodb/arangodb:latest + container_name: arangodb + ports: + - 8529:8529 + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_ROOT_PASSWORD: ${ARANGO_ROOT_PASSWORD} + + feedbackmanagement-arango: + image: opea/feedbackmanagement-arango:latest + container_name: feedbackmanagement-arango-server + ports: + - "6016:6016" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_HOST: ${ARANGO_HOST} + ARANGO_PORT: ${ARANGO_PORT} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + PROTOCOL: ${PROTOCOL} + DB_NAME: ${DB_NAME} + COLLECTION_NAME: ${COLLECTION_NAME} + restart: unless-stopped + +networks: + feedback_network: + driver: bridge diff --git a/comps/feedback_management/arango/feedback.py b/comps/feedback_management/arango/feedback.py new file mode 100644 index 0000000000..f1efa6f435 --- /dev/null +++ b/comps/feedback_management/arango/feedback.py @@ -0,0 +1,172 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import Annotated, Optional + +from arango_store import FeedbackStore +from fastapi import HTTPException +from pydantic import BaseModel, Field + +from comps import CustomLogger +from comps.cores.mega.micro_service import opea_microservices, register_microservice +from comps.cores.proto.api_protocol import ChatCompletionRequest + +logger = CustomLogger("feedback_arango") +logflag = os.getenv("LOGFLAG", False) + + +class FeedbackData(BaseModel): + """This class represents the data model of FeedbackData collected to store in database.". + + Attributes: + is_thumbs_up (bool): True if the response is satisfy, False otherwise. + rating: (int)[Optional]: Score rating. Range from 0 (bad rating) to 5(good rating). + comment (str)[Optional]: Comment given for response. + """ + + is_thumbs_up: bool + rating: Annotated[Optional[int], Field(ge=0, le=5)] = None + comment: Optional[str] = None + + +class ChatFeedback(BaseModel): + """This class represents the model for chat to collect FeedbackData together with ChatCompletionRequest data to store in database. + + Attributes: + chat_data (ChatCompletionRequest): ChatCompletionRequest object containing chat data to be stored. + feedback_data (FeedbackData): FeedbackData object containing feedback data for chat to be stored. + chat_id (str)[Optional]: The chat_id associated to the chat to be store together with feedback data. + feedback_id (str)[Optional]: The feedback_id of feedback data to be retrieved from database. + """ + + chat_data: ChatCompletionRequest + feedback_data: FeedbackData + chat_id: Optional[str] = None + feedback_id: Optional[str] = None + + +class FeedbackId(BaseModel): + """This class represent the data model for retrieve feedback data stored in database. + + Attributes: + user (str): The user of the requested feedback data. + feedback_id (str): The feedback_id of feedback data to be retrieved from database. + """ + + user: str + feedback_id: Optional[str] = None + + +@register_microservice( + name="opea_service@feedback_arango", + endpoint="/v1/feedback/create", + host="0.0.0.0", + input_datatype=FeedbackData, + port=6016, +) +async def create_feedback_data(feedback: ChatFeedback): + """Creates and stores a feedback data in database. + + Args: + feedback (ChatFeedback): The ChatFeedback class object containing feedback data to be stored. + + Returns: + response (str/bool): FeedbackId of the object created in database. True if data update successfully. + """ + if logflag: + logger.info(feedback) + + try: + feedback_store = FeedbackStore(feedback.chat_data.user) + feedback_store.initialize_storage() + if feedback.feedback_id is None: + response = feedback_store.save_feedback(feedback) + else: + response = feedback_store.update_feedback(feedback) + + if logflag: + logger.info(response) + return response + + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@feedback_arango", + endpoint="/v1/feedback/get", + host="0.0.0.0", + input_datatype=FeedbackId, + port=6016, +) +async def get_feedback(feedback: FeedbackId): + """Retrieves feedback_data from feedback store based on provided FeedbackId or user. + + Args: + feedback (FeedbackId): The FeedbackId object containing user and feedback_id or chat_id. + + Returns: + JSON: Retrieved feedback data if successful, error otherwise. + """ + if logflag: + logger.info(feedback) + + try: + feedback_store = FeedbackStore(feedback.user) + feedback_store.initialize_storage() + if feedback.feedback_id: + response = feedback_store.get_feedback_by_id(feedback.feedback_id) + else: + response = feedback_store.get_all_feedback_of_user() + + if logflag: + logger.info(response) + + return response + + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@feedback_arango", + endpoint="/v1/feedback/delete", + host="0.0.0.0", + input_datatype=FeedbackId, + port=6016, +) +async def delete_feedback(feedback: FeedbackId): + """Delete a feedback data from feedback store by given feedback Id. + + Args: + feedback (FeedbackId): The FeedbackId object containing user and feedback_id or chat_id + + Returns: + Result of deletion if successful, None otherwise. + """ + if logflag: + logger.info(feedback) + + try: + feedback_store = FeedbackStore(feedback.user) + feedback_store.initialize_storage() + if feedback.feedback_id is None: + raise Exception("feedback_id is required.") + else: + response = feedback_store.delete_feedback(feedback.feedback_id) + + if logflag: + logger.info(response) + + return response + + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +if __name__ == "__main__": + opea_microservices["opea_service@feedback_arango"].start() diff --git a/comps/feedback_management/arango/requirements.txt b/comps/feedback_management/arango/requirements.txt new file mode 100644 index 0000000000..9e5d0de8e0 --- /dev/null +++ b/comps/feedback_management/arango/requirements.txt @@ -0,0 +1 @@ +python-arango \ No newline at end of file From 6973d914c8932dd633914ccff24beacfa3c075c6 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 25 Nov 2024 19:05:21 -0500 Subject: [PATCH 03/35] remove: `PROTOCOL` env --- comps/feedback_management/arango/README.md | 2 -- comps/feedback_management/arango/arango_conn.py | 4 ++-- comps/feedback_management/arango/config.py | 3 +-- .../arango/docker-compose-user-feedback-arango.yaml | 3 +-- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/comps/feedback_management/arango/README.md b/comps/feedback_management/arango/README.md index 8eb223ce95..e0f070b684 100644 --- a/comps/feedback_management/arango/README.md +++ b/comps/feedback_management/arango/README.md @@ -13,7 +13,6 @@ export ARANGO_USERNAME=${ARANGO_USERNAME} export ARANGO_PASSWORD=${ARANGO_PASSWORD} export DB_NAME=${DB_NAME} export COLLECTION_NAME=${COLLECTION_NAME} -export PROTOCOL=${PROTOCOL} export PYTHONPATH={Path to base of directory} ``` @@ -49,7 +48,6 @@ docker build -t opea/feedbackmanagement-arango-server:latest --build-arg https_p -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ -e DB_NAME=${DB_NAME} \ - -e PROTOCOL=${PROTOCOL} \ -e COLLECTION_NAME=${COLLECTION_NAME} \ opea/feedbackmanagement-arango-server:latest diff --git a/comps/feedback_management/arango/arango_conn.py b/comps/feedback_management/arango/arango_conn.py index f9ac9e411d..84ded04283 100644 --- a/comps/feedback_management/arango/arango_conn.py +++ b/comps/feedback_management/arango/arango_conn.py @@ -3,11 +3,11 @@ from arango import ArangoClient as PythonArangoClient from arango.database import StandardDatabase -from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME, PROTOCOL +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME class ArangoClient: - conn_url = f"{PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + conn_url = f"arangodb://{ARANGO_HOST}:{ARANGO_PORT}/" @staticmethod def get_db_client() -> StandardDatabase: diff --git a/comps/feedback_management/arango/config.py b/comps/feedback_management/arango/config.py index e3272febf8..c332de7e5c 100644 --- a/comps/feedback_management/arango/config.py +++ b/comps/feedback_management/arango/config.py @@ -9,5 +9,4 @@ ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") DB_NAME = os.getenv("DB_NAME", "OPEA") -COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") -PROTOCOL = os.getenv("PROTOCOL", "http") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") \ No newline at end of file diff --git a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml index f4be0c845e..62ab0df544 100644 --- a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml +++ b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml @@ -28,11 +28,10 @@ services: ARANGO_PORT: ${ARANGO_PORT} ARANGO_USERNAME: ${ARANGO_USERNAME} ARANGO_PASSWORD: ${ARANGO_PASSWORD} - PROTOCOL: ${PROTOCOL} DB_NAME: ${DB_NAME} COLLECTION_NAME: ${COLLECTION_NAME} restart: unless-stopped networks: - feedback_network: + default: driver: bridge From 5e9742c52b6ab2b5f6f40b3ce2b171835b824fc1 Mon Sep 17 00:00:00 2001 From: SLasyaN Date: Tue, 26 Nov 2024 13:47:41 -0800 Subject: [PATCH 04/35] ArangoDB: PromptRegistry (#8) * Initial commit * remove unnecessary files * code review * update: `prompt_search` * new: `ARANGO_PROTOCOL` * README * cleanup --------- Co-authored-by: lasyasn Co-authored-by: Anthony Mahanna --- comps/feedback_management/README.md | 4 + comps/feedback_management/arango/README.md | 4 + .../feedback_management/arango/arango_conn.py | 4 +- comps/feedback_management/arango/config.py | 3 +- .../docker-compose-user-feedback-arango.yaml | 3 +- comps/prompt_registry/README.md | 4 + comps/prompt_registry/arango/DockerFile | 30 +++ comps/prompt_registry/arango/README.md | 120 ++++++++++ comps/prompt_registry/arango/arango_conn.py | 32 +++ comps/prompt_registry/arango/arango_store.py | 213 ++++++++++++++++++ comps/prompt_registry/arango/config.py | 13 ++ ...docker-compose-prompt-registry-arango.yaml | 38 ++++ comps/prompt_registry/arango/prompt.py | 148 ++++++++++++ comps/prompt_registry/arango/requirements.txt | 1 + 14 files changed, 613 insertions(+), 4 deletions(-) create mode 100644 comps/prompt_registry/arango/DockerFile create mode 100644 comps/prompt_registry/arango/README.md create mode 100644 comps/prompt_registry/arango/arango_conn.py create mode 100644 comps/prompt_registry/arango/arango_store.py create mode 100644 comps/prompt_registry/arango/config.py create mode 100644 comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml create mode 100644 comps/prompt_registry/arango/prompt.py create mode 100644 comps/prompt_registry/arango/requirements.txt diff --git a/comps/feedback_management/README.md b/comps/feedback_management/README.md index 2e68aa413c..9cd4b42a51 100644 --- a/comps/feedback_management/README.md +++ b/comps/feedback_management/README.md @@ -20,3 +20,7 @@ The Feedback Management microservice able to support various database backends f ### Feedback Management with MongoDB For more detail, please refer to this [README](./mongo/README.md) + +### Feedback Management with ArangoDB + +For more detail, please refer to this [README](./arango/README.md) diff --git a/comps/feedback_management/arango/README.md b/comps/feedback_management/arango/README.md index e0f070b684..7e9a5f8400 100644 --- a/comps/feedback_management/arango/README.md +++ b/comps/feedback_management/arango/README.md @@ -6,9 +6,12 @@ This README provides setup guides and all the necessary information about the Fe ## Setup Environment Variables +See `config.py` for default values. + ```bash export ARANGO_HOST=${ARANGO_HOST} export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} export ARANGO_USERNAME=${ARANGO_USERNAME} export ARANGO_PASSWORD=${ARANGO_PASSWORD} export DB_NAME=${DB_NAME} @@ -45,6 +48,7 @@ docker build -t opea/feedbackmanagement-arango-server:latest --build-arg https_p -e no_proxy=$no_proxy \ -e ARANGO_HOST=${ARANGO_HOST} \ -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ -e DB_NAME=${DB_NAME} \ diff --git a/comps/feedback_management/arango/arango_conn.py b/comps/feedback_management/arango/arango_conn.py index 84ded04283..d6c4b59777 100644 --- a/comps/feedback_management/arango/arango_conn.py +++ b/comps/feedback_management/arango/arango_conn.py @@ -3,11 +3,11 @@ from arango import ArangoClient as PythonArangoClient from arango.database import StandardDatabase -from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME class ArangoClient: - conn_url = f"arangodb://{ARANGO_HOST}:{ARANGO_PORT}/" + conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" @staticmethod def get_db_client() -> StandardDatabase: diff --git a/comps/feedback_management/arango/config.py b/comps/feedback_management/arango/config.py index c332de7e5c..bb790eb38a 100644 --- a/comps/feedback_management/arango/config.py +++ b/comps/feedback_management/arango/config.py @@ -6,7 +6,8 @@ # ARANGO configuration ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") DB_NAME = os.getenv("DB_NAME", "OPEA") -COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") \ No newline at end of file +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") diff --git a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml index 62ab0df544..8f9b3a85a8 100644 --- a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml +++ b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml @@ -3,7 +3,7 @@ version: "3" services: - arangodb: + arango: image: arangodb/arangodb:latest container_name: arangodb ports: @@ -26,6 +26,7 @@ services: no_proxy: ${no_proxy} ARANGO_HOST: ${ARANGO_HOST} ARANGO_PORT: ${ARANGO_PORT} + ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} ARANGO_USERNAME: ${ARANGO_USERNAME} ARANGO_PASSWORD: ${ARANGO_PASSWORD} DB_NAME: ${DB_NAME} diff --git a/comps/prompt_registry/README.md b/comps/prompt_registry/README.md index 6332a1a13a..a99b1b27b7 100644 --- a/comps/prompt_registry/README.md +++ b/comps/prompt_registry/README.md @@ -19,3 +19,7 @@ The Prompt Registry microservice able to support various database backends for s ### Prompt Registry with MongoDB For more detail, please refer to this [README](./mongo/README.md) + +### Prompt Registry with ArangoDB + +For more detail, please refer to this [README](./arango/README.md) diff --git a/comps/prompt_registry/arango/DockerFile b/comps/prompt_registry/arango/DockerFile new file mode 100644 index 0000000000..0659202058 --- /dev/null +++ b/comps/prompt_registry/arango/DockerFile @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps +COPY requirements.txt /home/user/ + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/prompt_registry/arango/requirements.txt && \ + pip install --no-cache-dir -r /home/user/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/prompt_registry/arango + +ENTRYPOINT ["python", "prompt.py"] diff --git a/comps/prompt_registry/arango/README.md b/comps/prompt_registry/arango/README.md new file mode 100644 index 0000000000..e4bdd6c101 --- /dev/null +++ b/comps/prompt_registry/arango/README.md @@ -0,0 +1,120 @@ +# 🧾 Prompt Registry Microservice with ArangoDB + +This README provides setup guides and all the necessary information about the Prompt Registry microservice with ArangoDB database. + +--- + +## Setup Environment Variables + +See `config.py` for default values. + +```bash +export ARANGO_HOST=${ARANGO_HOST} +export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} +export ARANGO_USERNAME=${ARANGO_USERNAME} +export ARANGO_PASSWORD=${ARANGO_PASSWORD} +export DB_NAME=${DB_NAME} +export COLLECTION_NAME=${COLLECTION_NAME} +``` + +--- + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ~/GenAIComps +docker build -t opea/promptregistry-arango-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/prompt_registry/arango/Dockerfile . +``` + +### Run Docker with CLI + + +- Run ArangoDB image container + + ```bash + docker run -d -p 8529:8529 --name=arango arangodb/arangodb:latest + ``` + +- Run Prompt Registry microservice + + ```bash + docker run -d -p 6018:6018 \ + --name="promptregistry-arango-server" \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/promptregistry-arango-server:latest + + ``` + +--- + +### ✅ Invoke Microservice + +The Prompt Registry microservice exposes the following API endpoints: + +- Save prompt + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "prompt_text": "test prompt", "user": "test" + }' + ``` + +- Retrieve prompt from database by user + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test"}' + ``` + +- Retrieve prompt from database by prompt_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "prompt_id":"{_id returned from save prompt route above}"}' + ``` + +- Retrieve relevant prompt by keyword + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "prompt_text": "{keyword to search}"}' + ``` + +- Delete prompt by prompt_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/delete \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "prompt_id":"{prompt_id to be deleted}"}' + ``` diff --git a/comps/prompt_registry/arango/arango_conn.py b/comps/prompt_registry/arango/arango_conn.py new file mode 100644 index 0000000000..d6c4b59777 --- /dev/null +++ b/comps/prompt_registry/arango/arango_conn.py @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango import ArangoClient as PythonArangoClient +from arango.database import StandardDatabase +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME + + +class ArangoClient: + conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + + @staticmethod + def get_db_client() -> StandardDatabase: + try: + # Create client + client = PythonArangoClient(hosts=ArangoClient.conn_url) + + # First connect to _system database + sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + # Create target database if it doesn't exist + if not sys_db.has_database(DB_NAME): + sys_db.create_database(DB_NAME) + + # Now connect to the target database + db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + return db + + except Exception as e: + print(e) + raise e diff --git a/comps/prompt_registry/arango/arango_store.py b/comps/prompt_registry/arango/arango_store.py new file mode 100644 index 0000000000..fb80ccd20c --- /dev/null +++ b/comps/prompt_registry/arango/arango_store.py @@ -0,0 +1,213 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +from arango.exceptions import IndexGetError +from arango_conn import ArangoClient +from config import COLLECTION_NAME +from prompt import PromptCreate +from pydantic import BaseModel + +from comps import CustomLogger + +logger = CustomLogger("arango_store") +logflag = os.getenv("LOGFLAG", False) + + +class PromptStore: + + def __init__( + self, + user: str, + ): + self.user = user + self.inverted_index_exists = False + + def initialize_storage(self) -> None: + self.db_client = ArangoClient.get_db_client() + + if not self.db_client.has_collection(COLLECTION_NAME): + self.db_client.create_collection(COLLECTION_NAME) + + self.collection = self.db_client.collection(COLLECTION_NAME) + + def save_prompt(self, prompt: PromptCreate): + """Stores a new prompt into the storage. + + Args: + prompt: The document to be stored. It should be a Pydantic model. + + Returns: + str: The ID of the inserted prompt. + + Raises: + Exception: If an error occurs while storing the prompt. + """ + try: + model_dump = prompt.model_dump(by_alias=True, mode="json", exclude={"id"}) + + inserted_prompt_data = self.collection.insert(model_dump) + + prompt_id = str(inserted_prompt_data["_key"]) + + return prompt_id + + except Exception as e: + print(e) + raise Exception(e) + + def get_all_prompt_of_user(self) -> list[dict]: + """Retrieves all prompts of a user from the collection. + + Returns: + list[dict] | None: List of dict of prompts of the user, None otherwise. + + Raises: + Exception: If there is an error while retrieving data. + """ + try: + prompt_data_list: list = [] + + # TODO: Clarify if we actually want to omit the `data` field. + # Implemented using MongoDB Prompt Registry as a reference. + cursor = """ + FOR doc IN @@collection + FILTER doc.chat_data.user == @user + RETURN UNSET(doc, "data") + """ + + cursor = self.db_client.aql.execute( + cursor, bind_vars={"@collection": self.collection.name, "user": self.user} + ) + + for document in cursor: + document["id"] = str(document["_key"]) + del document["_id"] + del document["_key"] + del document["_rev"] + + prompt_data_list.append(document) + + return prompt_data_list + + except Exception as e: + print(e) + raise Exception(e) + + def get_user_prompt_by_id(self, prompt_id: str) -> dict | None: + """Retrieves a user prompt from the collection based on the given prompt ID. + + Args: + prompt_id (str): The ID of the prompt to retrieve. + + Returns: + dict | None: The user prompt if found, None otherwise. + + Raises: + KeyError: If document with ID is not found. + Exception: If the user does not match with the document user. + """ + response = self.collection.get(prompt_id) + + if response is None: + raise KeyError(f"Prompt with ID: {prompt_id} not found.") + + if response["user"] != self.user: + raise Exception(f"User mismatch. Prompt with ID: {prompt_id} does not belong to user: {self.user}") + + del response["_id"] + del response["_key"] + del response["_rev"] + + return response + + def prompt_search(self, keyword: str) -> list | None: + """Retrieves prompt from the collection based on keyword provided. + + Args: + keyword (str): The keyword of prompt to search for. + + Returns: + list | None: The list of relevant prompt if found, None otherwise. + + Raises: + Exception: If there is an error while searching data. + """ + try: + index_name = "prompt_text_index" + + if not self.inverted_index_exists: + try: + self.collection.get_index(index_name) + + except IndexGetError: + self.collection.add_inverted_index( + fields=["prompt_text"], + name=index_name, + # TODO: add more kwargs if needed + ) + + self.inverted_index_exists = True + + query = """ + FOR doc IN @@collection + OPTIONS { indexHint: @index_name, forceIndexHint: true } + FILTER PHRASE(doc.prompt_text, @keyword, "text_en") + RETURN doc + """ + + cursor = self.db_client.aql.execute( + query, + bind_vars={ + "@collection": self.collection.name, + "index_name": index_name, + "keyword": keyword, + }, + ) + + serialized_data = [] + for doc in cursor: + doc["id"] = str(doc["_key"]) + del doc["_id"] + del doc["_key"] + del doc["_rev"] + + serialized_data.append(doc) + + return serialized_data + + except Exception as e: + print(e) + raise Exception(e) + + def delete_prompt(self, prompt_id: str) -> bool: + """Delete a prompt from collection by given prompt_id. + + Args: + prompt_id(str): The ID of the prompt to be deleted. + + Returns: + bool: True if prompt is successfully deleted, False otherwise. + + Raises: + KeyError: If the provided feedback_id is invalid: + Exception: If the user does not match with the document user. + Exception: If any errors occurs during delete process. + """ + response = self.collection.get(prompt_id) + + if response is None: + raise KeyError(f"Feedback with ID: {prompt_id} not found.") + + if response["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {prompt_id} does not belong to user: {self.user}") + + try: + response = self.collection.delete(prompt_id) + print(f"Deleted document: {prompt_id} !") + + return True + except Exception as e: + print(e) + raise Exception("Not able to delete the data.") diff --git a/comps/prompt_registry/arango/config.py b/comps/prompt_registry/arango/config.py new file mode 100644 index 0000000000..9719f1358e --- /dev/null +++ b/comps/prompt_registry/arango/config.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ARANGO configuration +ARANGO_HOST = os.getenv("ARANGODB_HOST", "localhost") +ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +DB_NAME = os.getenv("DB_NAME", "OPEA") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Prompt") diff --git a/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml b/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml new file mode 100644 index 0000000000..b1aee077d9 --- /dev/null +++ b/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arango: + image: arangodb/arangodb:latest + container_name: arangodb + ports: + - 8529:8529 + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_ROOT_PASSWORD: ${ARANGO_ROOT_PASSWORD} + + promptregistry-arango: + image: opea/promptregistry-arango:latest + container_name: promptregistry-arango-server + ports: + - "6018:6018" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_HOST: ${ARANGO_HOST} + ARANGO_PORT: ${ARANGO_PORT} + ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + DB_NAME: ${DB_NAME} + COLLECTION_NAME: ${COLLECTION_NAME} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/prompt_registry/arango/prompt.py b/comps/prompt_registry/arango/prompt.py new file mode 100644 index 0000000000..c46e0174c2 --- /dev/null +++ b/comps/prompt_registry/arango/prompt.py @@ -0,0 +1,148 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import Optional + +from arango_store import PromptStore +from pydantic import BaseModel + +from comps import CustomLogger +from comps.cores.mega.micro_service import opea_microservices, register_microservice + +logger = CustomLogger("prompt_arango") +logflag = os.getenv("LOGFLAG", False) + + +class PromptCreate(BaseModel): + """This class represents the data model for creating and storing a new prompt in the database. + + Attributes: + prompt_text (str): The text content of the prompt. + user (str): The user or creator of the prompt. + """ + + prompt_text: str + user: str + + +class PromptId(BaseModel): + """This class represent the data model for retrieve prompt stored in database. + + Attributes: + user (str): The user of the requested prompt. + prompt_id (str): The prompt_id of prompt to be retrieved from database. + """ + + user: str + prompt_id: Optional[str] = None + prompt_text: Optional[str] = None + + +@register_microservice( + name="opea_service@prompt_arango", + endpoint="/v1/prompt/create", + host="0.0.0.0", + input_datatype=PromptCreate, + port=6018, +) +async def create_prompt(prompt: PromptCreate): + """Creates and stores a prompt in prompt store. + + Args: + prompt (PromptCreate): The PromptCreate class object containing the data to be stored. + + Returns: + JSON (PromptResponse): PromptResponse class object, None otherwise. + """ + if logflag: + logger.info(prompt) + + try: + prompt_store = PromptStore(prompt.user) + prompt_store.initialize_storage() + response = prompt_store.save_prompt(prompt) + if logflag: + logger.info(response) + + return response + + except Exception as error: + logger.error(f"An error occurred: {str(error)}") + raise error + + +@register_microservice( + name="opea_service@prompt_arango", + endpoint="/v1/prompt/get", + host="0.0.0.0", + input_datatype=PromptId, + port=6018, +) +async def get_prompt(prompt: PromptId): + """Retrieves prompt from prompt store based on provided PromptId or user. + + Args: + prompt (PromptId): The PromptId object containing user and prompt_id. + + Returns: + JSON: Retrieved prompt data if successful, None otherwise. + """ + if logflag: + logger.info(prompt) + try: + + prompt_store = PromptStore(prompt.user) + prompt_store.initialize_storage() + + if prompt.prompt_id is not None: + response = prompt_store.get_user_prompt_by_id(prompt.prompt_id) + elif prompt.prompt_text: + response = prompt_store.prompt_search(prompt.prompt_text) + else: + response = prompt_store.get_all_prompt_of_user() + if logflag: + logger.info(response) + return response + + except Exception as error: + logger.error(f"An error occurred: {str(error)}") + raise error + + +@register_microservice( + name="opea_service@prompt_arango", + endpoint="/v1/prompt/delete", + host="0.0.0.0", + input_datatype=PromptId, + port=6018, +) +async def delete_prompt(prompt: PromptId): + """Delete a prompt from prompt store by given PromptId. + + Args: + prompt (PromptId): The PromptId object containing user and prompt_id. + + Returns: + Result of deletion if successful, None otherwise. + """ + if logflag: + logger.info(prompt) + try: + prompt_store = PromptStore(prompt.user) + prompt_store.initialize_storage() + if prompt.prompt_id is None: + raise Exception("Prompt id is required.") + else: + response = prompt_store.delete_prompt(prompt.prompt_id) + if logflag: + logger.info(response) + return response + + except Exception as error: + logger.error(f"An error occurred: {str(error)}") + raise error + + +if __name__ == "__main__": + opea_microservices["opea_service@prompt_arango"].start() diff --git a/comps/prompt_registry/arango/requirements.txt b/comps/prompt_registry/arango/requirements.txt new file mode 100644 index 0000000000..9e5d0de8e0 --- /dev/null +++ b/comps/prompt_registry/arango/requirements.txt @@ -0,0 +1 @@ +python-arango \ No newline at end of file From 0e9ed3ba98be04d106ead44c692fc215944997f5 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 22 Oct 2024 21:30:29 -0400 Subject: [PATCH 05/35] arangodb prep | initial commit --- .gitignore | 1 + ARANGODB_README.md | 33 +++++++++++++++ langchain_test.py | 101 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 135 insertions(+) create mode 100644 ARANGODB_README.md create mode 100644 langchain_test.py diff --git a/.gitignore b/.gitignore index 1d1e0a3899..9778bf8f78 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ __pycache__ *.egg-info/ .DS_Store +.venv \ No newline at end of file diff --git a/ARANGODB_README.md b/ARANGODB_README.md new file mode 100644 index 0000000000..76f4b7db30 --- /dev/null +++ b/ARANGODB_README.md @@ -0,0 +1,33 @@ +Instructions + +0. Create a virtual environment: + +```bash +python -m venv .venv + +source .venv/bin/activate +``` + +1. Install the required packages: + +```bash +pip install python-arango +pip install langchain_openai +pip install git+https://github.com/arangoml/langchain.git@arangodb#subdirectory=libs/community +``` + +2. Provision the ArangoDB with Vector Index image: + +```bash +docker create --name arango-vector -p 8529:8529 -e ARANGO_ROOT_PASSWORD=test jbajic/arangodb-arm:vector-index-preview + +docker start arango-vector +``` + +3. Set your `OPENAI_API_KEY` environment variable (contact Anthony for access) + +4. Run the test script to confirm LangChain is working: + +```bash +python langchain_test.py +``` \ No newline at end of file diff --git a/langchain_test.py b/langchain_test.py new file mode 100644 index 0000000000..e33ea16873 --- /dev/null +++ b/langchain_test.py @@ -0,0 +1,101 @@ +from arango import ArangoClient +from langchain_community.chains.graph_qa.arangodb import ArangoGraphQAChain +from langchain_community.embeddings import OpenAIEmbeddings +from langchain_community.graphs.arangodb_graph import ArangoGraph +from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship +from langchain_community.vectorstores.arangodb_vector import ArangoVector +from langchain_core.documents import Document +from langchain_openai import OpenAI + +system_db = ArangoClient().db("_system", password="test", verify=True) +system_db.delete_database("langchain_test", ignore_missing=True) +system_db.create_database("langchain_test") +db = ArangoClient().db("langchain_test", password="test", verify=True) + +#################### +# Test ArangoGraph # +#################### + +# Create nodes +node1 = Node(id="1", type="Person", properties={"name": "John", "age": 30}) +node2 = Node(id="2", type="Person", properties={"name": "Jane", "age": 28}) +node3 = Node(id="3", type="Club", properties={"name": "Karate Club"}) + +# Create relationships +relationship1 = Relationship(source=node1, target=node3, type="MEMBER_OF", properties={"joined_date": "2020-01-01"}) +relationship2 = Relationship(source=node2, target=node3, type="MEMBER_OF", properties={"joined_date": "2019-05-15"}) +relationship3 = Relationship(source=node1, target=node2, type="KNOWS", properties={"since": "2018-03-10"}) + +# Create source document +source_doc = Document( + page_content="John and Jane are members of the Karate Club. They know each other.", + metadata={"source": "club_records"}, +) + +# Create GraphDocument +graph_doc = GraphDocument( + nodes=[node1, node2, node3], relationships=[relationship1, relationship2, relationship3], source=source_doc +) + +arango_graph = ArangoGraph(db=db, include_examples=False) +arango_graph.add_graph_documents([graph_doc], graph_name="NewGraph", include_source=True) + +##################### +# Test ArangoVector # +##################### + +# Add some sample texts +texts = [ + "The quick brown fox jumps over the lazy dog", + "A journey of a thousand miles begins with a single step", + "To be or not to be, that is the question", + "All that glitters is not gold", + "hello what's up", +] + +vector_store = ArangoVector.from_texts( + texts, + OpenAIEmbeddings(), + database=db, + collection_name="vector_test", + index_name="vector_index", + distance_strategy="COSINE", +) + +texts_2 = ["the dog, cat, and mouse are all mammals"] +vector_store.add_texts(texts_2) + +# Perform a similarity search +query = "What animal is mentioned?" +results = vector_store.similarity_search_with_score(query, k=2) + +print("Search results for query:", query) +for doc, score in results: + print(f"Content: {doc.page_content}") + print(f"Metadata: {doc.metadata}") + print(f"Score: {score}") + print("---") + +# Try another query +query2 = "What's a famous Shakespeare quote?" +results2 = vector_store.similarity_search_with_score(query2, k=1) + +print("\nSearch results for query:", query2) +for doc, score in results2: + print(f"Content: {doc.page_content}") + print(f"Metadata: {doc.metadata}") + print(f"Score: {score}") + print("---") + +########################### +# Test ArangoGraphQAChain # +########################### + +llm = OpenAI(temperature=0) +graph = ArangoGraph(db=db, include_examples=False, graph_name="NewGraph") +chain = ArangoGraphQAChain.from_llm(llm, graph=graph, allow_dangerous_requests=True) +chain.verbose = True +chain.execute_aql_query = False +chain.run("What is the name of the club?") +chain.execute_aql_query = True +chain.run("What is the name of the club?") From 7bd5aaacdb43a90f6b8e2cb8ea08b184a03044d2 Mon Sep 17 00:00:00 2001 From: Ajay Kallepalli <72517322+ajaykallepalli@users.noreply.github.com> Date: Mon, 25 Nov 2024 14:28:02 -0800 Subject: [PATCH 06/35] ArangoDB: Feedback management (#11) * initial commit * updating feedback management readme to match arango * Removing comments above import * Working API test and updated readme * Working docker compose file * Docker compose creating network and docker image * code review * update readme & dev yaml * delete dev files * Delete arango_store.py --------- Co-authored-by: Anthony Mahanna --- comps/feedback_management/arango/Dockerfile | 30 +++ comps/feedback_management/arango/README.md | 172 ++++++++++++++++ .../feedback_management/arango/arango_conn.py | 32 +++ .../arango/arango_store.py | 186 ++++++++++++++++++ comps/feedback_management/arango/config.py | 13 ++ .../docker-compose-user-feedback-arango.yaml | 38 ++++ comps/feedback_management/arango/feedback.py | 172 ++++++++++++++++ .../arango/requirements.txt | 1 + 8 files changed, 644 insertions(+) create mode 100644 comps/feedback_management/arango/Dockerfile create mode 100644 comps/feedback_management/arango/README.md create mode 100644 comps/feedback_management/arango/arango_conn.py create mode 100644 comps/feedback_management/arango/arango_store.py create mode 100644 comps/feedback_management/arango/config.py create mode 100644 comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml create mode 100644 comps/feedback_management/arango/feedback.py create mode 100644 comps/feedback_management/arango/requirements.txt diff --git a/comps/feedback_management/arango/Dockerfile b/comps/feedback_management/arango/Dockerfile new file mode 100644 index 0000000000..95ac359e63 --- /dev/null +++ b/comps/feedback_management/arango/Dockerfile @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libjemalloc-dev \ + libgl1-mesa-glx + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps +COPY requirements.txt /home/user/ + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/feedback_management/arango/requirements.txt && \ + pip install --no-cache-dir -r /home/user/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/feedback_management/arango + +ENTRYPOINT ["python", "feedback.py"] diff --git a/comps/feedback_management/arango/README.md b/comps/feedback_management/arango/README.md new file mode 100644 index 0000000000..8eb223ce95 --- /dev/null +++ b/comps/feedback_management/arango/README.md @@ -0,0 +1,172 @@ +# 🗨 Feedback Management Microservice with ArangoDB + +This README provides setup guides and all the necessary information about the Feedback Management microservice with ArangoDB database. + +--- + +## Setup Environment Variables + +```bash +export ARANGO_HOST=${ARANGO_HOST} +export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_USERNAME=${ARANGO_USERNAME} +export ARANGO_PASSWORD=${ARANGO_PASSWORD} +export DB_NAME=${DB_NAME} +export COLLECTION_NAME=${COLLECTION_NAME} +export PROTOCOL=${PROTOCOL} +export PYTHONPATH={Path to base of directory} +``` + +--- + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ~/GenAIComps +docker build -t opea/feedbackmanagement-arango-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/feedback_management/arango/Dockerfile . +``` + +### Run Docker with CLI + +- Run ArangoDB image container + + ```bash + docker run -d -p 8529:8529 --name=arango arangodb/arangodb:latest + ``` + +- Run Feedback Management microservice + + ```bash + docker run -d -p 6016:6016 \ + --name="feedbackmanagement-arango-server" \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e PROTOCOL=${PROTOCOL} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/feedbackmanagement-arango-server:latest + + ``` + +--- + +### ✅ Invoke Microservice + +The Feedback Management microservice exposes the following API endpoints: + +- Save feedback data + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "chat_id": "66445d4f71c7eff23d44f78d", + "chat_data": { + "user": "test", + "messages": [ + { + "role": "system", + "content": "You are helpful assistant" + }, + { + "role": "user", + "content": "hi", + "time": "1724915247" + }, + { + "role": "assistant", + "content": "Hi, may I help you?", + "time": "1724915249" + } + ] + }, + "feedback_data": { + "comment": "Moderate", + "rating": 3, + "is_thumbs_up": true + }}' + + + # Take note that chat_id here would be the id get from feedback_arango service + # If you do not wish to maintain chat history via feedback_arango service, you may generate some random uuid for it or just leave it empty. + ``` + +- Update feedback data by feedback_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "chat_id": "66445d4f71c7eff23d44f78d", + "chat_data": { + "user": "test", + "messages": [ + { + "role": "system", + "content": "You are helpful assistant" + }, + { + "role": "user", + "content": "hi", + "time": "1724915247" + }, + { + "role": "assistant", + "content": "Hi, may I help you?", + "time": "1724915249" + } + ] + }, + "feedback_data": { + "comment": "Fair and Moderate answer", + "rating": 2, + "is_thumbs_up": true + }, + "feedback_id": "{feedback_id of the data that wanted to update}"}' + + # Just include any feedback_data field value that you wanted to update. + ``` + +- Retrieve feedback data by user + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test"}' + ``` + +- Retrieve feedback data by feedback_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "feedback_id":"{feedback_id returned from save feedback route above}"}' + ``` + +- Delete feedback data by feedback_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/delete \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "feedback_id":"{feedback_id to be deleted}"}' + ``` diff --git a/comps/feedback_management/arango/arango_conn.py b/comps/feedback_management/arango/arango_conn.py new file mode 100644 index 0000000000..f9ac9e411d --- /dev/null +++ b/comps/feedback_management/arango/arango_conn.py @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango import ArangoClient as PythonArangoClient +from arango.database import StandardDatabase +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME, PROTOCOL + + +class ArangoClient: + conn_url = f"{PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + + @staticmethod + def get_db_client() -> StandardDatabase: + try: + # Create client + client = PythonArangoClient(hosts=ArangoClient.conn_url) + + # First connect to _system database + sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + # Create target database if it doesn't exist + if not sys_db.has_database(DB_NAME): + sys_db.create_database(DB_NAME) + + # Now connect to the target database + db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + return db + + except Exception as e: + print(e) + raise e diff --git a/comps/feedback_management/arango/arango_store.py b/comps/feedback_management/arango/arango_store.py new file mode 100644 index 0000000000..cd22b80784 --- /dev/null +++ b/comps/feedback_management/arango/arango_store.py @@ -0,0 +1,186 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango_conn import ArangoClient +from config import COLLECTION_NAME +from pydantic import BaseModel + + +class FeedbackStore: + + def __init__( + self, + user: str, + ): + self.user = user + + def initialize_storage(self) -> None: + self.db_client = ArangoClient.get_db_client() + + if not self.db_client.has_collection(COLLECTION_NAME): + self.db_client.create_collection(COLLECTION_NAME) + + self.collection = self.db_client.collection(COLLECTION_NAME) + + def save_feedback(self, feedback_data: BaseModel) -> str: + """Stores a new feedback data into the storage. + + Args: + feedback_data (object): The document to be stored. + + Returns: + str: The ID of the inserted feedback data. + + Raises: + Exception: If an error occurs while storing the feedback_data. + """ + try: + model_dump = feedback_data.model_dump(by_alias=True, mode="json", exclude={"feedback_id"}) + + inserted_feedback_data = self.collection.insert(model_dump) + + feedback_id = str(inserted_feedback_data["_key"]) + + return feedback_id + + except Exception as e: + print(e) + raise Exception(e) + + def update_feedback(self, feedback_data: BaseModel) -> bool: + """Update a feedback data in the collection with given id. + + Args: + feedback_id (str): The ID of the data to be updated. + updated_data (object): The data to be updated in the entry. + + Returns: + bool: True if the data updated successfully, False otherwise. + + Raises: + KeyError: If the document with ID is not found. + Exception: If the user does not match with the document user. + Exception: If an error occurs while updating the feedback data. + """ + _key = feedback_data.feedback_id + document = self.collection.get(_key) + + if document is None: + raise KeyError(f"Document with ID: {_key} not found.") + + if document["chat_data"]["user"] != self.user: + raise Exception(f"User mismatch. Document with ID: {_key} does not belong to user: {self.user}") + + try: + model_dump = feedback_data.feedback_data.model_dump(by_alias=True, mode="json") + + self.collection.update( + {"_key": _key, "feedback_data": model_dump}, + merge=True, + keep_none=False, + ) + + print(f"Updated document: {_key} !") + + return True + + except Exception as e: + print("Not able to update the data.") + print(e) + raise Exception(e) + + def get_all_feedback_of_user(self) -> list[dict]: + """Retrieves all feedback data of a user from the collection. + + Returns: + list[dict] | None: List of dict of feedback data of the user, None otherwise. + + Raises: + Exception: If there is an error while retrieving data. + """ + try: + feedback_data_list: list = [] + + # TODO: Clarify if we actually want to omit the `feedback_data` field. + # Implemented using MongoDB Feedback Management as a reference. + cursor = """ + FOR doc IN @@collection + FILTER doc.chat_data.user == @user + RETURN UNSET(doc, "feedback_data") + """ + + cursor = self.db_client.aql.execute( + cursor, bind_vars={"@collection": self.collection.name, "user": self.user} + ) + + for document in cursor: + document["feedback_id"] = str(document["_key"]) + del document["_id"] + del document["_key"] + del document["_rev"] + + feedback_data_list.append(document) + + return feedback_data_list + + except Exception as e: + print(e) + raise Exception(e) + + def get_feedback_by_id(self, feedback_id: str) -> dict | None: + """Retrieves a user feedback data from the collection based on the given feedback ID. + + Args: + feedback_id (str): The ID of the feedback data to retrieve. + + Returns: + dict | None: The user's feedback data if found, None otherwise. + + Raises: + KeyError: If document with ID is not found. + Exception: If the user does not match with the document user. + """ + response = self.collection.get(feedback_id) + + if response is None: + raise KeyError(f"Feedback with ID: {feedback_id} not found.") + + if response["chat_data"]["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {feedback_id} does not belong to user: {self.user}") + + del response["_id"] + del response["_key"] + del response["_rev"] + + return response + + def delete_feedback(self, feedback_id: str) -> bool: + """Delete a feedback data from collection by given feedback_id. + + Args: + feedback_id(str): The ID of the feedback data to be deleted. + + Returns: + bool: True if feedback is successfully deleted, False otherwise. + + Raises: + KeyError: If the provided feedback_id is invalid: + Exception: If the user does not match with the document user. + Exception: If any errors occurs during delete process. + """ + response = self.collection.get(feedback_id) + + if response is None: + raise KeyError(f"Feedback with ID: {feedback_id} not found.") + + if response["chat_data"]["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {feedback_id} does not belong to user: {self.user}") + + try: + response = self.collection.delete(feedback_id) + print(f"Deleted document: {feedback_id} !") + + return True + except Exception as e: + print(e) + raise Exception("Not able to delete the data.") diff --git a/comps/feedback_management/arango/config.py b/comps/feedback_management/arango/config.py new file mode 100644 index 0000000000..e3272febf8 --- /dev/null +++ b/comps/feedback_management/arango/config.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ARANGO configuration +ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") +ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +DB_NAME = os.getenv("DB_NAME", "OPEA") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") +PROTOCOL = os.getenv("PROTOCOL", "http") diff --git a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml new file mode 100644 index 0000000000..f4be0c845e --- /dev/null +++ b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arangodb: + image: arangodb/arangodb:latest + container_name: arangodb + ports: + - 8529:8529 + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_ROOT_PASSWORD: ${ARANGO_ROOT_PASSWORD} + + feedbackmanagement-arango: + image: opea/feedbackmanagement-arango:latest + container_name: feedbackmanagement-arango-server + ports: + - "6016:6016" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_HOST: ${ARANGO_HOST} + ARANGO_PORT: ${ARANGO_PORT} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + PROTOCOL: ${PROTOCOL} + DB_NAME: ${DB_NAME} + COLLECTION_NAME: ${COLLECTION_NAME} + restart: unless-stopped + +networks: + feedback_network: + driver: bridge diff --git a/comps/feedback_management/arango/feedback.py b/comps/feedback_management/arango/feedback.py new file mode 100644 index 0000000000..f1efa6f435 --- /dev/null +++ b/comps/feedback_management/arango/feedback.py @@ -0,0 +1,172 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import Annotated, Optional + +from arango_store import FeedbackStore +from fastapi import HTTPException +from pydantic import BaseModel, Field + +from comps import CustomLogger +from comps.cores.mega.micro_service import opea_microservices, register_microservice +from comps.cores.proto.api_protocol import ChatCompletionRequest + +logger = CustomLogger("feedback_arango") +logflag = os.getenv("LOGFLAG", False) + + +class FeedbackData(BaseModel): + """This class represents the data model of FeedbackData collected to store in database.". + + Attributes: + is_thumbs_up (bool): True if the response is satisfy, False otherwise. + rating: (int)[Optional]: Score rating. Range from 0 (bad rating) to 5(good rating). + comment (str)[Optional]: Comment given for response. + """ + + is_thumbs_up: bool + rating: Annotated[Optional[int], Field(ge=0, le=5)] = None + comment: Optional[str] = None + + +class ChatFeedback(BaseModel): + """This class represents the model for chat to collect FeedbackData together with ChatCompletionRequest data to store in database. + + Attributes: + chat_data (ChatCompletionRequest): ChatCompletionRequest object containing chat data to be stored. + feedback_data (FeedbackData): FeedbackData object containing feedback data for chat to be stored. + chat_id (str)[Optional]: The chat_id associated to the chat to be store together with feedback data. + feedback_id (str)[Optional]: The feedback_id of feedback data to be retrieved from database. + """ + + chat_data: ChatCompletionRequest + feedback_data: FeedbackData + chat_id: Optional[str] = None + feedback_id: Optional[str] = None + + +class FeedbackId(BaseModel): + """This class represent the data model for retrieve feedback data stored in database. + + Attributes: + user (str): The user of the requested feedback data. + feedback_id (str): The feedback_id of feedback data to be retrieved from database. + """ + + user: str + feedback_id: Optional[str] = None + + +@register_microservice( + name="opea_service@feedback_arango", + endpoint="/v1/feedback/create", + host="0.0.0.0", + input_datatype=FeedbackData, + port=6016, +) +async def create_feedback_data(feedback: ChatFeedback): + """Creates and stores a feedback data in database. + + Args: + feedback (ChatFeedback): The ChatFeedback class object containing feedback data to be stored. + + Returns: + response (str/bool): FeedbackId of the object created in database. True if data update successfully. + """ + if logflag: + logger.info(feedback) + + try: + feedback_store = FeedbackStore(feedback.chat_data.user) + feedback_store.initialize_storage() + if feedback.feedback_id is None: + response = feedback_store.save_feedback(feedback) + else: + response = feedback_store.update_feedback(feedback) + + if logflag: + logger.info(response) + return response + + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@feedback_arango", + endpoint="/v1/feedback/get", + host="0.0.0.0", + input_datatype=FeedbackId, + port=6016, +) +async def get_feedback(feedback: FeedbackId): + """Retrieves feedback_data from feedback store based on provided FeedbackId or user. + + Args: + feedback (FeedbackId): The FeedbackId object containing user and feedback_id or chat_id. + + Returns: + JSON: Retrieved feedback data if successful, error otherwise. + """ + if logflag: + logger.info(feedback) + + try: + feedback_store = FeedbackStore(feedback.user) + feedback_store.initialize_storage() + if feedback.feedback_id: + response = feedback_store.get_feedback_by_id(feedback.feedback_id) + else: + response = feedback_store.get_all_feedback_of_user() + + if logflag: + logger.info(response) + + return response + + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@feedback_arango", + endpoint="/v1/feedback/delete", + host="0.0.0.0", + input_datatype=FeedbackId, + port=6016, +) +async def delete_feedback(feedback: FeedbackId): + """Delete a feedback data from feedback store by given feedback Id. + + Args: + feedback (FeedbackId): The FeedbackId object containing user and feedback_id or chat_id + + Returns: + Result of deletion if successful, None otherwise. + """ + if logflag: + logger.info(feedback) + + try: + feedback_store = FeedbackStore(feedback.user) + feedback_store.initialize_storage() + if feedback.feedback_id is None: + raise Exception("feedback_id is required.") + else: + response = feedback_store.delete_feedback(feedback.feedback_id) + + if logflag: + logger.info(response) + + return response + + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +if __name__ == "__main__": + opea_microservices["opea_service@feedback_arango"].start() diff --git a/comps/feedback_management/arango/requirements.txt b/comps/feedback_management/arango/requirements.txt new file mode 100644 index 0000000000..9e5d0de8e0 --- /dev/null +++ b/comps/feedback_management/arango/requirements.txt @@ -0,0 +1 @@ +python-arango \ No newline at end of file From b6ded9fb924a5d3fbb1645114ddf874d409d7c57 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 25 Nov 2024 19:05:21 -0500 Subject: [PATCH 07/35] remove: `PROTOCOL` env --- comps/feedback_management/arango/README.md | 2 -- comps/feedback_management/arango/arango_conn.py | 4 ++-- comps/feedback_management/arango/config.py | 3 +-- .../arango/docker-compose-user-feedback-arango.yaml | 3 +-- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/comps/feedback_management/arango/README.md b/comps/feedback_management/arango/README.md index 8eb223ce95..e0f070b684 100644 --- a/comps/feedback_management/arango/README.md +++ b/comps/feedback_management/arango/README.md @@ -13,7 +13,6 @@ export ARANGO_USERNAME=${ARANGO_USERNAME} export ARANGO_PASSWORD=${ARANGO_PASSWORD} export DB_NAME=${DB_NAME} export COLLECTION_NAME=${COLLECTION_NAME} -export PROTOCOL=${PROTOCOL} export PYTHONPATH={Path to base of directory} ``` @@ -49,7 +48,6 @@ docker build -t opea/feedbackmanagement-arango-server:latest --build-arg https_p -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ -e DB_NAME=${DB_NAME} \ - -e PROTOCOL=${PROTOCOL} \ -e COLLECTION_NAME=${COLLECTION_NAME} \ opea/feedbackmanagement-arango-server:latest diff --git a/comps/feedback_management/arango/arango_conn.py b/comps/feedback_management/arango/arango_conn.py index f9ac9e411d..84ded04283 100644 --- a/comps/feedback_management/arango/arango_conn.py +++ b/comps/feedback_management/arango/arango_conn.py @@ -3,11 +3,11 @@ from arango import ArangoClient as PythonArangoClient from arango.database import StandardDatabase -from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME, PROTOCOL +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME class ArangoClient: - conn_url = f"{PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + conn_url = f"arangodb://{ARANGO_HOST}:{ARANGO_PORT}/" @staticmethod def get_db_client() -> StandardDatabase: diff --git a/comps/feedback_management/arango/config.py b/comps/feedback_management/arango/config.py index e3272febf8..c332de7e5c 100644 --- a/comps/feedback_management/arango/config.py +++ b/comps/feedback_management/arango/config.py @@ -9,5 +9,4 @@ ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") DB_NAME = os.getenv("DB_NAME", "OPEA") -COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") -PROTOCOL = os.getenv("PROTOCOL", "http") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") \ No newline at end of file diff --git a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml index f4be0c845e..62ab0df544 100644 --- a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml +++ b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml @@ -28,11 +28,10 @@ services: ARANGO_PORT: ${ARANGO_PORT} ARANGO_USERNAME: ${ARANGO_USERNAME} ARANGO_PASSWORD: ${ARANGO_PASSWORD} - PROTOCOL: ${PROTOCOL} DB_NAME: ${DB_NAME} COLLECTION_NAME: ${COLLECTION_NAME} restart: unless-stopped networks: - feedback_network: + default: driver: bridge From 17467499a6fd22afcdb4d058984af6e8735cdae2 Mon Sep 17 00:00:00 2001 From: SLasyaN Date: Tue, 26 Nov 2024 13:47:41 -0800 Subject: [PATCH 08/35] ArangoDB: PromptRegistry (#8) * Initial commit * remove unnecessary files * code review * update: `prompt_search` * new: `ARANGO_PROTOCOL` * README * cleanup --------- Co-authored-by: lasyasn Co-authored-by: Anthony Mahanna --- comps/feedback_management/README.md | 4 + comps/feedback_management/arango/README.md | 4 + .../feedback_management/arango/arango_conn.py | 4 +- comps/feedback_management/arango/config.py | 3 +- .../docker-compose-user-feedback-arango.yaml | 3 +- comps/prompt_registry/README.md | 4 + comps/prompt_registry/arango/DockerFile | 30 +++ comps/prompt_registry/arango/README.md | 120 ++++++++++ comps/prompt_registry/arango/arango_conn.py | 32 +++ comps/prompt_registry/arango/arango_store.py | 213 ++++++++++++++++++ comps/prompt_registry/arango/config.py | 13 ++ ...docker-compose-prompt-registry-arango.yaml | 38 ++++ comps/prompt_registry/arango/prompt.py | 148 ++++++++++++ comps/prompt_registry/arango/requirements.txt | 1 + 14 files changed, 613 insertions(+), 4 deletions(-) create mode 100644 comps/prompt_registry/arango/DockerFile create mode 100644 comps/prompt_registry/arango/README.md create mode 100644 comps/prompt_registry/arango/arango_conn.py create mode 100644 comps/prompt_registry/arango/arango_store.py create mode 100644 comps/prompt_registry/arango/config.py create mode 100644 comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml create mode 100644 comps/prompt_registry/arango/prompt.py create mode 100644 comps/prompt_registry/arango/requirements.txt diff --git a/comps/feedback_management/README.md b/comps/feedback_management/README.md index 2e68aa413c..9cd4b42a51 100644 --- a/comps/feedback_management/README.md +++ b/comps/feedback_management/README.md @@ -20,3 +20,7 @@ The Feedback Management microservice able to support various database backends f ### Feedback Management with MongoDB For more detail, please refer to this [README](./mongo/README.md) + +### Feedback Management with ArangoDB + +For more detail, please refer to this [README](./arango/README.md) diff --git a/comps/feedback_management/arango/README.md b/comps/feedback_management/arango/README.md index e0f070b684..7e9a5f8400 100644 --- a/comps/feedback_management/arango/README.md +++ b/comps/feedback_management/arango/README.md @@ -6,9 +6,12 @@ This README provides setup guides and all the necessary information about the Fe ## Setup Environment Variables +See `config.py` for default values. + ```bash export ARANGO_HOST=${ARANGO_HOST} export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} export ARANGO_USERNAME=${ARANGO_USERNAME} export ARANGO_PASSWORD=${ARANGO_PASSWORD} export DB_NAME=${DB_NAME} @@ -45,6 +48,7 @@ docker build -t opea/feedbackmanagement-arango-server:latest --build-arg https_p -e no_proxy=$no_proxy \ -e ARANGO_HOST=${ARANGO_HOST} \ -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ -e DB_NAME=${DB_NAME} \ diff --git a/comps/feedback_management/arango/arango_conn.py b/comps/feedback_management/arango/arango_conn.py index 84ded04283..d6c4b59777 100644 --- a/comps/feedback_management/arango/arango_conn.py +++ b/comps/feedback_management/arango/arango_conn.py @@ -3,11 +3,11 @@ from arango import ArangoClient as PythonArangoClient from arango.database import StandardDatabase -from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME class ArangoClient: - conn_url = f"arangodb://{ARANGO_HOST}:{ARANGO_PORT}/" + conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" @staticmethod def get_db_client() -> StandardDatabase: diff --git a/comps/feedback_management/arango/config.py b/comps/feedback_management/arango/config.py index c332de7e5c..bb790eb38a 100644 --- a/comps/feedback_management/arango/config.py +++ b/comps/feedback_management/arango/config.py @@ -6,7 +6,8 @@ # ARANGO configuration ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") DB_NAME = os.getenv("DB_NAME", "OPEA") -COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") \ No newline at end of file +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") diff --git a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml index 62ab0df544..8f9b3a85a8 100644 --- a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml +++ b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml @@ -3,7 +3,7 @@ version: "3" services: - arangodb: + arango: image: arangodb/arangodb:latest container_name: arangodb ports: @@ -26,6 +26,7 @@ services: no_proxy: ${no_proxy} ARANGO_HOST: ${ARANGO_HOST} ARANGO_PORT: ${ARANGO_PORT} + ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} ARANGO_USERNAME: ${ARANGO_USERNAME} ARANGO_PASSWORD: ${ARANGO_PASSWORD} DB_NAME: ${DB_NAME} diff --git a/comps/prompt_registry/README.md b/comps/prompt_registry/README.md index 6332a1a13a..a99b1b27b7 100644 --- a/comps/prompt_registry/README.md +++ b/comps/prompt_registry/README.md @@ -19,3 +19,7 @@ The Prompt Registry microservice able to support various database backends for s ### Prompt Registry with MongoDB For more detail, please refer to this [README](./mongo/README.md) + +### Prompt Registry with ArangoDB + +For more detail, please refer to this [README](./arango/README.md) diff --git a/comps/prompt_registry/arango/DockerFile b/comps/prompt_registry/arango/DockerFile new file mode 100644 index 0000000000..0659202058 --- /dev/null +++ b/comps/prompt_registry/arango/DockerFile @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps +COPY requirements.txt /home/user/ + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/prompt_registry/arango/requirements.txt && \ + pip install --no-cache-dir -r /home/user/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/prompt_registry/arango + +ENTRYPOINT ["python", "prompt.py"] diff --git a/comps/prompt_registry/arango/README.md b/comps/prompt_registry/arango/README.md new file mode 100644 index 0000000000..e4bdd6c101 --- /dev/null +++ b/comps/prompt_registry/arango/README.md @@ -0,0 +1,120 @@ +# 🧾 Prompt Registry Microservice with ArangoDB + +This README provides setup guides and all the necessary information about the Prompt Registry microservice with ArangoDB database. + +--- + +## Setup Environment Variables + +See `config.py` for default values. + +```bash +export ARANGO_HOST=${ARANGO_HOST} +export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} +export ARANGO_USERNAME=${ARANGO_USERNAME} +export ARANGO_PASSWORD=${ARANGO_PASSWORD} +export DB_NAME=${DB_NAME} +export COLLECTION_NAME=${COLLECTION_NAME} +``` + +--- + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ~/GenAIComps +docker build -t opea/promptregistry-arango-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/prompt_registry/arango/Dockerfile . +``` + +### Run Docker with CLI + + +- Run ArangoDB image container + + ```bash + docker run -d -p 8529:8529 --name=arango arangodb/arangodb:latest + ``` + +- Run Prompt Registry microservice + + ```bash + docker run -d -p 6018:6018 \ + --name="promptregistry-arango-server" \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/promptregistry-arango-server:latest + + ``` + +--- + +### ✅ Invoke Microservice + +The Prompt Registry microservice exposes the following API endpoints: + +- Save prompt + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "prompt_text": "test prompt", "user": "test" + }' + ``` + +- Retrieve prompt from database by user + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test"}' + ``` + +- Retrieve prompt from database by prompt_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "prompt_id":"{_id returned from save prompt route above}"}' + ``` + +- Retrieve relevant prompt by keyword + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "prompt_text": "{keyword to search}"}' + ``` + +- Delete prompt by prompt_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/delete \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "prompt_id":"{prompt_id to be deleted}"}' + ``` diff --git a/comps/prompt_registry/arango/arango_conn.py b/comps/prompt_registry/arango/arango_conn.py new file mode 100644 index 0000000000..d6c4b59777 --- /dev/null +++ b/comps/prompt_registry/arango/arango_conn.py @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango import ArangoClient as PythonArangoClient +from arango.database import StandardDatabase +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME + + +class ArangoClient: + conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + + @staticmethod + def get_db_client() -> StandardDatabase: + try: + # Create client + client = PythonArangoClient(hosts=ArangoClient.conn_url) + + # First connect to _system database + sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + # Create target database if it doesn't exist + if not sys_db.has_database(DB_NAME): + sys_db.create_database(DB_NAME) + + # Now connect to the target database + db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + return db + + except Exception as e: + print(e) + raise e diff --git a/comps/prompt_registry/arango/arango_store.py b/comps/prompt_registry/arango/arango_store.py new file mode 100644 index 0000000000..fb80ccd20c --- /dev/null +++ b/comps/prompt_registry/arango/arango_store.py @@ -0,0 +1,213 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +from arango.exceptions import IndexGetError +from arango_conn import ArangoClient +from config import COLLECTION_NAME +from prompt import PromptCreate +from pydantic import BaseModel + +from comps import CustomLogger + +logger = CustomLogger("arango_store") +logflag = os.getenv("LOGFLAG", False) + + +class PromptStore: + + def __init__( + self, + user: str, + ): + self.user = user + self.inverted_index_exists = False + + def initialize_storage(self) -> None: + self.db_client = ArangoClient.get_db_client() + + if not self.db_client.has_collection(COLLECTION_NAME): + self.db_client.create_collection(COLLECTION_NAME) + + self.collection = self.db_client.collection(COLLECTION_NAME) + + def save_prompt(self, prompt: PromptCreate): + """Stores a new prompt into the storage. + + Args: + prompt: The document to be stored. It should be a Pydantic model. + + Returns: + str: The ID of the inserted prompt. + + Raises: + Exception: If an error occurs while storing the prompt. + """ + try: + model_dump = prompt.model_dump(by_alias=True, mode="json", exclude={"id"}) + + inserted_prompt_data = self.collection.insert(model_dump) + + prompt_id = str(inserted_prompt_data["_key"]) + + return prompt_id + + except Exception as e: + print(e) + raise Exception(e) + + def get_all_prompt_of_user(self) -> list[dict]: + """Retrieves all prompts of a user from the collection. + + Returns: + list[dict] | None: List of dict of prompts of the user, None otherwise. + + Raises: + Exception: If there is an error while retrieving data. + """ + try: + prompt_data_list: list = [] + + # TODO: Clarify if we actually want to omit the `data` field. + # Implemented using MongoDB Prompt Registry as a reference. + cursor = """ + FOR doc IN @@collection + FILTER doc.chat_data.user == @user + RETURN UNSET(doc, "data") + """ + + cursor = self.db_client.aql.execute( + cursor, bind_vars={"@collection": self.collection.name, "user": self.user} + ) + + for document in cursor: + document["id"] = str(document["_key"]) + del document["_id"] + del document["_key"] + del document["_rev"] + + prompt_data_list.append(document) + + return prompt_data_list + + except Exception as e: + print(e) + raise Exception(e) + + def get_user_prompt_by_id(self, prompt_id: str) -> dict | None: + """Retrieves a user prompt from the collection based on the given prompt ID. + + Args: + prompt_id (str): The ID of the prompt to retrieve. + + Returns: + dict | None: The user prompt if found, None otherwise. + + Raises: + KeyError: If document with ID is not found. + Exception: If the user does not match with the document user. + """ + response = self.collection.get(prompt_id) + + if response is None: + raise KeyError(f"Prompt with ID: {prompt_id} not found.") + + if response["user"] != self.user: + raise Exception(f"User mismatch. Prompt with ID: {prompt_id} does not belong to user: {self.user}") + + del response["_id"] + del response["_key"] + del response["_rev"] + + return response + + def prompt_search(self, keyword: str) -> list | None: + """Retrieves prompt from the collection based on keyword provided. + + Args: + keyword (str): The keyword of prompt to search for. + + Returns: + list | None: The list of relevant prompt if found, None otherwise. + + Raises: + Exception: If there is an error while searching data. + """ + try: + index_name = "prompt_text_index" + + if not self.inverted_index_exists: + try: + self.collection.get_index(index_name) + + except IndexGetError: + self.collection.add_inverted_index( + fields=["prompt_text"], + name=index_name, + # TODO: add more kwargs if needed + ) + + self.inverted_index_exists = True + + query = """ + FOR doc IN @@collection + OPTIONS { indexHint: @index_name, forceIndexHint: true } + FILTER PHRASE(doc.prompt_text, @keyword, "text_en") + RETURN doc + """ + + cursor = self.db_client.aql.execute( + query, + bind_vars={ + "@collection": self.collection.name, + "index_name": index_name, + "keyword": keyword, + }, + ) + + serialized_data = [] + for doc in cursor: + doc["id"] = str(doc["_key"]) + del doc["_id"] + del doc["_key"] + del doc["_rev"] + + serialized_data.append(doc) + + return serialized_data + + except Exception as e: + print(e) + raise Exception(e) + + def delete_prompt(self, prompt_id: str) -> bool: + """Delete a prompt from collection by given prompt_id. + + Args: + prompt_id(str): The ID of the prompt to be deleted. + + Returns: + bool: True if prompt is successfully deleted, False otherwise. + + Raises: + KeyError: If the provided feedback_id is invalid: + Exception: If the user does not match with the document user. + Exception: If any errors occurs during delete process. + """ + response = self.collection.get(prompt_id) + + if response is None: + raise KeyError(f"Feedback with ID: {prompt_id} not found.") + + if response["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {prompt_id} does not belong to user: {self.user}") + + try: + response = self.collection.delete(prompt_id) + print(f"Deleted document: {prompt_id} !") + + return True + except Exception as e: + print(e) + raise Exception("Not able to delete the data.") diff --git a/comps/prompt_registry/arango/config.py b/comps/prompt_registry/arango/config.py new file mode 100644 index 0000000000..9719f1358e --- /dev/null +++ b/comps/prompt_registry/arango/config.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ARANGO configuration +ARANGO_HOST = os.getenv("ARANGODB_HOST", "localhost") +ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +DB_NAME = os.getenv("DB_NAME", "OPEA") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Prompt") diff --git a/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml b/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml new file mode 100644 index 0000000000..b1aee077d9 --- /dev/null +++ b/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arango: + image: arangodb/arangodb:latest + container_name: arangodb + ports: + - 8529:8529 + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_ROOT_PASSWORD: ${ARANGO_ROOT_PASSWORD} + + promptregistry-arango: + image: opea/promptregistry-arango:latest + container_name: promptregistry-arango-server + ports: + - "6018:6018" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_HOST: ${ARANGO_HOST} + ARANGO_PORT: ${ARANGO_PORT} + ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + DB_NAME: ${DB_NAME} + COLLECTION_NAME: ${COLLECTION_NAME} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/prompt_registry/arango/prompt.py b/comps/prompt_registry/arango/prompt.py new file mode 100644 index 0000000000..c46e0174c2 --- /dev/null +++ b/comps/prompt_registry/arango/prompt.py @@ -0,0 +1,148 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import Optional + +from arango_store import PromptStore +from pydantic import BaseModel + +from comps import CustomLogger +from comps.cores.mega.micro_service import opea_microservices, register_microservice + +logger = CustomLogger("prompt_arango") +logflag = os.getenv("LOGFLAG", False) + + +class PromptCreate(BaseModel): + """This class represents the data model for creating and storing a new prompt in the database. + + Attributes: + prompt_text (str): The text content of the prompt. + user (str): The user or creator of the prompt. + """ + + prompt_text: str + user: str + + +class PromptId(BaseModel): + """This class represent the data model for retrieve prompt stored in database. + + Attributes: + user (str): The user of the requested prompt. + prompt_id (str): The prompt_id of prompt to be retrieved from database. + """ + + user: str + prompt_id: Optional[str] = None + prompt_text: Optional[str] = None + + +@register_microservice( + name="opea_service@prompt_arango", + endpoint="/v1/prompt/create", + host="0.0.0.0", + input_datatype=PromptCreate, + port=6018, +) +async def create_prompt(prompt: PromptCreate): + """Creates and stores a prompt in prompt store. + + Args: + prompt (PromptCreate): The PromptCreate class object containing the data to be stored. + + Returns: + JSON (PromptResponse): PromptResponse class object, None otherwise. + """ + if logflag: + logger.info(prompt) + + try: + prompt_store = PromptStore(prompt.user) + prompt_store.initialize_storage() + response = prompt_store.save_prompt(prompt) + if logflag: + logger.info(response) + + return response + + except Exception as error: + logger.error(f"An error occurred: {str(error)}") + raise error + + +@register_microservice( + name="opea_service@prompt_arango", + endpoint="/v1/prompt/get", + host="0.0.0.0", + input_datatype=PromptId, + port=6018, +) +async def get_prompt(prompt: PromptId): + """Retrieves prompt from prompt store based on provided PromptId or user. + + Args: + prompt (PromptId): The PromptId object containing user and prompt_id. + + Returns: + JSON: Retrieved prompt data if successful, None otherwise. + """ + if logflag: + logger.info(prompt) + try: + + prompt_store = PromptStore(prompt.user) + prompt_store.initialize_storage() + + if prompt.prompt_id is not None: + response = prompt_store.get_user_prompt_by_id(prompt.prompt_id) + elif prompt.prompt_text: + response = prompt_store.prompt_search(prompt.prompt_text) + else: + response = prompt_store.get_all_prompt_of_user() + if logflag: + logger.info(response) + return response + + except Exception as error: + logger.error(f"An error occurred: {str(error)}") + raise error + + +@register_microservice( + name="opea_service@prompt_arango", + endpoint="/v1/prompt/delete", + host="0.0.0.0", + input_datatype=PromptId, + port=6018, +) +async def delete_prompt(prompt: PromptId): + """Delete a prompt from prompt store by given PromptId. + + Args: + prompt (PromptId): The PromptId object containing user and prompt_id. + + Returns: + Result of deletion if successful, None otherwise. + """ + if logflag: + logger.info(prompt) + try: + prompt_store = PromptStore(prompt.user) + prompt_store.initialize_storage() + if prompt.prompt_id is None: + raise Exception("Prompt id is required.") + else: + response = prompt_store.delete_prompt(prompt.prompt_id) + if logflag: + logger.info(response) + return response + + except Exception as error: + logger.error(f"An error occurred: {str(error)}") + raise error + + +if __name__ == "__main__": + opea_microservices["opea_service@prompt_arango"].start() diff --git a/comps/prompt_registry/arango/requirements.txt b/comps/prompt_registry/arango/requirements.txt new file mode 100644 index 0000000000..9e5d0de8e0 --- /dev/null +++ b/comps/prompt_registry/arango/requirements.txt @@ -0,0 +1 @@ +python-arango \ No newline at end of file From bf413276134fd8ca942049239caa071b75660876 Mon Sep 17 00:00:00 2001 From: Ajay Kallepalli <72517322+ajaykallepalli@users.noreply.github.com> Date: Tue, 26 Nov 2024 14:59:41 -0800 Subject: [PATCH 09/35] ArangoDB: Chathistory (#10) * Initial chat history implementation without API and docker implementation * make copy and remove async * API functionality matching MongoDB implementation Working API functionality, update to dockerfile required, and additional checks when updating document required. * Delete temp.py * Push changes and reset repo * Async definitions working in curl calls, updated read me to ArangoDB setup * Working docker container with network * Removing need for network to be created before docker compose * Cleanup async files and backup files * code review * fix: typo * revert mongo changes --------- Co-authored-by: Anthony Mahanna --- .gitignore | 3 +- comps/chathistory/arango/Dockerfile | 30 +++ comps/chathistory/arango/README.md | 123 ++++++++++++ comps/chathistory/arango/arango_conn.py | 32 +++ comps/chathistory/arango/arango_store.py | 186 ++++++++++++++++++ comps/chathistory/arango/chat.py | 146 ++++++++++++++ comps/chathistory/arango/config.py | 13 ++ .../docker-compose-chathistory-arango.yaml | 38 ++++ comps/chathistory/arango/requirements.txt | 1 + comps/prompt_registry/arango/config.py | 2 +- 10 files changed, 572 insertions(+), 2 deletions(-) create mode 100644 comps/chathistory/arango/Dockerfile create mode 100644 comps/chathistory/arango/README.md create mode 100644 comps/chathistory/arango/arango_conn.py create mode 100644 comps/chathistory/arango/arango_store.py create mode 100644 comps/chathistory/arango/chat.py create mode 100644 comps/chathistory/arango/config.py create mode 100644 comps/chathistory/arango/docker-compose-chathistory-arango.yaml create mode 100644 comps/chathistory/arango/requirements.txt diff --git a/.gitignore b/.gitignore index 9778bf8f78..3a428754d9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ __pycache__ *.egg-info/ .DS_Store -.venv \ No newline at end of file +.venv +venv/ diff --git a/comps/chathistory/arango/Dockerfile b/comps/chathistory/arango/Dockerfile new file mode 100644 index 0000000000..f402e55267 --- /dev/null +++ b/comps/chathistory/arango/Dockerfile @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libjemalloc-dev \ + libgl1-mesa-glx + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps +COPY requirements.txt /home/user/ + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/chathistory/arango/requirements.txt && \ + pip install --no-cache-dir -r /home/user/requirements.txt + +ENV PYTHONPATH=/home/user + +WORKDIR /home/user/comps/chathistory/mongo + +ENTRYPOINT ["python", "chat.py"] diff --git a/comps/chathistory/arango/README.md b/comps/chathistory/arango/README.md new file mode 100644 index 0000000000..428a65255e --- /dev/null +++ b/comps/chathistory/arango/README.md @@ -0,0 +1,123 @@ +# 📝 Chat History Microservice with ArangoDB + +This README provides setup guides and all the necessary information about the Chat History microservice with ArangoDB database. + +--- + +## Setup Environment Variables + +See `config.py` for default values. + +```bash +export ARANGO_HOST=${ARANGO_HOST} +export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} +export ARANGO_USERNAME=${ARANGO_USERNAME} +export ARANGO_PASSWORD=${ARANGO_PASSWORD} +export DB_NAME=${DB_NAME} +export COLLECTION_NAME=${COLLECTION_NAME} +``` + +--- + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ../../../../ +docker build -t opea/chathistory-arango-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/chathistory/arango/Dockerfile . +``` + +### Run Docker with CLI + +- Run ArangoDB image container + + ```bash + docker run -d -p 8529:8529 --name=arango arangodb/arangodb:latest + ``` + +- Run the Chat History microservice + + ```bash + docker run -p 6012:6012 \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/chathistory-arango-server:latest + ``` + +--- + +## ✅ Invoke Microservice + +The Chat History microservice exposes the following API endpoints: + +- Create new chat conversation + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "data": { + "messages": "test Messages", "user": "test" + } + }' + ``` + +- Get all the Conversations for a user + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test"}' + ``` + +- Get a specific conversation by id. + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "id":"48918"}' + ``` + +- Update the conversation by id. + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "data": { + "messages": "test Messages Update", "user": "test" + }, + "id":"48918" + }' + ``` + +- Delete a stored conversation. + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/delete \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "id":"48918"}' + ``` diff --git a/comps/chathistory/arango/arango_conn.py b/comps/chathistory/arango/arango_conn.py new file mode 100644 index 0000000000..d6c4b59777 --- /dev/null +++ b/comps/chathistory/arango/arango_conn.py @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango import ArangoClient as PythonArangoClient +from arango.database import StandardDatabase +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME + + +class ArangoClient: + conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + + @staticmethod + def get_db_client() -> StandardDatabase: + try: + # Create client + client = PythonArangoClient(hosts=ArangoClient.conn_url) + + # First connect to _system database + sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + # Create target database if it doesn't exist + if not sys_db.has_database(DB_NAME): + sys_db.create_database(DB_NAME) + + # Now connect to the target database + db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + return db + + except Exception as e: + print(e) + raise e diff --git a/comps/chathistory/arango/arango_store.py b/comps/chathistory/arango/arango_store.py new file mode 100644 index 0000000000..8ab6928eb7 --- /dev/null +++ b/comps/chathistory/arango/arango_store.py @@ -0,0 +1,186 @@ +from typing import Any + +from arango_conn import ArangoClient +from config import COLLECTION_NAME +from pydantic import BaseModel + + +class DocumentStore: + + def __init__( + self, + user: str, + ): + self.user = user + + def initialize_storage(self) -> None: + self.db_client = ArangoClient.get_db_client() + + if not self.db_client.has_collection(COLLECTION_NAME): + self.db_client.create_collection(COLLECTION_NAME) + + self.collection = self.db_client.collection(COLLECTION_NAME) + + def save_document(self, document: BaseModel) -> str: + """Stores a new document into the storage. + + Args: + document: The document to be stored. It should be a Pydantic model. + + Returns: + str: The ID of the inserted document. + + Raises: + Exception: If an error occurs while storing the document. + """ + try: + model_dump = document.model_dump(by_alias=True, mode="json", exclude={"id"}) + + inserted_document = self.collection.insert(model_dump) + + document_id = str(inserted_document["_key"]) + + return document_id + + except Exception as e: + print(e) + raise Exception(e) + + def update_document(self, document_id: str, updated_data: BaseModel, first_query: Any) -> str: + """Updates a document in the collection with the given document_id. + + Args: + document_id (str): The ID of the document to update. + updated_data (object): The updated data to be set in the document. + first_query (object): The first query to be set in the document. + + Returns: + bool: True if the document was successfully updated, False otherwise. + + Raises: + KeyError: If the document with ID is not found. + Exception: If the user does not match with the document user. + Exception: If an error occurs while updating the document data. + """ + document = self.collection.get(document_id) + + if document is None: + raise Exception(f"Unable to find Document {document_id}") + + if document["data"]["user"] != self.user: + raise Exception(f"User {self.user} is not allowed to update Document {document_id}.") + + try: + self.collection.update( + { + "_key": document_id, + "data": updated_data.model_dump(by_alias=True, mode="json"), + "first_query": first_query, + }, + merge=True, + keep_none=True, + ) + + print(f"Updated document: {document_id} !") + + return True + + except Exception as e: + print("Not able to update the data.") + print(e) + raise Exception(e) + + def get_all_documents_of_user(self) -> list[dict]: + """Retrieves all documents of a specific user from the collection. + + Returns: + A list of dictionaries representing the conversation documents. + Raises: + Exception: If there is an error while retrieving the documents. + """ + try: + document_list: list = [] + + # TODO: Clarify if we actually want to omit the `data` field. + # Implemented using MongoDB Feedback Management as a reference. + cursor = """ + FOR doc IN @@collection + FILTER doc.data.user == @user + RETURN UNSET(doc, "data") + """ + + cursor = self.db_client.aql.execute( + cursor, bind_vars={"@collection": self.collection.name, "user": self.user} + ) + + for document in cursor: + document["id"] = str(document["_key"]) + del document["_id"] + del document["_key"] + del document["_rev"] + + document_list.append(document) + + return document_list + + except Exception as e: + print(e) + raise Exception(e) + + def get_user_documents_by_id(self, document_id: str) -> dict | None: + """Retrieves a user document from the collection based on the given document ID. + + Args: + document_id (str): The ID of the document to retrieve. + + Returns: + dict | None: The user document if found, None otherwise. + + Raises: + KeyError: If document with ID is not found. + Exception: If the user does not match with the document user. + """ + response = self.collection.get(document_id) + + if response is None: + raise KeyError(f"Document with ID: {document_id} not found.") + + if response["data"]["user"] != self.user: + raise Exception(f"User mismatch. Document with ID: {document_id} does not belong to user: {self.user}") + + del response["_id"] + del response["_key"] + del response["_rev"] + + return response + + def delete_document(self, document_id: str) -> str: + """Deletes a document from the collection based on the provided document ID. + + Args: + document_id (str): The ID of the document to be deleted. + + Returns: + bool: True if the document is successfully deleted, False otherwise. + + Raises: + KeyError: If the provided document_id is invalid: + Exception: If the user does not match with the document user. + Exception: If any errors occurs during delete process. + """ + response = self.collection.get(document_id) + + if response is None: + raise KeyError(f"Document with ID: {document_id} not found.") + + if response["data"]["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {document_id} does not belong to user: {self.user}") + + try: + response = self.collection.delete(document_id) + print(f"Deleted document: {document_id} !") + + return True + except Exception as e: + print(e) + raise Exception("Not able to delete the data.") diff --git a/comps/chathistory/arango/chat.py b/comps/chathistory/arango/chat.py new file mode 100644 index 0000000000..ce9c0a16eb --- /dev/null +++ b/comps/chathistory/arango/chat.py @@ -0,0 +1,146 @@ +import os +from typing import Optional + +from arango_store import DocumentStore +from fastapi import HTTPException +from pydantic import BaseModel + +from comps import CustomLogger +from comps.cores.mega.micro_service import opea_microservices, register_microservice +from comps.cores.proto.api_protocol import ChatCompletionRequest + +logger = CustomLogger("chathistory_arango") +logflag = os.getenv("LOGFLAG", False) + + +class ChatMessage(BaseModel): + data: ChatCompletionRequest + first_query: Optional[str] = None + id: Optional[str] = None + + +class ChatId(BaseModel): + user: str + id: Optional[str] = None + + +def get_first_string(value): + if isinstance(value, str): + return value + elif isinstance(value, list): + # Assuming we want the first string from the first dictionary + if value and isinstance(value[0], dict): + first_dict = value[0] + if first_dict: + # Get the first value from the dictionary + first_key = next(iter(first_dict)) + return first_dict[first_key] + + +@register_microservice( + name="opea_service@chathistory_arango", + endpoint="/v1/chathistory/create", + host="0.0.0.0", + input_datatype=ChatMessage, + port=6012, +) +async def create_documents(document: ChatMessage): + """Creates or updates a document in the document store. + + Args: + document (ChatMessage): The ChatMessage object containing the data to be stored. + + Returns: + The result of the operation if successful, None otherwise. + """ + if logflag: + logger.info(document) + try: + if document.data.user is None: + raise HTTPException(status_code=500, detail="Please provide the user information") + store = DocumentStore(document.data.user) + store.initialize_storage() + if document.first_query is None: + document.first_query = get_first_string(document.data.messages) + if document.id: + res = store.update_document(document.id, document.data, document.first_query) + else: + res = store.save_document(document) + if logflag: + logger.info(res) + return res + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@chathistory_arango", + endpoint="/v1/chathistory/get", + host="0.0.0.0", + input_datatype=ChatId, + port=6012, +) +async def get_documents(document: ChatId): + """Retrieves documents from the document store based on the provided ChatId. + + Args: + document (ChatId): The ChatId object containing the user and optional document id. + + Returns: + The retrieved documents if successful, None otherwise. + """ + if logflag: + logger.info(document) + try: + store = DocumentStore(document.user) + store.initialize_storage() + if document.id is None: + res = store.get_all_documents_of_user() + else: + res = store.get_user_documents_by_id(document.id) + if logflag: + logger.info(res) + return res + except Exception as e: + # Handle the exception here + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@chathistory_arango", + endpoint="/v1/chathistory/delete", + host="0.0.0.0", + input_datatype=ChatId, + port=6012, +) +async def delete_documents(document: ChatId): + """Deletes a document from the document store based on the provided ChatId. + + Args: + document (ChatId): The ChatId object containing the user and document id. + + Returns: + The result of the deletion if successful, None otherwise. + """ + if logflag: + logger.info(document) + try: + store = DocumentStore(document.user) + store.initialize_storage() + if document.id is None: + raise Exception("Document id is required.") + else: + res = store.delete_document(document.id) + if logflag: + logger.info(res) + return res + except Exception as e: + # Handle the exception here + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +if __name__ == "__main__": + opea_microservices["opea_service@chathistory_arango"].start() diff --git a/comps/chathistory/arango/config.py b/comps/chathistory/arango/config.py new file mode 100644 index 0000000000..9e66e8f1d9 --- /dev/null +++ b/comps/chathistory/arango/config.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ARANGO configuration +ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") +ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +DB_NAME = os.getenv("DB_NAME", "OPEA") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "ChatHistory") diff --git a/comps/chathistory/arango/docker-compose-chathistory-arango.yaml b/comps/chathistory/arango/docker-compose-chathistory-arango.yaml new file mode 100644 index 0000000000..36819c99b5 --- /dev/null +++ b/comps/chathistory/arango/docker-compose-chathistory-arango.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arango: + image: arangodb/arangodb:latest + container_name: arangodb + ports: + - 8529:8529 + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_ROOT_PASSWORD: ${ARANGO_ROOT_PASSWORD} + + chathistory-arango: + image: opea/chathistory-arango:latest + container_name: chathistory-arango-server + ports: + - "6012:6012" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_HOST: ${ARANGO_HOST} + ARANGO_PORT: ${ARANGO_PORT} + ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + DB_NAME: ${DB_NAME} + COLLECTION_NAME: ${COLLECTION_NAME} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/chathistory/arango/requirements.txt b/comps/chathistory/arango/requirements.txt new file mode 100644 index 0000000000..9e5d0de8e0 --- /dev/null +++ b/comps/chathistory/arango/requirements.txt @@ -0,0 +1 @@ +python-arango \ No newline at end of file diff --git a/comps/prompt_registry/arango/config.py b/comps/prompt_registry/arango/config.py index 9719f1358e..e597df0fbe 100644 --- a/comps/prompt_registry/arango/config.py +++ b/comps/prompt_registry/arango/config.py @@ -4,7 +4,7 @@ import os # ARANGO configuration -ARANGO_HOST = os.getenv("ARANGODB_HOST", "localhost") +ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") From 9d46b278e46b31da63e360e4a4d41a600f6faa47 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 26 Nov 2024 18:01:17 -0500 Subject: [PATCH 10/35] update ChatHistory README --- comps/chathistory/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/comps/chathistory/README.md b/comps/chathistory/README.md index 4f7bcbf717..754fd0bd8a 100644 --- a/comps/chathistory/README.md +++ b/comps/chathistory/README.md @@ -24,3 +24,7 @@ The Chat History microservice able to support various database backends for stor ### Chat History with MongoDB For more detail, please refer to this [README](./mongo/README.md) + +### Chat History with ArangoDB + +For more detail, please refer to this [README](./arango/README.md) From 029c1fdfe44b85acdcb9d06a6a13020744504d69 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 26 Nov 2024 18:09:20 -0500 Subject: [PATCH 11/35] new: tests --- tests/chathistory/test_chathistory_arango.sh | 91 ++++++++++++++ .../test_feedback_management_arango.sh | 113 ++++++++++++++++++ .../test_prompt_registry_arango.sh | 89 ++++++++++++++ 3 files changed, 293 insertions(+) create mode 100644 tests/chathistory/test_chathistory_arango.sh create mode 100644 tests/feedback_management/test_feedback_management_arango.sh create mode 100644 tests/prompt_registry/test_prompt_registry_arango.sh diff --git a/tests/chathistory/test_chathistory_arango.sh b/tests/chathistory/test_chathistory_arango.sh new file mode 100644 index 0000000000..50481262f8 --- /dev/null +++ b/tests/chathistory/test_chathistory_arango.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +export ARANGO_HOST=${ip_address} +export ARANGO_PORT=8529 +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL:-"http"} +export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} +export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} +export DB_NAME=${DB_NAME:-"Conversations"} +export COLLECTION_NAME=${COLLECTION_NAME:-"test"} + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker run -d -p 8529:8529 --name=test-comps-arango arangodb/arangodb:latest + + docker build --no-cache -t opea/chathistory-arango-server:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/chathistory/arango/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/chathistory-arango-server built fail" + exit 1 + else + echo "opea/chathistory-arango-server built successful" + fi +} + +function start_service() { + + docker run -d --name="test-comps-chathistory-arango-server" \ + -p 6012:6012 \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/chathistory-arango-server:comps + + sleep 10s +} + +function validate_microservice() { + result=$(curl -X 'POST' \ + http://${ip_address}:6012/v1/chathistory/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "data": { + "messages": "test Messages", "user": "test" + } +}') + echo $result + if [[ ${#result} -eq 26 ]]; then + echo "Result correct." + else + echo "Result wrong." + docker logs test-comps-chathistory-arango-server + exit 1 + fi + +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/feedback_management/test_feedback_management_arango.sh b/tests/feedback_management/test_feedback_management_arango.sh new file mode 100644 index 0000000000..9255550307 --- /dev/null +++ b/tests/feedback_management/test_feedback_management_arango.sh @@ -0,0 +1,113 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +export ARANGO_HOST=${ip_address} +export ARANGO_PORT=8529 +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL:-"http"} +export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} +export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} +export DB_NAME=${DB_NAME:-"Feedback"} +export COLLECTION_NAME=${COLLECTION_NAME:-"test"} + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker run -d -p 8529:8529 --name=test-comps-arango arangodb/arangodb:latest + + docker build --no-cache -t opea/feedbackmanagement-arango-server:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/feedback_management/arango/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/feedbackmanagement-arango-server built fail" + exit 1 + else + echo "opea/feedbackmanagement-arango-server built successful" + fi +} + +function start_service() { + + docker run -d --name="test-comps-feedbackmanagement-arango-server" \ + -p 6016:6016 \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/feedbackmanagement-arango-server:comps + + sleep 10s +} + +function validate_microservice() { + result=$(curl -X 'POST' \ + http://$ip_address:6016/v1/feedback/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "chat_id": "66445d4f71c7eff23d44f78d", + "chat_data": { + "user": "test", + "messages": [ + { + "role": "system", + "content": "You are helpful assistant" + }, + { + "role": "user", + "content": "hi", + "time": "1724915247" + }, + { + "role": "assistant", + "content": "Hi, may I help you?", + "time": "1724915249" + } + ] + }, + "feedback_data": { + "comment": "Moderate", + "rating": 3, + "is_thumbs_up": true + } +}') + echo $result + if [[ ${#result} -eq 26 ]]; then + echo "Correct result." + else + echo "Incorrect result." + docker logs test-comps-feedbackmanagement-arango-server + exit 1 + fi + +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/prompt_registry/test_prompt_registry_arango.sh b/tests/prompt_registry/test_prompt_registry_arango.sh new file mode 100644 index 0000000000..abc15ee7f2 --- /dev/null +++ b/tests/prompt_registry/test_prompt_registry_arango.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +export ARANGO_HOST=${ip_address} +export ARANGO_PORT=8529 +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL:-"http"} +export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} +export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} +export DB_NAME=${DB_NAME:-"Prompts"} +export COLLECTION_NAME=${COLLECTION_NAME:-"test"} + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker run -d -p 8529:8529 --name=test-comps-arango arangodb/arangodb:latest + + docker build --no-cache -t opea/promptregistry-arango-server:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/prompt_registry/arango/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/promptregistry-arango-server built fail" + exit 1 + else + echo "opea/promptregistry-arango-server built successful" + fi +} + +function start_service() { + + docker run -d --name="test-comps-promptregistry-arango-server" \ + -p 6018:6018 \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/promptregistry-arango-server:comps + + sleep 10s +} + +function validate_microservice() { + result=$(curl -X 'POST' \ + http://$ip_address:6018/v1/prompt/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "prompt_text": "test prompt", "user": "test" +}') + echo $result + if [[ ${#result} -eq 26 ]]; then + echo "Correct result." + else + echo "Incorrect result." + docker logs test-comps-promptregistry-arango-server + exit 1 + fi + +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main From 3a8060710d3bce0dd4a55bd1582a342bbc332822 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 26 Nov 2024 18:16:14 -0500 Subject: [PATCH 12/35] update: docker compose workflows --- .github/workflows/docker/compose/chathistory-compose.yaml | 4 ++++ .../docker/compose/feedback_management-compose.yaml | 8 ++++++-- .../workflows/docker/compose/prompt_registry-compose.yaml | 4 ++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker/compose/chathistory-compose.yaml b/.github/workflows/docker/compose/chathistory-compose.yaml index 987447feea..64dc579fca 100644 --- a/.github/workflows/docker/compose/chathistory-compose.yaml +++ b/.github/workflows/docker/compose/chathistory-compose.yaml @@ -7,3 +7,7 @@ services: build: dockerfile: comps/chathistory/mongo/Dockerfile image: ${REGISTRY:-opea}/chathistory-mongo-server:${TAG:-latest} + chathistory-arango-server: + build: + dockerfile: comps/chathistory/arango/Dockerfile + image: ${REGISTRY:-opea}/chathistory-arango-server:${TAG:-latest} diff --git a/.github/workflows/docker/compose/feedback_management-compose.yaml b/.github/workflows/docker/compose/feedback_management-compose.yaml index 0a3cfce66a..51f5ae343a 100644 --- a/.github/workflows/docker/compose/feedback_management-compose.yaml +++ b/.github/workflows/docker/compose/feedback_management-compose.yaml @@ -3,7 +3,11 @@ # this file should be run in the root of the repo services: - feedbackmanagement: + feedbackmanagement-mongo-server: build: dockerfile: comps/feedback_management/mongo/Dockerfile - image: ${REGISTRY:-opea}/feedbackmanagement:${TAG:-latest} + image: ${REGISTRY:-opea}/feedbackmanagement-mongo-server:${TAG:-latest} + feedbackmanagement-arango-server: + build: + dockerfile: comps/feedback_management/arango/Dockerfile + image: ${REGISTRY:-opea}/feedbackmanagement-arango-server:${TAG:-latest} \ No newline at end of file diff --git a/.github/workflows/docker/compose/prompt_registry-compose.yaml b/.github/workflows/docker/compose/prompt_registry-compose.yaml index 34d8973df5..4415a18a9d 100644 --- a/.github/workflows/docker/compose/prompt_registry-compose.yaml +++ b/.github/workflows/docker/compose/prompt_registry-compose.yaml @@ -7,3 +7,7 @@ services: build: dockerfile: comps/prompt_registry/mongo/Dockerfile image: ${REGISTRY:-opea}/promptregistry-mongo-server:${TAG:-latest} + promptregistry-arango-server: + build: + dockerfile: comps/prompt_registry/arango/Dockerfile + image: ${REGISTRY:-opea}/promptregistry-arango-server:${TAG:-latest} From fabd85c091dd02d83a662cc1aee147b75c6a78f4 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 27 Nov 2024 13:14:37 -0500 Subject: [PATCH 13/35] fix: `arango` --- comps/chathistory/arango/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/chathistory/arango/Dockerfile b/comps/chathistory/arango/Dockerfile index f402e55267..0977bdc45f 100644 --- a/comps/chathistory/arango/Dockerfile +++ b/comps/chathistory/arango/Dockerfile @@ -25,6 +25,6 @@ RUN pip install --no-cache-dir --upgrade pip setuptools && \ ENV PYTHONPATH=/home/user -WORKDIR /home/user/comps/chathistory/mongo +WORKDIR /home/user/comps/chathistory/arango ENTRYPOINT ["python", "chat.py"] From 0c21ff5e88f344108f7d1fb9d7146f3e58f3c05a Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 27 Nov 2024 16:42:11 -0500 Subject: [PATCH 14/35] fix: python path --- comps/chathistory/arango/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/chathistory/arango/Dockerfile b/comps/chathistory/arango/Dockerfile index 0977bdc45f..fbd7d2e9c2 100644 --- a/comps/chathistory/arango/Dockerfile +++ b/comps/chathistory/arango/Dockerfile @@ -23,7 +23,7 @@ RUN pip install --no-cache-dir --upgrade pip setuptools && \ pip install --no-cache-dir -r /home/user/comps/chathistory/arango/requirements.txt && \ pip install --no-cache-dir -r /home/user/requirements.txt -ENV PYTHONPATH=/home/user +ENV PYTHONPATH=$PYTHONPATH:/home/user WORKDIR /home/user/comps/chathistory/arango From c5b936ba38a360975b4cc590575642a75467da6d Mon Sep 17 00:00:00 2001 From: Anthony Mahanna <43019056+aMahanna@users.noreply.github.com> Date: Mon, 9 Dec 2024 10:15:56 -0500 Subject: [PATCH 15/35] rename arango envs (#14) * initial commit: rename arango envs * fix comment --- comps/chathistory/arango/README.md | 16 ++++++---------- comps/chathistory/arango/arango_conn.py | 10 +++++----- comps/chathistory/arango/arango_store.py | 8 ++++---- comps/chathistory/arango/config.py | 10 ++++------ .../docker-compose-chathistory-arango.yaml | 8 +++----- comps/feedback_management/arango/README.md | 16 ++++++---------- comps/feedback_management/arango/arango_conn.py | 10 +++++----- comps/feedback_management/arango/arango_store.py | 8 ++++---- comps/feedback_management/arango/config.py | 10 ++++------ .../docker-compose-user-feedback-arango.yaml | 8 +++----- comps/prompt_registry/arango/README.md | 16 ++++++---------- comps/prompt_registry/arango/arango_conn.py | 10 +++++----- comps/prompt_registry/arango/arango_store.py | 9 ++++----- comps/prompt_registry/arango/config.py | 10 ++++------ .../docker-compose-prompt-registry-arango.yaml | 8 +++----- tests/chathistory/test_chathistory_arango.sh | 16 ++++++---------- .../test_feedback_management_arango.sh | 16 ++++++---------- .../test_prompt_registry_arango.sh | 16 ++++++---------- 18 files changed, 84 insertions(+), 121 deletions(-) diff --git a/comps/chathistory/arango/README.md b/comps/chathistory/arango/README.md index 428a65255e..b0379cb40f 100644 --- a/comps/chathistory/arango/README.md +++ b/comps/chathistory/arango/README.md @@ -9,13 +9,11 @@ This README provides setup guides and all the necessary information about the Ch See `config.py` for default values. ```bash -export ARANGO_HOST=${ARANGO_HOST} -export ARANGO_PORT=${ARANGO_PORT} -export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} +export ARANGO_URL=${ARANGO_URL} export ARANGO_USERNAME=${ARANGO_USERNAME} export ARANGO_PASSWORD=${ARANGO_PASSWORD} -export DB_NAME=${DB_NAME} -export COLLECTION_NAME=${COLLECTION_NAME} +export ARANGO_DB_NAME=${ARANGO_DB_NAME} +export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME} ``` --- @@ -44,13 +42,11 @@ docker build -t opea/chathistory-arango-server:latest --build-arg https_proxy=$h -e http_proxy=$http_proxy \ -e https_proxy=$https_proxy \ -e no_proxy=$no_proxy \ - -e ARANGO_HOST=${ARANGO_HOST} \ - -e ARANGO_PORT=${ARANGO_PORT} \ - -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_URL=${ARANGO_URL} \ -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ - -e DB_NAME=${DB_NAME} \ - -e COLLECTION_NAME=${COLLECTION_NAME} \ + -e ARANGO_DB_NAME=${ARANGO_DB_NAME} \ + -e ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME} \ opea/chathistory-arango-server:latest ``` diff --git a/comps/chathistory/arango/arango_conn.py b/comps/chathistory/arango/arango_conn.py index d6c4b59777..c5c271c1ee 100644 --- a/comps/chathistory/arango/arango_conn.py +++ b/comps/chathistory/arango/arango_conn.py @@ -3,11 +3,11 @@ from arango import ArangoClient as PythonArangoClient from arango.database import StandardDatabase -from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME +from config import ARANGO_URL, ARANGO_PASSWORD, ARANGO_USERNAME, ARANGO_DB_NAME class ArangoClient: - conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + conn_url = ARANGO_URL @staticmethod def get_db_client() -> StandardDatabase: @@ -19,11 +19,11 @@ def get_db_client() -> StandardDatabase: sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) # Create target database if it doesn't exist - if not sys_db.has_database(DB_NAME): - sys_db.create_database(DB_NAME) + if not sys_db.has_database(ARANGO_DB_NAME): + sys_db.create_database(ARANGO_DB_NAME) # Now connect to the target database - db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + db = client.db(ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) return db diff --git a/comps/chathistory/arango/arango_store.py b/comps/chathistory/arango/arango_store.py index 8ab6928eb7..de25bd9cae 100644 --- a/comps/chathistory/arango/arango_store.py +++ b/comps/chathistory/arango/arango_store.py @@ -1,7 +1,7 @@ from typing import Any from arango_conn import ArangoClient -from config import COLLECTION_NAME +from config import ARANGO_COLLECTION_NAME from pydantic import BaseModel @@ -16,10 +16,10 @@ def __init__( def initialize_storage(self) -> None: self.db_client = ArangoClient.get_db_client() - if not self.db_client.has_collection(COLLECTION_NAME): - self.db_client.create_collection(COLLECTION_NAME) + if not self.db_client.has_collection(ARANGO_COLLECTION_NAME): + self.db_client.create_collection(ARANGO_COLLECTION_NAME) - self.collection = self.db_client.collection(COLLECTION_NAME) + self.collection = self.db_client.collection(ARANGO_COLLECTION_NAME) def save_document(self, document: BaseModel) -> str: """Stores a new document into the storage. diff --git a/comps/chathistory/arango/config.py b/comps/chathistory/arango/config.py index 9e66e8f1d9..f7351fcb48 100644 --- a/comps/chathistory/arango/config.py +++ b/comps/chathistory/arango/config.py @@ -3,11 +3,9 @@ import os -# ARANGO configuration -ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") -ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) -ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") +# ArangoDB configuration +ARANGO_URL = os.getenv("ARANGO_URL", "http://localhost:8529") ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") -DB_NAME = os.getenv("DB_NAME", "OPEA") -COLLECTION_NAME = os.getenv("COLLECTION_NAME", "ChatHistory") +ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "OPEA") +ARANGO_COLLECTION_NAME = os.getenv("ARANGO_COLLECTION_NAME", "ChatHistory") diff --git a/comps/chathistory/arango/docker-compose-chathistory-arango.yaml b/comps/chathistory/arango/docker-compose-chathistory-arango.yaml index 36819c99b5..218ec1b632 100644 --- a/comps/chathistory/arango/docker-compose-chathistory-arango.yaml +++ b/comps/chathistory/arango/docker-compose-chathistory-arango.yaml @@ -24,13 +24,11 @@ services: http_proxy: ${http_proxy} https_proxy: ${https_proxy} no_proxy: ${no_proxy} - ARANGO_HOST: ${ARANGO_HOST} - ARANGO_PORT: ${ARANGO_PORT} - ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} + ARANGO_URL: ${ARANGO_URL} ARANGO_USERNAME: ${ARANGO_USERNAME} ARANGO_PASSWORD: ${ARANGO_PASSWORD} - DB_NAME: ${DB_NAME} - COLLECTION_NAME: ${COLLECTION_NAME} + ARANGO_DB_NAME: ${ARANGO_DB_NAME} + ARANGO_COLLECTION_NAME: ${ARANGO_COLLECTION_NAME} restart: unless-stopped networks: diff --git a/comps/feedback_management/arango/README.md b/comps/feedback_management/arango/README.md index 7e9a5f8400..758a74430b 100644 --- a/comps/feedback_management/arango/README.md +++ b/comps/feedback_management/arango/README.md @@ -9,13 +9,11 @@ This README provides setup guides and all the necessary information about the Fe See `config.py` for default values. ```bash -export ARANGO_HOST=${ARANGO_HOST} -export ARANGO_PORT=${ARANGO_PORT} -export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} +export ARANGO_URL=${ARANGO_URL} export ARANGO_USERNAME=${ARANGO_USERNAME} export ARANGO_PASSWORD=${ARANGO_PASSWORD} -export DB_NAME=${DB_NAME} -export COLLECTION_NAME=${COLLECTION_NAME} +export ARANGO_DB_NAME=${ARANGO_DB_NAME} +export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME} export PYTHONPATH={Path to base of directory} ``` @@ -46,13 +44,11 @@ docker build -t opea/feedbackmanagement-arango-server:latest --build-arg https_p -e http_proxy=$http_proxy \ -e https_proxy=$https_proxy \ -e no_proxy=$no_proxy \ - -e ARANGO_HOST=${ARANGO_HOST} \ - -e ARANGO_PORT=${ARANGO_PORT} \ - -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_URL=${ARANGO_URL} \ -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ - -e DB_NAME=${DB_NAME} \ - -e COLLECTION_NAME=${COLLECTION_NAME} \ + -e ARANGO_DB_NAME=${ARANGO_DB_NAME} \ + -e ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME} \ opea/feedbackmanagement-arango-server:latest ``` diff --git a/comps/feedback_management/arango/arango_conn.py b/comps/feedback_management/arango/arango_conn.py index d6c4b59777..c5c271c1ee 100644 --- a/comps/feedback_management/arango/arango_conn.py +++ b/comps/feedback_management/arango/arango_conn.py @@ -3,11 +3,11 @@ from arango import ArangoClient as PythonArangoClient from arango.database import StandardDatabase -from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME +from config import ARANGO_URL, ARANGO_PASSWORD, ARANGO_USERNAME, ARANGO_DB_NAME class ArangoClient: - conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + conn_url = ARANGO_URL @staticmethod def get_db_client() -> StandardDatabase: @@ -19,11 +19,11 @@ def get_db_client() -> StandardDatabase: sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) # Create target database if it doesn't exist - if not sys_db.has_database(DB_NAME): - sys_db.create_database(DB_NAME) + if not sys_db.has_database(ARANGO_DB_NAME): + sys_db.create_database(ARANGO_DB_NAME) # Now connect to the target database - db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + db = client.db(ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) return db diff --git a/comps/feedback_management/arango/arango_store.py b/comps/feedback_management/arango/arango_store.py index cd22b80784..a20a6147d0 100644 --- a/comps/feedback_management/arango/arango_store.py +++ b/comps/feedback_management/arango/arango_store.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from arango_conn import ArangoClient -from config import COLLECTION_NAME +from config import ARANGO_COLLECTION_NAME from pydantic import BaseModel @@ -17,10 +17,10 @@ def __init__( def initialize_storage(self) -> None: self.db_client = ArangoClient.get_db_client() - if not self.db_client.has_collection(COLLECTION_NAME): - self.db_client.create_collection(COLLECTION_NAME) + if not self.db_client.has_collection(ARANGO_COLLECTION_NAME): + self.db_client.create_collection(ARANGO_COLLECTION_NAME) - self.collection = self.db_client.collection(COLLECTION_NAME) + self.collection = self.db_client.collection(ARANGO_COLLECTION_NAME) def save_feedback(self, feedback_data: BaseModel) -> str: """Stores a new feedback data into the storage. diff --git a/comps/feedback_management/arango/config.py b/comps/feedback_management/arango/config.py index bb790eb38a..36826b1e99 100644 --- a/comps/feedback_management/arango/config.py +++ b/comps/feedback_management/arango/config.py @@ -3,11 +3,9 @@ import os -# ARANGO configuration -ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") -ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) -ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") +# ArangoDB configuration +ARANGO_URL = os.getenv("ARANGO_URL", "http://localhost:8529") ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") -DB_NAME = os.getenv("DB_NAME", "OPEA") -COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") +ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "OPEA") +ARANGO_COLLECTION_NAME = os.getenv("ARANGO_COLLECTION_NAME", "Feedback") diff --git a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml index 8f9b3a85a8..f01c5d03f4 100644 --- a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml +++ b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml @@ -24,13 +24,11 @@ services: http_proxy: ${http_proxy} https_proxy: ${https_proxy} no_proxy: ${no_proxy} - ARANGO_HOST: ${ARANGO_HOST} - ARANGO_PORT: ${ARANGO_PORT} - ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} + ARANGO_URL: ${ARANGO_URL} ARANGO_USERNAME: ${ARANGO_USERNAME} ARANGO_PASSWORD: ${ARANGO_PASSWORD} - DB_NAME: ${DB_NAME} - COLLECTION_NAME: ${COLLECTION_NAME} + ARANGO_DB_NAME: ${ARANGO_DB_NAME} + ARANGO_COLLECTION_NAME: ${ARANGO_COLLECTION_NAME} restart: unless-stopped networks: diff --git a/comps/prompt_registry/arango/README.md b/comps/prompt_registry/arango/README.md index e4bdd6c101..d746e9ea78 100644 --- a/comps/prompt_registry/arango/README.md +++ b/comps/prompt_registry/arango/README.md @@ -9,13 +9,11 @@ This README provides setup guides and all the necessary information about the Pr See `config.py` for default values. ```bash -export ARANGO_HOST=${ARANGO_HOST} -export ARANGO_PORT=${ARANGO_PORT} -export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} +export ARANGO_URL=${ARANGO_URL} export ARANGO_USERNAME=${ARANGO_USERNAME} export ARANGO_PASSWORD=${ARANGO_PASSWORD} -export DB_NAME=${DB_NAME} -export COLLECTION_NAME=${COLLECTION_NAME} +export ARANGO_DB_NAME=${ARANGO_DB_NAME} +export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME} ``` --- @@ -46,13 +44,11 @@ docker build -t opea/promptregistry-arango-server:latest --build-arg https_proxy -e http_proxy=$http_proxy \ -e https_proxy=$https_proxy \ -e no_proxy=$no_proxy \ - -e ARANGO_HOST=${ARANGO_HOST} \ - -e ARANGO_PORT=${ARANGO_PORT} \ - -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_URL=${ARANGO_URL} \ -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ - -e DB_NAME=${DB_NAME} \ - -e COLLECTION_NAME=${COLLECTION_NAME} \ + -e ARANGO_DB_NAME=${ARANGO_DB_NAME} \ + -e ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME} \ opea/promptregistry-arango-server:latest ``` diff --git a/comps/prompt_registry/arango/arango_conn.py b/comps/prompt_registry/arango/arango_conn.py index d6c4b59777..c5c271c1ee 100644 --- a/comps/prompt_registry/arango/arango_conn.py +++ b/comps/prompt_registry/arango/arango_conn.py @@ -3,11 +3,11 @@ from arango import ArangoClient as PythonArangoClient from arango.database import StandardDatabase -from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME +from config import ARANGO_URL, ARANGO_PASSWORD, ARANGO_USERNAME, ARANGO_DB_NAME class ArangoClient: - conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + conn_url = ARANGO_URL @staticmethod def get_db_client() -> StandardDatabase: @@ -19,11 +19,11 @@ def get_db_client() -> StandardDatabase: sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) # Create target database if it doesn't exist - if not sys_db.has_database(DB_NAME): - sys_db.create_database(DB_NAME) + if not sys_db.has_database(ARANGO_DB_NAME): + sys_db.create_database(ARANGO_DB_NAME) # Now connect to the target database - db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + db = client.db(ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) return db diff --git a/comps/prompt_registry/arango/arango_store.py b/comps/prompt_registry/arango/arango_store.py index fb80ccd20c..17f73532aa 100644 --- a/comps/prompt_registry/arango/arango_store.py +++ b/comps/prompt_registry/arango/arango_store.py @@ -5,9 +5,8 @@ from arango.exceptions import IndexGetError from arango_conn import ArangoClient -from config import COLLECTION_NAME +from config import ARANGO_COLLECTION_NAME from prompt import PromptCreate -from pydantic import BaseModel from comps import CustomLogger @@ -27,10 +26,10 @@ def __init__( def initialize_storage(self) -> None: self.db_client = ArangoClient.get_db_client() - if not self.db_client.has_collection(COLLECTION_NAME): - self.db_client.create_collection(COLLECTION_NAME) + if not self.db_client.has_collection(ARANGO_COLLECTION_NAME): + self.db_client.create_collection(ARANGO_COLLECTION_NAME) - self.collection = self.db_client.collection(COLLECTION_NAME) + self.collection = self.db_client.collection(ARANGO_COLLECTION_NAME) def save_prompt(self, prompt: PromptCreate): """Stores a new prompt into the storage. diff --git a/comps/prompt_registry/arango/config.py b/comps/prompt_registry/arango/config.py index e597df0fbe..cf048dee95 100644 --- a/comps/prompt_registry/arango/config.py +++ b/comps/prompt_registry/arango/config.py @@ -3,11 +3,9 @@ import os -# ARANGO configuration -ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") -ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) -ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") +# ArangoDB configuration +ARANGO_URL = os.getenv("ARANGO_URL", "http://localhost:8529") ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") -DB_NAME = os.getenv("DB_NAME", "OPEA") -COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Prompt") +ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "OPEA") +ARANGO_COLLECTION_NAME = os.getenv("ARANGO_COLLECTION_NAME", "Prompt") diff --git a/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml b/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml index b1aee077d9..335be9411e 100644 --- a/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml +++ b/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml @@ -24,13 +24,11 @@ services: http_proxy: ${http_proxy} https_proxy: ${https_proxy} no_proxy: ${no_proxy} - ARANGO_HOST: ${ARANGO_HOST} - ARANGO_PORT: ${ARANGO_PORT} - ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} + ARANGO_URL: ${ARANGO_URL} ARANGO_USERNAME: ${ARANGO_USERNAME} ARANGO_PASSWORD: ${ARANGO_PASSWORD} - DB_NAME: ${DB_NAME} - COLLECTION_NAME: ${COLLECTION_NAME} + ARANGO_DB_NAME: ${DB_NAME} + ARANGO_COLLECTION_NAME: ${COLLECTION_NAME} restart: unless-stopped networks: diff --git a/tests/chathistory/test_chathistory_arango.sh b/tests/chathistory/test_chathistory_arango.sh index 50481262f8..f9d731802b 100644 --- a/tests/chathistory/test_chathistory_arango.sh +++ b/tests/chathistory/test_chathistory_arango.sh @@ -7,13 +7,11 @@ set -x WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') -export ARANGO_HOST=${ip_address} -export ARANGO_PORT=8529 -export ARANGO_PROTOCOL=${ARANGO_PROTOCOL:-"http"} +export ARANGO_URL=${ARANGO_URL:-"http://${ip_address}:8529"} export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} -export DB_NAME=${DB_NAME:-"Conversations"} -export COLLECTION_NAME=${COLLECTION_NAME:-"test"} +export ARANGO_DB_NAME=${ARANGO_DB_NAME:-"Conversations"} +export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME:-"test"} function build_docker_images() { cd $WORKPATH @@ -36,13 +34,11 @@ function start_service() { -e http_proxy=$http_proxy \ -e https_proxy=$https_proxy \ -e no_proxy=$no_proxy \ - -e ARANGO_HOST=${ARANGO_HOST} \ - -e ARANGO_PORT=${ARANGO_PORT} \ - -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_URL=${ARANGO_URL} \ -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ - -e DB_NAME=${DB_NAME} \ - -e COLLECTION_NAME=${COLLECTION_NAME} \ + -e ARANGO_DB_NAME=${ARANGO_DB_NAME} \ + -e ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME} \ opea/chathistory-arango-server:comps sleep 10s diff --git a/tests/feedback_management/test_feedback_management_arango.sh b/tests/feedback_management/test_feedback_management_arango.sh index 9255550307..6bbd32598a 100644 --- a/tests/feedback_management/test_feedback_management_arango.sh +++ b/tests/feedback_management/test_feedback_management_arango.sh @@ -7,13 +7,11 @@ set -xe WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') -export ARANGO_HOST=${ip_address} -export ARANGO_PORT=8529 -export ARANGO_PROTOCOL=${ARANGO_PROTOCOL:-"http"} +export ARANGO_URL=${ARANGO_URL:-"http://${ip_address}:8529"} export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} -export DB_NAME=${DB_NAME:-"Feedback"} -export COLLECTION_NAME=${COLLECTION_NAME:-"test"} +export ARANGO_DB_NAME=${ARANGO_DB_NAME:-"Feedback"} +export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME:-"test"} function build_docker_images() { cd $WORKPATH @@ -36,13 +34,11 @@ function start_service() { -e http_proxy=$http_proxy \ -e https_proxy=$https_proxy \ -e no_proxy=$no_proxy \ - -e ARANGO_HOST=${ARANGO_HOST} \ - -e ARANGO_PORT=${ARANGO_PORT} \ - -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_URL=${ARANGO_URL} \ -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ - -e DB_NAME=${DB_NAME} \ - -e COLLECTION_NAME=${COLLECTION_NAME} \ + -e ARANGO_DB_NAME=${ARANGO_DB_NAME} \ + -e ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME} \ opea/feedbackmanagement-arango-server:comps sleep 10s diff --git a/tests/prompt_registry/test_prompt_registry_arango.sh b/tests/prompt_registry/test_prompt_registry_arango.sh index abc15ee7f2..16d81b17d5 100644 --- a/tests/prompt_registry/test_prompt_registry_arango.sh +++ b/tests/prompt_registry/test_prompt_registry_arango.sh @@ -7,13 +7,11 @@ set -x WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') -export ARANGO_HOST=${ip_address} -export ARANGO_PORT=8529 -export ARANGO_PROTOCOL=${ARANGO_PROTOCOL:-"http"} +export ARANGO_URL=${ARANGO_URL:-"http://${ip_address}:8529"} export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} -export DB_NAME=${DB_NAME:-"Prompts"} -export COLLECTION_NAME=${COLLECTION_NAME:-"test"} +export ARANGO_DB_NAME=${ARANGO_DB_NAME:-"Prompts"} +export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME:-"test"} function build_docker_images() { cd $WORKPATH @@ -36,13 +34,11 @@ function start_service() { -e http_proxy=$http_proxy \ -e https_proxy=$https_proxy \ -e no_proxy=$no_proxy \ - -e ARANGO_HOST=${ARANGO_HOST} \ - -e ARANGO_PORT=${ARANGO_PORT} \ - -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_URL=${ARANGO_URL} \ -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ - -e DB_NAME=${DB_NAME} \ - -e COLLECTION_NAME=${COLLECTION_NAME} \ + -e ARANGO_DB_NAME=${ARANGO_DB_NAME} \ + -e ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME} \ opea/promptregistry-arango-server:comps sleep 10s From 23ac66a3f97aad494d7b80164ae9aff133629a99 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna <43019056+aMahanna@users.noreply.github.com> Date: Mon, 30 Dec 2024 13:16:58 -0500 Subject: [PATCH 16/35] ArangoDB: Dataprep (#12) * initial commit * fix: env * Update README.md * Revert "Update README.md" This reverts commit 8f750e4472e33d9c11bdc39606ea6b6e33fef892. * fix: create database * cleanup * new: chunk embedding generation * new: `cithash` dep * cleanup: `ingest_data_to_arango` * new: envs in `config` * fix: more envs * more env cleanup * fix: deprecated line * fix: graph doc * update dataprep-compose * Dockerfile update and parametrized prepare_doc_arango.py (#15) * Initial readme and prepare doc arango, with embeddings by Anthony * Adding git to Dockerfile, tested dockerfile and dockercompose. Also parametrized variables in prepare_doc_arango.py * Updating readme with adjustable parameters listed * Only printing debug statements if log flag is on * add review * review pt 2 --------- Co-authored-by: Anthony Mahanna * update dataprep readme --------- Co-authored-by: Ajay Kallepalli <72517322+ajaykallepalli@users.noreply.github.com> --- .../docker/compose/dataprep-compose.yaml | 4 + comps/dataprep/README.md | 4 + comps/dataprep/arango/__init__.py | 2 + comps/dataprep/arango/langchain/Dockerfile | 39 ++ comps/dataprep/arango/langchain/README.md | 149 ++++++++ comps/dataprep/arango/langchain/__init__.py | 2 + comps/dataprep/arango/langchain/config.py | 42 +++ .../docker-compose-dataprep-arango.yaml | 53 +++ .../arango/langchain/prepare_doc_arango.py | 342 ++++++++++++++++++ .../arango/langchain/requirements.txt | 32 ++ 10 files changed, 669 insertions(+) create mode 100644 comps/dataprep/arango/__init__.py create mode 100644 comps/dataprep/arango/langchain/Dockerfile create mode 100644 comps/dataprep/arango/langchain/README.md create mode 100644 comps/dataprep/arango/langchain/__init__.py create mode 100644 comps/dataprep/arango/langchain/config.py create mode 100644 comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml create mode 100644 comps/dataprep/arango/langchain/prepare_doc_arango.py create mode 100644 comps/dataprep/arango/langchain/requirements.txt diff --git a/.github/workflows/docker/compose/dataprep-compose.yaml b/.github/workflows/docker/compose/dataprep-compose.yaml index 7908e8c260..6053fd3d0e 100644 --- a/.github/workflows/docker/compose/dataprep-compose.yaml +++ b/.github/workflows/docker/compose/dataprep-compose.yaml @@ -63,3 +63,7 @@ services: build: dockerfile: comps/dataprep/multimedia2text/audio2text/Dockerfile image: ${REGISTRY:-opea}/dataprep-audio2text:${TAG:-latest} + dataprep-arango: + build: + dockerfile: comps/dataprep/arango/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-arango:${TAG:-latest} \ No newline at end of file diff --git a/comps/dataprep/README.md b/comps/dataprep/README.md index 46a57d37da..02b78ef99b 100644 --- a/comps/dataprep/README.md +++ b/comps/dataprep/README.md @@ -44,3 +44,7 @@ For details, please refer to this [readme](vdms/README.md) ## Dataprep Microservice with Multimodal For details, please refer to this [readme](multimodal/redis/langchain/README.md) + +## Dataprep Microservice with ArangoDB + +For details, please refer to this [readme](arango/langchain/README.md) diff --git a/comps/dataprep/arango/__init__.py b/comps/dataprep/arango/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/comps/dataprep/arango/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/arango/langchain/Dockerfile b/comps/dataprep/arango/langchain/Dockerfile new file mode 100644 index 0000000000..5d8aa7a488 --- /dev/null +++ b/comps/dataprep/arango/langchain/Dockerfile @@ -0,0 +1,39 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + default-jre \ + libgl1-mesa-glx \ + libjemalloc-dev \ + git + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/dataprep/arango/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +USER root + +RUN mkdir -p /home/user/comps/dataprep/arango/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/arango/langchain/uploaded_files + +USER user + +WORKDIR /home/user/comps/dataprep/arango/langchain + +ENTRYPOINT ["python", "prepare_doc_arango.py"] \ No newline at end of file diff --git a/comps/dataprep/arango/langchain/README.md b/comps/dataprep/arango/langchain/README.md new file mode 100644 index 0000000000..37f0a078b0 --- /dev/null +++ b/comps/dataprep/arango/langchain/README.md @@ -0,0 +1,149 @@ +# Dataprep Microservice with ArangoDB + +## 🚀Start Microservice with Python + +### Install Requirements + +```bash +pip install -r requirements.txt +apt-get install libtesseract-dev -y +apt-get install poppler-utils -y +``` + +### Start ArangoDB Server + +To launch ArangoDB locally, first ensure you have docker installed. Then, you can launch the database with the following docker command. + +```bash +docker run -d --name arangodb -p 8529:8529 -e ARANGO_ROOT_PASSWORD=password arangodb/arangodb:latest +``` + +### Setup Environment Variables + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export ARANGO_URL=${your_arango_url} +export ARANGO_USERNAME=${your_arango_username} +export ARANGO_PASSWORD=${your_arango_password} +export ARANGO_DB_NAME=${your_db_name} +export PYTHONPATH=${path_to_comps} +``` + +### Start Document Preparation Microservice for ArangoDB with Python Script + +Start document preparation microservice for ArangoDB with below command. + +```bash +python prepare_doc_arango.py +``` + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ../../../../ +docker build -t opea/dataprep-arango:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/arango/langchain/Dockerfile . +``` + +### Run Docker with CLI + +```bash +docker run -d --name="dataprep-arango-server" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-arango:latest +``` + +### Run Docker with Docker Compose + +```bash +cd comps/dataprep/arango/langchain +docker compose -f docker-compose-dataprep-arango.yaml up -d +``` + +## Invoke Microservice + +Once document preparation microservice for ArangoDB is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. + +After the service is complete a Graph is created in ArangoDB. The default graph name is `Graph`, you can specify the graph name by `-F "graph_name=${your_graph_name}"` in the curl command. + +By default, the microservice will create embeddings for the documents if embedding environment variables are specified. You can specify `-F "create_embeddings=false"` to skip the embedding creation. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + -F "graph_name=${your_graph_name}" \ + http://localhost:6007/v1/dataprep +``` + +You can specify chunk_size and chunk_size by the following commands. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + -F "chunk_size=1500" \ + -F "chunk_overlap=100" \ + -F "graph_name=${your_graph_name}" \ + http://localhost:6007/v1/dataprep +``` + +We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". + +Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. + +For ensure the quality and comprehensiveness of the extracted entities, we recommend to use `gpt-4o` as the default model for parsing the document. To enable the openai service, please `export OPENAI_API_KEY=xxxx` before using this services. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./your_file.pdf" \ + -F "process_table=true" \ + -F "table_strategy=hq" \ + -F "graph_name=${your_graph_name}" \ + http://localhost:6007/v1/dataprep +``` + +--- + +Additional options that can be specified from the environment variables are as follows (default values are in the config.py file): + +ArangoDB Configuration: +- `ARANGO_URL`: The URL for the ArangoDB service. +- `ARANGO_USERNAME`: The username for the ArangoDB service. +- `ARANGO_PASSWORD`: The password for the ArangoDB service. +- `ARANGO_DB_NAME`: The name of the database to use for the ArangoDB service. +- `USE_ONE_ENTITY_COLLECTION`: If set to True, the microservice will use a single entity collection for all nodes. If set to False, the microservice will use a separate collection by node type. Defaults to `True`. +- `INSERT_ASYNC`: If set to True, the microservice will insert the data into ArangoDB asynchronously. Defaults to `False`. +- `ARANGO_BATCH_SIZE`: The batch size for the microservice to insert the data. Defaults to `500`. + +Text Generation Inference Configuration +- `TGI_LLM_ENDPOINT`: The endpoint for the TGI service. +- `TGI_LLM_MAX_NEW_TOKENS`: The maximum number of new tokens to generate. Defaults to `512`. +- `TGI_LLM_TOP_K`: The number of highest probability vocabulary tokens to keep for top-k-filtering. Defaults to `40`. +- `TGI_LLM_TOP_P`: If set to < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. Defaults to `0.9`. +- `TGI_LLM_TEMPERATURE`: The temperature for the sampling. Defaults to `0.8`. +- `TGI_LLM_TIMEOUT`: The timeout for the TGI service. Defaults to `600`. + +Text Embeddings Inferencing Configuration +**Note**: This is optional functionality to generate embeddings for text chunks. +- `TEI_EMBEDDING_ENDPOINT`: The endpoint for the TEI service. +- `HUGGINGFACEHUB_API_TOKEN`: The API token for the Hugging Face Hub. +- `TEI_EMBED_MODEL`: The model to use for the TEI service. Defaults to `BAAI/bge-base-en-v1.5`. + +OpenAI Configuration: +**Note**: This configuration can replace the TGI and TEI services for text generation and embeddings. +- `OPENAI_API_KEY`: The API key for the OpenAI service. +- `OPENAI_EMBED_MODEL`: The embedding model to use for the OpenAI service. Defaults to `text-embedding-3-small`. +- `OPENAI_EMBED_DIMENSIONS`: The embedding dimension for the OpenAI service. Defaults to `512`. +- `OPENAI_CHAT_MODEL`: The chat model to use for the OpenAI service. Defaults to `gpt-4o`. +- `OPENAI_CHAT_TEMPERATURE`: The temperature for the OpenAI service. Defaults to `0`. + + +[LangChain LLMGraphTransformer](https://api.python.langchain.com/en/latest/graph_transformers/langchain_experimental.graph_transformers.llm.LLMGraphTransformer.html) Configuration: +- `SYSTEM_PROMPT_PATH`: The path to the system prompt text file. This can be used to specify the specific system prompt for the entity extraction and graph generation steps. +- `ALLOWED_NODES`: Specifies which node types are allowed in the graph. Defaults to an empty list, allowing all node types. +- `ALLOWED_RELATIONSHIPS`: Specifies which relationship types are allowed in the graph. Defaults to an empty list, allowing all relationship types. +- `NODE_PROPERTIES`: If True, the LLM can extract any node properties from text. Alternatively, a list of valid properties can be provided for the LLM to extract, restricting extraction to those specified. Defaults to `["description"]`. +- `RELATIONSHIP_PROPERTIES`: If True, the LLM can extract any relationship properties from text. Alternatively, a list of valid properties can be provided for the LLM to extract, restricting extraction to those specified. Defaults to `["description"]`. diff --git a/comps/dataprep/arango/langchain/__init__.py b/comps/dataprep/arango/langchain/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/comps/dataprep/arango/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py new file mode 100644 index 0000000000..1f2312e590 --- /dev/null +++ b/comps/dataprep/arango/langchain/config.py @@ -0,0 +1,42 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ArangoDB configuration +ARANGO_URL = os.getenv("ARANGO_URL", "http://localhost:8529") +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "_system") + +# ArangoDB graph configuration +USE_ONE_ENTITY_COLLECTION = os.getenv("USE_ONE_ENTITY_COLLECTION", True) +INSERT_ASYNC = os.getenv("INSERT_ASYNC", False) +ARANGO_BATCH_SIZE = os.getenv("ARANGO_BATCH_SIZE", 500) + +# Text Generation Inference configuration +TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") +TGI_LLM_MAX_NEW_TOKENS = os.getenv("TGI_LLM_MAX_NEW_TOKENS", 512) +TGI_LLM_TOP_K = os.getenv("TGI_LLM_TOP_K", 40) +TGI_LLM_TOP_P = os.getenv("TGI_LLM_TOP_P", 0.9) +TGI_LLM_TEMPERATURE = os.getenv("TGI_LLM_TEMPERATURE", 0.8) +TGI_LLM_TIMEOUT = os.getenv("TGI_LLM_TIMEOUT", 600) + +# Text Embeddings Inference configuration +TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT") +HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") +TEI_EMBED_MODEL = os.getenv("TEI_EMBED_MODEL", "BAAI/bge-base-en-v1.5") + +# OpenAI configuration (alternative to TGI & TEI) +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +OPENAI_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small") +OPENAI_EMBED_DIMENSIONS = os.getenv("OPENAI_EMBED_DIMENSIONS", 512) +OPENAI_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "gpt-4o") +OPENAI_CHAT_TEMPERATURE = os.getenv("OPENAI_CHAT_TEMPERATURE", 0) + +# LLMGraphTransformer configuration +SYSTEM_PROMPT_PATH = os.getenv("SYSTEM_PROMPT_PATH") +ALLOWED_NODES = os.getenv("ALLOWED_NODES", []) +ALLOWED_RELATIONSHIPS = os.getenv("ALLOWED_RELATIONSHIPS", []) +NODE_PROPERTIES = os.getenv("NODE_PROPERTIES", ["description"]) +RELATIONSHIP_PROPERTIES = os.getenv("RELATIONSHIP_PROPERTIES", ["description"]) diff --git a/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml b/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml new file mode 100644 index 0000000000..d3a9882c63 --- /dev/null +++ b/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml @@ -0,0 +1,53 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arango-vector-db: + image: arangodb/arangodb:latest + container_name: arango-graph-db + ports: + - "8529:8529" + environment: + ARANGO_ROOT_PASSWORD: ${ARANGO_PASSWORD} + tgi_gaudi_service: + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + container_name: tgi-service + ports: + - "8088:80" + volumes: + - "./data:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HF_TOKEN: ${HF_TOKEN} + command: --model-id ${LLM_MODEL_ID} --auto-truncate --max-input-tokens 1024 --max-total-tokens 2048 + dataprep-arango: + image: opea/dataprep-arango:latest + container_name: dataprep-arango-server + depends_on: + - arango-vector-db + - tgi_gaudi_service + ports: + - "6007:6007" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + ARANGO_URL: ${ARANGO_URL} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + ARANGO_DB_NAME: ${ARANGO_DB_NAME} + TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT} + TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + TEI_EMBED_MODEL: ${TEI_EMBED_MODEL} + OPENAI_API_KEY: ${OPENAI_API_KEY} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py new file mode 100644 index 0000000000..d2d467cff8 --- /dev/null +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -0,0 +1,342 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +from typing import List, Optional, Union + +import openai +from arango import ArangoClient +from config import ( + ALLOWED_NODES, + ALLOWED_RELATIONSHIPS, + ARANGO_BATCH_SIZE, + ARANGO_DB_NAME, + ARANGO_PASSWORD, + ARANGO_URL, + ARANGO_USERNAME, + HUGGINGFACEHUB_API_TOKEN, + INSERT_ASYNC, + NODE_PROPERTIES, + OPENAI_API_KEY, + OPENAI_EMBED_DIMENSIONS, + OPENAI_EMBED_MODEL, + RELATIONSHIP_PROPERTIES, + SYSTEM_PROMPT_PATH, + TEI_EMBED_MODEL, + TEI_EMBEDDING_ENDPOINT, + TGI_LLM_ENDPOINT, + TGI_LLM_MAX_NEW_TOKENS, + TGI_LLM_TEMPERATURE, + TGI_LLM_TIMEOUT, + TGI_LLM_TOP_K, + TGI_LLM_TOP_P, + USE_ONE_ENTITY_COLLECTION, +) +from fastapi import File, Form, HTTPException, UploadFile +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings +from langchain_community.graphs.arangodb_graph import ArangoGraph +from langchain_community.llms import HuggingFaceEndpoint +from langchain_core.documents import Document +from langchain_core.prompts import ChatPromptTemplate +from langchain_experimental.graph_transformers import LLMGraphTransformer +from langchain_openai import ChatOpenAI, OpenAIEmbeddings +from langchain_text_splitters import HTMLHeaderTextSplitter + +from comps import CustomLogger, DocPath, opea_microservices, register_microservice +from comps.dataprep.utils import ( + document_loader, + encode_filename, + get_separators, + get_tables_result, + parse_html, + save_content_to_local_disk, +) + +logger = CustomLogger("prepare_doc_arango") +logflag = os.getenv("LOGFLAG", True) + +upload_folder = "./uploaded_files/" + +PROMPT_TEMPLATE = None +if SYSTEM_PROMPT_PATH is not None: + try: + with open(SYSTEM_PROMPT_PATH, "r") as f: + PROMPT_TEMPLATE = ChatPromptTemplate.from_messages( + [ + ( + "system", + f.read(), + ), + ( + "human", + ( + "Tip: Make sure to answer in the correct format and do " + "not include any explanations. " + "Use the given format to extract information from the " + "following input: {input}" + ), + ), + ] + ) + except Exception as e: + logger.error(f"Could not set custom Prompt: {e}") + + +def ingest_data_to_arango(doc_path: DocPath, graph_name: str, create_embeddings: bool) -> bool: + """Ingest document to ArangoDB.""" + path = doc_path.path + if logflag: + logger.info(f"Parsing document {path}.") + + ############################# + # Text Generation Inference # + ############################# + + if OPENAI_API_KEY: + if logflag: + logger.info("OpenAI API Key is set. Verifying its validity...") + openai.api_key = OPENAI_API_KEY + + try: + openai.models.list() + if logflag: + logger.info("OpenAI API Key is valid.") + llm = ChatOpenAI(temperature=0, model_name="gpt-4o") + except openai.error.AuthenticationError: + if logflag: + logger.info("OpenAI API Key is invalid.") + except Exception as e: + if logflag: + logger.info(f"An error occurred while verifying the API Key: {e}") + + elif TGI_LLM_ENDPOINT: + llm = HuggingFaceEndpoint( + endpoint_url=TGI_LLM_ENDPOINT, + max_new_tokens=TGI_LLM_MAX_NEW_TOKENS, + top_k=TGI_LLM_TOP_K, + top_p=TGI_LLM_TOP_P, + temperature=TGI_LLM_TEMPERATURE, + timeout=TGI_LLM_TIMEOUT, + ) + else: + raise ValueError("No text generation inference endpoint is set.") + + try: + llm_transformer = LLMGraphTransformer( + llm=llm, + allowed_nodes=ALLOWED_NODES, + allowed_relationships=ALLOWED_RELATIONSHIPS, + prompt=PROMPT_TEMPLATE, + node_properties=NODE_PROPERTIES if NODE_PROPERTIES else False, + relationship_properties=RELATIONSHIP_PROPERTIES if RELATIONSHIP_PROPERTIES else False, + ) + except (TypeError, ValueError) as e: + if logflag: + logger.warning(f"Advanced LLMGraphTransformer failed: {e}") + # Fall back to basic config + try: + llm_transformer = LLMGraphTransformer(llm=llm) + except (TypeError, ValueError) as e: + if logflag: + logger.error(f"Failed to initialize LLMGraphTransformer: {e}") + raise + + ######################################## + # Text Embeddings Inference (optional) # + ######################################## + + embeddings = None + if create_embeddings: + if OPENAI_API_KEY: + # Use OpenAI embeddings + embeddings = OpenAIEmbeddings( + model=OPENAI_EMBED_MODEL, + dimensions=OPENAI_EMBED_DIMENSIONS, + ) + + elif TEI_EMBEDDING_ENDPOINT and HUGGINGFACEHUB_API_TOKEN: + # Use TEI endpoint service + embeddings = HuggingFaceHubEmbeddings( + model=TEI_EMBEDDING_ENDPOINT, + huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN, + ) + elif TEI_EMBED_MODEL: + # Use local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) + else: + if logflag: + logger.warning("No embeddings environment variables are set, cannot generate embeddings.") + embeddings = None + + ############ + # ArangoDB # + ############ + + client = ArangoClient(hosts=ARANGO_URL) + sys_db = client.db(name="_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + if not sys_db.has_database(ARANGO_DB_NAME): + sys_db.create_database(ARANGO_DB_NAME) + + db = client.db(name=ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + graph = ArangoGraph( + db=db, + include_examples=False, + generate_schema_on_init=False, + ) + + ############ + # Chunking # + ############ + + if path.endswith(".html"): + headers_to_split_on = [ + ("h1", "Header 1"), + ("h2", "Header 2"), + ("h3", "Header 3"), + ] + text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) + else: + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=doc_path.chunk_size, + chunk_overlap=doc_path.chunk_overlap, + add_start_index=True, + separators=get_separators(), + ) + + content = document_loader(path) + + structured_types = [".xlsx", ".csv", ".json", "jsonl"] + _, ext = os.path.splitext(path) + + if ext in structured_types: + chunks = content + else: + chunks = text_splitter.split_text(content) + + if doc_path.process_table and path.endswith(".pdf"): + table_chunks = get_tables_result(path, doc_path.table_strategy) + if isinstance(table_chunks, list): + chunks = chunks + table_chunks + if logflag: + logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.") + + ################################ + # Graph generation & insertion # + ################################ + + generate_chunk_embeddings = embeddings is not None + + for text in chunks: + document = Document(page_content=text) + graph_doc = llm_transformer.process_response(document) + + if generate_chunk_embeddings: + source = graph_doc.source + source.metadata["embeddings"] = embeddings.embed_documents([source.page_content])[0] + + graph.add_graph_documents( + graph_documents=[graph_doc], + include_source=True, + graph_name=graph_name, + update_graph_definition_if_exists=not USE_ONE_ENTITY_COLLECTION, + batch_size=ARANGO_BATCH_SIZE, + use_one_entity_collection=USE_ONE_ENTITY_COLLECTION, + insert_async=INSERT_ASYNC, + ) + + if logflag: + logger.info("The graph is built.") + + return True + + +@register_microservice( + name="opea_service@prepare_doc_arango", + endpoint="/v1/dataprep", + host="0.0.0.0", + port=6007, + input_datatype=DocPath, + output_datatype=None, +) +async def ingest_documents( + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1500), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), + graph_name: str = Form("Graph"), + create_embeddings: bool = Form(True), +): + if logflag: + logger.info(f"files:{files}") + logger.info(f"link_list:{link_list}") + + if files: + if not isinstance(files, list): + files = [files] + uploaded_files = [] + for file in files: + encode_file = encode_filename(file.filename) + save_path = upload_folder + encode_file + await save_content_to_local_disk(save_path, file) + ingest_data_to_arango( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ), + graph_name=graph_name, + create_embeddings=create_embeddings, + ) + uploaded_files.append(save_path) + if logflag: + logger.info(f"Successfully saved file {save_path}") + result = {"status": 200, "message": "Data preparation succeeded"} + if logflag: + logger.info(result) + return result + + if link_list: + link_list = json.loads(link_list) # Parse JSON string to list + if not isinstance(link_list, list): + raise HTTPException(status_code=400, detail="link_list should be a list.") + for link in link_list: + encoded_link = encode_filename(link) + save_path = upload_folder + encoded_link + ".txt" + content = parse_html([link])[0][0] + try: + await save_content_to_local_disk(save_path, content) + ingest_data_to_arango( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ), + graph_name=graph_name, + create_embeddings=create_embeddings, + ) + except json.JSONDecodeError: + raise HTTPException(status_code=500, detail="Fail to ingest data into qdrant.") + + if logflag: + logger.info(f"Successfully saved link {link}") + + result = {"status": 200, "message": "Data preparation succeeded"} + if logflag: + logger.info(result) + return result + + raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") + + +if __name__ == "__main__": + opea_microservices["opea_service@prepare_doc_arango"].start() diff --git a/comps/dataprep/arango/langchain/requirements.txt b/comps/dataprep/arango/langchain/requirements.txt new file mode 100644 index 0000000000..74d4a9f0dd --- /dev/null +++ b/comps/dataprep/arango/langchain/requirements.txt @@ -0,0 +1,32 @@ +beautifulsoup4 +cairosvg +docarray[full] +docx2txt +easyocr +fastapi +huggingface_hub +langchain +git+https://github.com/arangoml/langchain.git@arangodb#subdirectory=libs/community +langchain-experimental +langchain-openai +langchain-text-splitters +langchain_huggingface +markdown +python-arango +cityhash +numpy +openai +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pymupdf +pytesseract +python-docx +python-pptx +sentence_transformers +shortuuid +unstructured[all-docs]==0.15.7 +uvicorn From 32445331c07b063a61db1ff35aa5a8be923c5bc4 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 30 Dec 2024 14:18:29 -0500 Subject: [PATCH 17/35] new: `source_metadata_fields_to_extract_to_top_level` --- comps/dataprep/arango/langchain/prepare_doc_arango.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index d2d467cff8..77e73c00c4 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -236,7 +236,7 @@ def ingest_data_to_arango(doc_path: DocPath, graph_name: str, create_embeddings: if generate_chunk_embeddings: source = graph_doc.source - source.metadata["embeddings"] = embeddings.embed_documents([source.page_content])[0] + source.metadata["embedding"] = embeddings.embed_documents([source.page_content])[0] graph.add_graph_documents( graph_documents=[graph_doc], @@ -246,6 +246,7 @@ def ingest_data_to_arango(doc_path: DocPath, graph_name: str, create_embeddings: batch_size=ARANGO_BATCH_SIZE, use_one_entity_collection=USE_ONE_ENTITY_COLLECTION, insert_async=INSERT_ASYNC, + source_metadata_fields_to_extract_to_top_level={"embedding"}, ) if logflag: From 3604fb83c0f027cad2b883e5ab7250a6b3f69db9 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 30 Dec 2024 17:59:31 -0500 Subject: [PATCH 18/35] fix: logger info --- comps/dataprep/arango/langchain/prepare_doc_arango.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index 77e73c00c4..88b3b59868 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -222,7 +222,7 @@ def ingest_data_to_arango(doc_path: DocPath, graph_name: str, create_embeddings: if isinstance(table_chunks, list): chunks = chunks + table_chunks if logflag: - logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.") + logger.info(f"Done preprocessing. Created {len(chunks)} chunks of the original file.") ################################ # Graph generation & insertion # From 50b26397eafd836d0cce24418d3c60c6472fad3d Mon Sep 17 00:00:00 2001 From: Anthony Mahanna <43019056+aMahanna@users.noreply.github.com> Date: Sun, 5 Jan 2025 17:25:53 -0500 Subject: [PATCH 19/35] ArangoDB: Retriever (#2) * wip: retriever * rename: `arango` * checkpoint * cleanup * fix: env * update retriever compose * add test file * fix: config & dockerfile * fix: embedding field name * new: config variables * new: traverse graph after similarity * fix: string * add `uniqueVertices` * add filter * infra * fix: query * remove: `similarity_distance_threshold` * temp: replace `p` * cleanup * remove: `ARANGO_TRAVERSAL_MIN_DEPTH` * update max_depth * new: `fetch_neighborhoods` * fix: test * cleanup: `prepare_doc_arango.py` * move `graph` & `vector_db` instantiation * cleanup: dataprep readme * cleanup: retriever * fix: arango test scripts * Update test_retrievers_arango_langchain.sh * update `ARANGO_EMBEDDING_DIMENSION` * fix: env vars * cleanup: retriever port * new: `test_dataprep_arango_langchain` * new: retriever yaml * Changing naming convention from arangodb to arango to ensure consistency between microservices, updated dockerfile to match and removed space in port * fix: retriever name * remove: `retriever_arangodb` --------- Co-authored-by: Ajay Kallepalli --- .../docker/compose/retrievers-compose.yaml | 4 + comps/dataprep/arango/langchain/README.md | 42 +-- comps/dataprep/arango/langchain/config.py | 4 +- .../arango/langchain/prepare_doc_arango.py | 205 +++++++------- comps/retrievers/README.md | 4 + comps/retrievers/arango/__init__.py | 0 comps/retrievers/arango/langchain/Dockerfile | 34 +++ comps/retrievers/arango/langchain/README.md | 144 ++++++++++ comps/retrievers/arango/langchain/__init__.py | 2 + comps/retrievers/arango/langchain/config.py | 32 +++ .../docker-compose-retriever-arango.yaml | 54 ++++ .../arango/langchain/requirements.txt | 22 ++ .../arango/langchain/retriever_arango.py | 249 ++++++++++++++++++ tests/chathistory/test_chathistory_arango.sh | 2 +- .../test_dataprep_arango_langchain.sh | 105 ++++++++ .../test_feedback_management_arango.sh | 2 +- .../test_prompt_registry_arango.sh | 2 +- .../test_retrievers_arango_langchain.sh | 126 +++++++++ 18 files changed, 906 insertions(+), 127 deletions(-) create mode 100644 comps/retrievers/arango/__init__.py create mode 100644 comps/retrievers/arango/langchain/Dockerfile create mode 100644 comps/retrievers/arango/langchain/README.md create mode 100644 comps/retrievers/arango/langchain/__init__.py create mode 100644 comps/retrievers/arango/langchain/config.py create mode 100644 comps/retrievers/arango/langchain/docker-compose-retriever-arango.yaml create mode 100644 comps/retrievers/arango/langchain/requirements.txt create mode 100644 comps/retrievers/arango/langchain/retriever_arango.py create mode 100644 tests/dataprep/test_dataprep_arango_langchain.sh create mode 100644 tests/retrievers/test_retrievers_arango_langchain.sh diff --git a/.github/workflows/docker/compose/retrievers-compose.yaml b/.github/workflows/docker/compose/retrievers-compose.yaml index 7b89ce9bfe..bfc8a29a55 100644 --- a/.github/workflows/docker/compose/retrievers-compose.yaml +++ b/.github/workflows/docker/compose/retrievers-compose.yaml @@ -47,3 +47,7 @@ services: build: dockerfile: comps/retrievers/neo4j/llama_index/Dockerfile image: ${REGISTRY:-opea}/retriever-neo4j-llamaindex:${TAG:-latest} + retriever-arango: + build: + dockerfile: comps/retrievers/arango/langchain/Dockerfile + image: ${REGISTRY:-opea}/retriever-arango:${TAG:-latest} \ No newline at end of file diff --git a/comps/dataprep/arango/langchain/README.md b/comps/dataprep/arango/langchain/README.md index 37f0a078b0..fb383b42e5 100644 --- a/comps/dataprep/arango/langchain/README.md +++ b/comps/dataprep/arango/langchain/README.md @@ -1,6 +1,6 @@ # Dataprep Microservice with ArangoDB -## 🚀Start Microservice with Python +## 🚀 1. Start Microservice with Python ### Install Requirements @@ -31,27 +31,27 @@ export ARANGO_DB_NAME=${your_db_name} export PYTHONPATH=${path_to_comps} ``` -### Start Document Preparation Microservice for ArangoDB with Python Script +See below for additional environment variables that can be set. -Start document preparation microservice for ArangoDB with below command. +### Start Dataprep Service ```bash python prepare_doc_arango.py ``` -## 🚀Start Microservice with Docker +## 🚀 2. Start Microservice with Docker ### Build Docker Image ```bash -cd ../../../../ +cd /your/path/to/GenAIComps docker build -t opea/dataprep-arango:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/arango/langchain/Dockerfile . ``` ### Run Docker with CLI ```bash -docker run -d --name="dataprep-arango-server" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-arango:latest +docker run -d --name="dataprep-arango-server" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e ... opea/dataprep-arango:latest ``` ### Run Docker with Docker Compose @@ -61,13 +61,9 @@ cd comps/dataprep/arango/langchain docker compose -f docker-compose-dataprep-arango.yaml up -d ``` -## Invoke Microservice +## 🚀 3. Consume Retriever Service -Once document preparation microservice for ArangoDB is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. - -After the service is complete a Graph is created in ArangoDB. The default graph name is `Graph`, you can specify the graph name by `-F "graph_name=${your_graph_name}"` in the curl command. - -By default, the microservice will create embeddings for the documents if embedding environment variables are specified. You can specify `-F "create_embeddings=false"` to skip the embedding creation. +An ArangoDB Graph is created from the documents provided to the microservice. The microservice will extract entities from the documents and create nodes and relationships in the graph based on the entities extracted. The microservice will also create embeddings for the documents if embedding environment variables are specified. ```bash curl -X POST \ @@ -77,7 +73,11 @@ curl -X POST \ http://localhost:6007/v1/dataprep ``` -You can specify chunk_size and chunk_size by the following commands. +You can specify the graph name with `-F "graph_name=${your_graph_name}"` in the curl command. + +By default, the microservice will create embeddings for the documents if embedding environment variables are specified. You can specify `-F "create_embeddings=false"` to skip document embedding creation. + +You can also specify the `chunk_size` and `chunk_overlap` with the following parameters: ```bash curl -X POST \ @@ -89,11 +89,11 @@ curl -X POST \ http://localhost:6007/v1/dataprep ``` -We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". - -Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. +We support table extraction from pdf documents. You can specify `process_table` and `table_strategy` with the following parameters: +- `table_strategy` refers to the strategies to understand tables for table retrieval. As the setting progresses from `"fast"` to `"hq"` to `"llm"`, the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is `"fast"`. +- `process_table` refers to whether to process tables in the document. The default value is `False`. -For ensure the quality and comprehensiveness of the extracted entities, we recommend to use `gpt-4o` as the default model for parsing the document. To enable the openai service, please `export OPENAI_API_KEY=xxxx` before using this services. +Note: If you specify `"table_strategy=llm"`, you should first start the TGI Service. Please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. ```bash curl -X POST \ @@ -107,13 +107,15 @@ curl -X POST \ --- -Additional options that can be specified from the environment variables are as follows (default values are in the config.py file): +Additional options that can be specified from the environment variables are as follows (default values are also in the `config.py` file): -ArangoDB Configuration: +ArangoDB Connection configuration - `ARANGO_URL`: The URL for the ArangoDB service. - `ARANGO_USERNAME`: The username for the ArangoDB service. - `ARANGO_PASSWORD`: The password for the ArangoDB service. - `ARANGO_DB_NAME`: The name of the database to use for the ArangoDB service. + +ArangoDB Graph Insertion configuration - `USE_ONE_ENTITY_COLLECTION`: If set to True, the microservice will use a single entity collection for all nodes. If set to False, the microservice will use a separate collection by node type. Defaults to `True`. - `INSERT_ASYNC`: If set to True, the microservice will insert the data into ArangoDB asynchronously. Defaults to `False`. - `ARANGO_BATCH_SIZE`: The batch size for the microservice to insert the data. Defaults to `500`. @@ -127,7 +129,7 @@ Text Generation Inference Configuration - `TGI_LLM_TIMEOUT`: The timeout for the TGI service. Defaults to `600`. Text Embeddings Inferencing Configuration -**Note**: This is optional functionality to generate embeddings for text chunks. +**Note**: This is optional functionality to generate embeddings for documents (i.e text chunks). - `TEI_EMBEDDING_ENDPOINT`: The endpoint for the TEI service. - `HUGGINGFACEHUB_API_TOKEN`: The API token for the Hugging Face Hub. - `TEI_EMBED_MODEL`: The model to use for the TEI service. Defaults to `BAAI/bge-base-en-v1.5`. diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py index 1f2312e590..c3caf2da25 100644 --- a/comps/dataprep/arango/langchain/config.py +++ b/comps/dataprep/arango/langchain/config.py @@ -3,13 +3,13 @@ import os -# ArangoDB configuration +# ArangoDB Connection configuration ARANGO_URL = os.getenv("ARANGO_URL", "http://localhost:8529") ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "_system") -# ArangoDB graph configuration +# ArangoDB Graph Insertion configuration USE_ONE_ENTITY_COLLECTION = os.getenv("USE_ONE_ENTITY_COLLECTION", True) INSERT_ASYNC = os.getenv("INSERT_ASYNC", False) ARANGO_BATCH_SIZE = os.getenv("ARANGO_BATCH_SIZE", 500) diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index 88b3b59868..a89984f082 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -19,6 +19,8 @@ INSERT_ASYNC, NODE_PROPERTIES, OPENAI_API_KEY, + OPENAI_CHAT_MODEL, + OPENAI_CHAT_TEMPERATURE, OPENAI_EMBED_DIMENSIONS, OPENAI_EMBED_MODEL, RELATIONSHIP_PROPERTIES, @@ -84,110 +86,13 @@ logger.error(f"Could not set custom Prompt: {e}") -def ingest_data_to_arango(doc_path: DocPath, graph_name: str, create_embeddings: bool) -> bool: +def ingest_data_to_arango(doc_path: DocPath, graph_name: str, generate_chunk_embeddings: bool) -> bool: """Ingest document to ArangoDB.""" path = doc_path.path + if logflag: logger.info(f"Parsing document {path}.") - ############################# - # Text Generation Inference # - ############################# - - if OPENAI_API_KEY: - if logflag: - logger.info("OpenAI API Key is set. Verifying its validity...") - openai.api_key = OPENAI_API_KEY - - try: - openai.models.list() - if logflag: - logger.info("OpenAI API Key is valid.") - llm = ChatOpenAI(temperature=0, model_name="gpt-4o") - except openai.error.AuthenticationError: - if logflag: - logger.info("OpenAI API Key is invalid.") - except Exception as e: - if logflag: - logger.info(f"An error occurred while verifying the API Key: {e}") - - elif TGI_LLM_ENDPOINT: - llm = HuggingFaceEndpoint( - endpoint_url=TGI_LLM_ENDPOINT, - max_new_tokens=TGI_LLM_MAX_NEW_TOKENS, - top_k=TGI_LLM_TOP_K, - top_p=TGI_LLM_TOP_P, - temperature=TGI_LLM_TEMPERATURE, - timeout=TGI_LLM_TIMEOUT, - ) - else: - raise ValueError("No text generation inference endpoint is set.") - - try: - llm_transformer = LLMGraphTransformer( - llm=llm, - allowed_nodes=ALLOWED_NODES, - allowed_relationships=ALLOWED_RELATIONSHIPS, - prompt=PROMPT_TEMPLATE, - node_properties=NODE_PROPERTIES if NODE_PROPERTIES else False, - relationship_properties=RELATIONSHIP_PROPERTIES if RELATIONSHIP_PROPERTIES else False, - ) - except (TypeError, ValueError) as e: - if logflag: - logger.warning(f"Advanced LLMGraphTransformer failed: {e}") - # Fall back to basic config - try: - llm_transformer = LLMGraphTransformer(llm=llm) - except (TypeError, ValueError) as e: - if logflag: - logger.error(f"Failed to initialize LLMGraphTransformer: {e}") - raise - - ######################################## - # Text Embeddings Inference (optional) # - ######################################## - - embeddings = None - if create_embeddings: - if OPENAI_API_KEY: - # Use OpenAI embeddings - embeddings = OpenAIEmbeddings( - model=OPENAI_EMBED_MODEL, - dimensions=OPENAI_EMBED_DIMENSIONS, - ) - - elif TEI_EMBEDDING_ENDPOINT and HUGGINGFACEHUB_API_TOKEN: - # Use TEI endpoint service - embeddings = HuggingFaceHubEmbeddings( - model=TEI_EMBEDDING_ENDPOINT, - huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN, - ) - elif TEI_EMBED_MODEL: - # Use local embedding model - embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) - else: - if logflag: - logger.warning("No embeddings environment variables are set, cannot generate embeddings.") - embeddings = None - - ############ - # ArangoDB # - ############ - - client = ArangoClient(hosts=ARANGO_URL) - sys_db = client.db(name="_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) - - if not sys_db.has_database(ARANGO_DB_NAME): - sys_db.create_database(ARANGO_DB_NAME) - - db = client.db(name=ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) - - graph = ArangoGraph( - db=db, - include_examples=False, - generate_schema_on_init=False, - ) - ############ # Chunking # ############ @@ -221,6 +126,7 @@ def ingest_data_to_arango(doc_path: DocPath, graph_name: str, create_embeddings: table_chunks = get_tables_result(path, doc_path.table_strategy) if isinstance(table_chunks, list): chunks = chunks + table_chunks + if logflag: logger.info(f"Done preprocessing. Created {len(chunks)} chunks of the original file.") @@ -228,7 +134,11 @@ def ingest_data_to_arango(doc_path: DocPath, graph_name: str, create_embeddings: # Graph generation & insertion # ################################ - generate_chunk_embeddings = embeddings is not None + graph = ArangoGraph( + db=db, + include_examples=False, + generate_schema_on_init=False, + ) for text in chunks: document = Document(page_content=text) @@ -294,7 +204,7 @@ async def ingest_documents( table_strategy=table_strategy, ), graph_name=graph_name, - create_embeddings=create_embeddings, + generate_chunk_embeddings=create_embeddings and embeddings is not None, ) uploaded_files.append(save_path) if logflag: @@ -323,7 +233,7 @@ async def ingest_documents( table_strategy=table_strategy, ), graph_name=graph_name, - create_embeddings=create_embeddings, + generate_chunk_embeddings=create_embeddings and embeddings is not None, ) except json.JSONDecodeError: raise HTTPException(status_code=500, detail="Fail to ingest data into qdrant.") @@ -340,4 +250,95 @@ async def ingest_documents( if __name__ == "__main__": + + ############################# + # Text Generation Inference # + ############################# + + if OPENAI_API_KEY: + if logflag: + logger.info("OpenAI API Key is set. Verifying its validity...") + openai.api_key = OPENAI_API_KEY + + try: + openai.models.list() + if logflag: + logger.info("OpenAI API Key is valid.") + llm = ChatOpenAI(temperature=OPENAI_CHAT_TEMPERATURE, model_name=OPENAI_CHAT_MODEL) + except openai.error.AuthenticationError: + if logflag: + logger.info("OpenAI API Key is invalid.") + except Exception as e: + if logflag: + logger.info(f"An error occurred while verifying the API Key: {e}") + + elif TGI_LLM_ENDPOINT: + llm = HuggingFaceEndpoint( + endpoint_url=TGI_LLM_ENDPOINT, + max_new_tokens=TGI_LLM_MAX_NEW_TOKENS, + top_k=TGI_LLM_TOP_K, + top_p=TGI_LLM_TOP_P, + temperature=TGI_LLM_TEMPERATURE, + timeout=TGI_LLM_TIMEOUT, + ) + else: + raise ValueError("No text generation inference endpoint is set.") + + try: + llm_transformer = LLMGraphTransformer( + llm=llm, + allowed_nodes=ALLOWED_NODES, + allowed_relationships=ALLOWED_RELATIONSHIPS, + prompt=PROMPT_TEMPLATE, + node_properties=NODE_PROPERTIES or False, + relationship_properties=RELATIONSHIP_PROPERTIES or False, + ) + except (TypeError, ValueError) as e: + if logflag: + logger.warning(f"Advanced LLMGraphTransformer failed: {e}") + # Fall back to basic config + try: + llm_transformer = LLMGraphTransformer(llm=llm) + except (TypeError, ValueError) as e: + if logflag: + logger.error(f"Failed to initialize LLMGraphTransformer: {e}") + raise + + ######################################## + # Text Embeddings Inference (optional) # + ######################################## + + if OPENAI_API_KEY: + # Use OpenAI embeddings + embeddings = OpenAIEmbeddings( + model=OPENAI_EMBED_MODEL, + dimensions=OPENAI_EMBED_DIMENSIONS, + ) + + elif TEI_EMBEDDING_ENDPOINT and HUGGINGFACEHUB_API_TOKEN: + # Use TEI endpoint service + embeddings = HuggingFaceHubEmbeddings( + model=TEI_EMBEDDING_ENDPOINT, + huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN, + ) + elif TEI_EMBED_MODEL: + # Use local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) + else: + if logflag: + logger.warning("No embeddings environment variables are set, cannot generate embeddings.") + embeddings = None + + ############ + # ArangoDB # + ############ + + client = ArangoClient(hosts=ARANGO_URL) + sys_db = client.db(name="_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + if not sys_db.has_database(ARANGO_DB_NAME): + sys_db.create_database(ARANGO_DB_NAME) + + db = client.db(name=ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + opea_microservices["opea_service@prepare_doc_arango"].start() diff --git a/comps/retrievers/README.md b/comps/retrievers/README.md index eeba8860e6..6ae15db0ef 100644 --- a/comps/retrievers/README.md +++ b/comps/retrievers/README.md @@ -33,3 +33,7 @@ For details, please refer to this [readme](vdms/langchain/README.md) ## Retriever Microservice with Multimodal For details, please refer to this [readme](multimodal/redis/langchain/README.md) + +## Retriever Microservice with ArangoDB + +For details, please refer to this [readme](arango/langchain/README.md) diff --git a/comps/retrievers/arango/__init__.py b/comps/retrievers/arango/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/comps/retrievers/arango/langchain/Dockerfile b/comps/retrievers/arango/langchain/Dockerfile new file mode 100644 index 0000000000..27a04ccb91 --- /dev/null +++ b/comps/retrievers/arango/langchain/Dockerfile @@ -0,0 +1,34 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ENV HUGGINGFACEHUB_API_TOKEN=dummy + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libgl1-mesa-glx \ + libjemalloc-dev \ + git + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/retrievers/arango/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/retrievers/arango/langchain + +ENTRYPOINT ["python", "retriever_arango.py"] diff --git a/comps/retrievers/arango/langchain/README.md b/comps/retrievers/arango/langchain/README.md new file mode 100644 index 0000000000..66b6e74bc5 --- /dev/null +++ b/comps/retrievers/arango/langchain/README.md @@ -0,0 +1,144 @@ +# Retriever Microservice with ArangoDB + +## 🚀 1. Start Microservice with Python + +### Install Requirements + +```bash +pip install -r requirements.txt +apt-get install libtesseract-dev -y +apt-get install poppler-utils -y +``` + +### Start ArangoDB Server + +To launch ArangoDB locally, first ensure you have docker installed. Then, you can launch the database with the following docker command. + +```bash +docker run -d --name arangodb -p 8529:8529 -e ARANGO_ROOT_PASSWORD=password arangodb/arangodb:latest +``` + +### Setup Environment Variables + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export ARANGO_URL=${your_arango_url} +export ARANGO_USERNAME=${your_arango_username} +export ARANGO_PASSWORD=${your_arango_password} +export ARANGO_DB_NAME=${your_db_name} +export ARANGO_COLLECTION_NAME=${your_collection_name} +export ARANGO_EMBEDDING_DIMENSION=${your_embedding_dimension} +export PYTHONPATH=${path_to_comps} +``` + +See below for additional environment variables that can be set. + +### Start Retriever Service + +```bash +python retriever_arango.py +``` + +## 🚀 2. Start Microservice with Docker + +### Build Docker Image + +```bash +cd /your/path/to/GenAIComps +docker build -t opea/retriever-arango:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/arangodb/langchain/Dockerfile . +``` + +### Run Docker with CLI + +```bash +docker run -d --name="retriever-arango-server" -p 7000:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e ... opea/retriever-arango:latest +``` + +### Run Docker with Docker Compose + +```bash +cd /your/path/to/GenAIComps/comps/retriever/arango/langchain +docker compose -f docker-compose-retriever-arango.yaml up -d +``` + +## 🚀 3. Consume Retriever Service + +### 3.1 Check Service Status + +```bash +curl http://${your_ip}:7000/v1/health_check \ + -X GET \ + -H 'Content-Type: application/json' +``` + +### 3.2 Consume Embedding Service + +Assuming you have an ArangoDB Collection with the documents you want to retrieve from, you can consume the retriever service with the following curl command. + +```bash +curl http://${your_ip}:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":[]}" \ + -H 'Content-Type: application/json' +``` + +If `embedding` is not specified or is an empty list, the retriever will use the text to generate an embedding using the Embedding Environment variables provided. + +Additional parameters can be set for the retriever: + +```bash +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":[],\"search_type\":\"similarity\", \"k\":4}" \ + -H 'Content-Type: application/json' +``` + +```bash +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":[],\"search_type\":\"similarity_score_threshold\", \"k\":4, \"score_threshold\":0.2}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":[],\"search_type\":\"mmr\", \"k\":4, \"fetch_k\":20, \"lambda_mult\":0.5}" \ + -H 'Content-Type: application/json' +``` + +--- + +Additional options that can be specified from the environment variables are as follows (default values are also in the `config.py` file): + +ArangoDB Connection configuration +- `ARANGO_URL`: The URL for the ArangoDB service. +- `ARANGO_USERNAME`: The username for the ArangoDB service. +- `ARANGO_PASSWORD`: The password for the ArangoDB service. +- `ARANGO_DB_NAME`: The name of the database to use for the ArangoDB service. + +ArangoDB Collection configuration +- `ARANGO_COLLECTION_NAME`: The name of the collection containing the documents. +- `ARANGO_DISTANCE_STRATEGY`: The distance strategy to use for the embeddings. Options are `COSINE` and `L2` (euclidean distance). +- `ARANGO_USE_APPROX_SEARCH`: Whether to use approximate neighbor search. If False, exact search will be used (slower, but more accurate). If True, approximate search will be used (faster, but less accurate). Defaults to `True`. +- `ARANGO_TEXT_FIELD`: The document field name storing the text. +- `ARANGO_EMBEDDING_FIELD`: The document field name storing the embeddings. +- `ARANGO_EMBEDDING_DIMENSION`: The dimension of the document embeddings. +- `ARANGO_NUM_CENTROIDS`: The number of centroids to use for the approximate search. Defaults to `1`, which is essentially exhaustive search. + +ArangoDB Traversal configuration +- `ARANGO_TRAVERSAL_GRAPH_NAME`: If specified, the retriever will traverse the graph to retrieve the neighborhood of the retrieved documents. +- `ARANGO_TRAVERSAL_MAX_DEPTH`: The maximum depth to traverse the graph. Defaults to `1`. + +Embedding Configuration +- `TEI_EMBED_MODEL`: The model to use for the TEI service. Defaults to `BAAI/bge-base-en-v1.5`. +- `TEI_EMBEDDING_ENDPOINT`: The endpoint for the TEI service. +- `HUGGINGFACEHUB_API_TOKEN`: The API token for the Hugging Face Hub. + +OpenAI Configuration: +**Note**: This configuration can replace the TGI and TEI services for text generation and embeddings. +- `OPENAI_API_KEY`: The API key for the OpenAI service. +- `OPENAI_EMBED_MODEL`: The embedding model to use for the OpenAI service. Defaults to `text-embedding-3-small`. \ No newline at end of file diff --git a/comps/retrievers/arango/langchain/__init__.py b/comps/retrievers/arango/langchain/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/comps/retrievers/arango/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/retrievers/arango/langchain/config.py b/comps/retrievers/arango/langchain/config.py new file mode 100644 index 0000000000..d1df90cfd3 --- /dev/null +++ b/comps/retrievers/arango/langchain/config.py @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ArangoDB Connection configuration +ARANGO_URL = os.getenv("ARANGO_URL", "http://localhost:8529") +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "_system") + +# ArangoDB Vector configuration +ARANGO_COLLECTION_NAME = os.getenv("ARANGO_COLLECTION_NAME", "Graph_SOURCE") +ARANGO_DISTANCE_STRATEGY = os.getenv("ARANGO_DISTANCE_STRATEGY", "COSINE") +ARANGO_USE_APPROX_SEARCH = os.getenv("ARANGO_USE_APPROX_SEARCH", True) +ARANGO_TEXT_FIELD = os.getenv("ARANGO_TEXT_FIELD", "text") +ARANGO_EMBEDDING_FIELD = os.getenv("ARANGO_EMBEDDING_FIELD", "embedding") +ARANGO_EMBEDDING_DIMENSION = os.getenv("ARANGO_EMBEDDING_DIMENSION") +ARANGO_NUM_CENTROIDS = os.getenv("ARANGO_NUM_CENTROIDS", 1) + +# ArangoDB Traversal configuration +ARANGO_TRAVERSAL_GRAPH_NAME = os.getenv("ARANGO_TRAVERSAL_GRAPH_NAME") +ARANGO_TRAVERSAL_MAX_DEPTH = os.getenv("ARANGO_TRAVERSAL_MAX_DEPTH", 1) + +# Embedding configuration +TEI_EMBED_MODEL = os.getenv("TEI_EMBED_MODEL", "BAAI/bge-base-en-v1.5") +TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "") +HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") + +# OpenAI configuration (alternative to TEI & local model) +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +OPENAI_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small") diff --git a/comps/retrievers/arango/langchain/docker-compose-retriever-arango.yaml b/comps/retrievers/arango/langchain/docker-compose-retriever-arango.yaml new file mode 100644 index 0000000000..ffa86d28fb --- /dev/null +++ b/comps/retrievers/arango/langchain/docker-compose-retriever-arango.yaml @@ -0,0 +1,54 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arango-vector-db: + image: jbajic/arango-preview:vector-index-preview-5 + container_name: arango-vector + ports: + - "8529:8529" + environment: + ARANGO_ROOT_PASSWORD: ${ARANGO_PASSWORD} + tei-embedding-service: + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + container_name: tei-embedding-server + ports: + - "6006:80" + volumes: + - "./data:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate + retriever-arango: + image: opea/retriever-arango:latest + container_name: retriever-arango-server + depends_on: + - arango-vector-db + - tei-embedding-service + ports: + - "7000:7000" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + ARANGO_URL: ${ARANGO_URL} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + ARANGO_DB_NAME: ${ARANGO_DB_NAME} + ARANGO_COLLECTION_NAME: ${ARANGO_COLLECTION_NAME} + ARANGO_EMBEDDING_DIMENSION: ${ARANGO_EMBEDDING_DIMENSION} + TEI_EMBED_MODEL: ${TEI_EMBED_MODEL} + TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + OPENAI_API_KEY: ${OPENAI_API_KEY} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/retrievers/arango/langchain/requirements.txt b/comps/retrievers/arango/langchain/requirements.txt new file mode 100644 index 0000000000..f1d40dbbe0 --- /dev/null +++ b/comps/retrievers/arango/langchain/requirements.txt @@ -0,0 +1,22 @@ +docarray[full] +fastapi +frontend +huggingface_hub +langchain +git+https://github.com/arangoml/langchain.git@arangodb#subdirectory=libs/community +langchain_openai +python-arango +numpy +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pydantic +pymupdf +python-docx +sentence_transformers +shortuuid +tiktoken +uvicorn diff --git a/comps/retrievers/arango/langchain/retriever_arango.py b/comps/retrievers/arango/langchain/retriever_arango.py new file mode 100644 index 0000000000..2a7b37412a --- /dev/null +++ b/comps/retrievers/arango/langchain/retriever_arango.py @@ -0,0 +1,249 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import time +from typing import Any, Optional, Union + +from arango import ArangoClient +from config import ( + ARANGO_COLLECTION_NAME, + ARANGO_DB_NAME, + ARANGO_DISTANCE_STRATEGY, + ARANGO_EMBEDDING_DIMENSION, + ARANGO_EMBEDDING_FIELD, + ARANGO_NUM_CENTROIDS, + ARANGO_PASSWORD, + ARANGO_TEXT_FIELD, + ARANGO_TRAVERSAL_GRAPH_NAME, + ARANGO_TRAVERSAL_MAX_DEPTH, + ARANGO_URL, + ARANGO_USE_APPROX_SEARCH, + ARANGO_USERNAME, + TEI_EMBEDDING_ENDPOINT, + TEI_EMBED_MODEL, + HUGGINGFACEHUB_API_TOKEN, + OPENAI_API_KEY, + OPENAI_EMBED_MODEL, +) +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings +from langchain_community.vectorstores.arangodb_vector import ArangoVector +from langchain_openai import OpenAIEmbeddings + +from comps import ( + CustomLogger, + EmbedDoc, + SearchedDoc, + ServiceType, + TextDoc, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) +from comps.cores.proto.api_protocol import ( + ChatCompletionRequest, + RetrievalRequest, + RetrievalResponse, + RetrievalResponseData, +) + + +class ArangoTextDoc(TextDoc): + neighborhood: Optional[list[dict[str, Any]]] = None + + +class ArangoRetrievalResponseData(RetrievalResponseData): + neighborhood: Optional[list[dict[str, Any]]] = None + + +logger = CustomLogger("retriever_arango") +logflag = os.getenv("LOGFLAG", False) + + +def fetch_neighborhoods( + vector_db: ArangoVector, + keys: list[str], + neighborhoods: dict[str, Any], + graph_name: str, + source_collection_name: str, + max_depth: int, +) -> None: + """Fetch neighborhoods of source documents. Updates the neighborhoods dictionary in-place.""" + if not vector_db.db.has_graph(graph_name): + logger.error("Graph not found in database.") + return + + graph = vector_db.db.graph(graph_name) + + if not graph.has_edge_collection(f"{graph_name}_HAS_SOURCE"): + logger.error(f"Edge collection '{graph_name}_HAS_SOURCE' not found in graph.") + return + + if not graph.has_edge_collection(f"{graph_name}_LINKS_TO"): + logger.error(f"Edge collection '{graph_name}_LINKS_TO' not found in graph.") + return + + if max_depth < 1: + max_depth = 1 + + # TODO: Consider using general `GRAPH` syntax instead of specific edge collections... + aql = f""" + FOR doc IN @@collection + FILTER doc._key IN @keys + + LET entity_neighborhood = ( + FOR v1, e1, p1 IN 1..1 INBOUND doc {graph_name}_HAS_SOURCE + FOR v2, e2, p2 IN 1..{max_depth} ANY v1 {graph_name}_LINKS_TO + RETURN p2 + ) + + RETURN {{[doc._key]: entity_neighborhood}} + """ + + bind_vars = { + "@collection": source_collection_name, + "keys": keys, + } + + cursor = vector_db.db.aql.execute(aql, bind_vars=bind_vars) + + for doc in cursor: + neighborhoods.update(doc) + + +@register_microservice( + name="opea_service@retriever_arango", + service_type=ServiceType.RETRIEVER, + endpoint="/v1/retrieval", + host="0.0.0.0", + port=7000, +) +@register_statistics(names=["opea_service@retriever_arango"]) +async def retrieve( + input: Union[EmbedDoc, RetrievalRequest, ChatCompletionRequest] +) -> Union[SearchedDoc, RetrievalResponse, ChatCompletionRequest]: + if logflag: + logger.info(input) + + start = time.time() + + query = input.text if isinstance(input, EmbedDoc) else input.input + embedding = input.embedding if isinstance(input.embedding, list) else None + + vector_db = ArangoVector( + embedding=embeddings, + embedding_dimension=ARANGO_EMBEDDING_DIMENSION, + database=db, + collection_name=ARANGO_COLLECTION_NAME, + embedding_field=ARANGO_EMBEDDING_FIELD, + text_field=ARANGO_TEXT_FIELD, + distance_strategy=ARANGO_DISTANCE_STRATEGY, + num_centroids=ARANGO_NUM_CENTROIDS, + ) + + if input.search_type == "similarity_score_threshold": + docs_and_similarities = await vector_db.asimilarity_search_with_relevance_scores( + query=query, + embedding=embedding, + k=input.k, + score_threshold=input.score_threshold, + use_approx=ARANGO_USE_APPROX_SEARCH, + ) + search_res = [doc for doc, _ in docs_and_similarities] + elif input.search_type == "mmr": + search_res = await vector_db.amax_marginal_relevance_search( + query=query, + embedding=embedding, + k=input.k, + fetch_k=input.fetch_k, + lambda_mult=input.lambda_mult, + use_approx=ARANGO_USE_APPROX_SEARCH, + ) + else: + # Default to basic similarity search + search_res = await vector_db.asimilarity_search( + query=query, + embedding=embedding, + k=input.k, + use_approx=ARANGO_USE_APPROX_SEARCH, + ) + + neighborhoods = {} + if ARANGO_TRAVERSAL_GRAPH_NAME: + fetch_neighborhoods( + vector_db, + neighborhoods, + [r.id for r in search_res], + ARANGO_TRAVERSAL_GRAPH_NAME, + ARANGO_COLLECTION_NAME, + ARANGO_TRAVERSAL_MAX_DEPTH, + ) + + # return different response format + retrieved_docs: Union[list[ArangoTextDoc], list[ArangoRetrievalResponseData]] = [] + if isinstance(input, EmbedDoc): + for r in search_res: + retrieved_docs.append( + ArangoTextDoc( + text=r.page_content, + id=r.id, + neighborhood=neighborhoods.get(r.id), + ) + ) + + result = SearchedDoc(retrieved_docs=retrieved_docs, initial_query=input.text) + + else: + for r in search_res: + retrieved_docs.append( + ArangoRetrievalResponseData( + text=r.page_content, + id=r.id, + metadata=r.metadata, + neighborhood=neighborhoods.get(r.id), + ) + ) + + if isinstance(input, RetrievalRequest): + result = RetrievalResponse(retrieved_docs=retrieved_docs) + + elif isinstance(input, ChatCompletionRequest): + input.retrieved_docs = retrieved_docs + input.documents = [doc.text for doc in retrieved_docs] + result = input + else: + raise ValueError("Invalid input type: ", type(input)) + + statistics_dict["opea_service@retriever_arango"].append_latency(time.time() - start, None) + + if logflag: + logger.info(result) + + return result + + +if __name__ == "__main__": + + if not ARANGO_EMBEDDING_DIMENSION: + raise ValueError("EMBED_DIMENSION must specified in advance.") + + if OPENAI_API_KEY and OPENAI_EMBED_MODEL: + # Use OpenAI embeddings + embeddings = OpenAIEmbeddings(model=OPENAI_EMBED_MODEL, dimensions=ARANGO_EMBEDDING_DIMENSION) + elif TEI_EMBEDDING_ENDPOINT and HUGGINGFACEHUB_API_TOKEN: + # create embeddings using TEI endpoint service + embeddings = HuggingFaceHubEmbeddings(model=TEI_EMBEDDING_ENDPOINT, huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN) + else: + # create embeddings using local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) + + client = ArangoClient(hosts=ARANGO_URL) + sys_db = client.db(name="_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + if not sys_db.has_database(ARANGO_DB_NAME): + sys_db.create_database(ARANGO_DB_NAME) + + db = client.db(name=ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + opea_microservices["opea_service@retriever_arango"].start() diff --git a/tests/chathistory/test_chathistory_arango.sh b/tests/chathistory/test_chathistory_arango.sh index f9d731802b..9743a9ac42 100644 --- a/tests/chathistory/test_chathistory_arango.sh +++ b/tests/chathistory/test_chathistory_arango.sh @@ -16,7 +16,7 @@ export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME:-"test"} function build_docker_images() { cd $WORKPATH echo $(pwd) - docker run -d -p 8529:8529 --name=test-comps-arango arangodb/arangodb:latest + docker run -d -p 8529:8529 --name=test-comps-arango -e ARANGO_ROOT_PASSWORD=$ARANGO_PASSWORD arangodb/arangodb:latest docker build --no-cache -t opea/chathistory-arango-server:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/chathistory/arango/Dockerfile . if [ $? -ne 0 ]; then diff --git a/tests/dataprep/test_dataprep_arango_langchain.sh b/tests/dataprep/test_dataprep_arango_langchain.sh new file mode 100644 index 0000000000..45d82b974e --- /dev/null +++ b/tests/dataprep/test_dataprep_arango_langchain.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') + +export ARANGO_URL=${ARANGO_URL:-"http://${ip_address}:8529"} +export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} +export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} +export ARANGO_DB_NAME=${ARANGO_DB_NAME:-"_system"} + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker run -d -p 8529:8529 --name=test-comps-arango -e ARANGO_ROOT_PASSWORD=$ARANGO_PASSWORD arangodb/arangodb:latest + sleep 1m + + docker build --no-cache -t opea/dataprep-arango:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f $WORKPATH/comps/dataprep/arango/langchain/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/dataprep-arango built fail" + exit 1 + else + echo "opea/dataprep-arango built successful" + fi +} + +function start_service() { + tgi_endpoint=5044 + # Remember to set HF_TOKEN before invoking this test! + export HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN} + model=Intel/neural-chat-7b-v3-3 + docker run -d --name="test-comps-dataprep-tgi-endpoint" -p $tgi_endpoint:80 -v ./data:/data --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.4 --model-id $model + export TGI_LLM_ENDPOINT="http://${ip_address}:${tgi_endpoint}" + + # unset http_proxy + export no_proxy="localhost,127.0.0.1,"${ip_address} + docker run -d --name="test-comps-dataprep-arango-server" \ + -p 6007:6007 \ + --ipc=host \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e ARANGO_URL=$ARANGO_URL \ + -e ARANGO_USERNAME=$ARANGO_USERNAME \ + -e ARANGO_PASSWORD=$ARANGO_PASSWORD \ + -e ARANGO_DB_NAME=$ARANGO_DB_NAME \ + -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT \ + opea/dataprep-arango:comps + + sleep 1m +} + + +function validate_microservice() { + cd $LOG_PATH + + # test /v1/dataprep + URL="http://${ip_address}:6007/v1/dataprep" + echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL") + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ dataprep ] HTTP status is 200. Checking content..." + cp ./dataprep_file.txt ./dataprep_file2.txt + local CONTENT=$(curl -s -X POST -F 'files=@./dataprep_file2.txt' -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/dataprep.log) + + if echo "$CONTENT" | grep -q "Data preparation succeeded"; then + echo "[ dataprep ] Content is as expected." + else + echo "[ dataprep ] Content does not match the expected result: $CONTENT" + docker logs test-comps-dataprep-arango >> ${LOG_PATH}/dataprep.log + exit 1 + fi + else + echo "[ dataprep ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-arango >> ${LOG_PATH}/dataprep.log + exit 1 + fi +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps-arango*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi + + cid=$(docker ps -aq --filter "name=test-comps-dataprep-arango*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/feedback_management/test_feedback_management_arango.sh b/tests/feedback_management/test_feedback_management_arango.sh index 6bbd32598a..2eec9360eb 100644 --- a/tests/feedback_management/test_feedback_management_arango.sh +++ b/tests/feedback_management/test_feedback_management_arango.sh @@ -16,7 +16,7 @@ export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME:-"test"} function build_docker_images() { cd $WORKPATH echo $(pwd) - docker run -d -p 8529:8529 --name=test-comps-arango arangodb/arangodb:latest + docker run -d -p 8529:8529 --name=test-comps-arango -e ARANGO_ROOT_PASSWORD=$ARANGO_PASSWORD arangodb/arangodb:latest docker build --no-cache -t opea/feedbackmanagement-arango-server:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/feedback_management/arango/Dockerfile . if [ $? -ne 0 ]; then diff --git a/tests/prompt_registry/test_prompt_registry_arango.sh b/tests/prompt_registry/test_prompt_registry_arango.sh index 16d81b17d5..2338322a0b 100644 --- a/tests/prompt_registry/test_prompt_registry_arango.sh +++ b/tests/prompt_registry/test_prompt_registry_arango.sh @@ -16,7 +16,7 @@ export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME:-"test"} function build_docker_images() { cd $WORKPATH echo $(pwd) - docker run -d -p 8529:8529 --name=test-comps-arango arangodb/arangodb:latest + docker run -d -p 8529:8529 --name=test-comps-arango -e ARANGO_ROOT_PASSWORD=$ARANGO_PASSWORD arangodb/arangodb:latest docker build --no-cache -t opea/promptregistry-arango-server:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/prompt_registry/arango/Dockerfile . if [ $? -ne 0 ]; then diff --git a/tests/retrievers/test_retrievers_arango_langchain.sh b/tests/retrievers/test_retrievers_arango_langchain.sh new file mode 100644 index 0000000000..1b0ec6251d --- /dev/null +++ b/tests/retrievers/test_retrievers_arango_langchain.sh @@ -0,0 +1,126 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') + +export ARANGO_URL=${ARANGO_URL:-"http://${ip_address}:8529"} +export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} +export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} +export ARANGO_DB_NAME=${ARANGO_DB_NAME:-"_system"} +export ARANGO_COLLECTION_NAME=${ARANGO_COLLECTION_NAME:-"test"} +export ARANGO_EMBEDDING_DIMENSION=${ARANGO_EMBEDDING_DIMENSION:-5} + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker run -d -p 8529:8529 --name=test-comps-arango -e ARANGO_ROOT_PASSWORD=$ARANGO_PASSWORD arangodb/arangodb:latest + sleep 1m + + # Create ARANGO_COLLECTION_NAME + curl -X POST --header 'accept: application/json' \ + --header 'Content-Type: application/json' \ + --data '{"name": "'${ARANGO_COLLECTION_NAME}'", "type": 2, "waitForSync": true}' \ + "${ARANGO_URL}/_db/${ARANGO_DB_NAME}/_api/collection" \ + -u ${ARANGO_USERNAME}:${ARANGO_PASSWORD} + + # Insert data into arango: {text: "test", embedding: [0.1, 0.2, 0.3, 0.4, 0.5]} + curl -X POST --header 'accept: application/json' \ + --header 'Content-Type: application/json' \ + --data '{"text": "test", "embedding": [0.1, 0.2, 0.3, 0.4, 0.5]}' \ + "${ARANGO_URL}/_db/${ARANGO_DB_NAME}/_api/document/${ARANGO_COLLECTION_NAME}" \ + -u ${ARANGO_USERNAME}:${ARANGO_PASSWORD} + + docker build --no-cache -t opea/retriever-arango:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/arango/langchain/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/retriever-arango built fail" + exit 1 + else + echo "opea/retriever-arango built successful" + fi +} + +function start_service() { + # tei endpoint + tei_endpoint=5434 + model="BAAI/bge-base-en-v1.5" + docker run -d --name="test-comps-retriever-arango-tei-endpoint" -p $tei_endpoint:80 -v ./data:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model + sleep 30s + export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" + + # unset http_proxy + export no_proxy="localhost,127.0.0.1,"${ip_address} + docker run -d --name="test-comps-retriever-arango-server" \ + -p 7000:7000 \ + --ipc=host \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e ARANGO_URL=$ARANGO_URL \ + -e ARANGO_USERNAME=$ARANGO_USERNAME \ + -e ARANGO_PASSWORD=$ARANGO_PASSWORD \ + -e ARANGO_DB_NAME=$ARANGO_DB_NAME \ + -e ARANGO_COLLECTION_NAME=$ARANGO_COLLECTION_NAME \ + -e ARANGO_EMBEDDING_DIMENSION=$ARANGO_EMBEDDING_DIMENSION \ + -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT \ + opea/retriever-arango:comps + + sleep 1m +} + +function validate_microservice() { + export PATH="${HOME}/miniforge3/bin:$PATH" + source activate + URL="http://${ip_address}:7000/v1/retrieval" + + test_embedding="[0.1, 0.2, 0.3, 0.4, 0.5]" + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL") + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ retriever ] HTTP status is 200. Checking content..." + local CONTENT=$(curl -s -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/retriever.log) + + if echo "$CONTENT" | grep -q "retrieved_docs"; then + echo "[ retriever ] Content is as expected." + else + echo "[ retriever ] Content does not match the expected result: $CONTENT" + docker logs test-comps-retriever-arango-server >> ${LOG_PATH}/retriever.log + docker logs test-comps-retriever-arango-tei-endpoint >> ${LOG_PATH}/tei.log + exit 1 + fi + else + echo "[ retriever ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-retriever-arango-server >> ${LOG_PATH}/retriever.log + docker logs test-comps-retriever-arango-tei-endpoint >> ${LOG_PATH}/tei.log + exit 1 + fi +} + +function stop_docker() { + cid_retrievers=$(docker ps -aq --filter "name=test-comps-retriever-arango*") + if [[ ! -z "$cid_retrievers" ]]; then + docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s + fi + cid_db=$(docker ps -aq --filter "name=test-comps-arango-apoc1") + if [[ ! -z "$cid_retrievers" ]]; then + docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s + fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main From 9a7d8104c9104fa1cf40e7e7ec90417da8570d83 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Sun, 5 Jan 2025 17:27:17 -0500 Subject: [PATCH 20/35] revert: feedback management mongo change --- .../workflows/docker/compose/feedback_management-compose.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker/compose/feedback_management-compose.yaml b/.github/workflows/docker/compose/feedback_management-compose.yaml index 51f5ae343a..c22e6e4960 100644 --- a/.github/workflows/docker/compose/feedback_management-compose.yaml +++ b/.github/workflows/docker/compose/feedback_management-compose.yaml @@ -3,10 +3,10 @@ # this file should be run in the root of the repo services: - feedbackmanagement-mongo-server: + feedbackmanagement: build: dockerfile: comps/feedback_management/mongo/Dockerfile - image: ${REGISTRY:-opea}/feedbackmanagement-mongo-server:${TAG:-latest} + image: ${REGISTRY:-opea}/feedbackmanagement:${TAG:-latest} feedbackmanagement-arango-server: build: dockerfile: comps/feedback_management/arango/Dockerfile From f0659ade62921ad2a0e184fc65e43034e1346daa Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 6 Jan 2025 17:28:45 -0500 Subject: [PATCH 21/35] add logs --- comps/dataprep/arango/langchain/prepare_doc_arango.py | 4 +++- comps/retrievers/arango/langchain/retriever_arango.py | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index a89984f082..babc29131b 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -302,7 +302,7 @@ async def ingest_documents( except (TypeError, ValueError) as e: if logflag: logger.error(f"Failed to initialize LLMGraphTransformer: {e}") - raise + raise e ######################################## # Text Embeddings Inference (optional) # @@ -340,5 +340,7 @@ async def ingest_documents( sys_db.create_database(ARANGO_DB_NAME) db = client.db(name=ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + if logflag: + logger.info(f"Connected to ArangoDB {db.version()}.") opea_microservices["opea_service@prepare_doc_arango"].start() diff --git a/comps/retrievers/arango/langchain/retriever_arango.py b/comps/retrievers/arango/langchain/retriever_arango.py index 2a7b37412a..fb9a0e095a 100644 --- a/comps/retrievers/arango/langchain/retriever_arango.py +++ b/comps/retrievers/arango/langchain/retriever_arango.py @@ -245,5 +245,7 @@ async def retrieve( sys_db.create_database(ARANGO_DB_NAME) db = client.db(name=ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + if logflag: + logger.info(f"Connected to ArangoDB {db.version()}.") opea_microservices["opea_service@retriever_arango"].start() From aa2601322e2b16162bde961b66ac0ad36beccd39 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna <43019056+aMahanna@users.noreply.github.com> Date: Wed, 8 Jan 2025 18:06:24 -0500 Subject: [PATCH 22/35] dataprep improvements (#16) * dataprep improvements * fix: readme * new: make embedding generation mandatory * fix: exception handling * add logs * new: `ARANGO_USE_GRAPH_NAME` --- comps/dataprep/arango/langchain/README.md | 3 +- comps/dataprep/arango/langchain/config.py | 9 +- .../docker-compose-dataprep-arango.yaml | 17 +++ .../arango/langchain/prepare_doc_arango.py | 110 +++++++++++------- 4 files changed, 90 insertions(+), 49 deletions(-) diff --git a/comps/dataprep/arango/langchain/README.md b/comps/dataprep/arango/langchain/README.md index fb383b42e5..5c6985e67d 100644 --- a/comps/dataprep/arango/langchain/README.md +++ b/comps/dataprep/arango/langchain/README.md @@ -116,9 +116,10 @@ ArangoDB Connection configuration - `ARANGO_DB_NAME`: The name of the database to use for the ArangoDB service. ArangoDB Graph Insertion configuration -- `USE_ONE_ENTITY_COLLECTION`: If set to True, the microservice will use a single entity collection for all nodes. If set to False, the microservice will use a separate collection by node type. Defaults to `True`. - `INSERT_ASYNC`: If set to True, the microservice will insert the data into ArangoDB asynchronously. Defaults to `False`. - `ARANGO_BATCH_SIZE`: The batch size for the microservice to insert the data. Defaults to `500`. +- `ARANGO_GRAPH_NAME`: The name of the graph to use/create in ArangoDB Defaults to `GRAPH`. +- `ARANGO_USE_GRAPH_NAME`: If set to True, the microservice will use the graph name specified in the environment variable `ARANGO_GRAPH_NAME`. If set to False, the file name will be used as the graph name. Defaults to `True`. Text Generation Inference Configuration - `TGI_LLM_ENDPOINT`: The endpoint for the TGI service. diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py index c3caf2da25..98bfd498bb 100644 --- a/comps/dataprep/arango/langchain/config.py +++ b/comps/dataprep/arango/langchain/config.py @@ -10,9 +10,10 @@ ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "_system") # ArangoDB Graph Insertion configuration -USE_ONE_ENTITY_COLLECTION = os.getenv("USE_ONE_ENTITY_COLLECTION", True) INSERT_ASYNC = os.getenv("INSERT_ASYNC", False) ARANGO_BATCH_SIZE = os.getenv("ARANGO_BATCH_SIZE", 500) +ARANGO_GRAPH_NAME = os.getenv("ARANGO_GRAPH_NAME", "GRAPH") +ARANGO_USE_GRAPH_NAME = os.getenv("ARANGO_USE_GRAPH_NAME", True) # Text Generation Inference configuration TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") @@ -29,10 +30,12 @@ # OpenAI configuration (alternative to TGI & TEI) OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") -OPENAI_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small") -OPENAI_EMBED_DIMENSIONS = os.getenv("OPENAI_EMBED_DIMENSIONS", 512) OPENAI_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "gpt-4o") OPENAI_CHAT_TEMPERATURE = os.getenv("OPENAI_CHAT_TEMPERATURE", 0) +OPENAI_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small") +OPENAI_EMBED_DIMENSIONS = os.getenv("OPENAI_EMBED_DIMENSIONS", 512) +OPENAI_CHAT_ENABLED = os.getenv("OPENAI_TEI_ENABLED", True) +OPENAI_EMBED_ENABLED = os.getenv("OPENAI_TGI_ENABLED", True) # LLMGraphTransformer configuration SYSTEM_PROMPT_PATH = os.getenv("SYSTEM_PROMPT_PATH") diff --git a/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml b/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml index d3a9882c63..207e3d730a 100644 --- a/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml +++ b/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml @@ -10,6 +10,22 @@ services: - "8529:8529" environment: ARANGO_ROOT_PASSWORD: ${ARANGO_PASSWORD} + tei-embedding-service: + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + container_name: tei-embedding-server + ports: + - "6006:80" + volumes: + - "./data:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + NO_PROXY: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + ipc: host + command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate tgi_gaudi_service: image: ghcr.io/huggingface/tgi-gaudi:2.0.5 container_name: tgi-service @@ -30,6 +46,7 @@ services: depends_on: - arango-vector-db - tgi_gaudi_service + - tei-embedding-service ports: - "6007:6007" ipc: host diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index babc29131b..6a90c2d043 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -12,6 +12,7 @@ ALLOWED_RELATIONSHIPS, ARANGO_BATCH_SIZE, ARANGO_DB_NAME, + ARANGO_GRAPH_NAME, ARANGO_PASSWORD, ARANGO_URL, ARANGO_USERNAME, @@ -33,7 +34,9 @@ TGI_LLM_TIMEOUT, TGI_LLM_TOP_K, TGI_LLM_TOP_P, - USE_ONE_ENTITY_COLLECTION, + OPENAI_CHAT_ENABLED, + OPENAI_EMBED_ENABLED, + ARANGO_USE_GRAPH_NAME, ) from fastapi import File, Form, HTTPException, UploadFile from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -86,7 +89,7 @@ logger.error(f"Could not set custom Prompt: {e}") -def ingest_data_to_arango(doc_path: DocPath, graph_name: str, generate_chunk_embeddings: bool) -> bool: +def ingest_data_to_arango(doc_path: DocPath) -> str: """Ingest document to ArangoDB.""" path = doc_path.path @@ -128,7 +131,7 @@ def ingest_data_to_arango(doc_path: DocPath, graph_name: str, generate_chunk_emb chunks = chunks + table_chunks if logflag: - logger.info(f"Done preprocessing. Created {len(chunks)} chunks of the original file.") + logger.info(f"Created {len(chunks)} chunks of the original file.") ################################ # Graph generation & insertion # @@ -140,29 +143,40 @@ def ingest_data_to_arango(doc_path: DocPath, graph_name: str, generate_chunk_emb generate_schema_on_init=False, ) - for text in chunks: + if ARANGO_USE_GRAPH_NAME: + graph_name = ARANGO_GRAPH_NAME + else: + file_name = os.path.basename(path).split(".")[0] + graph_name = "".join(c for c in file_name if c.isalnum() or c in "_-:.@()+,=;$!*'%") + + if logflag: + logger.info(f"Creating graph {graph_name}.") + + for i, text in enumerate(chunks): document = Document(page_content=text) graph_doc = llm_transformer.process_response(document) - if generate_chunk_embeddings: - source = graph_doc.source - source.metadata["embedding"] = embeddings.embed_documents([source.page_content])[0] + source = graph_doc.source + source.metadata["embedding"] = embeddings.embed_documents([source.page_content])[0] graph.add_graph_documents( graph_documents=[graph_doc], include_source=True, graph_name=graph_name, - update_graph_definition_if_exists=not USE_ONE_ENTITY_COLLECTION, + update_graph_definition_if_exists=False, batch_size=ARANGO_BATCH_SIZE, - use_one_entity_collection=USE_ONE_ENTITY_COLLECTION, + use_one_entity_collection=True, insert_async=INSERT_ASYNC, source_metadata_fields_to_extract_to_top_level={"embedding"}, ) + if logflag: + logger.info(f"Chunk {i} processed into graph.") + if logflag: logger.info("The graph is built.") - return True + return graph_name @register_microservice( @@ -180,13 +194,16 @@ async def ingest_documents( chunk_overlap: int = Form(100), process_table: bool = Form(False), table_strategy: str = Form("fast"), - graph_name: str = Form("Graph"), - create_embeddings: bool = Form(True), ): if logflag: logger.info(f"files:{files}") logger.info(f"link_list:{link_list}") + if not files and not link_list: + raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") + + graph_names_created = set() + if files: if not isinstance(files, list): files = [files] @@ -195,24 +212,24 @@ async def ingest_documents( encode_file = encode_filename(file.filename) save_path = upload_folder + encode_file await save_content_to_local_disk(save_path, file) - ingest_data_to_arango( - DocPath( - path=save_path, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - process_table=process_table, - table_strategy=table_strategy, - ), - graph_name=graph_name, - generate_chunk_embeddings=create_embeddings and embeddings is not None, - ) - uploaded_files.append(save_path) + try: + graph_name = ingest_data_to_arango( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ), + ) + + uploaded_files.append(save_path) + graph_names_created.add(graph_name) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Failed to ingest {save_path} into ArangoDB: {e}") + if logflag: logger.info(f"Successfully saved file {save_path}") - result = {"status": 200, "message": "Data preparation succeeded"} - if logflag: - logger.info(result) - return result if link_list: link_list = json.loads(link_list) # Parse JSON string to list @@ -222,9 +239,9 @@ async def ingest_documents( encoded_link = encode_filename(link) save_path = upload_folder + encoded_link + ".txt" content = parse_html([link])[0][0] + await save_content_to_local_disk(save_path, content) try: - await save_content_to_local_disk(save_path, content) - ingest_data_to_arango( + graph_name = ingest_data_to_arango( DocPath( path=save_path, chunk_size=chunk_size, @@ -232,21 +249,26 @@ async def ingest_documents( process_table=process_table, table_strategy=table_strategy, ), - graph_name=graph_name, - generate_chunk_embeddings=create_embeddings and embeddings is not None, ) - except json.JSONDecodeError: - raise HTTPException(status_code=500, detail="Fail to ingest data into qdrant.") + graph_names_created.add(graph_name) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Failed to ingest {save_path} into ArangoDB: {e}") if logflag: logger.info(f"Successfully saved link {link}") - result = {"status": 200, "message": "Data preparation succeeded"} - if logflag: - logger.info(result) - return result + graph_names_created = list(graph_names_created) + + result = { + "status": 200, + "message": f"Data preparation succeeded: {graph_names_created}", + "graph_names": graph_names_created, + } - raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") + if logflag: + logger.info(result) + + return result if __name__ == "__main__": @@ -255,7 +277,7 @@ async def ingest_documents( # Text Generation Inference # ############################# - if OPENAI_API_KEY: + if OPENAI_API_KEY and OPENAI_CHAT_ENABLED: if logflag: logger.info("OpenAI API Key is set. Verifying its validity...") openai.api_key = OPENAI_API_KEY @@ -282,7 +304,7 @@ async def ingest_documents( timeout=TGI_LLM_TIMEOUT, ) else: - raise ValueError("No text generation inference endpoint is set.") + raise ValueError("No text generation environment variables are set, cannot generate graphs.") try: llm_transformer = LLMGraphTransformer( @@ -308,7 +330,7 @@ async def ingest_documents( # Text Embeddings Inference (optional) # ######################################## - if OPENAI_API_KEY: + if OPENAI_API_KEY and OPENAI_EMBED_ENABLED: # Use OpenAI embeddings embeddings = OpenAIEmbeddings( model=OPENAI_EMBED_MODEL, @@ -325,9 +347,7 @@ async def ingest_documents( # Use local embedding model embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) else: - if logflag: - logger.warning("No embeddings environment variables are set, cannot generate embeddings.") - embeddings = None + raise ValueError("No embeddings environment variables are set, cannot generate embeddings.") ############ # ArangoDB # From 29026501c128d04dca88b78834409af98f07aa1c Mon Sep 17 00:00:00 2001 From: Anthony Mahanna <43019056+aMahanna@users.noreply.github.com> Date: Fri, 10 Jan 2025 07:59:05 -0500 Subject: [PATCH 23/35] retriever improvements (#17) * retriever improvements * new: `collection_count` * new: `empty_result` object * remove: `raise` no longer required * set `LOGFLAG` to `True` * Removing config variable ARANGO_EMBED_DIMENSION, getting embed dimension automatically from the db * minor cleanup * whitespace * log cleanup --------- Co-authored-by: Ajay Kallepalli --- comps/retrievers/arango/langchain/README.md | 5 +- comps/retrievers/arango/langchain/config.py | 5 +- .../arango/langchain/retriever_arango.py | 280 ++++++++++++------ 3 files changed, 191 insertions(+), 99 deletions(-) diff --git a/comps/retrievers/arango/langchain/README.md b/comps/retrievers/arango/langchain/README.md index 66b6e74bc5..e766b2847d 100644 --- a/comps/retrievers/arango/langchain/README.md +++ b/comps/retrievers/arango/langchain/README.md @@ -28,7 +28,6 @@ export ARANGO_URL=${your_arango_url} export ARANGO_USERNAME=${your_arango_username} export ARANGO_PASSWORD=${your_arango_password} export ARANGO_DB_NAME=${your_db_name} -export ARANGO_COLLECTION_NAME=${your_collection_name} export ARANGO_EMBEDDING_DIMENSION=${your_embedding_dimension} export PYTHONPATH=${path_to_comps} ``` @@ -121,7 +120,7 @@ ArangoDB Connection configuration - `ARANGO_DB_NAME`: The name of the database to use for the ArangoDB service. ArangoDB Collection configuration -- `ARANGO_COLLECTION_NAME`: The name of the collection containing the documents. +- `ARANGO_GRAPH_NAME`: The name of the graph that contains the document collection. It is always assumed that the graph contains a document collection, which is named as `"{ARANGO_GRAPH_NAME}_SOURCE"`. This collection contains the documents to be retrieved, which have been inserted by the Arango DataPrep service. Defaults to `GRAPH`. Additionally, it is possible to specify the Graph Name via the `text` parameter of the `/v1/retrieval` endpoint, using the `text: "query | graph_name"` syntax. For example, `text: "What is the revenue of Nike in 2023? | MySportsGraph"`. In this case, the graph name used will be `MySportsGraph`, which means the document collection will be `MySportsGraph_SOURCE`. This is useful if you have multiple graphs in the database. - `ARANGO_DISTANCE_STRATEGY`: The distance strategy to use for the embeddings. Options are `COSINE` and `L2` (euclidean distance). - `ARANGO_USE_APPROX_SEARCH`: Whether to use approximate neighbor search. If False, exact search will be used (slower, but more accurate). If True, approximate search will be used (faster, but less accurate). Defaults to `True`. - `ARANGO_TEXT_FIELD`: The document field name storing the text. @@ -130,7 +129,7 @@ ArangoDB Collection configuration - `ARANGO_NUM_CENTROIDS`: The number of centroids to use for the approximate search. Defaults to `1`, which is essentially exhaustive search. ArangoDB Traversal configuration -- `ARANGO_TRAVERSAL_GRAPH_NAME`: If specified, the retriever will traverse the graph to retrieve the neighborhood of the retrieved documents. +- `ARANGO_TRAVERSAL_ENABLED`: If set to True, the retriever will traverse the graph to retrieve the neighborhood of the retrieved documents, using the specified `ARANGO_GRAPH_NAME` as a reference. Defaults to `False`. - `ARANGO_TRAVERSAL_MAX_DEPTH`: The maximum depth to traverse the graph. Defaults to `1`. Embedding Configuration diff --git a/comps/retrievers/arango/langchain/config.py b/comps/retrievers/arango/langchain/config.py index d1df90cfd3..bfabe94194 100644 --- a/comps/retrievers/arango/langchain/config.py +++ b/comps/retrievers/arango/langchain/config.py @@ -10,16 +10,15 @@ ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "_system") # ArangoDB Vector configuration -ARANGO_COLLECTION_NAME = os.getenv("ARANGO_COLLECTION_NAME", "Graph_SOURCE") +ARANGO_GRAPH_NAME = os.getenv("ARANGO_GRAPH_NAME", "GRAPH") ARANGO_DISTANCE_STRATEGY = os.getenv("ARANGO_DISTANCE_STRATEGY", "COSINE") ARANGO_USE_APPROX_SEARCH = os.getenv("ARANGO_USE_APPROX_SEARCH", True) ARANGO_TEXT_FIELD = os.getenv("ARANGO_TEXT_FIELD", "text") ARANGO_EMBEDDING_FIELD = os.getenv("ARANGO_EMBEDDING_FIELD", "embedding") -ARANGO_EMBEDDING_DIMENSION = os.getenv("ARANGO_EMBEDDING_DIMENSION") ARANGO_NUM_CENTROIDS = os.getenv("ARANGO_NUM_CENTROIDS", 1) # ArangoDB Traversal configuration -ARANGO_TRAVERSAL_GRAPH_NAME = os.getenv("ARANGO_TRAVERSAL_GRAPH_NAME") +ARANGO_TRAVERSAL_ENABLED = os.getenv("ARANGO_TRAVERSAL_ENABLED", False) ARANGO_TRAVERSAL_MAX_DEPTH = os.getenv("ARANGO_TRAVERSAL_MAX_DEPTH", 1) # Embedding configuration diff --git a/comps/retrievers/arango/langchain/retriever_arango.py b/comps/retrievers/arango/langchain/retriever_arango.py index fb9a0e095a..b7a6f49fcb 100644 --- a/comps/retrievers/arango/langchain/retriever_arango.py +++ b/comps/retrievers/arango/langchain/retriever_arango.py @@ -7,24 +7,23 @@ from arango import ArangoClient from config import ( - ARANGO_COLLECTION_NAME, ARANGO_DB_NAME, ARANGO_DISTANCE_STRATEGY, - ARANGO_EMBEDDING_DIMENSION, ARANGO_EMBEDDING_FIELD, + ARANGO_GRAPH_NAME, ARANGO_NUM_CENTROIDS, ARANGO_PASSWORD, ARANGO_TEXT_FIELD, - ARANGO_TRAVERSAL_GRAPH_NAME, + ARANGO_TRAVERSAL_ENABLED, ARANGO_TRAVERSAL_MAX_DEPTH, ARANGO_URL, ARANGO_USE_APPROX_SEARCH, ARANGO_USERNAME, - TEI_EMBEDDING_ENDPOINT, - TEI_EMBED_MODEL, HUGGINGFACEHUB_API_TOKEN, OPENAI_API_KEY, OPENAI_EMBED_MODEL, + TEI_EMBED_MODEL, + TEI_EMBEDDING_ENDPOINT, ) from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings from langchain_community.vectorstores.arangodb_vector import ArangoVector @@ -48,17 +47,16 @@ RetrievalResponseData, ) +# TODO: Revisit these classes. How would they be presented in ChatQnA? +# class ArangoTextDoc(TextDoc): +# neighborhood: Optional[list[dict[str, Any]]] = None -class ArangoTextDoc(TextDoc): - neighborhood: Optional[list[dict[str, Any]]] = None - - -class ArangoRetrievalResponseData(RetrievalResponseData): - neighborhood: Optional[list[dict[str, Any]]] = None +# class ArangoRetrievalResponseData(RetrievalResponseData): +# neighborhood: Optional[list[dict[str, Any]]] = None logger = CustomLogger("retriever_arango") -logflag = os.getenv("LOGFLAG", False) +logflag = os.getenv("LOGFLAG", True) def fetch_neighborhoods( @@ -70,20 +68,6 @@ def fetch_neighborhoods( max_depth: int, ) -> None: """Fetch neighborhoods of source documents. Updates the neighborhoods dictionary in-place.""" - if not vector_db.db.has_graph(graph_name): - logger.error("Graph not found in database.") - return - - graph = vector_db.db.graph(graph_name) - - if not graph.has_edge_collection(f"{graph_name}_HAS_SOURCE"): - logger.error(f"Edge collection '{graph_name}_HAS_SOURCE' not found in graph.") - return - - if not graph.has_edge_collection(f"{graph_name}_LINKS_TO"): - logger.error(f"Edge collection '{graph_name}_LINKS_TO' not found in graph.") - return - if max_depth < 1: max_depth = 1 @@ -111,6 +95,9 @@ def fetch_neighborhoods( for doc in cursor: neighborhoods.update(doc) + if logflag: + logger.info(f"Fetched neighborhoods for {len(neighborhoods)} documents.") + @register_microservice( name="opea_service@retriever_arango", @@ -126,94 +113,214 @@ async def retrieve( if logflag: logger.info(input) + if isinstance(input, EmbedDoc): + empty_result = SearchedDoc(retrieved_docs=[], initial_query=input.text) + elif isinstance(input, RetrievalRequest): + empty_result = RetrievalResponse(retrieved_docs=[]) + elif isinstance(input, ChatCompletionRequest): + input.retrieved_docs = [] + input.documents = [] + empty_result = input + else: + raise ValueError("Invalid input type: ", type(input)) + start = time.time() query = input.text if isinstance(input, EmbedDoc) else input.input embedding = input.embedding if isinstance(input.embedding, list) else None + ######################## + # Fetch the Graph Name # + ######################## + + # This is a workaround as the ChatQnA UI is limited to + # a single input field, so we need to parse the graph name from the query (for now). + + graph_name = None + query_split = query.split("|") + + if len(query_split) == 2: + # e.g "Who is connected to John Smith? | PersonGraph" + query = query_split[0].strip() + graph_name = query_split[1].strip() + + if not graph_name: + graph_name = ARANGO_GRAPH_NAME + + source_collection_name = f"{graph_name}_SOURCE" + + if not db.has_graph(graph_name): + if logflag: + graph_names = [g["name"] for g in db.graphs()] + logger.error(f"Graph '{graph_name}' does not exist in ArangoDB. Graphs: {graph_names}") + + return empty_result + + if not db.graph(graph_name).has_vertex_collection(source_collection_name): + if logflag: + collection_names = db.graph(graph_name).vertex_collections() + m = f"Collection '{source_collection_name}' does not exist in graph '{graph_name}'. Collections: {collection_names}" + logger.error(m) + + return empty_result + + collection = db.collection(source_collection_name) + + collection_count = collection.count() + if collection_count == 0: + if logflag: + logger.error(f"Collection '{source_collection_name}' is empty.") + + return empty_result + + if collection_count < ARANGO_NUM_CENTROIDS: + if logflag: + m = f"Collection '{source_collection_name}' has fewer documents ({collection_count}) than the number of centroids ({ARANGO_NUM_CENTROIDS})." + logger.error(m) + + return empty_result + + ################################ + # Retrieve Embedding Dimension # + ################################ + + random_doc = collection.random() + random_doc_id = random_doc["_id"] + + embedding = random_doc.get(ARANGO_EMBEDDING_FIELD) + + if not embedding: + if logflag: + logger.error(f"Document '{random_doc_id}' is missing field '{ARANGO_EMBEDDING_FIELD}'.") + + return empty_result + + if not isinstance(embedding, list): + if logflag: + logger.error(f"Document '{random_doc_id}' has a non-list embedding field, found {type(embedding)}.") + + return empty_result + + dimension = len(embedding) + + if dimension == 0: + if logflag: + logger.error(f"Document '{random_doc_id}' has an empty embedding field.") + + return empty_result + + if OPENAI_API_KEY and OPENAI_EMBED_MODEL: + # Use OpenAI embeddings + embeddings = OpenAIEmbeddings(model=OPENAI_EMBED_MODEL, dimensions=dimension) + elif TEI_EMBEDDING_ENDPOINT and HUGGINGFACEHUB_API_TOKEN: + # create embeddings using TEI endpoint service + embeddings = HuggingFaceHubEmbeddings( + model=TEI_EMBEDDING_ENDPOINT, huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN + ) + else: + # create embeddings using local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) + + ###################### + # Compute Similarity # + ###################### + vector_db = ArangoVector( embedding=embeddings, - embedding_dimension=ARANGO_EMBEDDING_DIMENSION, + embedding_dimension=dimension, database=db, - collection_name=ARANGO_COLLECTION_NAME, + collection_name=source_collection_name, embedding_field=ARANGO_EMBEDDING_FIELD, text_field=ARANGO_TEXT_FIELD, distance_strategy=ARANGO_DISTANCE_STRATEGY, num_centroids=ARANGO_NUM_CENTROIDS, ) - if input.search_type == "similarity_score_threshold": - docs_and_similarities = await vector_db.asimilarity_search_with_relevance_scores( - query=query, - embedding=embedding, - k=input.k, - score_threshold=input.score_threshold, - use_approx=ARANGO_USE_APPROX_SEARCH, - ) - search_res = [doc for doc, _ in docs_and_similarities] - elif input.search_type == "mmr": - search_res = await vector_db.amax_marginal_relevance_search( - query=query, - embedding=embedding, - k=input.k, - fetch_k=input.fetch_k, - lambda_mult=input.lambda_mult, - use_approx=ARANGO_USE_APPROX_SEARCH, - ) - else: - # Default to basic similarity search - search_res = await vector_db.asimilarity_search( - query=query, - embedding=embedding, - k=input.k, - use_approx=ARANGO_USE_APPROX_SEARCH, - ) + try: + if input.search_type == "similarity_score_threshold": + docs_and_similarities = await vector_db.asimilarity_search_with_relevance_scores( + query=query, + embedding=embedding, + k=input.k, + score_threshold=input.score_threshold, + use_approx=ARANGO_USE_APPROX_SEARCH, + ) + search_res = [doc for doc, _ in docs_and_similarities] + elif input.search_type == "mmr": + search_res = await vector_db.amax_marginal_relevance_search( + query=query, + embedding=embedding, + k=input.k, + fetch_k=input.fetch_k, + lambda_mult=input.lambda_mult, + use_approx=ARANGO_USE_APPROX_SEARCH, + ) + else: + # Default to basic similarity search + search_res = await vector_db.asimilarity_search( + query=query, + embedding=embedding, + k=input.k, + use_approx=ARANGO_USE_APPROX_SEARCH, + ) + except Exception as e: + if logflag: + logger.error(f"Error during similarity search: {e}") + + return empty_result + + if not search_res: + if logflag: + logger.info("No documents found.") + + return empty_result + + ######################################## + # Traverse Source Documents (optional) # + ######################################## neighborhoods = {} - if ARANGO_TRAVERSAL_GRAPH_NAME: + if ARANGO_TRAVERSAL_ENABLED: fetch_neighborhoods( vector_db, neighborhoods, [r.id for r in search_res], - ARANGO_TRAVERSAL_GRAPH_NAME, - ARANGO_COLLECTION_NAME, + graph_name, ARANGO_TRAVERSAL_MAX_DEPTH, ) - # return different response format - retrieved_docs: Union[list[ArangoTextDoc], list[ArangoRetrievalResponseData]] = [] - if isinstance(input, EmbedDoc): - for r in search_res: - retrieved_docs.append( - ArangoTextDoc( - text=r.page_content, - id=r.id, - neighborhood=neighborhoods.get(r.id), - ) - ) + #################### + # Process Response # + #################### + + search_res_tuples = [] + for r in search_res: + page_content = r.page_content + neighborhood = neighborhoods.get(r.id) + text = page_content + if neighborhood: + text += f"\n--------\nDocument Neighborhood:\n{neighborhood}" + + search_res_tuples.append((r.id, text, r.metadata)) + + retrieved_docs: Union[list[TextDoc], list[RetrievalResponseData]] = [] + if isinstance(input, EmbedDoc): + retrieved_docs = [TextDoc(id=id, text=text) for id, text, _ in search_res_tuples] result = SearchedDoc(retrieved_docs=retrieved_docs, initial_query=input.text) else: - for r in search_res: - retrieved_docs.append( - ArangoRetrievalResponseData( - text=r.page_content, - id=r.id, - metadata=r.metadata, - neighborhood=neighborhoods.get(r.id), - ) - ) + retrieved_docs = [ + RetrievalResponseData(id=id, text=text, metadata=metadata) for id, text, metadata in search_res_tuples + ] if isinstance(input, RetrievalRequest): result = RetrievalResponse(retrieved_docs=retrieved_docs) - elif isinstance(input, ChatCompletionRequest): + else: input.retrieved_docs = retrieved_docs input.documents = [doc.text for doc in retrieved_docs] result = input - else: - raise ValueError("Invalid input type: ", type(input)) statistics_dict["opea_service@retriever_arango"].append_latency(time.time() - start, None) @@ -225,19 +332,6 @@ async def retrieve( if __name__ == "__main__": - if not ARANGO_EMBEDDING_DIMENSION: - raise ValueError("EMBED_DIMENSION must specified in advance.") - - if OPENAI_API_KEY and OPENAI_EMBED_MODEL: - # Use OpenAI embeddings - embeddings = OpenAIEmbeddings(model=OPENAI_EMBED_MODEL, dimensions=ARANGO_EMBEDDING_DIMENSION) - elif TEI_EMBEDDING_ENDPOINT and HUGGINGFACEHUB_API_TOKEN: - # create embeddings using TEI endpoint service - embeddings = HuggingFaceHubEmbeddings(model=TEI_EMBEDDING_ENDPOINT, huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN) - else: - # create embeddings using local embedding model - embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) - client = ArangoClient(hosts=ARANGO_URL) sys_db = client.db(name="_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) From c72a8b20d0518c4fc59f61c9d3cf843a19f465cc Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Fri, 10 Jan 2025 08:28:41 -0500 Subject: [PATCH 24/35] fix: dataprep envs --- comps/dataprep/arango/langchain/config.py | 4 ++-- comps/dataprep/arango/langchain/prepare_doc_arango.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py index 98bfd498bb..21878a0b58 100644 --- a/comps/dataprep/arango/langchain/config.py +++ b/comps/dataprep/arango/langchain/config.py @@ -10,7 +10,7 @@ ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "_system") # ArangoDB Graph Insertion configuration -INSERT_ASYNC = os.getenv("INSERT_ASYNC", False) +ARANGO_INSERT_ASYNC = os.getenv("ARANGO_INSERT_ASYNC", False) ARANGO_BATCH_SIZE = os.getenv("ARANGO_BATCH_SIZE", 500) ARANGO_GRAPH_NAME = os.getenv("ARANGO_GRAPH_NAME", "GRAPH") ARANGO_USE_GRAPH_NAME = os.getenv("ARANGO_USE_GRAPH_NAME", True) @@ -33,7 +33,7 @@ OPENAI_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "gpt-4o") OPENAI_CHAT_TEMPERATURE = os.getenv("OPENAI_CHAT_TEMPERATURE", 0) OPENAI_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small") -OPENAI_EMBED_DIMENSIONS = os.getenv("OPENAI_EMBED_DIMENSIONS", 512) +OPENAI_EMBED_DIMENSION = os.getenv("OPENAI_EMBED_DIMENSION", 512) OPENAI_CHAT_ENABLED = os.getenv("OPENAI_TEI_ENABLED", True) OPENAI_EMBED_ENABLED = os.getenv("OPENAI_TGI_ENABLED", True) diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index 6a90c2d043..7ad051e2f4 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -17,12 +17,12 @@ ARANGO_URL, ARANGO_USERNAME, HUGGINGFACEHUB_API_TOKEN, - INSERT_ASYNC, + ARANGO_INSERT_ASYNC, NODE_PROPERTIES, OPENAI_API_KEY, OPENAI_CHAT_MODEL, OPENAI_CHAT_TEMPERATURE, - OPENAI_EMBED_DIMENSIONS, + OPENAI_EMBED_DIMENSION, OPENAI_EMBED_MODEL, RELATIONSHIP_PROPERTIES, SYSTEM_PROMPT_PATH, @@ -166,7 +166,7 @@ def ingest_data_to_arango(doc_path: DocPath) -> str: update_graph_definition_if_exists=False, batch_size=ARANGO_BATCH_SIZE, use_one_entity_collection=True, - insert_async=INSERT_ASYNC, + insert_async=ARANGO_INSERT_ASYNC, source_metadata_fields_to_extract_to_top_level={"embedding"}, ) @@ -334,7 +334,7 @@ async def ingest_documents( # Use OpenAI embeddings embeddings = OpenAIEmbeddings( model=OPENAI_EMBED_MODEL, - dimensions=OPENAI_EMBED_DIMENSIONS, + dimensions=OPENAI_EMBED_DIMENSION, ) elif TEI_EMBEDDING_ENDPOINT and HUGGINGFACEHUB_API_TOKEN: From 31261a844ff1a47c4a10a607a0f1c0b29ec51de3 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Fri, 10 Jan 2025 08:42:32 -0500 Subject: [PATCH 25/35] fix: bool envs --- comps/dataprep/arango/langchain/config.py | 10 +++++----- comps/retrievers/arango/langchain/config.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py index 21878a0b58..84b8ce52f4 100644 --- a/comps/dataprep/arango/langchain/config.py +++ b/comps/dataprep/arango/langchain/config.py @@ -10,10 +10,10 @@ ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "_system") # ArangoDB Graph Insertion configuration -ARANGO_INSERT_ASYNC = os.getenv("ARANGO_INSERT_ASYNC", False) +ARANGO_INSERT_ASYNC = os.getenv("ARANGO_INSERT_ASYNC", "false").lower() == "true" ARANGO_BATCH_SIZE = os.getenv("ARANGO_BATCH_SIZE", 500) ARANGO_GRAPH_NAME = os.getenv("ARANGO_GRAPH_NAME", "GRAPH") -ARANGO_USE_GRAPH_NAME = os.getenv("ARANGO_USE_GRAPH_NAME", True) +ARANGO_USE_GRAPH_NAME = os.getenv("ARANGO_USE_GRAPH_NAME", "false").lower() == "true" # Text Generation Inference configuration TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") @@ -33,9 +33,9 @@ OPENAI_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "gpt-4o") OPENAI_CHAT_TEMPERATURE = os.getenv("OPENAI_CHAT_TEMPERATURE", 0) OPENAI_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small") -OPENAI_EMBED_DIMENSION = os.getenv("OPENAI_EMBED_DIMENSION", 512) -OPENAI_CHAT_ENABLED = os.getenv("OPENAI_TEI_ENABLED", True) -OPENAI_EMBED_ENABLED = os.getenv("OPENAI_TGI_ENABLED", True) +OPENAI_EMBED_DIMENSION = os.getenv("OPENAI_EMBED_DIMENSION", 768) +OPENAI_CHAT_ENABLED = os.getenv("OPENAI_CHAT_ENABLED", "true").lower() == "true" +OPENAI_EMBED_ENABLED = os.getenv("OPENAI_EMBED_ENABLED", "true").lower() == "true" # LLMGraphTransformer configuration SYSTEM_PROMPT_PATH = os.getenv("SYSTEM_PROMPT_PATH") diff --git a/comps/retrievers/arango/langchain/config.py b/comps/retrievers/arango/langchain/config.py index bfabe94194..26f30dbf34 100644 --- a/comps/retrievers/arango/langchain/config.py +++ b/comps/retrievers/arango/langchain/config.py @@ -12,13 +12,13 @@ # ArangoDB Vector configuration ARANGO_GRAPH_NAME = os.getenv("ARANGO_GRAPH_NAME", "GRAPH") ARANGO_DISTANCE_STRATEGY = os.getenv("ARANGO_DISTANCE_STRATEGY", "COSINE") -ARANGO_USE_APPROX_SEARCH = os.getenv("ARANGO_USE_APPROX_SEARCH", True) +ARANGO_USE_APPROX_SEARCH = os.getenv("ARANGO_USE_APPROX_SEARCH", "true").lower() == "true" ARANGO_TEXT_FIELD = os.getenv("ARANGO_TEXT_FIELD", "text") ARANGO_EMBEDDING_FIELD = os.getenv("ARANGO_EMBEDDING_FIELD", "embedding") ARANGO_NUM_CENTROIDS = os.getenv("ARANGO_NUM_CENTROIDS", 1) # ArangoDB Traversal configuration -ARANGO_TRAVERSAL_ENABLED = os.getenv("ARANGO_TRAVERSAL_ENABLED", False) +ARANGO_TRAVERSAL_ENABLED = os.getenv("ARANGO_TRAVERSAL_ENABLED", "false").lower() == "true" ARANGO_TRAVERSAL_MAX_DEPTH = os.getenv("ARANGO_TRAVERSAL_MAX_DEPTH", 1) # Embedding configuration From 27b14e6724964bd118700f56a5630a7976b974fa Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Fri, 10 Jan 2025 09:07:20 -0500 Subject: [PATCH 26/35] fix: retriever fetch_neighborhoods --- comps/retrievers/arango/langchain/retriever_arango.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/comps/retrievers/arango/langchain/retriever_arango.py b/comps/retrievers/arango/langchain/retriever_arango.py index b7a6f49fcb..ea781b09cb 100644 --- a/comps/retrievers/arango/langchain/retriever_arango.py +++ b/comps/retrievers/arango/langchain/retriever_arango.py @@ -282,11 +282,12 @@ async def retrieve( neighborhoods = {} if ARANGO_TRAVERSAL_ENABLED: fetch_neighborhoods( - vector_db, - neighborhoods, - [r.id for r in search_res], - graph_name, - ARANGO_TRAVERSAL_MAX_DEPTH, + vector_db=vector_db, + keys=[r.id for r in search_res], + neighborhoods=neighborhoods, + graph_name=graph_name, + source_collection_name=source_collection_name, + max_depth=ARANGO_TRAVERSAL_MAX_DEPTH, ) #################### From a0d53719981912a23570d4bd78ecc4045721073f Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Fri, 10 Jan 2025 12:10:25 -0500 Subject: [PATCH 27/35] fix: env var int typing --- comps/dataprep/arango/langchain/config.py | 16 ++++++++-------- comps/retrievers/arango/langchain/config.py | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py index 84b8ce52f4..fc9494e8cd 100644 --- a/comps/dataprep/arango/langchain/config.py +++ b/comps/dataprep/arango/langchain/config.py @@ -11,17 +11,17 @@ # ArangoDB Graph Insertion configuration ARANGO_INSERT_ASYNC = os.getenv("ARANGO_INSERT_ASYNC", "false").lower() == "true" -ARANGO_BATCH_SIZE = os.getenv("ARANGO_BATCH_SIZE", 500) +ARANGO_BATCH_SIZE = int(os.getenv("ARANGO_BATCH_SIZE", 500)) ARANGO_GRAPH_NAME = os.getenv("ARANGO_GRAPH_NAME", "GRAPH") ARANGO_USE_GRAPH_NAME = os.getenv("ARANGO_USE_GRAPH_NAME", "false").lower() == "true" # Text Generation Inference configuration TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") -TGI_LLM_MAX_NEW_TOKENS = os.getenv("TGI_LLM_MAX_NEW_TOKENS", 512) -TGI_LLM_TOP_K = os.getenv("TGI_LLM_TOP_K", 40) -TGI_LLM_TOP_P = os.getenv("TGI_LLM_TOP_P", 0.9) -TGI_LLM_TEMPERATURE = os.getenv("TGI_LLM_TEMPERATURE", 0.8) -TGI_LLM_TIMEOUT = os.getenv("TGI_LLM_TIMEOUT", 600) +TGI_LLM_MAX_NEW_TOKENS = int(os.getenv("TGI_LLM_MAX_NEW_TOKENS", 512)) +TGI_LLM_TOP_K = int(os.getenv("TGI_LLM_TOP_K", 40)) +TGI_LLM_TOP_P = int(os.getenv("TGI_LLM_TOP_P", 0.9)) +TGI_LLM_TEMPERATURE = int(os.getenv("TGI_LLM_TEMPERATURE", 0.8)) +TGI_LLM_TIMEOUT = int(os.getenv("TGI_LLM_TIMEOUT", 600)) # Text Embeddings Inference configuration TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT") @@ -31,9 +31,9 @@ # OpenAI configuration (alternative to TGI & TEI) OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") OPENAI_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "gpt-4o") -OPENAI_CHAT_TEMPERATURE = os.getenv("OPENAI_CHAT_TEMPERATURE", 0) +OPENAI_CHAT_TEMPERATURE = int(os.getenv("OPENAI_CHAT_TEMPERATURE", 0)) OPENAI_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small") -OPENAI_EMBED_DIMENSION = os.getenv("OPENAI_EMBED_DIMENSION", 768) +OPENAI_EMBED_DIMENSION = int(os.getenv("OPENAI_EMBED_DIMENSION", 768)) OPENAI_CHAT_ENABLED = os.getenv("OPENAI_CHAT_ENABLED", "true").lower() == "true" OPENAI_EMBED_ENABLED = os.getenv("OPENAI_EMBED_ENABLED", "true").lower() == "true" diff --git a/comps/retrievers/arango/langchain/config.py b/comps/retrievers/arango/langchain/config.py index 26f30dbf34..097131163c 100644 --- a/comps/retrievers/arango/langchain/config.py +++ b/comps/retrievers/arango/langchain/config.py @@ -15,11 +15,11 @@ ARANGO_USE_APPROX_SEARCH = os.getenv("ARANGO_USE_APPROX_SEARCH", "true").lower() == "true" ARANGO_TEXT_FIELD = os.getenv("ARANGO_TEXT_FIELD", "text") ARANGO_EMBEDDING_FIELD = os.getenv("ARANGO_EMBEDDING_FIELD", "embedding") -ARANGO_NUM_CENTROIDS = os.getenv("ARANGO_NUM_CENTROIDS", 1) +ARANGO_NUM_CENTROIDS = int(os.getenv("ARANGO_NUM_CENTROIDS", 1)) # ArangoDB Traversal configuration ARANGO_TRAVERSAL_ENABLED = os.getenv("ARANGO_TRAVERSAL_ENABLED", "false").lower() == "true" -ARANGO_TRAVERSAL_MAX_DEPTH = os.getenv("ARANGO_TRAVERSAL_MAX_DEPTH", 1) +ARANGO_TRAVERSAL_MAX_DEPTH = int(os.getenv("ARANGO_TRAVERSAL_MAX_DEPTH", 1)) # Embedding configuration TEI_EMBED_MODEL = os.getenv("TEI_EMBED_MODEL", "BAAI/bge-base-en-v1.5") From f2fe0f3376a045f65bb20a4d008029711b832170 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Fri, 10 Jan 2025 12:10:36 -0500 Subject: [PATCH 28/35] new: `fetch_neighborhoods` AQL query --- .../arango/langchain/retriever_arango.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/comps/retrievers/arango/langchain/retriever_arango.py b/comps/retrievers/arango/langchain/retriever_arango.py index ea781b09cb..c45fbd2013 100644 --- a/comps/retrievers/arango/langchain/retriever_arango.py +++ b/comps/retrievers/arango/langchain/retriever_arango.py @@ -79,11 +79,24 @@ def fetch_neighborhoods( LET entity_neighborhood = ( FOR v1, e1, p1 IN 1..1 INBOUND doc {graph_name}_HAS_SOURCE FOR v2, e2, p2 IN 1..{max_depth} ANY v1 {graph_name}_LINKS_TO - RETURN p2 + + LET isForward = (e2._to == v2._id) + LET A = CONCAT(p2.vertices[-2].name, " (", p2.vertices[-2].type, ")") + LET B = CONCAT(v2.name, " (", v2.type, ")") + + LET source = isForward ? A : B + LET destination = !isForward ? A : B + + COLLECT s = source, d = destination + AGGREGATE relations = UNIQUE(e2.type) + + FOR r IN relations + RETURN CONCAT(s , ' ', r, ' ', d) ) RETURN {{[doc._key]: entity_neighborhood}} """ + # FOR s2 IN 1..1 OUTBOUND v2 {graph_name}_HAS_SOURCE (after aggregate) bind_vars = { "@collection": source_collection_name, From dc9b4e011f3ff82fbc530bdc8e5b51273365e8f0 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 13 Jan 2025 11:28:58 -0500 Subject: [PATCH 29/35] update: AQL traversal query --- .../arango/langchain/retriever_arango.py | 55 ++++++++++++------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/comps/retrievers/arango/langchain/retriever_arango.py b/comps/retrievers/arango/langchain/retriever_arango.py index c45fbd2013..678780c433 100644 --- a/comps/retrievers/arango/langchain/retriever_arango.py +++ b/comps/retrievers/arango/langchain/retriever_arango.py @@ -71,32 +71,45 @@ def fetch_neighborhoods( if max_depth < 1: max_depth = 1 - # TODO: Consider using general `GRAPH` syntax instead of specific edge collections... - aql = f""" - FOR doc IN @@collection - FILTER doc._key IN @keys + # aql = f""" + # FOR doc IN @@collection + # FILTER doc._key IN @keys - LET entity_neighborhood = ( - FOR v1, e1, p1 IN 1..1 INBOUND doc {graph_name}_HAS_SOURCE - FOR v2, e2, p2 IN 1..{max_depth} ANY v1 {graph_name}_LINKS_TO + # LET entity_neighborhood = ( + # FOR v1, e1, p1 IN 1..1 INBOUND doc {graph_name}_HAS_SOURCE + # FOR v2, e2, p2 IN 1..{max_depth} ANY v1 {graph_name}_LINKS_TO + # LET isForward = (e2._to == v2._id) + # LET A = CONCAT(p2.vertices[-2].name, " (", p2.vertices[-2].type, ")") + # LET B = CONCAT(v2.name, " (", v2.type, ")") + + # LET source = isForward ? A : B + # LET destination = !isForward ? A : B + + # COLLECT s = source, d = destination + # AGGREGATE relations = UNIQUE(e2.type) - LET isForward = (e2._to == v2._id) - LET A = CONCAT(p2.vertices[-2].name, " (", p2.vertices[-2].type, ")") - LET B = CONCAT(v2.name, " (", v2.type, ")") + # FOR r IN relations + # RETURN CONCAT(s , ' ', r, ' ', d) + # ) - LET source = isForward ? A : B - LET destination = !isForward ? A : B + # RETURN {{[doc._key]: entity_neighborhood}} + # """ - COLLECT s = source, d = destination - AGGREGATE relations = UNIQUE(e2.type) + aql = f""" + FOR doc IN @@collection + FILTER doc._key IN @keys - FOR r IN relations - RETURN CONCAT(s , ' ', r, ' ', d) + LET source_neighborhood = ( + FOR v1, e1, p1 IN 1..1 INBOUND doc {graph_name}_HAS_SOURCE + FOR v2, e2, p2 IN 1..{max_depth} ANY v1 {graph_name}_LINKS_TO OPTIONS {{uniqueEdges: "path"}} + FOR v3, e3, p3 IN 1..1 OUTBOUND v2 {graph_name}_HAS_SOURCE + FILTER v3._key != doc._key + COLLECT text = v3.text + RETURN text ) - RETURN {{[doc._key]: entity_neighborhood}} + RETURN {{[doc._key]: source_neighborhood}} """ - # FOR s2 IN 1..1 OUTBOUND v2 {graph_name}_HAS_SOURCE (after aggregate) bind_vars = { "@collection": source_collection_name, @@ -314,7 +327,11 @@ async def retrieve( text = page_content if neighborhood: - text += f"\n--------\nDocument Neighborhood:\n{neighborhood}" + text += "\n------\nRELATED INFORMATION:\n------\n" + text += neighborhood + + if logflag: + logger.info(f"Document: {r.id}, Text: {text}") search_res_tuples.append((r.id, text, r.metadata)) From d34263a2dfa6c055bbbdb5fa66d263b143806a9c Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 13 Jan 2025 15:55:33 -0500 Subject: [PATCH 30/35] cleanup queries --- .../arango/langchain/retriever_arango.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/comps/retrievers/arango/langchain/retriever_arango.py b/comps/retrievers/arango/langchain/retriever_arango.py index 678780c433..a1df095552 100644 --- a/comps/retrievers/arango/langchain/retriever_arango.py +++ b/comps/retrievers/arango/langchain/retriever_arango.py @@ -100,12 +100,12 @@ def fetch_neighborhoods( FILTER doc._key IN @keys LET source_neighborhood = ( - FOR v1, e1, p1 IN 1..1 INBOUND doc {graph_name}_HAS_SOURCE - FOR v2, e2, p2 IN 1..{max_depth} ANY v1 {graph_name}_LINKS_TO OPTIONS {{uniqueEdges: "path"}} - FOR v3, e3, p3 IN 1..1 OUTBOUND v2 {graph_name}_HAS_SOURCE + FOR v1 IN 1..1 INBOUND doc {graph_name}_HAS_SOURCE + FOR v2 IN 1..{max_depth} ANY v1 {graph_name}_LINKS_TO OPTIONS {{uniqueEdges: "path"}} + FOR v3 IN 1..1 OUTBOUND v2 {graph_name}_HAS_SOURCE FILTER v3._key != doc._key - COLLECT text = v3.text - RETURN text + COLLECT id = v3._key, text = v3.text + RETURN {{[id]: text}} ) RETURN {{[doc._key]: source_neighborhood}} @@ -327,8 +327,8 @@ async def retrieve( text = page_content if neighborhood: - text += "\n------\nRELATED INFORMATION:\n------\n" - text += neighborhood + text += "\n------\nRELATED CHUNKS:\n------\n" + text += f"{neighborhood}\n" if logflag: logger.info(f"Document: {r.id}, Text: {text}") From 01c8f9f5dde7c9060587e3c668cb303c4fe5be12 Mon Sep 17 00:00:00 2001 From: Ajay Kallepalli <72517322+ajaykallepalli@users.noreply.github.com> Date: Mon, 13 Jan 2025 15:36:30 -0800 Subject: [PATCH 31/35] Added environment variables to specify table strategy, chunk size, chunk overlap, and process table. CURL command will supercede environment variables (#18) --- comps/dataprep/arango/langchain/config.py | 6 ++++++ .../dataprep/arango/langchain/prepare_doc_arango.py | 12 ++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py index fc9494e8cd..c8d42a5817 100644 --- a/comps/dataprep/arango/langchain/config.py +++ b/comps/dataprep/arango/langchain/config.py @@ -43,3 +43,9 @@ ALLOWED_RELATIONSHIPS = os.getenv("ALLOWED_RELATIONSHIPS", []) NODE_PROPERTIES = os.getenv("NODE_PROPERTIES", ["description"]) RELATIONSHIP_PROPERTIES = os.getenv("RELATIONSHIP_PROPERTIES", ["description"]) + +# Parsing configuration +PROCESS_TABLE = os.getenv("PROCESS_TABLE", "false").lower() == "true" +TABLE_STRATEGY = os.getenv("TABLE_STRATEGY", "fast") +CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", 1500)) +CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", 100)) diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index 7ad051e2f4..10d2ab9c55 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -37,6 +37,10 @@ OPENAI_CHAT_ENABLED, OPENAI_EMBED_ENABLED, ARANGO_USE_GRAPH_NAME, + PROCESS_TABLE, + TABLE_STRATEGY, + CHUNK_SIZE, + CHUNK_OVERLAP, ) from fastapi import File, Form, HTTPException, UploadFile from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -190,10 +194,10 @@ def ingest_data_to_arango(doc_path: DocPath) -> str: async def ingest_documents( files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), + chunk_size: int = Form(CHUNK_SIZE), + chunk_overlap: int = Form(CHUNK_OVERLAP), + process_table: bool = Form(PROCESS_TABLE), + table_strategy: str = Form(TABLE_STRATEGY), ): if logflag: logger.info(f"files:{files}") From 1d8693d7a9dcbe164dd1d1b1e2702071ab624a13 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 14 Jan 2025 14:15:51 -0500 Subject: [PATCH 32/35] cleanup retriever --- .../arango/langchain/retriever_arango.py | 34 +------------------ 1 file changed, 1 insertion(+), 33 deletions(-) diff --git a/comps/retrievers/arango/langchain/retriever_arango.py b/comps/retrievers/arango/langchain/retriever_arango.py index a1df095552..a619d8dc34 100644 --- a/comps/retrievers/arango/langchain/retriever_arango.py +++ b/comps/retrievers/arango/langchain/retriever_arango.py @@ -3,7 +3,7 @@ import os import time -from typing import Any, Optional, Union +from typing import Any, Union from arango import ArangoClient from config import ( @@ -47,14 +47,6 @@ RetrievalResponseData, ) -# TODO: Revisit these classes. How would they be presented in ChatQnA? -# class ArangoTextDoc(TextDoc): -# neighborhood: Optional[list[dict[str, Any]]] = None - -# class ArangoRetrievalResponseData(RetrievalResponseData): -# neighborhood: Optional[list[dict[str, Any]]] = None - - logger = CustomLogger("retriever_arango") logflag = os.getenv("LOGFLAG", True) @@ -71,30 +63,6 @@ def fetch_neighborhoods( if max_depth < 1: max_depth = 1 - # aql = f""" - # FOR doc IN @@collection - # FILTER doc._key IN @keys - - # LET entity_neighborhood = ( - # FOR v1, e1, p1 IN 1..1 INBOUND doc {graph_name}_HAS_SOURCE - # FOR v2, e2, p2 IN 1..{max_depth} ANY v1 {graph_name}_LINKS_TO - # LET isForward = (e2._to == v2._id) - # LET A = CONCAT(p2.vertices[-2].name, " (", p2.vertices[-2].type, ")") - # LET B = CONCAT(v2.name, " (", v2.type, ")") - - # LET source = isForward ? A : B - # LET destination = !isForward ? A : B - - # COLLECT s = source, d = destination - # AGGREGATE relations = UNIQUE(e2.type) - - # FOR r IN relations - # RETURN CONCAT(s , ' ', r, ' ', d) - # ) - - # RETURN {{[doc._key]: entity_neighborhood}} - # """ - aql = f""" FOR doc IN @@collection FILTER doc._key IN @keys From a4b83d75ec23061e92e815fb2b5348390fdac734 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 14 Jan 2025 19:12:15 -0500 Subject: [PATCH 33/35] update: `ARANGO_TRAVERSAL_MAX_DEPTH` --- comps/retrievers/arango/langchain/README.md | 2 +- comps/retrievers/arango/langchain/config.py | 2 +- .../arango/langchain/retriever_arango.py | 17 +++++++++++------ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/comps/retrievers/arango/langchain/README.md b/comps/retrievers/arango/langchain/README.md index e766b2847d..6f3c6c8c6b 100644 --- a/comps/retrievers/arango/langchain/README.md +++ b/comps/retrievers/arango/langchain/README.md @@ -130,7 +130,7 @@ ArangoDB Collection configuration ArangoDB Traversal configuration - `ARANGO_TRAVERSAL_ENABLED`: If set to True, the retriever will traverse the graph to retrieve the neighborhood of the retrieved documents, using the specified `ARANGO_GRAPH_NAME` as a reference. Defaults to `False`. -- `ARANGO_TRAVERSAL_MAX_DEPTH`: The maximum depth to traverse the graph. Defaults to `1`. +- `ARANGO_TRAVERSAL_MAX_DEPTH`: The maximum depth to traverse the graph with regards to entities linked to entities directly associated with source documents. If `0`, traversal is applied as `SOURCE --> ENTITY <--- SOURCE` to find soures that are linked to the same entity. If `1`, traversal is applied as `SOURCE --> ENTITY <--> ENTITY <--- SOURCE` to find sources that are linked to the same entity's linked entity. If `2`, ... and so on. Defaults to `0`. Embedding Configuration - `TEI_EMBED_MODEL`: The model to use for the TEI service. Defaults to `BAAI/bge-base-en-v1.5`. diff --git a/comps/retrievers/arango/langchain/config.py b/comps/retrievers/arango/langchain/config.py index 097131163c..f3c63df6f4 100644 --- a/comps/retrievers/arango/langchain/config.py +++ b/comps/retrievers/arango/langchain/config.py @@ -19,7 +19,7 @@ # ArangoDB Traversal configuration ARANGO_TRAVERSAL_ENABLED = os.getenv("ARANGO_TRAVERSAL_ENABLED", "false").lower() == "true" -ARANGO_TRAVERSAL_MAX_DEPTH = int(os.getenv("ARANGO_TRAVERSAL_MAX_DEPTH", 1)) +ARANGO_TRAVERSAL_MAX_DEPTH = int(os.getenv("ARANGO_TRAVERSAL_MAX_DEPTH", 0)) # Embedding configuration TEI_EMBED_MODEL = os.getenv("TEI_EMBED_MODEL", "BAAI/bge-base-en-v1.5") diff --git a/comps/retrievers/arango/langchain/retriever_arango.py b/comps/retrievers/arango/langchain/retriever_arango.py index a619d8dc34..7b8743978f 100644 --- a/comps/retrievers/arango/langchain/retriever_arango.py +++ b/comps/retrievers/arango/langchain/retriever_arango.py @@ -60,8 +60,13 @@ def fetch_neighborhoods( max_depth: int, ) -> None: """Fetch neighborhoods of source documents. Updates the neighborhoods dictionary in-place.""" - if max_depth < 1: - max_depth = 1 + + if max_depth <= 0: + start_vertex = "v1" + links_to_query = "" + else: + start_vertex = "v2" + links_to_query = f"FOR v2 IN 1..{max_depth} ANY v1 {graph_name}_LINKS_TO OPTIONS {{uniqueEdges: 'path'}}" aql = f""" FOR doc IN @@collection @@ -69,10 +74,10 @@ def fetch_neighborhoods( LET source_neighborhood = ( FOR v1 IN 1..1 INBOUND doc {graph_name}_HAS_SOURCE - FOR v2 IN 1..{max_depth} ANY v1 {graph_name}_LINKS_TO OPTIONS {{uniqueEdges: "path"}} - FOR v3 IN 1..1 OUTBOUND v2 {graph_name}_HAS_SOURCE - FILTER v3._key != doc._key - COLLECT id = v3._key, text = v3.text + {links_to_query} + FOR s IN 1..1 OUTBOUND {start_vertex} {graph_name}_HAS_SOURCE + FILTER s._key != doc._key + COLLECT id = s._key, text = s.{ARANGO_TEXT_FIELD} RETURN {{[id]: text}} ) From 900f3a27f5351d33ea149aa6f32f4db13b44cac0 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 15 Jan 2025 13:36:30 -0500 Subject: [PATCH 34/35] update `ARANGO_USE_GRAPH_NAME` --- comps/dataprep/arango/langchain/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py index c8d42a5817..4a692ded1a 100644 --- a/comps/dataprep/arango/langchain/config.py +++ b/comps/dataprep/arango/langchain/config.py @@ -13,7 +13,7 @@ ARANGO_INSERT_ASYNC = os.getenv("ARANGO_INSERT_ASYNC", "false").lower() == "true" ARANGO_BATCH_SIZE = int(os.getenv("ARANGO_BATCH_SIZE", 500)) ARANGO_GRAPH_NAME = os.getenv("ARANGO_GRAPH_NAME", "GRAPH") -ARANGO_USE_GRAPH_NAME = os.getenv("ARANGO_USE_GRAPH_NAME", "false").lower() == "true" +ARANGO_USE_GRAPH_NAME = os.getenv("ARANGO_USE_GRAPH_NAME", "true").lower() == "true" # Text Generation Inference configuration TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") From 6702d7be296be13a6835c906193ea3b0cec63043 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Fri, 17 Jan 2025 14:08:14 -0500 Subject: [PATCH 35/35] fix: related chunks --- comps/retrievers/arango/langchain/retriever_arango.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comps/retrievers/arango/langchain/retriever_arango.py b/comps/retrievers/arango/langchain/retriever_arango.py index 7b8743978f..da46c56f51 100644 --- a/comps/retrievers/arango/langchain/retriever_arango.py +++ b/comps/retrievers/arango/langchain/retriever_arango.py @@ -300,8 +300,8 @@ async def retrieve( text = page_content if neighborhood: - text += "\n------\nRELATED CHUNKS:\n------\n" - text += f"{neighborhood}\n" + text += "\n------\nRELATED CHUNKS FOUND [{ID: TEXT}]:\n------\n" + text += str(neighborhood) if logflag: logger.info(f"Document: {r.id}, Text: {text}")