From 086a13e0104c10fa1e10d1217c8b2bddc1b191e3 Mon Sep 17 00:00:00 2001 From: Amna Mubashar Date: Sun, 8 Dec 2024 23:19:42 +0100 Subject: [PATCH] Updated tutorial 32 --- ...le_Type_Preprocessing_Index_Pipeline.ipynb | 148 ++++++++-- ...ng_Documents_and_Queries_by_Language.ipynb | 264 ++++++------------ 2 files changed, 211 insertions(+), 201 deletions(-) diff --git a/tutorials/30_File_Type_Preprocessing_Index_Pipeline.ipynb b/tutorials/30_File_Type_Preprocessing_Index_Pipeline.ipynb index 45fb879..8fc936b 100644 --- a/tutorials/30_File_Type_Preprocessing_Index_Pipeline.ipynb +++ b/tutorials/30_File_Type_Preprocessing_Index_Pipeline.ipynb @@ -91,11 +91,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "id": "CkvJIU7FmDf9" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/amna.mubashar/Library/Python/3.9/lib/python/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "from haystack.telemetry import tutorial_running\n", "\n", @@ -117,9 +126,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['recipe_files/vegan_flan_recipe.md',\n", + " 'recipe_files/vegan_keto_eggplant_recipe_fixed.pdf',\n", + " 'recipe_files/vegan_sunflower_hemp_cheese_recipe.txt']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import gdown\n", "\n", @@ -180,7 +202,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "id": "hCWlpiQCBYOg" }, @@ -201,7 +223,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "id": "TVXSX0GHBtdj" }, @@ -222,7 +244,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "id": "4yGXKHEXIZxi" }, @@ -251,7 +273,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -260,7 +282,39 @@ "id": "gafXWtNYfNbr", "outputId": "10f351de-ac09-4273-85a2-ac7b59fb2f77" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "πŸš… Components\n", + " - file_type_router: FileTypeRouter\n", + " - text_file_converter: TextFileToDocument\n", + " - markdown_converter: MarkdownToDocument\n", + " - pypdf_converter: PyPDFToDocument\n", + " - document_joiner: DocumentJoiner\n", + " - document_cleaner: DocumentCleaner\n", + " - document_splitter: DocumentSplitter\n", + " - document_embedder: SentenceTransformersDocumentEmbedder\n", + " - document_writer: DocumentWriter\n", + "πŸ›€οΈ Connections\n", + " - file_type_router.text/plain -> text_file_converter.sources (List[Union[str, Path, ByteStream]])\n", + " - file_type_router.application/pdf -> pypdf_converter.sources (List[Union[str, Path, ByteStream]])\n", + " - file_type_router.text/markdown -> markdown_converter.sources (List[Union[str, Path, ByteStream]])\n", + " - text_file_converter.documents -> document_joiner.documents (List[Document])\n", + " - markdown_converter.documents -> document_joiner.documents (List[Document])\n", + " - pypdf_converter.documents -> document_joiner.documents (List[Document])\n", + " - document_joiner.documents -> document_cleaner.documents (List[Document])\n", + " - document_cleaner.documents -> document_splitter.documents (List[Document])\n", + " - document_splitter.documents -> document_embedder.documents (List[Document])\n", + " - document_embedder.documents -> document_writer.documents (List[Document])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "preprocessing_pipeline.connect(\"file_type_router.text/plain\", \"text_file_converter.sources\")\n", "preprocessing_pipeline.connect(\"file_type_router.application/pdf\", \"pypdf_converter.sources\")\n", @@ -325,7 +379,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -357,7 +411,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -366,14 +420,35 @@ "id": "_s--8xEWq8Y9", "outputId": "1c050d5f-f2ae-4cd3-e0d4-533397a6af63" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "πŸš… Components\n", + " - embedder: SentenceTransformersTextEmbedder\n", + " - retriever: InMemoryEmbeddingRetriever\n", + " - chat_prompt_builder: ChatPromptBuilder\n", + " - llm: HuggingFaceAPIChatGenerator\n", + "πŸ›€οΈ Connections\n", + " - embedder.embedding -> retriever.query_embedding (List[float])\n", + " - retriever.documents -> chat_prompt_builder.documents (List[Document])\n", + " - chat_prompt_builder.prompt -> llm.messages (List[ChatMessage])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from haystack.components.embedders import SentenceTransformersTextEmbedder\n", "from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever\n", - "from haystack.components.builders import PromptBuilder\n", - "from haystack.components.generators import HuggingFaceAPIGenerator\n", + "from haystack.components.builders import ChatPromptBuilder\n", + "from haystack.dataclasses import ChatMessage\n", + "from haystack.components.generators.chat import HuggingFaceAPIChatGenerator\n", "\n", - "template = \"\"\"\n", + "template = [ChatMessage.from_user(\"\"\"\n", "Answer the questions based on the given context.\n", "\n", "Context:\n", @@ -383,19 +458,19 @@ "\n", "Question: {{ question }}\n", "Answer:\n", - "\"\"\"\n", + "\"\"\")]\n", "pipe = Pipeline()\n", "pipe.add_component(\"embedder\", SentenceTransformersTextEmbedder(model=\"sentence-transformers/all-MiniLM-L6-v2\"))\n", "pipe.add_component(\"retriever\", InMemoryEmbeddingRetriever(document_store=document_store))\n", - "pipe.add_component(\"prompt_builder\", PromptBuilder(template=template))\n", + "pipe.add_component(\"chat_prompt_builder\", ChatPromptBuilder(template=template))\n", "pipe.add_component(\n", " \"llm\",\n", - " HuggingFaceAPIGenerator(api_type=\"serverless_inference_api\", api_params={\"model\": \"HuggingFaceH4/zephyr-7b-beta\"}),\n", + " HuggingFaceAPIChatGenerator(api_type=\"serverless_inference_api\", api_params={\"model\": \"HuggingFaceH4/zephyr-7b-beta\"}),\n", ")\n", "\n", "pipe.connect(\"embedder.embedding\", \"retriever.query_embedding\")\n", - "pipe.connect(\"retriever\", \"prompt_builder.documents\")\n", - "pipe.connect(\"prompt_builder\", \"llm\")" + "pipe.connect(\"retriever\", \"chat_prompt_builder.documents\")\n", + "pipe.connect(\"chat_prompt_builder.prompt\", \"llm.messages\")" ] }, { @@ -409,11 +484,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": { "id": "qDqrU5emtBWQ" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Batches: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 3.20it/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "{'llm': {'replies': [ChatMessage(content=\"For vegan keto eggplant lasagna:\\n- 2 large eggplants\\n- Hella salt (optional)\\n- 1/2 cup store-bought vegan mozzarella cheese (for topping)\\n- Pesto:\\n - 4 oz basil (generally one large clamshell or 2 small ones)\\n - 1/4 cup almonds\\n - 1/4 cup nutritional yeast\\n - 1/4 cup olive oil\\n- Spinach tofu ricotta:\\n - 1 recipe spinach tofu ricotta\\n- 1 tsp garlic powder\\n- Juice of half a lemon\\n- Salt to taste\\n\\nFor macadamia nut cheese:\\n- 1 cup macadamia nuts (unsalted and unroasted)\\n- Salt (optional)\\n\\nInstructions:\\n1. Preheat oven to 400Β°F.\\n2. Slice eggplants into 1/4 inch thick slices and rub both sides with salt. Let sit for 20-30 minutes to extract moisture. Rinse with water and pat dry.\\n3. Roast the eggplant in the oven for about 20 minutes or until they're soft and brown in spots, rotating the pans halfway through.\\n4. Reduce oven temperature to 350Β°F.\\n5. In a separate bowl, mix together the store-bought vegan mozzarella cheese (for topping) with spinach tofu ricotta.\\n6. Assemble the lasagna: spread a layer of roasted eggplant at the bottom of the casserole dish, followed by a layer of pesto and a layer of the cheese mixture. Repeat until all ingredients are used, finishing with a layer of roasted eggplant. Sprinkle the remaining store-bought vegan mozzarella cheese (for topping) on top.\\n7. Bake for 25 minutes. Optionally, broil for 1-2 minutes at the end to melt the cheese.\\n\\nFor vegan persimmon flan:\\n- 1/2 cup persimmon pulp, strained (about 2 medium persimmons)\\n- 1 tbsp cornstarch\\n- 1/2 tsp agar agar\\n-\", role=, name=None, meta={'model': 'HuggingFaceH4/zephyr-7b-beta', 'finish_reason': 'length', 'index': 0, 'usage': ChatCompletionOutputUsage(completion_tokens=512, prompt_tokens=2276, total_tokens=2788)})]}}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "question = (\n", " \"What ingredients would I need to make vegan keto eggplant lasagna, vegan persimmon flan, and vegan hemp cheese?\"\n", @@ -422,8 +515,8 @@ "pipe.run(\n", " {\n", " \"embedder\": {\"text\": question},\n", - " \"prompt_builder\": {\"question\": question},\n", - " \"llm\": {\"generation_kwargs\": {\"max_new_tokens\": 350}},\n", + " \"chat_prompt_builder\": {\"question\": question},\n", + " \n", " }\n", ")" ] @@ -434,12 +527,7 @@ "id": "ZJueu_V4KP6w" }, "source": [ - "```python\n", - "{'llm': {'replies': [\"\\n\\nVegan Keto Eggplant Lasagna:\\n\\nIngredients:\\n- 2 large eggplants\\n- A lot of salt (you should have this in your house already)\\n- 1/2 cup store-bought vegan mozzarella (for topping)\\n\\nPesto:\\n- 4 oz basil (generally one large clamshell or 2 small ones)\\n- 1/4 cup almonds\\n- 1/4 cup nutritional yeast\\n- 1/4 cup olive oil\\n- 1 recipe vegan pesto (you can find this in the recipe)\\n- 1 recipe spinach tofu ricotta (you can find this in the recipe)\\n- 1 tsp garlic powder\\n- Juice of half a lemon\\n- Salt to taste\\n\\nSpinach Tofu Ricotta:\\n- 10 oz firm or extra firm tofu\\n- Juice of 1 lemon\\n- Garlic powder to taste\\n- Salt to taste\\n\\nInstructions:\\n1. Slice the eggplants into 1/4 inch thick slices. Some slices will need to be scrapped because it's difficult to get them all uniformly thin. Use them in soup or something, IDK, man.\\n2. Take the eggplant slices and rub both sides with salt. Don't be shy about how much, you're gonna rinse it off anyway.\\n3. Put them in a colander with something underneath it and let them sit for half an hour. This draws the water out so that the egg\"],\n", - " 'meta': [{'model': 'HuggingFaceH4/zephyr-7b-beta',\n", - " ...\n", - " }]}}\n", - "```" + "{'llm': {'replies': [ChatMessage(content=\"For vegan keto eggplant lasagna:\\n- 2 large eggplants\\n- Hella salt (optional)\\n- 1/2 cup store-bought vegan mozzarella cheese (for topping)\\n- Pesto:\\n - 4 oz basil (generally one large clamshell or 2 small ones)\\n - 1/4 cup almonds\\n - 1/4 cup nutritional yeast\\n - 1/4 cup olive oil\\n- Spinach tofu ricotta:\\n - 1 recipe spinach tofu ricotta\\n- 1 tsp garlic powder\\n- Juice of half a lemon\\n- Salt to taste\\n\\nFor macadamia nut cheese:\\n- 1 cup macadamia nuts (unsalted and unroasted)\\n- Salt (optional)\\n\\nInstructions:\\n1. Preheat oven to 400Β°F.\\n2. Slice eggplants into 1/4 inch thick slices and rub both sides with salt. Let sit for 20-30 minutes to extract moisture. Rinse with water and pat dry.\\n3. Roast the eggplant in the oven for about 20 minutes or until they're soft and brown in spots, rotating the pans halfway through.\\n4. Reduce oven temperature to 350Β°F.\\n5. In a separate bowl, mix together the store-bought vegan mozzarella cheese (for topping) with spinach tofu ricotta.\\n6. Assemble the lasagna: spread a layer of roasted eggplant at the bottom of the casserole dish, followed by a layer of pesto and a layer of the cheese mixture. Repeat until all ingredients are used, finishing with a layer of roasted eggplant. Sprinkle the remaining store-bought vegan mozzarella cheese (for topping) on top.\\n7. Bake for 25 minutes. Optionally, broil for 1-2 minutes at the end to melt the cheese.\\n\\nFor vegan persimmon flan:\\n- 1/2 cup persimmon pulp, strained (about 2 medium persimmons)\\n- 1 tbsp cornstarch\\n- 1/2 tsp agar agar\\n-\", role=, name=None, meta={'model': 'HuggingFaceH4/zephyr-7b-beta', 'finish_reason': 'length', 'index': 0, 'usage': ChatCompletionOutputUsage(completion_tokens=512, prompt_tokens=2276, total_tokens=2788)})]}}" ] }, { diff --git a/tutorials/32_Classifying_Documents_and_Queries_by_Language.ipynb b/tutorials/32_Classifying_Documents_and_Queries_by_Language.ipynb index c4bdb5b..6d745d7 100644 --- a/tutorials/32_Classifying_Documents_and_Queries_by_Language.ipynb +++ b/tutorials/32_Classifying_Documents_and_Queries_by_Language.ipynb @@ -10,7 +10,7 @@ "\n", "- **Level**: Beginner\n", "- **Time to complete**: 15 minutes\n", - "- **Components Used**: [`InMemoryDocumentStore`](https://docs.haystack.deepset.ai/docs/inmemorydocumentstore), [`DocumentLanguageClassifier`](https://docs.haystack.deepset.ai/docs/documentlanguageclassifier), [`MetadataRouter`](https://docs.haystack.deepset.ai/docs/metadatarouter), [`DocumentWriter`](https://docs.haystack.deepset.ai/docs/documentwriter), [`TextLanguageRouter`](https://docs.haystack.deepset.ai/docs/textlanguagerouter), [`DocumentJoiner`](https://docs.haystack.deepset.ai/docs/documentjoiner), [`InMemoryBM25Retriever`](https://docs.haystack.deepset.ai/docs/inmemorybm25retriever), [`PromptBuilder`](https://docs.haystack.deepset.ai/docs/promptbuilder), [`OpenAIGenerator`](https://docs.haystack.deepset.ai/docs/openaigenerator)\n", + "- **Components Used**: [`InMemoryDocumentStore`](https://docs.haystack.deepset.ai/docs/inmemorydocumentstore), [`DocumentLanguageClassifier`](https://docs.haystack.deepset.ai/docs/documentlanguageclassifier), [`MetadataRouter`](https://docs.haystack.deepset.ai/docs/metadatarouter), [`DocumentWriter`](https://docs.haystack.deepset.ai/docs/documentwriter), [`TextLanguageRouter`](https://docs.haystack.deepset.ai/docs/textlanguagerouter), [`DocumentJoiner`](https://docs.haystack.deepset.ai/docs/documentjoiner), [`InMemoryBM25Retriever`](https://docs.haystack.deepset.ai/docs/inmemorybm25retriever), [`ChatPromptBuilder`](https://docs.haystack.deepset.ai/docs/chatpromptbuilder), [`OpenAIChatGenerator`](https://docs.haystack.deepset.ai/docs/openaichatgenerator)\n", "- **Goal**: After completing this tutorial, you'll have learned how to build a Haystack pipeline to classify documents based on the (human) language they were written in.\n", "- Optionally, at the end you'll also incorporate language clasification and query routing into a RAG pipeline, so you can query documents based on the language a question was written in.\n", "\n", @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -65,115 +65,7 @@ "id": "lxgAfuxcdftS", "outputId": "36339d6b-f7a8-4686-911a-60642a8adbe6" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting haystack-ai\n", - " Using cached haystack_ai-2.5.1-py3-none-any.whl.metadata (13 kB)\n", - "Collecting haystack-experimental (from haystack-ai)\n", - " Using cached haystack_experimental-0.1.1-py3-none-any.whl.metadata (6.9 kB)\n", - "Requirement already satisfied: jinja2 in /Users/tuanacelik/opt/anaconda3/envs/tutorials/lib/python3.12/site-packages (from haystack-ai) (3.1.4)\n", - "Collecting lazy-imports (from haystack-ai)\n", - " Using cached lazy_imports-0.3.1-py3-none-any.whl.metadata (10 kB)\n", - "Collecting more-itertools (from haystack-ai)\n", - " Downloading more_itertools-10.5.0-py3-none-any.whl.metadata (36 kB)\n", - "Collecting networkx (from haystack-ai)\n", - " Using cached networkx-3.3-py3-none-any.whl.metadata (5.1 kB)\n", - "Collecting numpy<2 (from haystack-ai)\n", - " Using cached numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl.metadata (61 kB)\n", - "Collecting openai>=1.1.0 (from haystack-ai)\n", - " Downloading openai-1.45.0-py3-none-any.whl.metadata (22 kB)\n", - "Collecting pandas (from haystack-ai)\n", - " Using cached pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl.metadata (19 kB)\n", - "Collecting posthog (from haystack-ai)\n", - " Downloading posthog-3.6.5-py2.py3-none-any.whl.metadata (2.0 kB)\n", - "Requirement already satisfied: python-dateutil in /Users/tuanacelik/opt/anaconda3/envs/tutorials/lib/python3.12/site-packages (from haystack-ai) (2.9.0)\n", - "Requirement already satisfied: pyyaml in /Users/tuanacelik/opt/anaconda3/envs/tutorials/lib/python3.12/site-packages (from haystack-ai) (6.0.1)\n", - "Requirement already satisfied: requests in /Users/tuanacelik/opt/anaconda3/envs/tutorials/lib/python3.12/site-packages (from haystack-ai) (2.32.3)\n", - "Collecting tenacity!=8.4.0 (from haystack-ai)\n", - " Using cached tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)\n", - "Collecting tqdm (from haystack-ai)\n", - " Using cached tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)\n", - "Requirement already satisfied: typing-extensions>=4.7 in /Users/tuanacelik/opt/anaconda3/envs/tutorials/lib/python3.12/site-packages (from haystack-ai) (4.12.2)\n", - "Collecting anyio<5,>=3.5.0 (from openai>=1.1.0->haystack-ai)\n", - " Using cached anyio-4.4.0-py3-none-any.whl.metadata (4.6 kB)\n", - "Collecting distro<2,>=1.7.0 (from openai>=1.1.0->haystack-ai)\n", - " Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)\n", - "Collecting httpx<1,>=0.23.0 (from openai>=1.1.0->haystack-ai)\n", - " Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)\n", - "Collecting jiter<1,>=0.4.0 (from openai>=1.1.0->haystack-ai)\n", - " Using cached jiter-0.5.0-cp312-cp312-macosx_10_12_x86_64.whl.metadata (3.6 kB)\n", - "Collecting pydantic<3,>=1.9.0 (from openai>=1.1.0->haystack-ai)\n", - " Downloading pydantic-2.9.1-py3-none-any.whl.metadata (146 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m147.0/147.0 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting sniffio (from openai>=1.1.0->haystack-ai)\n", - " Using cached sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /Users/tuanacelik/opt/anaconda3/envs/tutorials/lib/python3.12/site-packages (from jinja2->haystack-ai) (2.1.5)\n", - "Collecting pytz>=2020.1 (from pandas->haystack-ai)\n", - " Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)\n", - "Collecting tzdata>=2022.7 (from pandas->haystack-ai)\n", - " Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)\n", - "Requirement already satisfied: six>=1.5 in /Users/tuanacelik/opt/anaconda3/envs/tutorials/lib/python3.12/site-packages (from python-dateutil->haystack-ai) (1.16.0)\n", - "Collecting monotonic>=1.5 (from posthog->haystack-ai)\n", - " Using cached monotonic-1.6-py2.py3-none-any.whl.metadata (1.5 kB)\n", - "Collecting backoff>=1.10.0 (from posthog->haystack-ai)\n", - " Using cached backoff-2.2.1-py3-none-any.whl.metadata (14 kB)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/tuanacelik/opt/anaconda3/envs/tutorials/lib/python3.12/site-packages (from requests->haystack-ai) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /Users/tuanacelik/opt/anaconda3/envs/tutorials/lib/python3.12/site-packages (from requests->haystack-ai) (3.7)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/tuanacelik/opt/anaconda3/envs/tutorials/lib/python3.12/site-packages (from requests->haystack-ai) (2.2.2)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /Users/tuanacelik/opt/anaconda3/envs/tutorials/lib/python3.12/site-packages (from requests->haystack-ai) (2024.6.2)\n", - "Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai>=1.1.0->haystack-ai)\n", - " Using cached httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)\n", - "Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai>=1.1.0->haystack-ai)\n", - " Using cached h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)\n", - "Collecting annotated-types>=0.6.0 (from pydantic<3,>=1.9.0->openai>=1.1.0->haystack-ai)\n", - " Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)\n", - "Collecting pydantic-core==2.23.3 (from pydantic<3,>=1.9.0->openai>=1.1.0->haystack-ai)\n", - " Downloading pydantic_core-2.23.3-cp312-cp312-macosx_10_12_x86_64.whl.metadata (6.6 kB)\n", - "Using cached haystack_ai-2.5.1-py3-none-any.whl (351 kB)\n", - "Using cached numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl (20.3 MB)\n", - "Downloading openai-1.45.0-py3-none-any.whl (374 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m374.1/374.1 kB\u001b[0m \u001b[31m12.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hUsing cached tenacity-9.0.0-py3-none-any.whl (28 kB)\n", - "Using cached tqdm-4.66.5-py3-none-any.whl (78 kB)\n", - "Using cached haystack_experimental-0.1.1-py3-none-any.whl (41 kB)\n", - "Using cached lazy_imports-0.3.1-py3-none-any.whl (12 kB)\n", - "Downloading more_itertools-10.5.0-py3-none-any.whl (60 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.0/61.0 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hUsing cached networkx-3.3-py3-none-any.whl (1.7 MB)\n", - "Using cached pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl (12.5 MB)\n", - "Downloading posthog-3.6.5-py2.py3-none-any.whl (54 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.2/54.2 kB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hUsing cached anyio-4.4.0-py3-none-any.whl (86 kB)\n", - "Using cached backoff-2.2.1-py3-none-any.whl (15 kB)\n", - "Using cached distro-1.9.0-py3-none-any.whl (20 kB)\n", - "Downloading httpx-0.27.2-py3-none-any.whl (76 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.4/76.4 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hUsing cached httpcore-1.0.5-py3-none-any.whl (77 kB)\n", - "Using cached jiter-0.5.0-cp312-cp312-macosx_10_12_x86_64.whl (283 kB)\n", - "Using cached monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n", - "Downloading pydantic-2.9.1-py3-none-any.whl (434 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m434.4/434.4 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hDownloading pydantic_core-2.23.3-cp312-cp312-macosx_10_12_x86_64.whl (1.8 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0mm\n", - "\u001b[?25hDownloading pytz-2024.2-py2.py3-none-any.whl (508 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m508.0/508.0 kB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hUsing cached sniffio-1.3.1-py3-none-any.whl (10 kB)\n", - "Using cached tzdata-2024.1-py2.py3-none-any.whl (345 kB)\n", - "Using cached annotated_types-0.7.0-py3-none-any.whl (13 kB)\n", - "Using cached h11-0.14.0-py3-none-any.whl (58 kB)\n", - "Installing collected packages: pytz, monotonic, tzdata, tqdm, tenacity, sniffio, pydantic-core, numpy, networkx, more-itertools, lazy-imports, jiter, h11, distro, backoff, annotated-types, pydantic, posthog, pandas, httpcore, anyio, httpx, openai, haystack-experimental, haystack-ai\n", - "Successfully installed annotated-types-0.7.0 anyio-4.4.0 backoff-2.2.1 distro-1.9.0 h11-0.14.0 haystack-ai-2.5.1 haystack-experimental-0.1.1 httpcore-1.0.5 httpx-0.27.2 jiter-0.5.0 lazy-imports-0.3.1 monotonic-1.6 more-itertools-10.5.0 networkx-3.3 numpy-1.26.4 openai-1.45.0 pandas-2.2.2 posthog-3.6.5 pydantic-2.9.1 pydantic-core-2.23.3 pytz-2024.2 sniffio-1.3.1 tenacity-9.0.0 tqdm-4.66.5 tzdata-2024.1\n", - "Collecting langdetect\n", - " Using cached langdetect-1.0.9-py3-none-any.whl\n", - "Requirement already satisfied: six in /Users/tuanacelik/opt/anaconda3/envs/tutorials/lib/python3.12/site-packages (from langdetect) (1.16.0)\n", - "Installing collected packages: langdetect\n", - "Successfully installed langdetect-1.0.9\n" - ] - } - ], + "outputs": [], "source": [ "%%bash\n", "\n", @@ -194,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "id": "Ubr7yVt6Gbnj" }, @@ -220,7 +112,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "id": "mN2fFuWWP_8D" }, @@ -267,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { "id": "rfC1ZCigQJgI" }, @@ -291,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": { "id": "FlqGdbuxQNKk" }, @@ -306,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": { "id": "FEw5pfmBQRBB" }, @@ -328,11 +220,33 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 7, "metadata": { "id": "BdvO_fEfcVAY" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "πŸš… Components\n", + " - language_classifier: DocumentLanguageClassifier\n", + " - router: MetadataRouter\n", + " - en_writer: DocumentWriter\n", + " - fr_writer: DocumentWriter\n", + " - es_writer: DocumentWriter\n", + "πŸ›€οΈ Connections\n", + " - language_classifier.documents -> router.documents (List[Document])\n", + " - router.en -> en_writer.documents (List[Document])\n", + " - router.fr -> fr_writer.documents (List[Document])\n", + " - router.es -> es_writer.documents (List[Document])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "indexing_pipeline = Pipeline()\n", "indexing_pipeline.add_component(instance=language_classifier, name=\"language_classifier\")\n", @@ -379,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -391,13 +305,15 @@ { "data": { "text/plain": [ - "{'router': {'unmatched': []},\n", + "{'router': {'unmatched': [Document(id=ea7ea338874232de2d8105a258813f50345db82772e21ad2c4549dbb7adce8a3, content: 'Super appartement. Juste au dessus de plusieurs bars qui ferment trΓ¨s tard. A savoir Γ  l'avance. (Bo...', meta: {'language': 'fr'}),\n", + " Document(id=6b64c8a60543ee32b81cd39bc8d6e09fae4bff1b22c6ccdcf414db26fa354e7a, content: 'Un peu salΓ© surtout le sol. Manque de service et de souplesse', meta: {'language': 'fr'}),\n", + " Document(id=b1be23526f19a8af80a190e775bfd05e65878e585529037cb45b47267a4eaa98, content: 'Nous avons passΓ© un sΓ©jour formidable. Merci aux personnes , le bonjours Γ  Ricardo notre taxi man, t...', meta: {'language': 'fr'})]},\n", " 'en_writer': {'documents_written': 2},\n", - " 'fr_writer': {'documents_written': 3},\n", + " 'fr_writer': {'documents_written': 0},\n", " 'es_writer': {'documents_written': 2}}" ] }, - "execution_count": 13, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -419,7 +335,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -433,7 +349,7 @@ "output_type": "stream", "text": [ "English documents: [Document(id=8f64ab234c6a5d5652d02bed144d069ec6e988903b071d16fffbf400abfc1047, content: 'The keypad with a code is convenient and the location is convenient. Basically everything else, very...', meta: {'language': 'en'}), Document(id=d4d878288efba5e28a43ae0195e43dadd0298fe36d3d9b3075c5c5120d27763e, content: 'It is very central and appartement has a nice appearance (even though a lot IKEA stuff), *W A R N I ...', meta: {'language': 'en'})]\n", - "French documents: [Document(id=ea7ea338874232de2d8105a258813f50345db82772e21ad2c4549dbb7adce8a3, content: 'Super appartement. Juste au dessus de plusieurs bars qui ferment trΓ¨s tard. A savoir Γ  l'avance. (Bo...', meta: {'language': 'fr'}), Document(id=6b64c8a60543ee32b81cd39bc8d6e09fae4bff1b22c6ccdcf414db26fa354e7a, content: 'Un peu salΓ© surtout le sol. Manque de service et de souplesse', meta: {'language': 'fr'}), Document(id=b1be23526f19a8af80a190e775bfd05e65878e585529037cb45b47267a4eaa98, content: 'Nous avons passΓ© un sΓ©jour formidable. Merci aux personnes , le bonjours Γ  Ricardo notre taxi man, t...', meta: {'language': 'fr'})]\n", + "French documents: []\n", "Spanish documents: [Document(id=72b094c163b22a660528bc5adbdf0fecf96b4b4d753c1b117f15dba482d2f948, content: 'El apartamento estaba genial y muy cΓ©ntrico, todo a mano. Al lado de la librerΓ­a Lello y De la Torre...', meta: {'language': 'es'}), Document(id=4b37b8bdfffccfb3211ea167b4fdc5121ca51fc5f869b4f834e8da473f0d3353, content: 'CΓ©ntrico. Muy cΓ³modo para moverse y ver Oporto. Edificio con terraza propia en la ΓΊltima planta. Tod...', meta: {'language': 'es'})]\n" ] } @@ -459,7 +375,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -489,7 +405,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 12, "metadata": { "id": "CN1N2sn1yUVx" }, @@ -497,11 +413,12 @@ "source": [ "from haystack.components.retrievers.in_memory import InMemoryBM25Retriever\n", "from haystack.components.joiners import DocumentJoiner\n", - "from haystack.components.builders import PromptBuilder\n", - "from haystack.components.generators import OpenAIGenerator\n", + "from haystack.components.builders import ChatPromptBuilder\n", + "from haystack.components.generators.chat import OpenAIChatGenerator\n", + "from haystack.dataclasses import ChatMessage\n", "from haystack.components.routers import TextLanguageRouter\n", "\n", - "prompt_template = \"\"\"\n", + "prompt_template = [ChatMessage.from_user(\"\"\"\n", "You will be provided with reviews for an accommodation.\n", "Answer the question concisely based solely on the given reviews.\n", "Reviews:\n", @@ -510,7 +427,7 @@ " {% endfor %}\n", "Question: {{ query}}\n", "Answer:\n", - "\"\"\"" + "\"\"\")]" ] }, { @@ -525,19 +442,47 @@ "- `TextLanguageRouter`\n", "- `InMemoryBM25Retriever`. You'll need a retriever per language, since each language has its own `DocumentStore`.\n", "- `DocumentJoiner`\n", - "- `PromptBuilder`\n", - "- `OpenAIGenerator`\n", + "- `ChatPromptBuilder`\n", + "- `OpenAIChatGenerator`\n", "\n", "> Note: The `BM25Retriever` essentially does keyword matching, which isn't as accurate as other search methods. In order to make the LLM responses more precise, you could refacctor your piplines to use an [`EmbeddingRetriever`](https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever) which performs vector search over the documents." ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 13, "metadata": { "id": "BN1Hr_BjWKcl" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "πŸš… Components\n", + " - router: TextLanguageRouter\n", + " - en_retriever: InMemoryBM25Retriever\n", + " - fr_retriever: InMemoryBM25Retriever\n", + " - es_retriever: InMemoryBM25Retriever\n", + " - joiner: DocumentJoiner\n", + " - prompt_builder: ChatPromptBuilder\n", + " - llm: OpenAIChatGenerator\n", + "πŸ›€οΈ Connections\n", + " - router.en -> en_retriever.query (str)\n", + " - router.fr -> fr_retriever.query (str)\n", + " - router.es -> es_retriever.query (str)\n", + " - en_retriever.documents -> joiner.documents (List[Document])\n", + " - fr_retriever.documents -> joiner.documents (List[Document])\n", + " - es_retriever.documents -> joiner.documents (List[Document])\n", + " - joiner.documents -> prompt_builder.documents (List[Document])\n", + " - prompt_builder.prompt -> llm.messages (List[ChatMessage])" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "rag_pipeline = Pipeline()\n", "rag_pipeline.add_component(instance=TextLanguageRouter([\"en\", \"fr\", \"es\"]), name=\"router\")\n", @@ -545,8 +490,8 @@ "rag_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=fr_document_store), name=\"fr_retriever\")\n", "rag_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=es_document_store), name=\"es_retriever\")\n", "rag_pipeline.add_component(instance=DocumentJoiner(), name=\"joiner\")\n", - "rag_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name=\"prompt_builder\")\n", - "rag_pipeline.add_component(instance=OpenAIGenerator(), name=\"llm\")\n", + "rag_pipeline.add_component(instance=ChatPromptBuilder(template=prompt_template), name=\"prompt_builder\")\n", + "rag_pipeline.add_component(instance=OpenAIChatGenerator(), name=\"llm\")\n", "\n", "\n", "rag_pipeline.connect(\"router.en\", \"en_retriever.query\")\n", @@ -556,7 +501,7 @@ "rag_pipeline.connect(\"fr_retriever\", \"joiner\")\n", "rag_pipeline.connect(\"es_retriever\", \"joiner\")\n", "rag_pipeline.connect(\"joiner.documents\", \"prompt_builder.documents\")\n", - "rag_pipeline.connect(\"prompt_builder\", \"llm\")" + "rag_pipeline.connect(\"prompt_builder.prompt\", \"llm.messages\")" ] }, { @@ -570,7 +515,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -595,7 +540,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -617,22 +562,7 @@ "id": "wj24fjXN0l6v", "outputId": "3c1eed33-c31c-4b72-bcda-fdd64744560b" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Ranking by BM25...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 3134.76 docs/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], + "outputs": [], "source": [ "en_question = \"Is this apartment conveniently located?\"\n", "\n", @@ -641,7 +571,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -654,7 +584,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Yes, the apartment is conveniently located.\n" + "ChatMessage(content='Yes, the apartment is conveniently located.', role=, name=None, meta={'model': 'gpt-4o-mini-2024-07-18', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 8, 'prompt_tokens': 365, 'total_tokens': 373, 'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0}, 'completion_tokens_details': {'reasoning_tokens': 0, 'audio_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}}})\n" ] } ], @@ -673,7 +603,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -695,15 +625,7 @@ "id": "B4_Be1bs1jxJ", "outputId": "0b96cf29-d633-4c9b-f54c-a785e1c2cbe4" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Ranking by BM25...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 15887.52 docs/s]\n" - ] - } - ], + "outputs": [], "source": [ "es_question = \"ΒΏEl desayuno es genial?\"\n", "\n", @@ -712,7 +634,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -725,12 +647,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "No, el desayuno no es genial.\n" + "SΓ­, el desayuno es descrito como estupendo.\n" ] } ], "source": [ - "print(result[\"llm\"][\"replies\"][0])" + "print(result[\"llm\"][\"replies\"][0].content)" ] }, { @@ -773,7 +695,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.9.6" }, "widgets": { "application/vnd.jupyter.widget-state+json": {