Added Jupyter Notebook for FlockMTL setup (#57)

dsg-polymtl · Nov 7, 2024 · 2b4cae0 · 2b4cae0
1 parent 0a9f966
commit 2b4cae0
Showing 1 changed file with 379 additions and 0 deletions.
diff --git a/tutorial/jupyter_notebooks/FlockMTL_setup.ipynb b/tutorial/jupyter_notebooks/FlockMTL_setup.ipynb
@@ -0,0 +1,379 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b1cf8bd9-26dc-4c45-8e7d-49858c999aa4",
+   "metadata": {},
+   "source": [
+    "The following steps need to be followed for setting up FlocktMTL with Jupyter\n",
+    "\n",
+    "#### Step 1: Store API secrets in a .env file, and load it in jupyter\n",
+    "We store the OpenAI API key in our .env file for the tutorial. and use the python-dotenv package\n",
+    "to read this secret. Any other suitable method can also be used\n",
+    "\n",
+    "#### Step 2: Install DuckDB for python. \n",
+    "For FlockMTL v0.1.0 \"Schwartz Deli\", DuckDB version 1.1.1 is required\n",
+    "\n",
+    "#### Step 3: Create DuckDB database.\n",
+    "We use a temporary in-memory database. Persistent database can also be used. Please\n",
+    "see https://duckdb.org/docs/connect/overview.html for detailed methods to create databases\n",
+    "\n",
+    "#### Step 4: Install FlockMTL\n",
+    "FlockMTL needs to be installed once per new DuckDB installation. DuckDB installs the latest\n",
+    "release from DuckDB Community extensions (https://community-extensions.duckdb.org/)\n",
+    "\n",
+    "#### Step 5: Load FlockMTL\n",
+    "FlockMTL needs to be loaded every time when the Jupyter kernel restarts\n",
+    "\n",
+    "#### Step 6: Use FlockMTL\n",
+    "After step 5, FlockMTL is installed and ready to use. We create a new table called product_reviews,\n",
+    "and use LLM function calls to extract semantic details from it"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7bd34cd9-0acc-499d-8978-f40980fdefaf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Step 1 Store API secrets in a .env file, and load it in jupyter\n",
+    "!pip install python-dotenv\n",
+    "from dotenv import load_dotenv\n",
+    "import os\n",
+    "\n",
+    "# Load the .env file\n",
+    "load_dotenv()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c975a76-58bc-40d4-a9b1-27af6f4dd7e3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Step 2: Install DuckDB for python\n",
+    "!pip install duckdb==1.1.1\n",
+    "import duckdb\n",
+    "print(duckdb.__version__)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c93b406-0235-49d1-8efb-52a171a6d427",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Step 3: Create DuckDB database.\n",
+    "con = duckdb.connect(':memory:', config={'allow_unsigned_extensions' : 'true'})\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "759841f8-8889-4c45-a344-c55715d71b82",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Step 4: Install FlockMTL\n",
+    "con.execute(f\"INSTALL flockmtl\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f1b94ac0-82e0-4fc7-92a5-5649ab19570e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Step 5: Load FlockMTL\n",
+    "con.execute(f\"LOAD flockmtl\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8beeef6d-ab19-417c-a1d5-1c739c221795",
+   "metadata": {},
+   "source": [
+    "#### A simple sentiment analysis on product reviews is defined below. The following features are highlighted\n",
+    "* FlockMTL default supported models\n",
+    "* Use of LLMs to perform analysis tasks on SQL tables\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6e1abae-5842-4eab-babb-e4b16c408ca7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Step 6: Use FlockMTL\n",
+    "csv_path = 'products.csv' #add your own path\n",
+    "con.execute(f\"CREATE TABLE product_reviews AS SELECT * FROM read_csv_auto('{csv_path}')\");\n",
+    "con.execute ('SELECT * from product_reviews;').fetchall()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7e9d26e5-af14-4594-873c-562f9f09b822",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#View the default models supported out of the box by FlockMTL\n",
+    "con.execute(\"GET MODELS;\").fetchall()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffb94d15-b5c1-4a4a-96e9-4b49d11e2344",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# First we create a new sentiment analysis prompt\n",
+    "sentiment_analysis_prompt = \"\"\"\n",
+    "Analyze the sentiment of the following product review. Consider both the review text and the star rating. Provide a brief sentiment label (positive, negative, or neutral) and a short explanation for your decision.\n",
+    "\n",
+    "Review: {review}\n",
+    "Star Rating: {rating}\n",
+    "\n",
+    "Output your response in the following JSON format:\n",
+    "{\n",
+    "    \"sentiment\": \"positive/negative/neutral\",\n",
+    "    \"explanation\": \"Brief explanation of the sentiment analysis\"\n",
+    "}\n",
+    "\"\"\"\n",
+    "\n",
+    "# Use an f-string to insert the prompt directly into the query\n",
+    "sentiment_analysis_prompt_query = f\"\"\"\n",
+    "    CREATE PROMPT ('sentiment-analysis', '{sentiment_analysis_prompt}');\n",
+    "\"\"\"\n",
+    "\n",
+    "con.execute (sentiment_analysis_prompt_query)\n",
+    "\n",
+    "\n",
+    "\n",
+    "# Now we use the new prompt for analytical analysis\n",
+    "query = \"\"\"\n",
+    "CREATE TABLE sentiment_analysis AS\n",
+    "WITH sentiment_analysis AS (\n",
+    "    SELECT \n",
+    "        ProductID as product_id, \n",
+    "        ID AS review_id,\n",
+    "        Review AS review_text, \n",
+    "        Rating AS star_rating, \n",
+    "        llm_complete_json('sentiment-analysis', 'default', {'review': review_text, 'rating': star_rating}) AS sentiment_json\n",
+    "    FROM \n",
+    "        product_reviews\n",
+    ")\n",
+    "SELECT * \n",
+    "FROM sentiment_analysis;\n",
+    "\"\"\"\n",
+    "\n",
+    "# Run the query and fetch results\n",
+    "results = con.execute(query).fetchall()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce7c0b10-36d7-4af9-92c3-c93e1fb7ab3e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Print newly created table\n",
+    "query = \"select * from sentiment_analysis;\"\n",
+    "results = con.execute(query).fetchall()\n",
+    "\n",
+    "for row in results:\n",
+    "    print(row)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bc6fd987-7d86-41a0-9d7f-923b171acbf5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the is-high-impact-review prompt\n",
+    "is_high_impact_review_prompt = \"\"\"\n",
+    "Determine if the given review is a high-impact review that provides valuable insights. Consider the following factors:\n",
+    "\n",
+    "1. Sentiment: {sentiment}\n",
+    "2. Star Rating: {rating}\n",
+    "3. Review Length: {review_length}\n",
+    "\n",
+    "A high-impact review typically has:\n",
+    "\n",
+    "- A strong sentiment (very positive or very negative)\n",
+    "- An extreme rating (1-2 or 4-5 stars)\n",
+    "- Sufficient length to provide detailed feedback (usually more than 50 words)\n",
+    "\n",
+    "Output your decision as a boolean true or false.\n",
+    "\"\"\"\n",
+    "\n",
+    "# Create the SQL query to register the is-high-impact-review prompt\n",
+    "is_high_impact_review_prompt_query = f\"\"\"\n",
+    "    CREATE PROMPT ('is-high-impact-review', '{is_high_impact_review_prompt}');\n",
+    "\"\"\"\n",
+    "\n",
+    "# Execute the query to create the prompt\n",
+    "con.execute(is_high_impact_review_prompt_query)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63312f96-7360-4bbb-9a3c-edc90c533dcf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"\"\"\n",
+    "CREATE TABLE filtered_reviews AS\n",
+    "WITH filtered_reviews AS (\n",
+    "    SELECT \n",
+    "        * \n",
+    "    FROM \n",
+    "        sentiment_analysis \n",
+    "    WHERE \n",
+    "        llm_filter('is-high-impact-review', 'gpt-4o', {\n",
+    "            'sentiment': sentiment_json, \n",
+    "            'rating': star_rating, \n",
+    "            'review_length': LENGTH(review_text)\n",
+    "        })\n",
+    ")\n",
+    "SELECT * \n",
+    "FROM filtered_reviews;\n",
+    "\"\"\"\n",
+    "\n",
+    "# Run the query and fetch results\n",
+    "results = con.execute(query).fetchall()\n",
+    "\n",
+    "for row in results:\n",
+    "    print(row)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43d3a10f-0459-4dce-9904-e88e4397ead5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Print newly created table\n",
+    "query = \"select * from filtered_reviews;\"\n",
+    "results = con.execute(query).fetchall()\n",
+    "\n",
+    "for row in results:\n",
+    "    print(row)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec377577-a29b-45be-80c2-40454e60d603",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the extract themes prompt\n",
+    "extract_themes_prompt = \"\"\"\n",
+    "    Analyze the following sentiment analysis JSON. Identify and extract key themes or topics discussed in the product review. Output the themes in a JSON array format.\n",
+    "    \n",
+    "    Sentiment Analysis JSON: {{sentiment_json}}\n",
+    "    \n",
+    "    Output your response in the following JSON format:\n",
+    "    {{\n",
+    "        \"themes\": [\"theme1\", \"theme2\", \"theme3\"]\n",
+    "    }}\n",
+    "\"\"\"\n",
+    "       \n",
+    "# Create the SQL query to register the prompt\n",
+    "extract_themes_prompt_query = f\"\"\"\n",
+    "    CREATE PROMPT ('extract-themes' , '{extract_themes_prompt}');\n",
+    "\"\"\"\n",
+    "\n",
+    "# Execute the query to create the prompt\n",
+    "results = con.execute(extract_themes_prompt_query)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e2f7e074-2a94-4fd4-83ea-dc33fce43cec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"\"\"\n",
+    "CREATE TABLE themes_extracted AS\n",
+    "WITH themes_extracted AS (\n",
+    "    SELECT \n",
+    "        product_id, \n",
+    "        review_id, \n",
+    "        review_text,\n",
+    "        star_rating,\n",
+    "        llm_complete_json('extract-themes', 'gpt-4o', {'sentiment_json': sentiment_json}) AS themes\n",
+    "    FROM \n",
+    "        filtered_reviews\n",
+    ")\n",
+    "SELECT * \n",
+    "FROM themes_extracted;\n",
+    "\"\"\"\n",
+    "# Run the query and fetch results\n",
+    "results = con.execute(query).fetchall()\n",
+    "\n",
+    "for row in results:\n",
+    "    print(row)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c4382489-808c-4680-b0f1-ec954d840323",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Print newly created table\n",
+    "query = \"select * from themes_extracted;\"\n",
+    "results = con.execute(query).fetchall()\n",
+    "\n",
+    "for row in results:\n",
+    "    print(row)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cdec26c4-0d11-41e4-b563-1c45c8ba41aa",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}