From 764d9ac9be1becd12795316310bb16841c2f038f Mon Sep 17 00:00:00 2001
From: vga91 <giuseppe.villani@larus-ba.it>
Date: Thu, 11 Jul 2024 14:48:44 +0200
Subject: [PATCH] Fixes #4000: Add self-explanation to the model, include the
 verbal schema description to the flow

---
 .../asciidoc/modules/ROOT/pages/ml/genai.adoc | 145 +++++++++++++++++
 extended/src/main/java/apoc/ml/Prompt.java    |  35 +++-
 .../src/main/java/apoc/util/ExtendedUtil.java |   6 +
 extended/src/test/java/apoc/ml/PromptIT.java  | 154 +++++++++++++++---
 .../test/resources/northwind_dataset.cypher   |  61 +++++++
 .../test/resources/northwind_schema.cypher    |   6 +
 6 files changed, 378 insertions(+), 29 deletions(-)
 create mode 100644 extended/src/test/resources/northwind_dataset.cypher
 create mode 100644 extended/src/test/resources/northwind_schema.cypher

diff --git a/docs/asciidoc/modules/ROOT/pages/ml/genai.adoc b/docs/asciidoc/modules/ROOT/pages/ml/genai.adoc
index 9d35b29752..c5199092ca 100644
--- a/docs/asciidoc/modules/ROOT/pages/ml/genai.adoc
+++ b/docs/asciidoc/modules/ROOT/pages/ml/genai.adoc
@@ -91,6 +91,7 @@ RETURN m.title
 | apiKey | OpenAI API key | in case `apoc.openai.key` is not defined
 | model | The Open AI model | no, default `gpt-3.5-turbo`
 | sample | The number of nodes to skip, e.g. a sample of 1000 will read every 1000th node. It's used as a parameter to `apoc.meta.data` procedure that computes the schema | no, default is a random number
+| additionalPrompts | To specify other prompts to be passed to improve the request
 |===
 
 .Results
@@ -102,6 +103,107 @@ RETURN m.title
 |===
 
 
+We can use the `additionalPrompts` config to improve the request, e.g. adding the natural language description of the schema (like the output of the `apoc.ml.schema` for instance).
+Since OpenAI is mainly trained to elaborate natural language questions asked in, rather than Cypher queries, by using this configuration it is possible to achieve better results.
+For example, given the https://neo4j.com/docs/getting-started/appendix/tutorials/guide-import-relational-and-etl/[Northwind dataset] we can execute:
+
+.Query call
+[source,cypher]
+----
+CALL apoc.ml.schema({apiKey: $apiKey}) YIELD value
+WITH value
+CALL apoc.ml.query("Which 5 employees had sold the product 'Chocolade' and has the highest selling count of another product?
+  Please returns the employee identificator, the other product name and the count orders of another product",
+{
+    retries: 8,
+    retryWithError: true,
+    apiKey: $apiKey,
+    additionalPrompts: [
+        {role: "system", content: "The human description of the schema is the following:\n" + value}
+    ]
+})
+YIELD query, value RETURN query, value
+----
+
+with a result similar to the following.
+
+NOTE: the results are not deterministic and will potentially change each time the query is re-executed
+
+.Results
+[%autowidth, opts=header]
+|===
+| query | value
+| "cypher
+MATCH (p:Product {productName: 'Chocolade'})<-[:CONTAINS]-(:Order)<-[:SOLD]-(e:Employee)
+MATCH (e)-[:SOLD]->(o:Order)-[:CONTAINS]->(p2:Product)
+WITH e, p2, COUNT(DISTINCT o) AS orderCount
+ORDER BY orderCount DESC
+RETURN e.employeeID AS employeeID, p2.productName AS otherProduct, orderCount
+LIMIT 5
+" 
+| {
+"otherProduct": "Gnocchi di nonna Alice",
+"employeeID": "4",
+"orderCount": 14
+}
+| "cypher
+MATCH (p:Product {productName: 'Chocolade'})<-[:CONTAINS]-(:Order)<-[:SOLD]-(e:Employee)
+MATCH (e)-[:SOLD]->(o:Order)-[:CONTAINS]->(p2:Product)
+WITH e, p2, COUNT(DISTINCT o) AS orderCount
+ORDER BY orderCount DESC
+RETURN e.employeeID AS employeeID, p2.productName AS otherProduct, orderCount
+LIMIT 5
+"
+| {
+"otherProduct": "Pâté chinois",
+"employeeID": "4",
+"orderCount": 12
+}
+| "cypher
+MATCH (p:Product {productName: 'Chocolade'})<-[:CONTAINS]-(:Order)<-[:SOLD]-(e:Employee)
+MATCH (e)-[:SOLD]->(o:Order)-[:CONTAINS]->(p2:Product)
+WITH e, p2, COUNT(DISTINCT o) AS orderCount
+ORDER BY orderCount DESC
+RETURN e.employeeID AS employeeID, p2.productName AS otherProduct, orderCount
+LIMIT 5
+"
+| {
+"otherProduct": "Gumbär Gummibärchen",
+"employeeID": "3",
+"orderCount": 12
+}
+| "cypher
+MATCH (p:Product {productName: 'Chocolade'})<-[:CONTAINS]-(:Order)<-[:SOLD]-(e:Employee)
+MATCH (e)-[:SOLD]->(o:Order)-[:CONTAINS]->(p2:Product)
+WITH e, p2, COUNT(DISTINCT o) AS orderCount
+ORDER BY orderCount DESC
+RETURN e.employeeID AS employeeID, p2.productName AS otherProduct, orderCount
+LIMIT 5
+"
+| {
+"otherProduct": "Flotemysost",
+"employeeID": "1",
+"orderCount": 12
+}
+| "cypher
+MATCH (p:Product {productName: 'Chocolade'})<-[:CONTAINS]-(:Order)<-[:SOLD]-(e:Employee)
+MATCH (e)-[:SOLD]->(o:Order)-[:CONTAINS]->(p2:Product)
+WITH e, p2, COUNT(DISTINCT o) AS orderCount
+ORDER BY orderCount DESC
+RETURN e.employeeID AS employeeID, p2.productName AS otherProduct, orderCount
+LIMIT 5
+"
+| {
+"otherProduct": "Pavlova",
+"employeeID": "1",
+"orderCount": 11
+}
+|===
+
+Respect to using the procedure without the natural language schema description, the output has fewer hallucinations, 
+like properties hold by different labels and relationships linked to other entities.
+
+
 == Describe the graph model with natural language
 
 This procedure `apoc.ml.schema` returns a description, in natural language, of the underlying dataset.
@@ -126,6 +228,7 @@ RETURN *
 1 row
 ----
 
+
 .Input Parameters
 [%autowidth, opts=header]
 |===
@@ -205,6 +308,7 @@ RETURN DISTINCT a.name
 | apiKey | OpenAI API key | in case `apoc.openai.key` is not defined
 | model | The Open AI model | no, default `gpt-3.5-turbo`
 | sample | The number of nodes to skip, e.g. a sample of 1000 will read every 1000th node. It's used as a parameter to `apoc.meta.data` procedure that computes the schema | no, default is a random number
+| additionalPrompts | To specify other prompts to be passed to improve the request
 |===
 
 .Results
@@ -214,6 +318,47 @@ RETURN DISTINCT a.name
 | value | the description of the dataset
 |===
 
+
+We can use the `additionalPrompts` config to improve the request, e.g. adding the natural language description of the schema (like the output of the `apoc.ml.schema` for instance).
+Since OpenAI is mainly trained to elaborate natural language questions asked in, rather than Cypher queries, by using this configuration it is possible to achieve better results.
+For example, given the https://neo4j.com/docs/getting-started/appendix/tutorials/guide-import-relational-and-etl/[Northwind dataset] we can execute:
+
+.Query call
+[source,cypher]
+----
+CALL apoc.ml.schema({apiKey: $apiKey}) YIELD value
+WITH value
+CALL apoc.ml.cypher("Which 5 employees had sold the product 'Chocolade' and has the highest selling count of another product? 
+  Please returns the employee identificator, the other product name and the count orders of another product",
+{
+  count: 1,
+  apiKey: $apiKey,
+  additionalPrompts: [
+    {role: "system", content: "The human description of the schema is the following:\n" + value}
+  ]
+})
+YIELD value RETURN value
+----
+
+with a result similar to the following.
+
+NOTE: the results are not deterministic and will potentially change each time the query is re-executed
+
+.Results
+[%autowidth, opts=header]
+|===
+| value
+| MATCH (p:Product {productName: 'Chocolade'})<-[:CONTAINS]-(o:Order)<-[:SOLD]-(e:Employee)
+MATCH (e)-[:SOLD]->(o2:Order)-[:CONTAINS]->(p2:Product)
+WITH e, p2, COUNT(DISTINCT o2) AS ordersCnt
+ORDER BY ordersCnt DESC
+RETURN e.employeeID AS employeeID, p2.productName AS otherProduct, ordersCnt
+LIMIT 5
+|===
+
+Respect to using the procedure without the natural language schema description, the output has fewer hallucinations,
+like properties hold by different labels and relationships linked to other entities.
+
 == Create a natural language query explanation from a cypher query
 
 This procedure `apoc.ml.fromCypher` takes a natural language question and transforms it into natural language query explanation.
diff --git a/extended/src/main/java/apoc/ml/Prompt.java b/extended/src/main/java/apoc/ml/Prompt.java
index e35a82c792..dc7fd96c39 100644
--- a/extended/src/main/java/apoc/ml/Prompt.java
+++ b/extended/src/main/java/apoc/ml/Prompt.java
@@ -3,9 +3,11 @@
 import apoc.ApocConfig;
 import apoc.Extended;
 import apoc.result.StringResult;
+import apoc.util.CollectionUtils;
 import apoc.util.Util;
 import apoc.util.collection.Iterators;
 import com.fasterxml.jackson.core.JsonProcessingException;
+import org.apache.commons.collections.ListUtils;
 import org.apache.commons.text.WordUtils;
 import org.jetbrains.annotations.NotNull;
 import org.neo4j.graphdb.Entity;
@@ -148,6 +150,7 @@ private void augmentEntity(RagConfig config, String[] objects, StringBuilder con
     public static final String EXPLAIN_SCHEMA_PROMPT = """
             You are an expert in the Neo4j graph database and graph data modeling and have experience in a wide variety of business domains.
             Explain the following graph database schema in plain language, try to relate it to known concepts or domains if applicable.
+            Try to explain as much as possible the nodes, relationships and properties.
             Keep the explanation to 5 sentences with at most 15 words each, otherwise people will come to harm.
             """;
 
@@ -274,8 +277,9 @@ public Stream<PromptMapResult> query(@Name("question") String question,
 
     @Procedure
     public Stream<StringResult> schema(@Name(value = "conf", defaultValue = "{}") Map<String, Object> conf) throws MalformedURLException, JsonProcessingException {
+        String schema = loadSchema(tx, conf);
         String schemaExplanation = prompt("Please explain the graph database schema to me and relate it to well known concepts and domains.",
-                EXPLAIN_SCHEMA_PROMPT, "This database schema ", loadSchema(tx, conf), conf, List.of());
+                EXPLAIN_SCHEMA_PROMPT, "This database schema ", schema, conf, List.of());
         return Stream.of(new StringResult(schemaExplanation));
     }
 
@@ -302,14 +306,19 @@ private QueryResult tryQuery(String question, Map<String, Object> conf, String s
         }
     }
 
-    private String prompt(String userQuestion, String systemPrompt, String assistantPrompt, String schema, Map<String, Object> conf, List<Map<String,String>> otherPrompts) throws JsonProcessingException, MalformedURLException {
+    private String prompt(String userQuestion, String systemPrompt, String assistantPrompt, String schema, Map<String, Object> conf, List<Map<String,String>> otherPromptsFromRetries) throws JsonProcessingException, MalformedURLException {
         List<Map<String, String>> prompt = new ArrayList<>();
         if (systemPrompt != null && !systemPrompt.isBlank()) prompt.add(Map.of("role", "system", "content", systemPrompt));
         if (schema != null && !schema.isBlank()) prompt.add(Map.of("role", "system", "content", "The graph database schema consists of these elements\n" + schema));
+        
+        List<Map<String, String>> additionalPrompts = (List<Map<String, String>>) conf.get("additionalPrompts");
+        if (CollectionUtils.isNotEmpty(additionalPrompts)) {
+            prompt.addAll(additionalPrompts);
+        }
         if (userQuestion != null && !userQuestion.isBlank()) prompt.add(Map.of("role", "user", "content", userQuestion));
         if (assistantPrompt != null && !assistantPrompt.isBlank()) prompt.add(Map.of("role", "assistant", "content", assistantPrompt));
 
-        prompt.addAll(otherPrompts);
+        prompt.addAll(otherPromptsFromRetries);
         
         String apiKey = (String) conf.get(API_KEY_CONF);
         String model = (String) conf.getOrDefault("model", "gpt-3.5-turbo");
@@ -357,12 +366,20 @@ private String prompt(String userQuestion, String systemPrompt, String assistant
             """ + SCHEMA_FROM_META_DATA;
     
     private final static String SCHEMA_PROMPT = """
-                nodes:
-                %s
-                relationships:
-                %s
-                patterns:
-                %s
+            nodes:
+            ```
+            %s
+            ```
+
+            relationships:
+            ```
+            %s
+            ```
+
+            patterns:
+            ```
+            %s
+            ```
             """;
 
 
diff --git a/extended/src/main/java/apoc/util/ExtendedUtil.java b/extended/src/main/java/apoc/util/ExtendedUtil.java
index 1190b0e371..319430caa1 100644
--- a/extended/src/main/java/apoc/util/ExtendedUtil.java
+++ b/extended/src/main/java/apoc/util/ExtendedUtil.java
@@ -337,5 +337,11 @@ public static float[] listOfNumbersToFloatArray(List<? extends Number> embedding
         }
         return floats;
     }
+
+    public static List<String> splitSemicolonAndRemoveBlanks(String value) {
+        return Arrays.stream(value.split(";\n"))
+                .filter(i -> !i.isBlank())
+                .toList();
+    }
             
 }
diff --git a/extended/src/test/java/apoc/ml/PromptIT.java b/extended/src/test/java/apoc/ml/PromptIT.java
index 3930502173..fd33c8f721 100644
--- a/extended/src/test/java/apoc/ml/PromptIT.java
+++ b/extended/src/test/java/apoc/ml/PromptIT.java
@@ -13,6 +13,7 @@
 import org.junit.BeforeClass;
 import org.junit.Rule;
 import org.junit.Test;
+import org.neo4j.graphdb.Result;
 import org.neo4j.graphdb.Transaction;
 import org.neo4j.test.rule.DbmsRule;
 import org.neo4j.test.rule.ImpermanentDbmsRule;
@@ -25,6 +26,7 @@
 import static apoc.ml.Prompt.API_KEY_CONF;
 import static apoc.ml.Prompt.UNKNOWN_ANSWER;
 import static apoc.ml.RagConfig.*;
+import static apoc.util.ExtendedUtil.splitSemicolonAndRemoveBlanks;
 import static apoc.util.MapUtil.map;
 import static apoc.util.TestUtil.testCall;
 import static apoc.util.TestUtil.testResult;
@@ -33,7 +35,6 @@
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.fail;
 import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.Assert.fail;
 
 public class PromptIT {
 
@@ -63,7 +64,6 @@ public static void check() {
 
     @Before
     public void setUp() {
-        TestUtil.registerProcedure(db, Prompt.class, OpenAI.class, Meta.class, Strings.class, Coll.class);
         String movies = Util.readResourceFile("movies.cypher");
         try (Transaction tx = db.beginTx()) {
             tx.execute(movies);
@@ -75,7 +75,23 @@ public void setUp() {
             tx.execute(rag);
             tx.commit();
         }
+        TestUtil.registerProcedure(db, Prompt.class, OpenAI.class, Meta.class, Strings.class, Coll.class);
+        
+        String northwindEntities = Util.readResourceFile("northwind_dataset.cypher");
+        try (Transaction tx = db.beginTx()) {
+            for (String query: splitSemicolonAndRemoveBlanks(northwindEntities)) {
+                tx.execute(query);
+            }
+            tx.commit();
+        }
 
+        String northwindSchema = Util.readResourceFile("northwind_schema.cypher");
+        try (Transaction tx = db.beginTx()) {
+            for (String query: splitSemicolonAndRemoveBlanks(northwindSchema)) {
+                tx.execute(query);
+            }
+            tx.commit();
+        }
     }
 
     @Test
@@ -88,16 +104,24 @@ public void testQuery() {
                         "retries", 2L,
                         "apiKey", OPENAI_KEY
                 ),
-                (r) -> {
-                    List<Map<String, Object>> list = r.stream().toList();
-                    Assertions.assertThat(list).hasSize(12);
-                    Assertions.assertThat(list.stream()
-                                    .map(m -> m.get("query"))
-                                    .filter(Objects::nonNull)
-                                    .map(Object::toString)
-                                    .map(String::trim))
-                            .isNotEmpty();
-                });
+                r -> testQueryAssertions(r, 12)
+        );
+    }
+
+    private void testQueryAssertions(Result r, Integer size) {
+        List<Map<String, Object>> list = r.stream().toList();
+        System.out.println("list = " + list);
+        if (size == null) {
+            Assertions.assertThat(list).isNotEmpty();
+        } else {
+            Assertions.assertThat(list).hasSize(size);
+        }
+        Assertions.assertThat(list.stream()
+                        .map(m -> m.get("query"))
+                        .filter(Objects::nonNull)
+                        .map(Object::toString)
+                        .map(String::trim))
+                .isNotEmpty();
     }
 
     @Test
@@ -143,17 +167,107 @@ public void testCypher() {
                         "apiKey", OPENAI_KEY
                 ),
                 (r) -> {
-                    List<Map<String, Object>> list = r.stream().toList();
-                    Assertions.assertThat(list).hasSize((int) numOfQueries);
-                    Assertions.assertThat(list.stream()
-                                    .map(m -> m.get("query"))
-                                    .filter(Objects::nonNull)
-                                    .map(Object::toString)
-                                    .filter(StringUtils::isNotEmpty))
-                            .hasSize((int) numOfQueries);
+                    testCypherAssertions((int) numOfQueries, r);
                 });
     }
 
+    private void testCypherAssertions(int numOfQueries, Result r) {
+        List<Map<String, Object>> list = r.stream().toList();
+        System.out.println("list = " + list);
+        Assertions.assertThat(list).hasSize(numOfQueries);
+        Assertions.assertThat(list.stream()
+                        .map(m -> m.get("query"))
+                        .filter(Objects::nonNull)
+                        .map(Object::toString)
+                        .filter(StringUtils::isNotEmpty))
+                .hasSize(numOfQueries);
+    }
+
+    @Test
+    public void testCypherWithSchemaExplanationAndQuestionAboutCrossSellingCount() {
+
+        String question = "Which 5 employees had sold the product 'Chocolade' and has the highest selling count of another product? " +
+                          "Please returns the employee identificator, the other product name and the count orders of another product";
+        testCypherWithSchemaCommon(question, 5);
+    }
+
+    @Test
+    public void testCypherWithSchemaExplanationAndQuestionAboutEmployeeOrganization() {
+
+        String question = "How are Employees organized? Who reports to whom?";
+        testCypherWithSchemaCommon(question, null);
+    }
+
+    @Test
+    public void testCypherWithSchemaExplanationAndQuestionAboutEmployeeReport() {
+
+        String question = "Which Employees report to each other indirectly?";
+        testCypherWithSchemaCommon(question,  null);
+    }
+
+    @Test
+    public void testCypherWithSchemaExplanationAndQuestionAboutHierarchy() {
+
+        String question = "How many orders were made by each part of the hierarchy?\n";
+        testCypherWithSchemaCommon(question, null);
+    }
+
+    private void testCypherWithSchemaCommon(String question, Integer size) {
+        long numOfQueries = 4L;
+        String schema = TestUtil.singleResultFirstColumn(db, "CALL apoc.ml.schema({apiKey: $apiKey})",
+                Map.of("apiKey", OPENAI_KEY));
+
+        String humanDescriptionSchema = "The human description of the schema is the following:" +
+                                        "```\n%s\n```"
+                                                .formatted(schema);
+
+        List<Map> additionalPrompts = List.of(
+                Map.of("role", "system", "content", humanDescriptionSchema)
+        );
+        
+        testResult(db, """
+                CALL apoc.ml.cypher($query, {count: $numOfQueries, apiKey: $apiKey})
+                """,
+                Map.of(
+                        "query", question,
+                        "numOfQueries", numOfQueries,
+                        "apiKey", OPENAI_KEY
+                ),
+                (r) -> testCypherAssertions((int) numOfQueries, r)
+        );
+        
+        testResult(db, "CALL apoc.ml.cypher($query, {count: $numOfQueries, apiKey: $apiKey, additionalPrompts: $additionalPrompts})",
+                Map.of(
+                        "query", question,
+                        "numOfQueries", numOfQueries,
+                        "apiKey", OPENAI_KEY,
+                        "additionalPrompts", additionalPrompts
+                ),
+                (r) -> testCypherAssertions((int) numOfQueries, r)
+        );
+
+        testResult(db, """
+                CALL apoc.ml.query($query, {apiKey: $apiKey, retries: $retries, retryWithError: true}) YIELD query
+                """,
+                Map.of(
+                        "query", question,
+                        "retries", 10L,
+                        "apiKey", OPENAI_KEY
+                ),
+                r -> testQueryAssertions(r, size)
+        );
+
+        testResult(db, "CALL apoc.ml.query($query, {apiKey: $apiKey, additionalPrompts: $additionalPrompts, retries: $retries, retryWithError: true}) YIELD query ",
+                Map.of(
+                        "query", question,
+                        "retries", 10L,
+                        "apiKey", OPENAI_KEY,
+                        "additionalPrompts", additionalPrompts
+                ),
+                r -> testQueryAssertions(r, size)
+        );
+    }
+
     @Test
     public void testFromCypher() {
         testCall(db, """
diff --git a/extended/src/test/resources/northwind_dataset.cypher b/extended/src/test/resources/northwind_dataset.cypher
new file mode 100644
index 0000000000..c8866256bc
--- /dev/null
+++ b/extended/src/test/resources/northwind_dataset.cypher
@@ -0,0 +1,61 @@
+// Create orders
+LOAD CSV WITH HEADERS FROM 'https://gist.githubusercontent.com/jexp/054bc6baf36604061bf407aa8cd08608/raw/8bdd36dfc88381995e6823ff3f419b5a0cb8ac4f/orders.csv' AS row
+MERGE (order:Order {orderID: row.OrderID})
+  ON CREATE SET order.shipName = row.ShipName;
+
+// Create products
+LOAD CSV WITH HEADERS FROM 'https://gist.githubusercontent.com/jexp/054bc6baf36604061bf407aa8cd08608/raw/8bdd36dfc88381995e6823ff3f419b5a0cb8ac4f/products.csv' AS row
+MERGE (product:Product {productID: row.ProductID})
+  ON CREATE SET product.productName = row.ProductName, product.unitPrice = toFloat(row.UnitPrice);
+
+// Create suppliers
+LOAD CSV WITH HEADERS FROM 'https://gist.githubusercontent.com/jexp/054bc6baf36604061bf407aa8cd08608/raw/8bdd36dfc88381995e6823ff3f419b5a0cb8ac4f/suppliers.csv' AS row
+MERGE (supplier:Supplier {supplierID: row.SupplierID})
+  ON CREATE SET supplier.companyName = row.CompanyName;
+
+
+// Create employees
+LOAD CSV WITH HEADERS FROM 'https://gist.githubusercontent.com/jexp/054bc6baf36604061bf407aa8cd08608/raw/8bdd36dfc88381995e6823ff3f419b5a0cb8ac4f/employees.csv' AS row
+MERGE (e:Employee {employeeID:row.EmployeeID})
+  ON CREATE SET e.firstName = row.FirstName, e.lastName = row.LastName, e.title = row.Title;
+
+
+// Create categories
+LOAD CSV WITH HEADERS FROM 'https://gist.githubusercontent.com/jexp/054bc6baf36604061bf407aa8cd08608/raw/8bdd36dfc88381995e6823ff3f419b5a0cb8ac4f/categories.csv' AS row
+MERGE (c:Category {categoryID: row.CategoryID})
+  ON CREATE SET c.categoryName = row.CategoryName, c.description = row.Description;
+
+
+// Create relationships between orders and products
+LOAD CSV WITH HEADERS FROM 'https://gist.githubusercontent.com/jexp/054bc6baf36604061bf407aa8cd08608/raw/8bdd36dfc88381995e6823ff3f419b5a0cb8ac4f/orders.csv' AS row
+MATCH (order:Order {orderID: row.OrderID})
+MATCH (product:Product {productID: row.ProductID})
+MERGE (order)-[op:CONTAINS]->(product)
+  ON CREATE SET op.unitPrice = toFloat(row.UnitPrice), op.quantity = toFloat(row.Quantity);
+
+
+// Create relationships between orders and employees
+LOAD CSV WITH HEADERS FROM 'https://gist.githubusercontent.com/jexp/054bc6baf36604061bf407aa8cd08608/raw/8bdd36dfc88381995e6823ff3f419b5a0cb8ac4f/orders.csv' AS row
+MATCH (order:Order {orderID: row.OrderID})
+MATCH (employee:Employee {employeeID: row.EmployeeID})
+MERGE (employee)-[:SOLD]->(order);
+
+
+// Create relationships between products and suppliers
+LOAD CSV WITH HEADERS FROM 'https://gist.githubusercontent.com/jexp/054bc6baf36604061bf407aa8cd08608/raw/8bdd36dfc88381995e6823ff3f419b5a0cb8ac4f/products.csv' AS row
+MATCH (product:Product {productID: row.ProductID})
+MATCH (supplier:Supplier {supplierID: row.SupplierID})
+MERGE (supplier)-[:SUPPLIES]->(product);
+
+// Create relationships between products and categories
+LOAD CSV WITH HEADERS FROM 'https://gist.githubusercontent.com/jexp/054bc6baf36604061bf407aa8cd08608/raw/8bdd36dfc88381995e6823ff3f419b5a0cb8ac4f/products.csv' AS row
+MATCH (product:Product {productID: row.ProductID})
+MATCH (category:Category {categoryID: row.CategoryID})
+MERGE (product)-[:PART_OF]->(category);
+
+
+// Create relationships between employees (reporting hierarchy)
+LOAD CSV WITH HEADERS FROM 'https://gist.githubusercontent.com/jexp/054bc6baf36604061bf407aa8cd08608/raw/8bdd36dfc88381995e6823ff3f419b5a0cb8ac4f/employees.csv' AS row
+MATCH (employee:Employee {employeeID: row.EmployeeID})
+MATCH (manager:Employee {employeeID: row.ReportsTo})
+MERGE (employee)-[:REPORTS_TO]->(manager);
\ No newline at end of file
diff --git a/extended/src/test/resources/northwind_schema.cypher b/extended/src/test/resources/northwind_schema.cypher
new file mode 100644
index 0000000000..75f8efd448
--- /dev/null
+++ b/extended/src/test/resources/northwind_schema.cypher
@@ -0,0 +1,6 @@
+CREATE INDEX product_id FOR (p:Product) ON (p.productID);
+CREATE INDEX product_name FOR (p:Product) ON (p.productName);
+CREATE INDEX supplier_id FOR (s:Supplier) ON (s.supplierID);
+CREATE INDEX employee_id FOR (e:Employee) ON (e.employeeID);
+CREATE INDEX category_id FOR (c:Category) ON (c.categoryID);
+CREATE CONSTRAINT order_id FOR (o:Order) REQUIRE o.orderID IS UNIQUE;
\ No newline at end of file