Fixes #4000: Add self-explanation to the model, include the verbal sc…

…hema description to the flow
neo4j-contrib · Jul 16, 2024 · 764d9ac · 764d9ac
1 parent c49fe8a
commit 764d9ac
Show file tree

Hide file tree

Showing 6 changed files with 378 additions and 29 deletions.
diff --git a/docs/asciidoc/modules/ROOT/pages/ml/genai.adoc b/docs/asciidoc/modules/ROOT/pages/ml/genai.adoc
@@ -91,6 +91,7 @@ RETURN m.title
 | apiKey | OpenAI API key | in case `apoc.openai.key` is not defined
 | model | The Open AI model | no, default `gpt-3.5-turbo`
 | sample | The number of nodes to skip, e.g. a sample of 1000 will read every 1000th node. It's used as a parameter to `apoc.meta.data` procedure that computes the schema | no, default is a random number
+| additionalPrompts | To specify other prompts to be passed to improve the request
 |===
 
 .Results
@@ -102,6 +103,107 @@ RETURN m.title
 |===
 
 
+We can use the `additionalPrompts` config to improve the request, e.g. adding the natural language description of the schema (like the output of the `apoc.ml.schema` for instance).
+Since OpenAI is mainly trained to elaborate natural language questions asked in, rather than Cypher queries, by using this configuration it is possible to achieve better results.
+For example, given the https://neo4j.com/docs/getting-started/appendix/tutorials/guide-import-relational-and-etl/[Northwind dataset] we can execute:
+
+.Query call
+[source,cypher]
+----
+CALL apoc.ml.schema({apiKey: $apiKey}) YIELD value
+WITH value
+CALL apoc.ml.query("Which 5 employees had sold the product 'Chocolade' and has the highest selling count of another product?
+  Please returns the employee identificator, the other product name and the count orders of another product",
+{
+    retries: 8,
+    retryWithError: true,
+    apiKey: $apiKey,
+    additionalPrompts: [
+        {role: "system", content: "The human description of the schema is the following:\n" + value}
+    ]
+})
+YIELD query, value RETURN query, value
+----
+
+with a result similar to the following.
+
+NOTE: the results are not deterministic and will potentially change each time the query is re-executed
+
+.Results
+[%autowidth, opts=header]
+|===
+| query | value
+| "cypher
+MATCH (p:Product {productName: 'Chocolade'})<-[:CONTAINS]-(:Order)<-[:SOLD]-(e:Employee)
+MATCH (e)-[:SOLD]->(o:Order)-[:CONTAINS]->(p2:Product)
+WITH e, p2, COUNT(DISTINCT o) AS orderCount
+ORDER BY orderCount DESC
+RETURN e.employeeID AS employeeID, p2.productName AS otherProduct, orderCount
+LIMIT 5
+" 
+| {
+"otherProduct": "Gnocchi di nonna Alice",
+"employeeID": "4",
+"orderCount": 14
+}
+| "cypher
+MATCH (p:Product {productName: 'Chocolade'})<-[:CONTAINS]-(:Order)<-[:SOLD]-(e:Employee)
+MATCH (e)-[:SOLD]->(o:Order)-[:CONTAINS]->(p2:Product)
+WITH e, p2, COUNT(DISTINCT o) AS orderCount
+ORDER BY orderCount DESC
+RETURN e.employeeID AS employeeID, p2.productName AS otherProduct, orderCount
+LIMIT 5
+"
+| {
+"otherProduct": "Pâté chinois",
+"employeeID": "4",
+"orderCount": 12
+}
+| "cypher
+MATCH (p:Product {productName: 'Chocolade'})<-[:CONTAINS]-(:Order)<-[:SOLD]-(e:Employee)
+MATCH (e)-[:SOLD]->(o:Order)-[:CONTAINS]->(p2:Product)
+WITH e, p2, COUNT(DISTINCT o) AS orderCount
+ORDER BY orderCount DESC
+RETURN e.employeeID AS employeeID, p2.productName AS otherProduct, orderCount
+LIMIT 5
+"
+| {
+"otherProduct": "Gumbär Gummibärchen",
+"employeeID": "3",
+"orderCount": 12
+}
+| "cypher
+MATCH (p:Product {productName: 'Chocolade'})<-[:CONTAINS]-(:Order)<-[:SOLD]-(e:Employee)
+MATCH (e)-[:SOLD]->(o:Order)-[:CONTAINS]->(p2:Product)
+WITH e, p2, COUNT(DISTINCT o) AS orderCount
+ORDER BY orderCount DESC
+RETURN e.employeeID AS employeeID, p2.productName AS otherProduct, orderCount
+LIMIT 5
+"
+| {
+"otherProduct": "Flotemysost",
+"employeeID": "1",
+"orderCount": 12
+}
+| "cypher
+MATCH (p:Product {productName: 'Chocolade'})<-[:CONTAINS]-(:Order)<-[:SOLD]-(e:Employee)
+MATCH (e)-[:SOLD]->(o:Order)-[:CONTAINS]->(p2:Product)
+WITH e, p2, COUNT(DISTINCT o) AS orderCount
+ORDER BY orderCount DESC
+RETURN e.employeeID AS employeeID, p2.productName AS otherProduct, orderCount
+LIMIT 5
+"
+| {
+"otherProduct": "Pavlova",
+"employeeID": "1",
+"orderCount": 11
+}
+|===
+
+Respect to using the procedure without the natural language schema description, the output has fewer hallucinations, 
+like properties hold by different labels and relationships linked to other entities.
+
+
 == Describe the graph model with natural language
 
 This procedure `apoc.ml.schema` returns a description, in natural language, of the underlying dataset.
@@ -126,6 +228,7 @@ RETURN *
 1 row
 ----
 
+
 .Input Parameters
 [%autowidth, opts=header]
 |===
@@ -205,6 +308,7 @@ RETURN DISTINCT a.name
 | apiKey | OpenAI API key | in case `apoc.openai.key` is not defined
 | model | The Open AI model | no, default `gpt-3.5-turbo`
 | sample | The number of nodes to skip, e.g. a sample of 1000 will read every 1000th node. It's used as a parameter to `apoc.meta.data` procedure that computes the schema | no, default is a random number
+| additionalPrompts | To specify other prompts to be passed to improve the request
 |===
 
 .Results
@@ -214,6 +318,47 @@ RETURN DISTINCT a.name
 | value | the description of the dataset
 |===
 
+
+We can use the `additionalPrompts` config to improve the request, e.g. adding the natural language description of the schema (like the output of the `apoc.ml.schema` for instance).
+Since OpenAI is mainly trained to elaborate natural language questions asked in, rather than Cypher queries, by using this configuration it is possible to achieve better results.
+For example, given the https://neo4j.com/docs/getting-started/appendix/tutorials/guide-import-relational-and-etl/[Northwind dataset] we can execute:
+
+.Query call
+[source,cypher]
+----
+CALL apoc.ml.schema({apiKey: $apiKey}) YIELD value
+WITH value
+CALL apoc.ml.cypher("Which 5 employees had sold the product 'Chocolade' and has the highest selling count of another product? 
+  Please returns the employee identificator, the other product name and the count orders of another product",
+{
+  count: 1,
+  apiKey: $apiKey,
+  additionalPrompts: [
+    {role: "system", content: "The human description of the schema is the following:\n" + value}
+  ]
+})
+YIELD value RETURN value
+----
+
+with a result similar to the following.
+
+NOTE: the results are not deterministic and will potentially change each time the query is re-executed
+
+.Results
+[%autowidth, opts=header]
+|===
+| value
+| MATCH (p:Product {productName: 'Chocolade'})<-[:CONTAINS]-(o:Order)<-[:SOLD]-(e:Employee)
+MATCH (e)-[:SOLD]->(o2:Order)-[:CONTAINS]->(p2:Product)
+WITH e, p2, COUNT(DISTINCT o2) AS ordersCnt
+ORDER BY ordersCnt DESC
+RETURN e.employeeID AS employeeID, p2.productName AS otherProduct, ordersCnt
+LIMIT 5
+|===
+
+Respect to using the procedure without the natural language schema description, the output has fewer hallucinations,
+like properties hold by different labels and relationships linked to other entities.
+
 == Create a natural language query explanation from a cypher query
 
 This procedure `apoc.ml.fromCypher` takes a natural language question and transforms it into natural language query explanation.

diff --git a/extended/src/main/java/apoc/ml/Prompt.java b/extended/src/main/java/apoc/ml/Prompt.java
@@ -3,9 +3,11 @@
 import apoc.ApocConfig;
 import apoc.Extended;
 import apoc.result.StringResult;
+import apoc.util.CollectionUtils;
 import apoc.util.Util;
 import apoc.util.collection.Iterators;
 import com.fasterxml.jackson.core.JsonProcessingException;
+import org.apache.commons.collections.ListUtils;
 import org.apache.commons.text.WordUtils;
 import org.jetbrains.annotations.NotNull;
 import org.neo4j.graphdb.Entity;
@@ -148,6 +150,7 @@ private void augmentEntity(RagConfig config, String[] objects, StringBuilder con
     public static final String EXPLAIN_SCHEMA_PROMPT = """
             You are an expert in the Neo4j graph database and graph data modeling and have experience in a wide variety of business domains.
             Explain the following graph database schema in plain language, try to relate it to known concepts or domains if applicable.
+            Try to explain as much as possible the nodes, relationships and properties.
             Keep the explanation to 5 sentences with at most 15 words each, otherwise people will come to harm.
             """;
 
@@ -274,8 +277,9 @@ public Stream<PromptMapResult> query(@Name("question") String question,
 
     @Procedure
     public Stream<StringResult> schema(@Name(value = "conf", defaultValue = "{}") Map<String, Object> conf) throws MalformedURLException, JsonProcessingException {
+        String schema = loadSchema(tx, conf);
         String schemaExplanation = prompt("Please explain the graph database schema to me and relate it to well known concepts and domains.",
-                EXPLAIN_SCHEMA_PROMPT, "This database schema ", loadSchema(tx, conf), conf, List.of());
+                EXPLAIN_SCHEMA_PROMPT, "This database schema ", schema, conf, List.of());
         return Stream.of(new StringResult(schemaExplanation));
     }
 
@@ -302,14 +306,19 @@ private QueryResult tryQuery(String question, Map<String, Object> conf, String s
         }
     }
 
-    private String prompt(String userQuestion, String systemPrompt, String assistantPrompt, String schema, Map<String, Object> conf, List<Map<String,String>> otherPrompts) throws JsonProcessingException, MalformedURLException {
+    private String prompt(String userQuestion, String systemPrompt, String assistantPrompt, String schema, Map<String, Object> conf, List<Map<String,String>> otherPromptsFromRetries) throws JsonProcessingException, MalformedURLException {
         List<Map<String, String>> prompt = new ArrayList<>();
         if (systemPrompt != null && !systemPrompt.isBlank()) prompt.add(Map.of("role", "system", "content", systemPrompt));
         if (schema != null && !schema.isBlank()) prompt.add(Map.of("role", "system", "content", "The graph database schema consists of these elements\n" + schema));
+
+        List<Map<String, String>> additionalPrompts = (List<Map<String, String>>) conf.get("additionalPrompts");
+        if (CollectionUtils.isNotEmpty(additionalPrompts)) {
+            prompt.addAll(additionalPrompts);
+        }
         if (userQuestion != null && !userQuestion.isBlank()) prompt.add(Map.of("role", "user", "content", userQuestion));
         if (assistantPrompt != null && !assistantPrompt.isBlank()) prompt.add(Map.of("role", "assistant", "content", assistantPrompt));
 
-        prompt.addAll(otherPrompts);
+        prompt.addAll(otherPromptsFromRetries);
 
         String apiKey = (String) conf.get(API_KEY_CONF);
         String model = (String) conf.getOrDefault("model", "gpt-3.5-turbo");
@@ -357,12 +366,20 @@ private String prompt(String userQuestion, String systemPrompt, String assistant
             """ + SCHEMA_FROM_META_DATA;
 
     private final static String SCHEMA_PROMPT = """
-                nodes:
-                %s
-                relationships:
-                %s
-                patterns:
-                %s
+            nodes:
+            ```
+            %s
+            ```
+
+            relationships:
+            ```
+            %s
+            ```
+
+            patterns:
+            ```
+            %s
+            ```
             """;
 
 

diff --git a/extended/src/main/java/apoc/util/ExtendedUtil.java b/extended/src/main/java/apoc/util/ExtendedUtil.java
@@ -337,5 +337,11 @@ public static float[] listOfNumbersToFloatArray(List<? extends Number> embedding
         }
         return floats;
     }
+
+    public static List<String> splitSemicolonAndRemoveBlanks(String value) {
+        return Arrays.stream(value.split(";\n"))
+                .filter(i -> !i.isBlank())
+                .toList();
+    }
 
 }