Merge pull request #53 from weaviate/fix-queries

Fix queries
weaviate · Jan 23, 2025 · 4c11e7b · 4c11e7b
2 parents 704dcee + 72ec7a1
commit 4c11e7b
Show file tree

Hide file tree

Showing 45 changed files with 385,095 additions and 20,328 deletions.
diff --git a/app/backend/synthetic-weaviate-queries-with-results.json b/app/backend/synthetic-weaviate-queries-with-results.json
diff --git a/app/frontend/src/components/QueryVisualizer.js b/app/frontend/src/components/QueryVisualizer.js
@@ -788,7 +788,7 @@ const QueryVisualizer = () => {
                       <span className="text-xl mt-2 block">{currentItem.query.corresponding_natural_language_query}</span>
                     </p>
                   )}
-                  <h2 className="font-semibold text-2xl text-[#1c1468]">Query APIs utilized</h2>
+                  <h2 className="font-semibold text-2xl text-[#1c1468]">Query APIs Utilized</h2>
                   {currentItem.query.target_collection && (
                     <p>
                       <span className="font-semibold">Collection:</span>{' '}
@@ -854,6 +854,22 @@ const QueryVisualizer = () => {
                     </p>
                   )}
                 </div>
+
+                <div className="mt-6">
+                  <h2 className="font-semibold text-2xl text-[#1c1468]">Query Validation</h2>
+                  <div className="mt-2">
+                    <p className="font-semibold">
+                      LLM-as-Judge Query Assessment:{' '}
+                      <span className={currentItem.is_valid ? 'text-green-600' : 'text-red-600'}>
+                        {currentItem.is_valid ? 'Valid' : 'Invalid'}
+                      </span>
+                    </p>
+                    <p className="mt-2">
+                      {currentItem.verification_rationale}
+                    </p>
+                  </div>
+                </div>
+
                 {renderQueryResult(currentItem.ground_truth_query_result)}
               </>
             )}

diff --git a/data/OLD-synthetic-weaviate-queries-with-results.json b/data/OLD-synthetic-weaviate-queries-with-results.json
diff --git a/data/analyze-queries.py b/data/analyze-queries.py
@@ -0,0 +1,95 @@
+import json
+from collections import Counter, defaultdict
+from typing import Dict, List, Set
+import itertools
+
+def analyze_operator_distribution(data: List[Dict]) -> None:
+    """
+    Analyze the distribution of operators in the generated queries.
+    """
+    # Initialize counters
+    total_queries = len(data)
+    valid_queries = sum(1 for item in data if item['is_valid'])
+    operator_counts = Counter()
+    operator_combinations = Counter()
+    schemas_covered = set()
+
+    # Count operator occurrences and combinations
+    for item in data:
+        # Track schemas
+        schema_str = json.dumps(item['database_schema'], sort_keys=True)
+        schemas_covered.add(schema_str)
+
+        # Get operators used in this query
+        operators = set(item['ground_truth_operators'])
+
+        # Count individual operators
+        for op in operators:
+            operator_counts[op] += 1
+
+        # Count operator combinations
+        operator_combinations[tuple(sorted(operators))] += 1
+
+    # Print results
+    print("\n=== Query Generation Analysis ===")
+    print(f"\nTotal Queries: {total_queries}")
+    print(f"Valid Queries: {valid_queries} ({(valid_queries/total_queries)*100:.1f}%)")
+    print(f"Unique Schemas Used: {len(schemas_covered)}")
+
+    print("\n=== Individual Operator Distribution ===")
+    for operator, count in sorted(operator_counts.items()):
+        percentage = (count / total_queries) * 100
+        print(f"{operator}: {count} ({percentage:.1f}%)")
+
+    print("\n=== Operator Combination Distribution ===")
+    for combo, count in sorted(operator_combinations.items(), key=lambda x: (-len(x[0]), x[0])):
+        percentage = (count / total_queries) * 100
+        print(f"{' + '.join(combo)}: {count} ({percentage:.1f}%)")
+
+    # Verify completeness of combinations
+    print("\n=== Completeness Analysis ===")
+    operator_types = {
+        'search': ['search_query'],
+        'filter': ['integer_property_filter', 'text_property_filter', 'boolean_property_filter'],
+        'aggregation': ['integer_property_aggregation', 'text_property_aggregation', 'boolean_property_aggregation'],
+        'group': ['groupby_property']
+    }
+
+    # Generate all possible valid combinations
+    all_possible_combinations = set()
+    for r in range(1, len(operator_types) + 1):
+        for type_combo in itertools.combinations(operator_types.keys(), r):
+            # Get all possible operator combinations for these types
+            type_operators = [operator_types[t] for t in type_combo]
+            for op_combo in itertools.product(*type_operators):
+                all_possible_combinations.add(tuple(sorted(op_combo)))
+
+    # Check which combinations are missing
+    actual_combinations = set(operator_combinations.keys())
+    missing_combinations = all_possible_combinations - actual_combinations
+
+    print(f"\nFound {len(actual_combinations)} unique operator combinations")
+    print(f"Expected {len(all_possible_combinations)} possible combinations")
+
+    if missing_combinations:
+        print("\nMissing combinations:")
+        for combo in sorted(missing_combinations, key=lambda x: (len(x), x)):
+            print(f"- {' + '.join(combo)}")
+    else:
+        print("\nAll possible operator combinations are present!")
+
+def main():
+    # Load the generated queries
+    try:
+        with open('synthetic-weaviate-queries-with-results.json', 'r') as f:
+            data = json.load(f)
+        analyze_operator_distribution(data)
+    except FileNotFoundError:
+        print("Error: Could not find the queries file. Make sure it's in the current directory.")
+    except json.JSONDecodeError:
+        print("Error: Could not parse the JSON file. Make sure it's properly formatted.")
+    except Exception as e:
+        print(f"An unexpected error occurred: {str(e)}")
+
+if __name__ == "__main__":
+    main()
diff --git a/data/synthetic-weaviate-queries-with-results.json b/data/synthetic-weaviate-queries-with-results.json
diff --git a/data/synthetic-weaviate-queries-with-schemas.json b/data/synthetic-weaviate-queries-with-schemas.json
diff --git a/...esult-postprocessing/create-radar-plot.py → experimental-results/create-radar-plot.py b/...esult-postprocessing/create-radar-plot.py → experimental-results/create-radar-plot.py
diff --git a/...-results/Llama-3.1-8B-Instruct-Turbo.json → ...s/legacy/Llama-3.1-8B-Instruct-Turbo.json b/...-results/Llama-3.1-8B-Instruct-Turbo.json → ...s/legacy/Llama-3.1-8B-Instruct-Turbo.json
diff --git a/experimental-results/claude-3-5-sonnet.json → ...tal-results/legacy/claude-3-5-sonnet.json b/experimental-results/claude-3-5-sonnet.json → ...tal-results/legacy/claude-3-5-sonnet.json
diff --git a/experimental-results/command-r-plus.json → ...mental-results/legacy/command-r-plus.json b/experimental-results/command-r-plus.json → ...mental-results/legacy/command-r-plus.json
diff --git a/experimental-results/command-r7b.json → experimental-results/legacy/command-r7b.json b/experimental-results/command-r7b.json → experimental-results/legacy/command-r7b.json
diff --git a/experimental-results/gemini-1.5-pro.json → ...mental-results/legacy/gemini-1.5-pro.json b/experimental-results/gemini-1.5-pro.json → ...mental-results/legacy/gemini-1.5-pro.json
diff --git a/...imental-results/gemini-2.0-flash-exp.json → ...-results/legacy/gemini-2.0-flash-exp.json b/...imental-results/gemini-2.0-flash-exp.json → ...-results/legacy/gemini-2.0-flash-exp.json
diff --git a/experimental-results/gpt-4o-mini.json → experimental-results/legacy/gpt-4o-mini.json b/experimental-results/gpt-4o-mini.json → experimental-results/legacy/gpt-4o-mini.json
diff --git a/experimental-results/gpt-4o.json → experimental-results/legacy/gpt-4o.json b/experimental-results/gpt-4o.json → experimental-results/legacy/gpt-4o.json
diff --git a/...ental-results/visualization-py/output.png → experimental-results/output.png b/...ental-results/visualization-py/output.png → experimental-results/output.png
diff --git a/...ts/result-postprocessing/print-queries.py → experimental-results/print-queries.py b/...ts/result-postprocessing/print-queries.py → experimental-results/print-queries.py
diff --git a/...sults/result-postprocessing/radar-plot.js → experimental-results/radar-plot.js b/...sults/result-postprocessing/radar-plot.js → experimental-results/radar-plot.js
diff --git a/...al-results/visualization-py/radar-plot.py → experimental-results/radar-plot.py b/...al-results/visualization-py/radar-plot.py → experimental-results/radar-plot.py
diff --git a/experimental-results/readme.md b/experimental-results/readme.md
@@ -1,25 +1,64 @@
-# Experimental Results
-
-```python
-class QueryPredictionResult(BaseModel):
-    query_index: int
-    database_schema_index: int
-    natural_language_query: str
-    ground_truth_query: WeaviateQueryWithSchema
-    predicted_query: Optional[WeaviateQuery]
-    ast_score: float
-    error: Optional[str]
-
-class ExperimentSummary(BaseModel):
-    timestamp: str
-    model_name: str
-    generate_with_models: bool
-    total_queries: int
-    successful_predictions: int
-    failed_predictions: int
-    average_ast_score: float
-    per_schema_scores: Dict[int, float]
-    detailed_results: List[QueryPredictionResult]
-```
-
-![Weaviate Gorilla](../visuals/weaviate-gorillas/gorilla-96.png)
+# Experiment Results
+
+## Overall Performance Comparison
+
+\begin{table}[h]
+\centering
+\begin{tabular}{|l|r|r|}
+\hline
+\textbf{Metric} & \textbf{GPT-4o} & \textbf{GPT-4o-mini} \\
+\hline
+Total Queries & 315 & 315 \\
+Successful Predictions & 304 & 308 \\
+Failed Predictions & 11 & 7 \\
+Average AST Score & 85.66\% & 83.43\% \\
+\hline
+\end{tabular}
+\caption{Overall Performance Metrics}
+\label{tab:overall-performance}
+\end{table}
+
+## Per Schema Performance
+
+\begin{table}[h]
+\centering
+\begin{tabular}{|l|r|r|}
+\hline
+\textbf{Schema} & \textbf{GPT-4o} & \textbf{GPT-4o-mini} \\
+\hline
+Schema 0 & 87.97\% & 84.14\% \\
+Schema 1 & 85.59\% & 84.10\% \\
+Schema 2 & 85.08\% & 82.30\% \\
+Schema 3 & 81.45\% & 82.23\% \\
+Schema 4 & 82.62\% & 79.18\% \\
+\hline
+\end{tabular}
+\caption{Performance Across Different Schemas}
+\label{tab:schema-performance}
+\end{table}
+
+## Component Analysis
+
+\begin{table}[h]
+\centering
+\begin{tabular}{|l|r|r|r|}
+\hline
+\textbf{Component Type} & \textbf{Sample Size} & \textbf{GPT-4o} & \textbf{GPT-4o-mini} \\
+\hline
+Search Queries & 160 & 76.77\% & 72.48\% \\
+Integer Filters & 80 & 79.28\% & 76.31\% \\
+Text Filters & 80 & 84.53\% & 85.16\% \\
+Boolean Filters & 80 & 91.44\% & 88.13\% \\
+Integer Aggregations & 80 & 82.38\% & 82.69\% \\
+Text Aggregations & 80 & 83.16\% & 78.78\% \\
+Boolean Aggregations & 80 & 87.03\% & 84.59\% \\
+GroupBy Operations & 160 & 83.53\% & 80.03\% \\
+\hline
+\end{tabular}
+\caption{Performance Analysis by Component Type}
+\label{tab:component-analysis}
+\end{table}
+
+# Latex
+
+![Weaviate Gorilla](../../visuals/weaviate-gorillas/gorilla-118.png)
diff --git a/experimental-results/result-postprocessing/ast-by-number-of-arguments.py b/experimental-results/result-postprocessing/ast-by-number-of-arguments.py
diff --git a/experimental-results/result-postprocessing/ast-test-postprocessor.py b/experimental-results/result-postprocessing/ast-test-postprocessor.py