From f2562fe2ca1da7e1704aa2a5b0473be08e5f296c Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <joka921@users.noreply.github.com>
Date: Fri, 31 Jan 2025 02:27:50 +0100
Subject: [PATCH] Don't use the exact size from the cache for query planning
 (#1736)

So far, when a part of a query was in the cache, the query planner used the exact size as cost estimate. This sounds natural because the exact size is by definition the best estimate. However, under special circumstances, this might lead to poor query plans when the other estimates are off (it's not important that the individual estimates are good, it's important that together they lead to a reasonable query plan).

NOTE: This is a trivial change. A bigger issue that we might want to address in the future is that it might not be optimal to compute the average multiplicity of a column as the average multiplicity of it's distinct elements. Instead the average of the squares might be more appropriate (note that the sum of the squares of the individual multiplicities is exactly the size of the result one would get when joining that column with itself).
---
 src/engine/QueryExecutionTree.cpp | 14 +++++---------
 test/OperationTest.cpp            |  6 +++---
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/engine/QueryExecutionTree.cpp b/src/engine/QueryExecutionTree.cpp
index 9b58d0bb78..49ceb340aa 100644
--- a/src/engine/QueryExecutionTree.cpp
+++ b/src/engine/QueryExecutionTree.cpp
@@ -91,15 +91,11 @@ size_t QueryExecutionTree::getCostEstimate() {
 // _____________________________________________________________________________
 size_t QueryExecutionTree::getSizeEstimate() {
   if (!sizeEstimate_.has_value()) {
-    if (cachedResult_) {
-      AD_CORRECTNESS_CHECK(cachedResult_->isFullyMaterialized());
-      sizeEstimate_ = cachedResult_->idTable().size();
-    } else {
-      // if we are in a unit test setting and there is no QueryExecutionContest
-      // specified it is the rootOperation_'s obligation to handle this case
-      // correctly
-      sizeEstimate_ = rootOperation_->getSizeEstimate();
-    }
+    // Note: Previously we used the exact size instead of the estimate for
+    // results that were already in the cache. This however often lead to poor
+    // planning, because the query planner compared exact sizes with estimates,
+    // which lead to worse plans than just conistently choosing the estimate.
+    sizeEstimate_ = rootOperation_->getSizeEstimate();
   }
   return sizeEstimate_.value();
 }
diff --git a/test/OperationTest.cpp b/test/OperationTest.cpp
index 2afbe38a83..e39b9312e0 100644
--- a/test/OperationTest.cpp
+++ b/test/OperationTest.cpp
@@ -242,12 +242,12 @@ TEST(OperationTest, estimatesForCachedResults) {
     [[maybe_unused]] auto res = qet->getResult();
   }
   // The result is now cached inside the static execution context, if we create
-  // the same operation again, the cost estimate is 0 and the size estimate is
-  // exact (3 rows).
+  // the same operation again, the cost estimate is 0. The size estimate doesn't
+  // change (see the `getCostEstimate` function for details on why).
   {
     auto qet = makeQet();
     EXPECT_EQ(qet->getCacheKey(), qet->getRootOperation()->getCacheKey());
-    EXPECT_EQ(qet->getSizeEstimate(), 3u);
+    EXPECT_EQ(qet->getSizeEstimate(), 24u);
     EXPECT_EQ(qet->getCostEstimate(), 0u);
   }
 }