Skip to content

Commit

Permalink
remove OOM caused by very large execution plans
Browse files Browse the repository at this point in the history
  • Loading branch information
eiennohito committed Jan 24, 2025
1 parent acd11b5 commit 2e7bc86
Showing 1 changed file with 3 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ object MergeDedupStats {
explode($"tgtAll").as("srcHash"),
$"tgtMin".as("tgtHash")
)
).where($"srcHash" =!= $"tgtHash").distinct().persist()
).where($"srcHash" =!= $"tgtHash").distinct().localCheckpoint(eager = true)

// Step 2b: collect all repr hash candidates to consider for updating
// find all repr hashes which have distinct hashes
Expand All @@ -108,10 +108,10 @@ object MergeDedupStats {
df.join(nonUnique, "hash").select($"reprHash".as("initReprHash"))
}

val seedGroups = seedGroupsA.union(seedGroupsB).distinct().select(
val seedGroups = seedGroupsA.union(seedGroupsB).distinct().localCheckpoint(eager = true).select(
$"initReprHash",
$"initReprHash".as("newReprHash")
).persist()
)

// compute the correct remaps themselves iteratively
// this will have false positives, but hopefully not much
Expand Down

0 comments on commit 2e7bc86

Please sign in to comment.