diff --git a/posts/ptrhash-paper/plot.py b/posts/ptrhash-paper/bucket-fn.py similarity index 84% rename from posts/ptrhash-paper/plot.py rename to posts/ptrhash-paper/bucket-fn.py index 58ddd08..d64b12b 100644 --- a/posts/ptrhash-paper/plot.py +++ b/posts/ptrhash-paper/bucket-fn.py @@ -41,12 +41,15 @@ def invert(f, y): return l +lmbda = 4 + + def bucket_sz(f, y): y1 = y y2 = y + 0.001 x1 = invert(f, y1) x2 = invert(f, y2) - return (x2 - x1) * 1000 + return lmbda * (x2 - x1) * 1000 xs = [x / 1000 for x in range(1000)] @@ -73,7 +76,7 @@ def bucket_sz(f, y): # Add horizontal grid lines plt.grid(axis="y", lw=0.5) -plt.savefig("bucket-fn.svg", bbox_inches="tight") +plt.savefig("plots/bucket-fn.svg", bbox_inches="tight") plt.close() ## PLOT 2 @@ -93,11 +96,14 @@ def bucket_sz(f, y): plt.gca().spines["right"].set_visible(False) # x and y from 0 to 1 plt.xlim(0, 1) -plt.ylim(0, 3) +plt.ylim(0, 11.5) plt.xlabel("Normalized bucket index") -plt.ylabel("Relative expected bucket size") +plt.ylabel("Expected bucket size for $\lambda = 4$") # Add horizontal grid lines -plt.grid(axis="y", lw=0.5) +plt.grid(axis="y", lw=0.5, which="major") +plt.grid(axis="y", lw=0.5, which="minor", alpha=0.4) +# Add minor tickes lines every 1 +plt.yticks(range(0, 12, 1), minor=True) -plt.savefig("bucket-size.svg", bbox_inches="tight") +plt.savefig("plots/bucket-size.svg", bbox_inches="tight") plt.close() diff --git a/posts/ptrhash-paper/evals.py b/posts/ptrhash-paper/evals.py new file mode 100644 index 0000000..04b5132 --- /dev/null +++ b/posts/ptrhash-paper/evals.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 + +import json +from math import log, floor, ceil +import matplotlib.pyplot as plt +from matplotlib.ticker import MultipleLocator + + +def build_stats(f, out): + alpha = 0.98 + + with open(f) as f: + all_data = json.load(f) + + cols = len(all_data) + + pcts = range(100) + xs = [x / 100 for x in range(0, 101)] + + # Figure with 3 subplots + fig, axs = plt.subplots( + 1, cols, figsize=(cols * 3 + 0.75, 2.5), layout="constrained" + ) + if cols == 1: + axs = [axs] + keys = ["linear", "skewed", "optimal", "square", "cubic"] + keys = [k for k in keys if k in all_data] + + for ax1, name in zip(axs, keys): + data = all_data[name]["by_pct"] + elem = 0 + elems = [0] + for pct in pcts: + elem += data[pct]["elements"] + elems.append(elem) + slots = elems[-1] / alpha + load = [x / slots for x in elems] + + # Set plot size + ax3 = ax1.twinx() + ax2 = ax1.twinx() + + ax1.set_ylim(0, 1.0) + ax3.set_ylim(0, 1) + ax2.set_ylim(0, 10) + + ax1.set_title(name.capitalize()) + + ax1.set_xlabel("Normalized bucket index") + ax1.set_ylabel("Evictions per bucket") + ax2.set_ylabel("Bucket size") + ax2.yaxis.label.set_color("red") + ax1.xaxis.set_minor_locator(MultipleLocator(0.1)) + + p1 = ax1.plot( + xs[1:], + [data[pct]["evictions"] / data[pct]["buckets"] for pct in pcts], + label="Evictions per bucket", + color="black", + ) + # Plot load factor + p3 = ax3.fill(xs + [1], load + [0], color="red", alpha=0.1, label="Load factor") + ax3.yaxis.set_visible(False) + + # Plot bucket size on secondary axis + p2 = ax2.plot( + xs[1:], + [data[pct]["elements"] / data[pct]["buckets"] for pct in pcts], + color="red", + label="Bucket size", + ) + + ax1.set_zorder(ax3.get_zorder() + 1) + ax1.set_frame_on(False) + + # Only show x and y ax + ax1.spines["top"].set_visible(False) + ax1.spines["bottom"].set_visible(False) + ax1.spines["left"].set_visible(False) + ax1.spines["right"].set_visible(False) + ax2.spines["top"].set_visible(False) + ax2.spines["bottom"].set_visible(False) + ax2.spines["left"].set_visible(False) + ax2.spines["right"].set_visible(False) + ax3.spines["top"].set_visible(False) + ax3.spines["bottom"].set_visible(False) + ax3.spines["left"].set_visible(False) + ax3.spines["right"].set_visible(False) + # x and y from 0 to 1 + ax1.set_xlim(0, 1) + # plt.ylim(0, 1) + # plt.xlabel("Normalized hash") + # plt.ylabel("Normalized bucket index") + # Add horizontal grid lines + ax1.grid(axis="y", lw=0.5) + + # Keep only leftmost and rightmost y-axis + # First + if name == keys[0]: + ax1.yaxis.set_visible(True) + else: + ax1.set_yticklabels([]) + ax1.yaxis.set_ticks_position("none") + ax1.set_ylabel(None) + # Last + if name == keys[-1]: + ax2.yaxis.set_visible(True) + if len(keys) > 1: + ax1.legend(handles=p1 + p2 + p3, loc="best") + else: + ax2.yaxis.set_visible(False) + + plt.savefig(out, bbox_inches="tight") + plt.close() + + +build_stats("data/build_stats_l35.json", "plots/build_stats_l35.svg") +build_stats("data/build_stats_l40.json", "plots/build_stats_l40.svg") diff --git a/posts/ptrhash-paper/bucket-fn.svg b/posts/ptrhash-paper/plots/bucket-fn.svg similarity index 93% rename from posts/ptrhash-paper/bucket-fn.svg rename to posts/ptrhash-paper/plots/bucket-fn.svg index 45db833..5165d4d 100644 --- a/posts/ptrhash-paper/bucket-fn.svg +++ b/posts/ptrhash-paper/plots/bucket-fn.svg @@ -6,7 +6,7 @@ - 2024-12-04T11:31:37.871010 + 2024-12-04T15:52:53.928454 image/svg+xml @@ -41,12 +41,12 @@ z - - + @@ -91,7 +91,7 @@ z - + @@ -132,7 +132,7 @@ z - + @@ -168,7 +168,7 @@ z - + @@ -215,7 +215,7 @@ z - + @@ -271,7 +271,7 @@ z - + @@ -576,16 +576,16 @@ z +" clip-path="url(#p560d1a2939)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.5; stroke-linecap: square"/> - - + @@ -601,11 +601,11 @@ L -3.5 0 +" clip-path="url(#p560d1a2939)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.5; stroke-linecap: square"/> - + @@ -621,11 +621,11 @@ L 294.88125 144.055219 +" clip-path="url(#p560d1a2939)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.5; stroke-linecap: square"/> - + @@ -641,11 +641,11 @@ L 294.88125 110.791219 +" clip-path="url(#p560d1a2939)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.5; stroke-linecap: square"/> - + @@ -661,11 +661,11 @@ L 294.88125 77.527219 +" clip-path="url(#p560d1a2939)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.5; stroke-linecap: square"/> - + @@ -681,11 +681,11 @@ L 294.88125 44.263219 +" clip-path="url(#p560d1a2939)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.5; stroke-linecap: square"/> - + @@ -870,14 +870,14 @@ z +" clip-path="url(#p560d1a2939)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square"/> +" clip-path="url(#p560d1a2939)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square"/> +" clip-path="url(#p560d1a2939)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square"/> +" clip-path="url(#p560d1a2939)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square"/> +" clip-path="url(#p560d1a2939)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square"/> + diff --git a/posts/ptrhash-paper/bucket-size.svg b/posts/ptrhash-paper/plots/bucket-size.svg similarity index 56% rename from posts/ptrhash-paper/bucket-size.svg rename to posts/ptrhash-paper/plots/bucket-size.svg index 2071050..ffe5353 100644 --- a/posts/ptrhash-paper/bucket-size.svg +++ b/posts/ptrhash-paper/plots/bucket-size.svg @@ -1,12 +1,12 @@ - + - 2024-12-04T11:31:38.042555 + 2024-12-04T15:52:54.115853 image/svg+xml @@ -21,19 +21,19 @@ - - @@ -41,17 +41,17 @@ z - - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - + - - + + - - - + - + - - - - - - - - + + + - + - + - - - - - + + + - + - + - - - - - + + + - + - + - - - - - + + + - + - + - - - - - + + + + - + + + + - + - - - - - - - - - + + + + + + + + - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - - + + - - + + - - + + - - + + - - - - - + - + - + - - + - + - + + @@ -1698,15 +1742,15 @@ z - - + - + - + - - + - + - + - - + - + - + - + + diff --git a/posts/ptrhash-paper/plots/build_stats_l35.svg b/posts/ptrhash-paper/plots/build_stats_l35.svg new file mode 100644 index 0000000..0a48b6c --- /dev/null +++ b/posts/ptrhash-paper/plots/build_stats_l35.svg @@ -0,0 +1,3775 @@ + + + + + + + + 2024-12-05T13:48:09.676374 + image/svg+xml + + + Matplotlib v3.9.2, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/posts/ptrhash-paper/plots/build_stats_l40.svg b/posts/ptrhash-paper/plots/build_stats_l40.svg new file mode 100644 index 0000000..5e908df --- /dev/null +++ b/posts/ptrhash-paper/plots/build_stats_l40.svg @@ -0,0 +1,1429 @@ + + + + + + + + 2024-12-05T13:48:10.124032 + image/svg+xml + + + Matplotlib v3.9.2, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/posts/ptrhash-paper/ptrhash-paper.org b/posts/ptrhash-paper/ptrhash-paper.org index d2ab4a7..c14367d 100644 --- a/posts/ptrhash-paper/ptrhash-paper.org +++ b/posts/ptrhash-paper/ptrhash-paper.org @@ -501,7 +501,7 @@ impl CacheLineEF { #+name: bucket-fn #+caption: The left shows various bucket assignment functions $\gamma$, such as the piecewise linear function used by FCH and PTHash, and the optimal function introduced by PHOBIC. Flatter slopes at $x=0$ create larger buckets, while steeper slopes at $x=1$ create more small buckets, as shown on the right, as the distribution of expected bucket sizes given by $(\gamma^{-1})'$ when the expected bucket size is $\lambda=4$. -| [[file:bucket-fn.svg]] | [[file:bucket-size.svg]] | +| [[file:plots/bucket-fn.svg]] | [[file:plots/bucket-size.svg]] | During construction, slots slowly fill up as more buckets are placed. Because of this, the first buckets are much easier to place than the @@ -633,22 +633,28 @@ can also be stored to disk and read on-demand while querying. This is supported * Results -** Setup -- =lscpu | grep CPU= CPU: Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz - - Running at 3.6GHz - - Cache sizes 32KiB/256KiB/12MiB - - 12 line fill buffers - - hyperthreading disabled -- Ram: =sudo lshw -short -C memory= 2x 32GiB SODIMM DDR4 synchronous 3200 MHz. -- Measure at 25M keys (which at 10MB fits in 12MB L3 cache) and 1G keys (which is 20x larger than L3) -*** Parameters -- Parameters for small cases, -** PtrHash construction -- Construction time vs part size -- Single bucket size vs 2-way split vs PHOBIC - - Construction speed - - Sequential lookup - - Prefetching +In this section we investigate PtrHash construction and query throughput for +different parameters, and compare PtrHash to competitors. +All experiments are run on an Intel Core i7-10750H CPU with 6 cores and +hyper-threading disabled. +The frequency is pinned to 2.6GHz. +Cache sizes are 32KiB L1 and 256KiB L2 per core, and 12MiB shared L3 cache. Main +memory is 64GiB DDR4 at 3200MHz, split over two 32GiB banks. + +All experiments use either 20 million keys, for which the pilots take around +6MB and easily fit in L3 cache, or $10^9$ keys, for which the pilots take around +300MB and are much larger than L3. + +** PtrHash parameters +*** Buckets function + +#+name: bucket-fn-plot +#+caption: Bucket size distribution (red) and average number of evictions (black) per additionally placed bucket during construction of the pilot table. Parameters are $n=10^9$ keys, $S=2^{18}$ slots per part, and $\alpha=0.98$, and the red shaded load factor ranges from $0$ to $\alpha$. On the left (first five plots), $\lambda=3.5$ so that the pilots take $2.29$ bits/key. For all methods, placing buckets of size $1$ is fast due to the load factor $\alpha<1$, and the bottleneck is placing the last buckets of size $2$ and $3$. Cubic has the least amount of evictions, and hence is fastest to construct. For $\lambda=4.0$ (rightmost plot), the linear, skewed, and optimal bucket assignment functions cause endless evictions, and construction fails. The cubic function does work, resulting in $2.0$ bits/key for the pilots. +#+attr_html: :class full-width +| [[file:plots/build_stats_l35.svg]] | [[file:plots/build_stats_l40.svg]] | + +*** $\alpha$, $\lambda$, and bucket function +- Table of parameters, and time and space used - Comparison of bucketing functions; plot over 'construction time' with pilot value and number of evictions. *** Remap