From de80e9ed0e2ee6d3e21d03ded2de9eadd270bad5 Mon Sep 17 00:00:00 2001
From: Doug Branton <brantd@uw.edu>
Date: Mon, 1 Jul 2024 10:20:32 -0700
Subject: [PATCH 1/4] add testing of seed stability

---
 tests/nested_dask/test_datasets.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/nested_dask/test_datasets.py b/tests/nested_dask/test_datasets.py
index 26f9ad7..7a2e66a 100644
--- a/tests/nested_dask/test_datasets.py
+++ b/tests/nested_dask/test_datasets.py
@@ -1,4 +1,5 @@
 import nested_dask as nd
+import pytest
 
 
 def test_generate_data():
@@ -18,3 +19,8 @@ def test_generate_data():
     # test the length
     assert len(generate_1) == 10
     assert len(generate_1.nested.nest.to_flat()) == 1000
+
+    # test seed stability
+    assert pytest.approx(generate_1.compute().loc[0]["a"], 0.1) == 0.417
+    assert pytest.approx(generate_1.compute().loc[0]["b"], 0.1) == 0.838
+    assert pytest.approx(generate_1.nested.nest.to_flat().compute().iloc[0]["t"], 0.1) == 16.015

From 9b8cf7c9fa374022e3865fa585faa58ebb6980ac Mon Sep 17 00:00:00 2001
From: Doug Branton <brantd@uw.edu>
Date: Mon, 1 Jul 2024 10:30:43 -0700
Subject: [PATCH 2/4] try just iloc

---
 tests/nested_dask/test_accessor.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/nested_dask/test_accessor.py b/tests/nested_dask/test_accessor.py
index 8cc7603..48c97b3 100644
--- a/tests/nested_dask/test_accessor.py
+++ b/tests/nested_dask/test_accessor.py
@@ -31,9 +31,9 @@ def test_to_flat(test_dataset):
     # Make sure we retain all rows
     assert len(flat_ztf.loc[1]) == 500
 
-    one_row = flat_ztf.loc[1].compute().iloc[1]
-    assert pytest.approx(one_row["t"], 0.01) == 5.4584
-    assert pytest.approx(one_row["flux"], 0.01) == 84.1573
+    one_row = flat_ztf.compute().iloc[0]
+    assert pytest.approx(one_row["t"], 0.01) == 6.5329
+    assert pytest.approx(one_row["flux"], 0.01) == 19.0794
     assert one_row["band"] == "r"
 
 

From 9603e5175014cc4a4c75fbc1f4331f9d72392b02 Mon Sep 17 00:00:00 2001
From: Doug Branton <brantd@uw.edu>
Date: Mon, 1 Jul 2024 10:42:08 -0700
Subject: [PATCH 3/4] try from generate_data

---
 tests/nested_dask/test_accessor.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tests/nested_dask/test_accessor.py b/tests/nested_dask/test_accessor.py
index 48c97b3..23ff703 100644
--- a/tests/nested_dask/test_accessor.py
+++ b/tests/nested_dask/test_accessor.py
@@ -1,6 +1,7 @@
 import pandas as pd
 import pyarrow as pa
 import pytest
+import nested_dask as nd
 
 
 def test_nest_accessor(test_dataset):
@@ -19,6 +20,28 @@ def test_fields(test_dataset):
     assert test_dataset.nested.nest.fields == ["t", "flux", "band"]
 
 
+def test_to_flat_gen():
+    nf = nd.datasets.generate_data(10, 100, npartitions=2, seed=1)
+
+    flat_nf = nf.nested.nest.to_flat()
+
+    # check dtypes
+    assert flat_nf.dtypes["t"] == pd.ArrowDtype(pa.float64())
+    assert flat_nf.dtypes["flux"] == pd.ArrowDtype(pa.float64())
+    assert flat_nf.dtypes["band"] == pd.ArrowDtype(pa.string())
+
+    # Make sure we retain all rows
+    assert len(flat_nf.loc[1]) == 100
+
+    one_row = flat_nf.compute().iloc[0]
+
+    assert pytest.approx(one_row["t"], 0.01) == 16.0149
+    assert pytest.approx(one_row["flux"], 0.01) == 51.2061
+    assert one_row["band"] == "r"
+
+
+
+
 def test_to_flat(test_dataset):
     """test the to_flat function"""
     flat_ztf = test_dataset.nested.nest.to_flat()

From d0fa21c1c146d34f26e4114311e7c854a8a3bd9a Mon Sep 17 00:00:00 2001
From: Doug Branton <brantd@uw.edu>
Date: Mon, 1 Jul 2024 10:57:22 -0700
Subject: [PATCH 4/4] use generate_data for accessor tests

---
 tests/nested_dask/test_accessor.py | 85 ++++++++++++++----------------
 1 file changed, 39 insertions(+), 46 deletions(-)

diff --git a/tests/nested_dask/test_accessor.py b/tests/nested_dask/test_accessor.py
index 23ff703..2d89adf 100644
--- a/tests/nested_dask/test_accessor.py
+++ b/tests/nested_dask/test_accessor.py
@@ -1,7 +1,7 @@
+import nested_dask as nd
 import pandas as pd
 import pyarrow as pa
 import pytest
-import nested_dask as nd
 
 
 def test_nest_accessor(test_dataset):
@@ -20,7 +20,8 @@ def test_fields(test_dataset):
     assert test_dataset.nested.nest.fields == ["t", "flux", "band"]
 
 
-def test_to_flat_gen():
+def test_to_flat():
+    """test the to_flat function"""
     nf = nd.datasets.generate_data(10, 100, npartitions=2, seed=1)
 
     flat_nf = nf.nested.nest.to_flat()
@@ -40,77 +41,69 @@ def test_to_flat_gen():
     assert one_row["band"] == "r"
 
 
-
-
-def test_to_flat(test_dataset):
+def test_to_flat_with_fields():
     """test the to_flat function"""
-    flat_ztf = test_dataset.nested.nest.to_flat()
-
-    # check dtypes
-    assert flat_ztf.dtypes["t"] == pd.ArrowDtype(pa.float64())
-    assert flat_ztf.dtypes["flux"] == pd.ArrowDtype(pa.float64())
-    assert flat_ztf.dtypes["band"] == pd.ArrowDtype(pa.large_string())
-
-    # Make sure we retain all rows
-    assert len(flat_ztf.loc[1]) == 500
-
-    one_row = flat_ztf.compute().iloc[0]
-    assert pytest.approx(one_row["t"], 0.01) == 6.5329
-    assert pytest.approx(one_row["flux"], 0.01) == 19.0794
-    assert one_row["band"] == "r"
+    nf = nd.datasets.generate_data(10, 100, npartitions=2, seed=1)
 
+    flat_nf = nf.nested.nest.to_flat(fields=["t", "flux"])
 
-def test_to_flat_with_fields(test_dataset):
-    """test the to_flat function"""
-    flat_ztf = test_dataset.nested.nest.to_flat(fields=["t", "flux"])
+    assert "band" not in flat_nf.columns
 
     # check dtypes
-    assert flat_ztf.dtypes["t"] == pd.ArrowDtype(pa.float64())
-    assert flat_ztf.dtypes["flux"] == pd.ArrowDtype(pa.float64())
+    assert flat_nf.dtypes["t"] == pd.ArrowDtype(pa.float64())
+    assert flat_nf.dtypes["flux"] == pd.ArrowDtype(pa.float64())
 
     # Make sure we retain all rows
-    assert len(flat_ztf.loc[1]) == 500
+    assert len(flat_nf.loc[1]) == 100
+
+    one_row = flat_nf.compute().iloc[0]
 
-    one_row = flat_ztf.loc[1].compute().iloc[1]
-    assert pytest.approx(one_row["t"], 0.01) == 5.4584
-    assert pytest.approx(one_row["flux"], 0.01) == 84.1573
+    assert pytest.approx(one_row["t"], 0.01) == 16.0149
+    assert pytest.approx(one_row["flux"], 0.01) == 51.2061
 
 
-def test_to_lists(test_dataset):
+def test_to_lists():
     """test the to_lists function"""
-    list_ztf = test_dataset.nested.nest.to_lists()
+
+    nf = nd.datasets.generate_data(10, 100, npartitions=2, seed=1)
+    list_nf = nf.nested.nest.to_lists()
 
     # check dtypes
-    assert list_ztf.dtypes["t"] == pd.ArrowDtype(pa.list_(pa.float64()))
-    assert list_ztf.dtypes["flux"] == pd.ArrowDtype(pa.list_(pa.float64()))
-    assert list_ztf.dtypes["band"] == pd.ArrowDtype(pa.list_(pa.large_string()))
+    assert list_nf.dtypes["t"] == pd.ArrowDtype(pa.list_(pa.float64()))
+    assert list_nf.dtypes["flux"] == pd.ArrowDtype(pa.list_(pa.float64()))
+    assert list_nf.dtypes["band"] == pd.ArrowDtype(pa.list_(pa.string()))
 
     # Make sure we have a single row for an id
-    assert len(list_ztf.loc[1]) == 1
+    assert len(list_nf.loc[1]) == 1
 
     # Make sure we retain all rows -- double loc for speed and pandas get_item
-    assert len(list_ztf.loc[1].compute().loc[1]["t"]) == 500
+    assert len(list_nf.loc[1].compute().loc[1]["t"]) == 100
 
+    one_row = list_nf.compute().iloc[1]
     # spot-check values
-    assert pytest.approx(list_ztf.loc[1].compute().loc[1]["t"][0], 0.01) == 7.5690279
-    assert pytest.approx(list_ztf.loc[1].compute().loc[1]["flux"][0], 0.01) == 79.6886
-    assert list_ztf.loc[1].compute().loc[1]["band"][0] == "g"
+    assert pytest.approx(one_row["t"][0], 0.01) == 19.3652
+    assert pytest.approx(one_row["flux"][0], 0.01) == 61.7461
+    assert one_row["band"][0] == "g"
 
 
-def test_to_lists_with_fields(test_dataset):
+def test_to_lists_with_fields():
     """test the to_lists function"""
-    list_ztf = test_dataset.nested.nest.to_lists(fields=["t", "flux"])
+    nf = nd.datasets.generate_data(10, 100, npartitions=2, seed=1)
+    list_nf = nf.nested.nest.to_lists(fields=["t", "flux"])
+
+    assert "band" not in list_nf.columns
 
     # check dtypes
-    assert list_ztf.dtypes["t"] == pd.ArrowDtype(pa.list_(pa.float64()))
-    assert list_ztf.dtypes["flux"] == pd.ArrowDtype(pa.list_(pa.float64()))
+    assert list_nf.dtypes["t"] == pd.ArrowDtype(pa.list_(pa.float64()))
+    assert list_nf.dtypes["flux"] == pd.ArrowDtype(pa.list_(pa.float64()))
 
     # Make sure we have a single row for an id
-    assert len(list_ztf.loc[1]) == 1
+    assert len(list_nf.loc[1]) == 1
 
     # Make sure we retain all rows -- double loc for speed and pandas get_item
-    assert len(list_ztf.loc[1].compute().loc[1]["t"]) == 500
+    assert len(list_nf.loc[1].compute().loc[1]["t"]) == 100
 
+    one_row = list_nf.compute().iloc[1]
     # spot-check values
-    assert pytest.approx(list_ztf.loc[1].compute().loc[1]["t"][0], 0.01) == 7.5690279
-    assert pytest.approx(list_ztf.loc[1].compute().loc[1]["flux"][0], 0.01) == 79.6886
+    assert pytest.approx(one_row["t"][0], 0.01) == 19.3652
+    assert pytest.approx(one_row["flux"][0], 0.01) == 61.7461