From de80e9ed0e2ee6d3e21d03ded2de9eadd270bad5 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Mon, 1 Jul 2024 10:20:32 -0700 Subject: [PATCH 1/4] add testing of seed stability --- tests/nested_dask/test_datasets.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/nested_dask/test_datasets.py b/tests/nested_dask/test_datasets.py index 26f9ad7..7a2e66a 100644 --- a/tests/nested_dask/test_datasets.py +++ b/tests/nested_dask/test_datasets.py @@ -1,4 +1,5 @@ import nested_dask as nd +import pytest def test_generate_data(): @@ -18,3 +19,8 @@ def test_generate_data(): # test the length assert len(generate_1) == 10 assert len(generate_1.nested.nest.to_flat()) == 1000 + + # test seed stability + assert pytest.approx(generate_1.compute().loc[0]["a"], 0.1) == 0.417 + assert pytest.approx(generate_1.compute().loc[0]["b"], 0.1) == 0.838 + assert pytest.approx(generate_1.nested.nest.to_flat().compute().iloc[0]["t"], 0.1) == 16.015 From 9b8cf7c9fa374022e3865fa585faa58ebb6980ac Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Mon, 1 Jul 2024 10:30:43 -0700 Subject: [PATCH 2/4] try just iloc --- tests/nested_dask/test_accessor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/nested_dask/test_accessor.py b/tests/nested_dask/test_accessor.py index 8cc7603..48c97b3 100644 --- a/tests/nested_dask/test_accessor.py +++ b/tests/nested_dask/test_accessor.py @@ -31,9 +31,9 @@ def test_to_flat(test_dataset): # Make sure we retain all rows assert len(flat_ztf.loc[1]) == 500 - one_row = flat_ztf.loc[1].compute().iloc[1] - assert pytest.approx(one_row["t"], 0.01) == 5.4584 - assert pytest.approx(one_row["flux"], 0.01) == 84.1573 + one_row = flat_ztf.compute().iloc[0] + assert pytest.approx(one_row["t"], 0.01) == 6.5329 + assert pytest.approx(one_row["flux"], 0.01) == 19.0794 assert one_row["band"] == "r" From 9603e5175014cc4a4c75fbc1f4331f9d72392b02 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Mon, 1 Jul 2024 10:42:08 -0700 Subject: [PATCH 3/4] try from generate_data --- tests/nested_dask/test_accessor.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/nested_dask/test_accessor.py b/tests/nested_dask/test_accessor.py index 48c97b3..23ff703 100644 --- a/tests/nested_dask/test_accessor.py +++ b/tests/nested_dask/test_accessor.py @@ -1,6 +1,7 @@ import pandas as pd import pyarrow as pa import pytest +import nested_dask as nd def test_nest_accessor(test_dataset): @@ -19,6 +20,28 @@ def test_fields(test_dataset): assert test_dataset.nested.nest.fields == ["t", "flux", "band"] +def test_to_flat_gen(): + nf = nd.datasets.generate_data(10, 100, npartitions=2, seed=1) + + flat_nf = nf.nested.nest.to_flat() + + # check dtypes + assert flat_nf.dtypes["t"] == pd.ArrowDtype(pa.float64()) + assert flat_nf.dtypes["flux"] == pd.ArrowDtype(pa.float64()) + assert flat_nf.dtypes["band"] == pd.ArrowDtype(pa.string()) + + # Make sure we retain all rows + assert len(flat_nf.loc[1]) == 100 + + one_row = flat_nf.compute().iloc[0] + + assert pytest.approx(one_row["t"], 0.01) == 16.0149 + assert pytest.approx(one_row["flux"], 0.01) == 51.2061 + assert one_row["band"] == "r" + + + + def test_to_flat(test_dataset): """test the to_flat function""" flat_ztf = test_dataset.nested.nest.to_flat() From d0fa21c1c146d34f26e4114311e7c854a8a3bd9a Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Mon, 1 Jul 2024 10:57:22 -0700 Subject: [PATCH 4/4] use generate_data for accessor tests --- tests/nested_dask/test_accessor.py | 85 ++++++++++++++---------------- 1 file changed, 39 insertions(+), 46 deletions(-) diff --git a/tests/nested_dask/test_accessor.py b/tests/nested_dask/test_accessor.py index 23ff703..2d89adf 100644 --- a/tests/nested_dask/test_accessor.py +++ b/tests/nested_dask/test_accessor.py @@ -1,7 +1,7 @@ +import nested_dask as nd import pandas as pd import pyarrow as pa import pytest -import nested_dask as nd def test_nest_accessor(test_dataset): @@ -20,7 +20,8 @@ def test_fields(test_dataset): assert test_dataset.nested.nest.fields == ["t", "flux", "band"] -def test_to_flat_gen(): +def test_to_flat(): + """test the to_flat function""" nf = nd.datasets.generate_data(10, 100, npartitions=2, seed=1) flat_nf = nf.nested.nest.to_flat() @@ -40,77 +41,69 @@ def test_to_flat_gen(): assert one_row["band"] == "r" - - -def test_to_flat(test_dataset): +def test_to_flat_with_fields(): """test the to_flat function""" - flat_ztf = test_dataset.nested.nest.to_flat() - - # check dtypes - assert flat_ztf.dtypes["t"] == pd.ArrowDtype(pa.float64()) - assert flat_ztf.dtypes["flux"] == pd.ArrowDtype(pa.float64()) - assert flat_ztf.dtypes["band"] == pd.ArrowDtype(pa.large_string()) - - # Make sure we retain all rows - assert len(flat_ztf.loc[1]) == 500 - - one_row = flat_ztf.compute().iloc[0] - assert pytest.approx(one_row["t"], 0.01) == 6.5329 - assert pytest.approx(one_row["flux"], 0.01) == 19.0794 - assert one_row["band"] == "r" + nf = nd.datasets.generate_data(10, 100, npartitions=2, seed=1) + flat_nf = nf.nested.nest.to_flat(fields=["t", "flux"]) -def test_to_flat_with_fields(test_dataset): - """test the to_flat function""" - flat_ztf = test_dataset.nested.nest.to_flat(fields=["t", "flux"]) + assert "band" not in flat_nf.columns # check dtypes - assert flat_ztf.dtypes["t"] == pd.ArrowDtype(pa.float64()) - assert flat_ztf.dtypes["flux"] == pd.ArrowDtype(pa.float64()) + assert flat_nf.dtypes["t"] == pd.ArrowDtype(pa.float64()) + assert flat_nf.dtypes["flux"] == pd.ArrowDtype(pa.float64()) # Make sure we retain all rows - assert len(flat_ztf.loc[1]) == 500 + assert len(flat_nf.loc[1]) == 100 + + one_row = flat_nf.compute().iloc[0] - one_row = flat_ztf.loc[1].compute().iloc[1] - assert pytest.approx(one_row["t"], 0.01) == 5.4584 - assert pytest.approx(one_row["flux"], 0.01) == 84.1573 + assert pytest.approx(one_row["t"], 0.01) == 16.0149 + assert pytest.approx(one_row["flux"], 0.01) == 51.2061 -def test_to_lists(test_dataset): +def test_to_lists(): """test the to_lists function""" - list_ztf = test_dataset.nested.nest.to_lists() + + nf = nd.datasets.generate_data(10, 100, npartitions=2, seed=1) + list_nf = nf.nested.nest.to_lists() # check dtypes - assert list_ztf.dtypes["t"] == pd.ArrowDtype(pa.list_(pa.float64())) - assert list_ztf.dtypes["flux"] == pd.ArrowDtype(pa.list_(pa.float64())) - assert list_ztf.dtypes["band"] == pd.ArrowDtype(pa.list_(pa.large_string())) + assert list_nf.dtypes["t"] == pd.ArrowDtype(pa.list_(pa.float64())) + assert list_nf.dtypes["flux"] == pd.ArrowDtype(pa.list_(pa.float64())) + assert list_nf.dtypes["band"] == pd.ArrowDtype(pa.list_(pa.string())) # Make sure we have a single row for an id - assert len(list_ztf.loc[1]) == 1 + assert len(list_nf.loc[1]) == 1 # Make sure we retain all rows -- double loc for speed and pandas get_item - assert len(list_ztf.loc[1].compute().loc[1]["t"]) == 500 + assert len(list_nf.loc[1].compute().loc[1]["t"]) == 100 + one_row = list_nf.compute().iloc[1] # spot-check values - assert pytest.approx(list_ztf.loc[1].compute().loc[1]["t"][0], 0.01) == 7.5690279 - assert pytest.approx(list_ztf.loc[1].compute().loc[1]["flux"][0], 0.01) == 79.6886 - assert list_ztf.loc[1].compute().loc[1]["band"][0] == "g" + assert pytest.approx(one_row["t"][0], 0.01) == 19.3652 + assert pytest.approx(one_row["flux"][0], 0.01) == 61.7461 + assert one_row["band"][0] == "g" -def test_to_lists_with_fields(test_dataset): +def test_to_lists_with_fields(): """test the to_lists function""" - list_ztf = test_dataset.nested.nest.to_lists(fields=["t", "flux"]) + nf = nd.datasets.generate_data(10, 100, npartitions=2, seed=1) + list_nf = nf.nested.nest.to_lists(fields=["t", "flux"]) + + assert "band" not in list_nf.columns # check dtypes - assert list_ztf.dtypes["t"] == pd.ArrowDtype(pa.list_(pa.float64())) - assert list_ztf.dtypes["flux"] == pd.ArrowDtype(pa.list_(pa.float64())) + assert list_nf.dtypes["t"] == pd.ArrowDtype(pa.list_(pa.float64())) + assert list_nf.dtypes["flux"] == pd.ArrowDtype(pa.list_(pa.float64())) # Make sure we have a single row for an id - assert len(list_ztf.loc[1]) == 1 + assert len(list_nf.loc[1]) == 1 # Make sure we retain all rows -- double loc for speed and pandas get_item - assert len(list_ztf.loc[1].compute().loc[1]["t"]) == 500 + assert len(list_nf.loc[1].compute().loc[1]["t"]) == 100 + one_row = list_nf.compute().iloc[1] # spot-check values - assert pytest.approx(list_ztf.loc[1].compute().loc[1]["t"][0], 0.01) == 7.5690279 - assert pytest.approx(list_ztf.loc[1].compute().loc[1]["flux"][0], 0.01) == 79.6886 + assert pytest.approx(one_row["t"][0], 0.01) == 19.3652 + assert pytest.approx(one_row["flux"][0], 0.01) == 61.7461