From 8f34046f425fac361bedf01e372753fbc77ae0dc Mon Sep 17 00:00:00 2001 From: Max Humber Date: Tue, 18 Oct 2022 11:12:23 -0400 Subject: [PATCH 1/3] sample errors improved --- CHANGELOG | 3 +++ redframes/verbs/sample.py | 4 ++-- redframes/version.py | 2 +- tests/test_ladybugs.py | 10 ++++++++++ 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 17ea8b1..6165b63 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,6 @@ +- 1.3b1 + - NEW: `gather` + - IMPROVED: `sample` errors - 1.2 - NEW: `cross` join verb! - NEW: `join(..., postfix=("_lhs, "_rhs"))` argument diff --git a/redframes/verbs/sample.py b/redframes/verbs/sample.py index 986eb19..30e3192 100644 --- a/redframes/verbs/sample.py +++ b/redframes/verbs/sample.py @@ -10,11 +10,11 @@ def sample( _check_type(rows, {int, float}) if rows >= 1: if isinstance(rows, float): - raise ValueError("rows (int) must be >= 1") + raise ValueError("must be int if > 1") df = df.sample(rows, random_state=seed) elif 0 < rows < 1: df = df.sample(frac=rows, random_state=seed) else: - raise ValueError("rows (float) must be (0, 1)") + raise ValueError("must be > 0") df = df.reset_index(drop=True) return df diff --git a/redframes/version.py b/redframes/version.py index b2a95f9..9c0bb6d 100644 --- a/redframes/version.py +++ b/redframes/version.py @@ -1 +1 @@ -__version__ = "1.2" +__version__ = "1.3b1" diff --git a/tests/test_ladybugs.py b/tests/test_ladybugs.py index fc941c8..db6cc4a 100644 --- a/tests/test_ladybugs.py +++ b/tests/test_ladybugs.py @@ -25,3 +25,13 @@ def test_comine_overwrite_and_drop_other(self): result = df.combine(["foo", "bar"], into="foo", sep="-", drop=True) expected = rf.DataFrame({"foo": ["1-1", "2-2", "3-3"]}) self.assertEqual(result, expected) + + def test_sample_float_1_point_0(self): + df = rf.DataFrame({"foo": range(100)}) + with self.assertRaisesRegex(ValueError, "must be int if > 1"): + df.sample(1.0) + + def test_sample_negative_1(self): + df = rf.DataFrame({"foo": range(100)}) + with self.assertRaisesRegex(ValueError, "must be > 0"): + df.sample(-1) From 6d410e4cc9fd3f2dea293d508fbf845564679644 Mon Sep 17 00:00:00 2001 From: Max Humber Date: Tue, 18 Oct 2022 11:46:18 -0400 Subject: [PATCH 2/3] gather(beside=) argument + tests --- CHANGELOG | 2 +- TODO | 3 +- redframes/core.py | 82 ++++++++++++++++++++++++++++----------- redframes/verbs/gather.py | 30 +++++++++----- tests/test_docstrings.py | 64 +++++++++++++++++++++++++++--- tests/test_type_hints.py | 13 +++++++ 6 files changed, 154 insertions(+), 40 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 6165b63..6f002d9 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,5 @@ - 1.3b1 - - NEW: `gather` + - NEW: `gather(beside=...)` argument - IMPROVED: `sample` errors - 1.2 - NEW: `cross` join verb! diff --git a/TODO b/TODO index 8bc14e8..3af6e48 100644 --- a/TODO +++ b/TODO @@ -1,3 +1,4 @@ +- remove summarize - explode, collapse verbs - more research into `.assign` mutate(..., vectorized=True)? - datasets @@ -8,4 +9,4 @@ - hide/protect/private - 10 minutes tutorial - cheatsheet (pandas/dplyr/tidyr) -- anaconda \ No newline at end of file +- anaconda? \ No newline at end of file diff --git a/redframes/core.py b/redframes/core.py index eb336c1..fcc9037 100644 --- a/redframes/core.py +++ b/redframes/core.py @@ -819,9 +819,10 @@ def filter(self, func: Func) -> DataFrame: def gather( self, columns: Columns | None = None, + beside: LazyColumns | None = None, into: tuple[Column, Column] = ("variable", "value"), ): - """Lengthen data, increase rows, decrease columns (opposite of `spread`) + """Lengthen data by increasing rows and decreasing columns (opposite of `spread`) pandas: `melt` tidyverse: `gather`, `pivot_longer` @@ -829,40 +830,77 @@ def gather( Examples: ```python - df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4], "baz": [4, 5]}) + df = rf.DataFrame({ + "foo": [1, 2, 1, 2], + "bar": ["A", "B", "C", "D"], + "baz": ["!", "@", "#", "$"], + "jaz": range(4) + }) ``` - | foo | bar | baz | - |------:|------:|------:| - | 1 | 3 | 4 | - | 2 | 4 | 5 | + | foo | bar | baz | jaz | + |------:|:------|:------|------:| + | 1 | A | ! | 0 | + | 2 | B | @ | 1 | + | 1 | C | # | 2 | + | 2 | D | $ | 3 | All columns: ```python df.gather() ``` - | variable | value | - |:-----------|--------:| - | foo | 1 | - | foo | 2 | - | bar | 3 | - | bar | 4 | - | baz | 4 | - | baz | 5 | + | variable | value | + |:-----------|:--------| + | foo | 1 | + | foo | 2 | + | foo | 1 | + | foo | 2 | + | bar | A | + | bar | B | + | bar | C | + | bar | D | + | baz | ! | + | baz | @ | + | baz | # | + | baz | $ | + | jaz | 0 | + | jaz | 1 | + | jaz | 2 | + | jaz | 3 | Multiple columns: ```python df.gather(["foo", "bar"], into=("var", "val")) ``` - | baz | var | val | - |------:|:------|------:| - | 4 | foo | 1 | - | 5 | foo | 2 | - | 4 | bar | 3 | - | 5 | bar | 4 | + | baz | jaz | var | val | + |:------|------:|:------|:------| + | ! | 0 | foo | 1 | + | @ | 1 | foo | 2 | + | # | 2 | foo | 1 | + | $ | 3 | foo | 2 | + | ! | 0 | bar | A | + | @ | 1 | bar | B | + | # | 2 | bar | C | + | $ | 3 | bar | D | + + All columns except: + + ```python + df.gather(beside=["foo", "bar"]) + ``` + | foo | bar | variable | value | + |------:|:------|:-----------|:--------| + | 1 | A | baz | ! | + | 2 | B | baz | @ | + | 1 | C | baz | # | + | 2 | D | baz | $ | + | 1 | A | jaz | 0 | + | 2 | B | jaz | 1 | + | 1 | C | jaz | 2 | + | 2 | D | jaz | 3 | """ - return _wrap(gather(self._data, columns, into)) + return _wrap(gather(self._data, columns, beside, into)) def group(self, by: LazyColumns) -> GroupedFrame: """Create a GroupedFrame overwhich split-apply-combine operations can be applied @@ -1324,7 +1362,7 @@ def split( return _wrap(split(self._data, column, into, sep, drop)) def spread(self, column: Column, using: Column) -> DataFrame: - """Widen data, increase columns, decreas rows (opposite of `gather`) + """Widen data by increasing columns and decreasing rows (opposite of `gather`) pandas: `pivot_table` tidyverse: `spread`, `pivot_wider` diff --git a/redframes/verbs/gather.py b/redframes/verbs/gather.py index 94809ac..b0e84dd 100644 --- a/redframes/verbs/gather.py +++ b/redframes/verbs/gather.py @@ -3,31 +3,41 @@ import pandas as pd from ..checks import _check_type -from ..types import Column, Columns, PandasDataFrame +from ..types import Column, Columns, LazyColumns, PandasDataFrame def gather( df: PandasDataFrame, columns: Columns | None = None, + beside: LazyColumns | None = None, into: tuple[Column, Column] = ("variable", "value"), ) -> PandasDataFrame: _check_type(columns, {list, None}) + _check_type(beside, {str, list, None}) _check_type(into, tuple) - if not (isinstance(into, tuple) and len(into) == 2): + if not (isinstance(into, tuple) and (len(into) == 2)): raise TypeError("must be tuple[str, str]") if into[0] == into[1]: raise TypeError("must be unique") - if into[0] in df.columns: + if (into[0] in df.columns) or (into[1] in df.columns): raise TypeError("must not be an existing column key") - if into[1] in df.columns: - raise TypeError("must not be an existing column key") - if columns == None: - columns = list(df.columns) - index = [col for col in df.columns if col not in columns] # type: ignore + if (columns != None) and (beside != None): + raise ValueError("columns OR beside must be None") + if (columns == None) and (beside == None): + id_vars = [] + value_vars = list(df.columns) + if isinstance(beside, str): + beside = [beside] + if isinstance(beside, list): + id_vars = beside + value_vars = [col for col in df.columns if col not in id_vars] + if isinstance(columns, list): + id_vars = [col for col in df.columns if col not in columns] + value_vars = columns df = pd.melt( df, - id_vars=index, - value_vars=columns, + id_vars=id_vars, + value_vars=value_vars, var_name=into[0], value_name=into[1], ) diff --git a/tests/test_docstrings.py b/tests/test_docstrings.py index f2a1500..5316ef4 100644 --- a/tests/test_docstrings.py +++ b/tests/test_docstrings.py @@ -218,24 +218,76 @@ def test_filter(self): self.assertEqual(result3, expected3) def test_gather(self): - df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4], "baz": [4, 5]}) + df = rf.DataFrame( + { + "foo": [1, 2, 1, 2], + "bar": ["A", "B", "C", "D"], + "baz": ["!", "@", "#", "$"], + "jaz": range(4), + } + ) result1 = df.gather() result2 = df.gather(["foo", "bar"], into=("var", "val")) + result3 = df.gather(beside=["foo", "bar"]) expected1 = rf.DataFrame( { - "variable": ["foo", "foo", "bar", "bar", "baz", "baz"], - "value": [1, 2, 3, 4, 4, 5], + "variable": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "baz", + "baz", + "baz", + "baz", + "jaz", + "jaz", + "jaz", + "jaz", + ], + "value": [ + 1, + 2, + 1, + 2, + "A", + "B", + "C", + "D", + "!", + "@", + "#", + "$", + 0, + 1, + 2, + 3, + ], } ) expected2 = rf.DataFrame( { - "baz": [4, 5, 4, 5], - "var": ["foo", "foo", "bar", "bar"], - "val": [1, 2, 3, 4], + "baz": ["!", "@", "#", "$", "!", "@", "#", "$"], + "jaz": [0, 1, 2, 3, 0, 1, 2, 3], + "var": ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + "val": [1, 2, 1, 2, "A", "B", "C", "D"], + } + ) + expected3 = rf.DataFrame( + { + "foo": [1, 2, 1, 2, 1, 2, 1, 2], + "bar": ["A", "B", "C", "D", "A", "B", "C", "D"], + "variable": ["baz", "baz", "baz", "baz", "jaz", "jaz", "jaz", "jaz"], + "value": ["!", "@", "#", "$", 0, 1, 2, 3], } ) self.assertEqual(result1, expected1) self.assertEqual(result2, expected2) + self.assertEqual(result3, expected3) def test_group(self): df = rf.DataFrame( diff --git a/tests/test_type_hints.py b/tests/test_type_hints.py index bcff5ee..8fc7a5c 100644 --- a/tests/test_type_hints.py +++ b/tests/test_type_hints.py @@ -156,10 +156,23 @@ def test_gather_bad_columns(self): with self.assertRaisesRegex(TypeError, "must be list | None"): self.df.gather(1) + def test_gather_bad_beside(self): + with self.assertRaisesRegex(TypeError, "must be str | list | None"): + self.df.gather(beside=1) + def test_gather_bad_into_column(self): with self.assertRaisesRegex(TypeError, "must be tuple"): self.df.gather(["foo", "bar"], into=1) + def test_gather_bad_into_tuple(self): + # with self.assertRaisesRegex(TypeError, f'must be tuple[str, str]'): + # self.df.gather(into=("one", "two", "three")) + pass + + def test_gather_bad_both_not_none(self): + with self.assertRaisesRegex(ValueError, "columns OR beside must be None"): + self.df.gather(columns=["foo", "bar"], beside=["baz"]) + def test_group_bad_by_columns(self): with self.assertRaisesRegex(TypeError, "must be list | str"): self.df.group(1) From d92c83fe254c037cff3cb27d330e9f3f3a90ba4c Mon Sep 17 00:00:00 2001 From: Max Humber Date: Tue, 18 Oct 2022 12:07:44 -0400 Subject: [PATCH 3/3] change --- CHANGELOG | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 6f002d9..5cb3b68 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,6 @@ - 1.3b1 - - NEW: `gather(beside=...)` argument - - IMPROVED: `sample` errors + - NEW: `gather(beside=...)` argument! + - IMPROVED: `sample` errors are more explicit - 1.2 - NEW: `cross` join verb! - NEW: `join(..., postfix=("_lhs, "_rhs"))` argument