Merge pull request #21 from srivarra/lowess/groupby

Added Groupby to Lowess
Ofosu-Osei · Oct 19, 2024 · 0778b40 · 0778b40
2 parents a657817 + 112a006
commit 0778b40
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -201,18 +201,17 @@ import seaborn.objects as so
 import seaborn as sns
 import seaborn_objects_recipes as sor
 
+
 def test_lowess_with_ci():
-
     # Load the penguins dataset
     penguins = sns.load_dataset("penguins")
 
     # Prepare data
     data = penguins.copy()
-    data = penguins[penguins['species'] == 'Adelie']
 
     # Create the plot
     plot = (
-        so.Plot(data, x="bill_length_mm", y="body_mass_g")
+        so.Plot(data, x="bill_length_mm", y="body_mass_g", color="species")
         .add(so.Dot())
         .add(so.Line(), lowess := sor.Lowess(frac=0.2, gridsize=100, num_bootstrap=200, alpha=0.95))
         .add(so.Band(), lowess)

diff --git a/img/lowess_b.png b/img/lowess_b.png
diff --git a/seaborn_objects_recipes/recipes/lowess.py b/seaborn_objects_recipes/recipes/lowess.py
@@ -1,12 +1,12 @@
 from __future__ import annotations
 import numpy as np
 import pandas as pd
-from pandas import DataFrame
 from dataclasses import dataclass
 from seaborn._stats.base import Stat
 import statsmodels.api as sm
 from typing import Optional
 
+
 @dataclass
 class Lowess(Stat):
     """
@@ -39,34 +39,29 @@ class Lowess(Stat):
     delta: float = 0.0
     num_bootstrap: Optional[int] = None
     alpha: float = 0.95
-
 
     def __post_init__(self):
         # Type checking for the arguments
         if not isinstance(self.frac, float) or not (0 < self.frac <= 1):
             raise ValueError("frac must be a float between 0 and 1.")
         if not isinstance(self.gridsize, int) or self.gridsize <= 0:
             raise ValueError("gridsize must be a positive integer.")
-        if self.num_bootstrap is not None and (
-            not isinstance(self.num_bootstrap, int) or self.num_bootstrap <= 0
-        ):
+        if self.num_bootstrap is not None and (not isinstance(self.num_bootstrap, int) or self.num_bootstrap <= 0):
             raise ValueError("num_bootstrap must be a positive integer or None.")
         if not isinstance(self.alpha, float) or not (0 < self.alpha < 1):
             raise ValueError("alpha must be a float between 0 and 1.")
 
     def _fit_predict(self, data):
         x = data["x"]
         xx = np.linspace(x.min(), x.max(), self.gridsize)
-        result = sm.nonparametric.lowess(
-            endog=data["y"], exog=x, frac=self.frac, delta=self.delta, xvals=xx
-        )
+        result = sm.nonparametric.lowess(endog=data["y"], exog=x, frac=self.frac, delta=self.delta, xvals=xx)
         if result.ndim == 1:  # Handle single-dimensional return values
             yy = result
         else:
             yy = result[:, 1]  # Select the predicted y-values
         return pd.DataFrame(dict(x=xx, y=yy))
 
-    def _bootstrap_resampling(self, data):
+    def _bootstrap_resampling(self, data) -> pd.DataFrame:
         xx = np.linspace(data["x"].min(), data["x"].max(), self.gridsize)
         bootstrap_estimates = np.empty((self.num_bootstrap, len(xx)))
 
@@ -81,34 +76,39 @@ def _bootstrap_resampling(self, data):
             )
             # Ensure the result is two-dimensional
             if result.ndim == 1:
-                result = np.column_stack(
-                    (xx, result)
-                )  # Reformat to two-dimensional if needed
+                result = np.column_stack((xx, result))  # Reformat to two-dimensional if needed
             bootstrap_estimates[i, :] = result[:, 1]
 
-        return xx, bootstrap_estimates
+        lower_bound = np.percentile(bootstrap_estimates, (1 - self.alpha) / 2 * 100, axis=0)
+        upper_bound = np.percentile(bootstrap_estimates, (1 + self.alpha) / 2 * 100, axis=0)
+
+        return pd.DataFrame({"ymin": lower_bound, "ymax": upper_bound})
 
-    def __call__(self, data: DataFrame, groupby, orient, scales) -> DataFrame:
+    def __call__(self, data: pd.DataFrame, groupby, orient, scales) -> pd.DataFrame:
         if orient == "x":
             xvar = data.columns[0]
             yvar = data.columns[1]
         else:
             xvar = data.columns[1]
             yvar = data.columns[0]
 
-        renamed_data = data.rename(columns={xvar: "x", yvar: "y"})        
+        renamed_data = data.rename(columns={xvar: "x", yvar: "y"})
         renamed_data = renamed_data.dropna(subset=["x", "y"])
         smoothed = self._fit_predict(renamed_data)
 
+        grouping_vars = [str(v) for v in data if v in groupby.order]
+
+        if not grouping_vars:
+            # If no grouping variables, directly fit and predict
+            smoothed = self._fit_predict(renamed_data)
+        else:
+            # Apply the fit_predict method for each group separately
+            smoothed = groupby.apply(renamed_data, self._fit_predict)
+
         if self.num_bootstrap:
-            xx, bootstrap_estimates = self._bootstrap_resampling(data)
-            lower_bound = np.percentile(
-                bootstrap_estimates, (1 - self.alpha) / 2 * 100, axis=0
-            )
-            upper_bound = np.percentile(
-                bootstrap_estimates, (1 + self.alpha) / 2 * 100, axis=0
-            )
-            smoothed["ymin"] = lower_bound
-            smoothed["ymax"] = upper_bound
+            if not grouping_vars:
+                bootstrap_estimates = self._bootstrap_resampling(data)
+            else:
+                bootstrap_estimates = groupby.apply(data, self._bootstrap_resampling)
 
-        return smoothed
+        return smoothed.join(bootstrap_estimates[["ymin", "ymax"]]) if self.num_bootstrap else smoothed
diff --git a/test_main.py b/test_main.py
@@ -113,12 +113,9 @@ def test_lowess_with_ci(cleanup_files):
     # Load the penguins dataset
     penguins = sns.load_dataset("penguins")
 
-    # Prepare data
-    data = penguins[penguins['species'] == 'Adelie']
-
     # Create the plot
     plot = (
-        so.Plot(data, x="bill_length_mm", y="body_mass_g")
+        so.Plot(penguins, x="bill_length_mm", y="body_mass_g", color="species")
         .add(so.Dot())
         .add(so.Line(), lowess := sor.Lowess(frac=0.2, gridsize=100, num_bootstrap=200, alpha=0.95))
         .add(so.Band(), lowess)