Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Polyfit update to reflect recent requirements #11

Merged
merged 2 commits into from
Jun 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 18 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,28 +229,30 @@ def test_polyfit_with_ci(cleanup_files):
penguins = sns.load_dataset("penguins")

# Prepare data
data = penguins[penguins['species'] == 'Adelie']
data = penguins.copy()
data = data[data["species"] == "Adelie"]

# Initialize PolyFit instance with bootstrapping
poly_fit_with_bootstrap = sor.PolyFit(order=2, gridsize=100, num_bootstrap=200, alpha=0.05)
# Initialize PolyFit instance with confidence intervals
poly_fit_with_ci = sor.PolyFit(order=2, gridsize=100, alpha=0.05)

# Call the PolyFit method on prepared data
results_with_bootstrap = poly_fit_with_bootstrap(data, 'bill_length_mm', 'body_mass_g')
results_with_ci = poly_fit_with_ci(data, "bill_length_mm", "body_mass_g")

# Plotting
fig, ax = plt.subplots(figsize=(9, 5))
sns.scatterplot(x='bill_length_mm', y='body_mass_g', data=data, ax=ax, color='blue', alpha=0.5)
ax.plot(results_with_bootstrap['bill_length_mm'], results_with_bootstrap['body_mass_g'], color='darkblue')
if 'ci_lower' in results_with_bootstrap.columns and 'ci_upper' in results_with_bootstrap.columns:
ax.fill_between(results_with_bootstrap['bill_length_mm'],
results_with_bootstrap['ci_lower'],
results_with_bootstrap['ci_upper'],
color='blue',
alpha=0.3)
ax.set_xlabel('Bill Length (mm)')
ax.set_ylabel('Body Mass (g)')
ax.set_title('Polynomial Fit with Confidence Intervals for Adelie Penguins')
ax.grid(True, which='both', color='gray', linewidth=0.5, linestyle='--')
sns.scatterplot(x="bill_length_mm", y="body_mass_g", data=data, ax=ax, color="blue", alpha=0.5)
ax.plot(results_with_ci["bill_length_mm"], results_with_ci["body_mass_g"], color="darkblue")
ax.fill_between(
results_with_ci["bill_length_mm"],
results_with_ci["ci_lower"],
results_with_ci["ci_upper"],
color="blue",
alpha=0.3,
)
ax.set_xlabel("Bill Length (mm)")
ax.set_ylabel("Body Mass (g)")
ax.set_title("Polynomial Fit with Confidence Intervals for Adelie Penguins")
ax.grid(True, which="both", color="gray", linewidth=0.5, linestyle="--")
plt.show()
```
### Output
Expand Down
Empty file added docs/recipes_overview.md
Empty file.
478 changes: 478 additions & 0 deletions docs/tutorial/recipes_tut.ipynb

Large diffs are not rendered by default.

Empty file added docs/usage_instructions.md
Empty file.
Binary file modified img/polyfit_with_ci.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
50 changes: 27 additions & 23 deletions seaborn_objects_recipes/recipes/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ def plot(self, data, xvar, yvar):
return plot



@dataclass
class PolyFit(Stat):
"""
Expand All @@ -75,50 +74,55 @@ class PolyFit(Stat):
alpha: float = 0.05
order: int = 2
gridsize: int = 100
num_bootstrap: Optional[int] = None

def __post_init__(self):
# Type checking for the arguments
if not isinstance(self.order, int) or self.order <= 0:
raise ValueError("order must be a positive integer.")
if not isinstance(self.gridsize, int) or self.gridsize <= 0:
raise ValueError("gridsize must be a positive integer.")
if self.num_bootstrap is not None and (not isinstance(self.num_bootstrap, int) or self.num_bootstrap <= 0):
raise ValueError("num_bootstrap must be a positive integer or None.")
if not isinstance(self.alpha, float) or not (0 < self.alpha < 1):
raise ValueError("alpha must be a float between 0 and 1.")

def _fit_predict(self, data):
data = data.dropna(subset=["x", "y"])
x = data["x"]
y = data["y"]
if x.nunique() <= self.order:
x = data["x"].values
y = data["y"].values
if x.size <= self.order:
xx = yy = []
else:
p = np.polyfit(x, y, self.order)
xx = np.linspace(x.min(), x.max(), self.gridsize)
yy = np.polyval(p, xx)

# Calculate confidence intervals
# Design matrix
X_design = np.vander(xx, self.order + 1)

# Calculate standard errors
y_hat = np.polyval(p, x)
residuals = y - y_hat
dof = max(0, len(x) - (self.order + 1))
residual_std_error = np.sqrt(np.sum(residuals**2) / dof)

# Covariance matrix of coefficients
C_matrix = np.linalg.inv(X_design.T @ X_design) * residual_std_error**2

# Calculate the standard error for the predicted values
y_err = np.sqrt(np.sum((X_design @ C_matrix) * X_design, axis=1))

# Calculate the confidence intervals
ci = y_err * 1.96 # For approximately 95% CI
ci_lower = yy - ci
ci_upper = yy + ci

results = pd.DataFrame(dict(x=xx, y=yy))

if self.num_bootstrap:
bootstrap_estimates = np.empty((self.num_bootstrap, len(xx)))
for i in range(self.num_bootstrap):
sample = data.sample(frac=1, replace=True)
p = np.polyfit(sample["x"], sample["y"], self.order)
yy_sample = np.polyval(p, xx)
bootstrap_estimates[i, :] = yy_sample

lower_bound = np.percentile(bootstrap_estimates, (1 - self.alpha) / 2 * 100, axis=0)
upper_bound = np.percentile(bootstrap_estimates, (1 + self.alpha) / 2 * 100, axis=0)
results["ci_lower"] = lower_bound
results["ci_upper"] = upper_bound
results = pd.DataFrame(dict(x=xx, y=yy, ci_lower=ci_lower, ci_upper=ci_upper))

return results

def __call__(self, data, xvar, yvar):
# Rename columns to match expected input for _fit_predict
data_renamed = data.rename(columns={xvar: "x", yvar: "y"})
#return groupby.apply(data_renamed.dropna(subset=["x", "y"]), self._fit_predict)
results = self._fit_predict(data_renamed)
return results.rename(columns={"x": xvar, "y": yvar})
return results.rename(columns={"x": xvar, "y": yvar, "ci_lower": "ci_lower", "ci_upper": "ci_upper"})

37 changes: 19 additions & 18 deletions test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,29 +221,30 @@ def test_polyfit_with_ci(cleanup_files):
penguins = sns.load_dataset("penguins")

# Prepare data
data = penguins[penguins['species'] == 'Adelie']

# Initialize PolyFit instance with bootstrapping
poly_fit_with_bootstrap = sor.PolyFit(order=2, gridsize=100, num_bootstrap=200, alpha=0.05)
data = penguins.copy()
data = data[data["species"] == "Adelie"]

# Initialize PolyFit instance with confidence intervals
poly_fit_with_ci = sor.PolyFit(order=2, gridsize=100, alpha=0.05)

# Call the PolyFit method on prepared data
results_with_bootstrap = poly_fit_with_bootstrap(data, 'bill_length_mm', 'body_mass_g')
results_with_ci = poly_fit_with_ci(data, "bill_length_mm", "body_mass_g")

# Plotting
fig, ax = plt.subplots(figsize=(9, 5))
sns.scatterplot(x='bill_length_mm', y='body_mass_g', data=data, ax=ax, color='blue', alpha=0.5)
ax.plot(results_with_bootstrap['bill_length_mm'], results_with_bootstrap['body_mass_g'], color='darkblue')
if 'ci_lower' in results_with_bootstrap.columns and 'ci_upper' in results_with_bootstrap.columns:
ax.fill_between(results_with_bootstrap['bill_length_mm'],
results_with_bootstrap['ci_lower'],
results_with_bootstrap['ci_upper'],
color='blue',
alpha=0.3)
ax.set_xlabel('Bill Length (mm)')
ax.set_ylabel('Body Mass (g)')
ax.set_title('Polynomial Fit with Confidence Intervals for Adelie Penguins')
ax.grid(True, which='both', color='gray', linewidth=0.5, linestyle='--')
sns.scatterplot(x="bill_length_mm", y="body_mass_g", data=data, ax=ax, color="blue", alpha=0.5)
ax.plot(results_with_ci["bill_length_mm"], results_with_ci["body_mass_g"], color="darkblue")
ax.fill_between(
results_with_ci["bill_length_mm"],
results_with_ci["ci_lower"],
results_with_ci["ci_upper"],
color="blue",
alpha=0.3,
)
ax.set_xlabel("Bill Length (mm)")
ax.set_ylabel("Body Mass (g)")
ax.set_title("Polynomial Fit with Confidence Intervals for Adelie Penguins")
ax.grid(True, which="both", color="gray", linewidth=0.5, linestyle="--")
#plt.show()
plt.savefig("polyfit_with_ci.png")
# Assert that the file was created
Expand Down
Loading