From 0a03c6f95bdf638cf29074d045be21f52194427f Mon Sep 17 00:00:00 2001 From: dani Date: Tue, 27 Mar 2018 16:43:43 -0700 Subject: [PATCH 1/9] use sp.special instead of sp.misc --- pygam/distributions.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/pygam/distributions.py b/pygam/distributions.py index ff5acd33..f8ae4916 100644 --- a/pygam/distributions.py +++ b/pygam/distributions.py @@ -123,9 +123,9 @@ def __init__(self, scale=None): """ super(NormalDist, self).__init__(name='normal', scale=scale) - def pdf(self, y, mu, weights=None): + def log_pdf(self, y, mu, weights=None): """ - computes the pdf or pmf of the values under the current distribution + computes the log of the pdf or pmf of the values under the current distribution Parameters ---------- @@ -144,7 +144,7 @@ def pdf(self, y, mu, weights=None): if weights is None: weights = np.ones_like(mu) scale = self.scale / weights - return np.exp(-(y - mu)**2 / (2 * scale)) / (scale * 2 * np.pi)**0.5 + return -(y - mu)**2 / (2 * scale) - 0.5 * np.log(scale * 2 * np.pi) @divide_weights def V(self, mu): @@ -245,9 +245,9 @@ def __init__(self, levels=1): super(BinomialDist, self).__init__(name='binomial', scale=1.) self._exclude.append('scale') - def pdf(self, y, mu, weights=None): + def log_pdf(self, y, mu, weights=None): """ - computes the pdf or pmf of the values under the current distribution + computes the log of the pdf or pmf of the values under the current distribution Parameters ---------- @@ -266,8 +266,8 @@ def pdf(self, y, mu, weights=None): if weights is None: weights = np.ones_like(mu) n = self.levels - return weights * (sp.misc.comb(n, y) * (mu / n)**y * - (1 - (mu / n))**(n - y)) + return (np.log(weights * sp.special.comb(n, y)) + np.log(mu / n) * y + + np.log(1 - (mu / n)) * (n - y)) @divide_weights def V(self, mu): @@ -352,9 +352,9 @@ def __init__(self): super(PoissonDist, self).__init__(name='poisson', scale=1.) self._exclude.append('scale') - def pdf(self, y, mu, weights=None): + def log_pdf(self, y, mu, weights=None): """ - computes the pdf or pmf of the values under the current distribution + computes the log of the pdf or pmf of the values under the current distribution Parameters ---------- @@ -376,7 +376,7 @@ def pdf(self, y, mu, weights=None): # so we want to pump up all our predictions # NOTE: we assume the targets are unchanged mu = mu * weights - return (mu**y) * np.exp(-mu) / sp.misc.factorial(y) + return np.log(mu) * y - mu - np.log(sp.special.factorial(y)) @divide_weights def V(self, mu): @@ -459,9 +459,9 @@ def __init__(self, scale=None): """ super(GammaDist, self).__init__(name='gamma', scale=scale) - def pdf(self, y, mu, weights=None): + def log_pdf(self, y, mu, weights=None): """ - computes the pdf or pmf of the values under the current distribution + computes the log of the pdf or pmf of the values under the current distribution Parameters ---------- @@ -480,8 +480,8 @@ def pdf(self, y, mu, weights=None): if weights is None: weights = np.ones_like(mu) nu = weights / self.scale - return (1. / sp.special.gamma(nu) * - (nu / mu)**nu * y**(nu - 1) * np.exp(-nu * y / mu)) + return (-np.log(sp.special.gamma(nu)) + + np.log(nu / mu) * nu + np.log(y)*(nu - 1) + -nu * y / mu) @divide_weights def V(self, mu): @@ -570,9 +570,9 @@ def __init__(self, scale=None): """ super(InvGaussDist, self).__init__(name='inv_gauss', scale=scale) - def pdf(self, y, mu, weights=None): + def log_pdf(self, y, mu, weights=None): """ - computes the pdf or pmf of the values under the current distribution + computes the log of the pdf or pmf of the values under the current distribution Parameters ---------- @@ -591,8 +591,8 @@ def pdf(self, y, mu, weights=None): if weights is None: weights = np.ones_like(mu) gamma = weights / self.scale - return ((gamma / (2 * np.pi * y**3))**.5 * - np.exp(-gamma * (y - mu)**2 / (2 * mu**2 * y))) + return (0.5 * np.log(gamma / (2 * np.pi * y**3)) + + -gamma * (y - mu)**2 / (2 * mu**2 * y)) @divide_weights def V(self, mu): From a94be032d1f58c2f08608eddc51ef0428a5a9a12 Mon Sep 17 00:00:00 2001 From: dani Date: Tue, 27 Mar 2018 16:44:00 -0700 Subject: [PATCH 2/9] add verbosity option for warnings --- pygam/utils.py | 47 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/pygam/utils.py b/pygam/utils.py index fa9fd144..0bd0be09 100644 --- a/pygam/utils.py +++ b/pygam/utils.py @@ -24,7 +24,7 @@ class NotPositiveDefiniteError(ValueError): """ -def cholesky(A, sparse=True): +def cholesky(A, sparse=True, verbose=True): """ Choose the best possible cholesky factorizor. @@ -39,6 +39,8 @@ def cholesky(A, sparse=True): array to decompose sparse : boolean, default: True whether to return a sparse array + verbose : bool, default: True + whether to print warnings """ if SKSPIMPORT: A = sp.sparse.csc_matrix(A) @@ -66,7 +68,8 @@ def cholesky(A, sparse=True): 'monotonicity/convexity penalties and many splines.\n'\ 'See installation instructions for installing '\ 'Scikit-Sparse and Suite-Sparse via Conda.' - warnings.warn(msg) + if verbose: + warnings.warn(msg) if sp.sparse.issparse(A): A = A.todense() @@ -146,7 +149,7 @@ def check_dtype(X, ratio=.95): return dtypes -def make_2d(array): +def make_2d(array, verbose=True): """ tiny tool to expand 1D arrays the way i want @@ -154,6 +157,9 @@ def make_2d(array): ---------- array : array-like + verbose : bool, default: True + whether to print warnings + Returns ------- np.array of with ndim = 2 @@ -162,13 +168,14 @@ def make_2d(array): if array.ndim < 2: msg = 'Expected 2D input data array, but found {}D. '\ 'Expanding to 2D.'.format(array.ndim) - warnings.warn(msg) + if verbose: + warnings.warn(msg) array = np.atleast_1d(array)[:,None] return array def check_array(array, force_2d=False, n_feats=None, n_dims=None, - min_samples=1, name='Input data'): + min_samples=1, name='Input data', verbose=True): """ tool to perform basic data validation. called by check_X and check_y. @@ -193,6 +200,8 @@ def check_array(array, force_2d=False, n_feats=None, n_dims=None, min_samples : int, default: 1 name : str, default: 'Input data' name to use when referring to the array + verbose : bool, default: True + whether to print warnings Returns ------- @@ -200,7 +209,7 @@ def check_array(array, force_2d=False, n_feats=None, n_dims=None, """ # make array if force_2d: - array = make_2d(array) + array = make_2d(array, verbose=verbose) n_dims = 2 else: array = np.array(array) @@ -242,7 +251,7 @@ def check_array(array, force_2d=False, n_feats=None, n_dims=None, return array -def check_y(y, link, dist, min_samples=1): +def check_y(y, link, dist, min_samples=1, verbose=True): """ tool to ensure that the targets: - are in the domain of the link function @@ -256,6 +265,8 @@ def check_y(y, link, dist, min_samples=1): link : Link object dist : Distribution object min_samples : int, default: 1 + verbose : bool, default: True + whether to print warnings Returns ------- @@ -264,7 +275,7 @@ def check_y(y, link, dist, min_samples=1): y = np.ravel(y) y = check_array(y, force_2d=False, min_samples=min_samples, n_dims=1, - name='y data') + name='y data', verbose=verbose) warnings.filterwarnings('ignore', 'divide by zero encountered in log') if np.any(np.isnan(link.link(y, dist))): @@ -277,7 +288,8 @@ def check_y(y, link, dist, min_samples=1): return y -def check_X(X, n_feats=None, min_samples=1, edge_knots=None, dtypes=None): +def check_X(X, n_feats=None, min_samples=1, edge_knots=None, dtypes=None, + verbose=True): """ tool to ensure that X: - is 2 dimensional @@ -296,13 +308,15 @@ def check_X(X, n_feats=None, min_samples=1, edge_knots=None, dtypes=None): min_samples : int, default: 1 edge_knots : list of arrays, default: None dtypes : list of strings, default: None + verbose : bool, default: True + whether to print warnings Returns ------- X : array with ndims == 2 containing validated X-data """ X = check_array(X, force_2d=True, n_feats=n_feats, min_samples=min_samples, - name='X data') + name='X data', verbose=verbose) if (edge_knots is not None) and (dtypes is not None): for i, (dt, ek, feat) in enumerate(zip(dtypes, edge_knots, X.T)): @@ -505,7 +519,7 @@ def print_data(data_dict, width=-5, keep_decimals=3, fill=' ', title=None): filler = fill*(width - nk - nv) print(k + filler + v) -def gen_edge_knots(data, dtype): +def gen_edge_knots(data, dtype, verbose=True): """ generate uniform knots from data including the edges of the data @@ -515,6 +529,8 @@ def gen_edge_knots(data, dtype): ---------- data : array-like with one dimension dtype : str in {'categorical', 'numerical'} + verbose : bool, default: True + whether to print warnings Returns ------- @@ -526,7 +542,7 @@ def gen_edge_knots(data, dtype): return np.r_[np.min(data) - 0.5, np.unique(data) + 0.5] else: knots = np.r_[np.min(data), np.max(data)] - if knots[0] == knots[1]: + if knots[0] == knots[1] and verbose: warnings.warn('Data contains constant feature. '\ 'Consider removing and setting fit_intercept=True', stacklevel=2) @@ -534,7 +550,7 @@ def gen_edge_knots(data, dtype): def b_spline_basis(x, edge_knots, n_splines=20, spline_order=3, sparse=True, - clamped=False): + clamped=False, verbose=True): """ tool to generate b-spline basis using vectorized De Boor recursion the basis functions extrapolate linearly past the end-knots. @@ -560,6 +576,9 @@ def b_spline_basis(x, edge_knots, n_splines=20, on this property, like monotonicity and convexity are no longer valid. + verbose : bool, default: True + whether to print warnings + Returns ------- basis : sparse csc matrix or array containing b-spline basis functions @@ -580,7 +599,7 @@ def b_spline_basis(x, edge_knots, n_splines=20, 'found: n_splines = {} and spline_order = {}'\ .format(n_splines, spline_order)) - if n_splines == 0: + if n_splines == 0 and verbose: warnings.warn('Requested 1 spline. This is equivalent to '\ 'fitting an intercept', stacklevel=2) From 5ed87b7e1860e63e71660ac7e6e34324da31bce8 Mon Sep 17 00:00:00 2001 From: dani Date: Tue, 27 Mar 2018 16:44:35 -0700 Subject: [PATCH 3/9] add verbosity attribute, use log_pdf instead of pdf for the log_likelihood --- pygam/pygam.py | 83 +++++++++++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 34 deletions(-) diff --git a/pygam/pygam.py b/pygam/pygam.py index c8f6bd52..fc0ac7bd 100644 --- a/pygam/pygam.py +++ b/pygam/pygam.py @@ -210,6 +210,9 @@ class GAM(Core): tol : float, default: 1e-4 Tolerance for stopping criteria. + verbose : bool, default: False + whether to show pyGAM warnings + Attributes ---------- coef_ : array, shape (n_classes, m_features) @@ -243,7 +246,7 @@ def __init__(self, lam=0.6, max_iter=100, n_splines=25, spline_order=3, penalties='auto', tol=1e-4, distribution='normal', link='identity', callbacks=['deviance', 'diffs'], fit_intercept=True, fit_linear=False, fit_splines=True, - dtype='auto', constraints=None): + dtype='auto', constraints=None, verbose=False): self.max_iter = max_iter self.tol = tol @@ -259,6 +262,7 @@ def __init__(self, lam=0.6, max_iter=100, n_splines=25, spline_order=3, self.fit_linear = fit_linear self.fit_splines = fit_splines self.dtype = dtype + self.verbose = verbose # created by other methods self._n_coeffs = [] # useful for indexing into model coefficients @@ -496,7 +500,7 @@ def _validate_data_dep_params(self, X): for i, (dt, x) in enumerate(zip(self._dtype, X.T)): if dt == 'auto': dt = check_dtype(x)[0] - if dt == 'categorical': + if dt == 'categorical' and self.verbose: warnings.warn('detected catergorical data for feature {}'\ .format(i), stacklevel=2) self._dtype[i] = dt @@ -521,7 +525,7 @@ def _validate_data_dep_params(self, X): self._expand_attr('fit_splines', m_features) for i, (fl, c) in enumerate(zip(self._fit_linear, self._constraints)): if bool(c) and (c is not 'none'): - if fl: + if fl and self.verbose: warnings.warn('cannot do fit_linear with constraints. '\ 'setting fit_linear=False for feature {}'\ .format(i)) @@ -539,7 +543,7 @@ def _validate_data_dep_params(self, X): # expand spline_order, n_splines, and prepare edge_knots self._expand_attr('spline_order', X.shape[1], dt_alt=0) self._expand_attr('n_splines', X.shape[1], dt_alt=0) - self._edge_knots = [gen_edge_knots(feat, dtype) for feat, dtype in \ + self._edge_knots = [gen_edge_knots(feat, dtype, verbose=self.verbose) for feat, dtype in \ zip(X.T, self._dtype)] # update our n_splines correcting for categorical features, no splines @@ -580,11 +584,11 @@ def loglikelihood(self, X, y, weights=None): log-likelihood : np.array of shape (n,) containing log-likelihood scores """ - y = check_y(y, self.link, self.distribution) + y = check_y(y, self.link, self.distribution, verbose=self.verbose) mu = self.predict_mu(X) if weights is not None: - weights = check_array(weights, name='sample weights') + weights = check_array(weights, name='sample weights', verbose=self.verbose) check_lengths(y, weights) else: weights = np.ones_like(y).astype('f') @@ -609,7 +613,7 @@ def _loglikelihood(self, y, mu, weights=None): log-likelihood : np.array of shape (n,) containing log-likelihood scores """ - return np.log(self.distribution.pdf(y=y, mu=mu, weights=weights)).sum() + return self.distribution.log_pdf(y=y, mu=mu, weights=weights).sum() def _linear_predictor(self, X=None, modelmat=None, b=None, feature=-1): """linear predictor @@ -667,7 +671,8 @@ def predict_mu(self, X): raise AttributeError('GAM has not been fitted. Call fit first.') X = check_X(X, n_feats=len(self._n_coeffs) - self._fit_intercept, - edge_knots=self._edge_knots, dtypes=self._dtype) + edge_knots=self._edge_knots, dtypes=self._dtype, + verbose=self.verbose) lp = self._linear_predictor(X) return self.link.mu(lp, self.distribution) @@ -691,7 +696,8 @@ def predict(self, X): raise AttributeError('GAM has not been fitted. Call fit first.') X = check_X(X, n_feats=len(self._n_coeffs) - self._fit_intercept, - edge_knots=self._edge_knots, dtypes=self._dtype) + edge_knots=self._edge_knots, dtypes=self._dtype, + verbose=self.verbose) return self.predict_mu(X) @@ -715,7 +721,8 @@ def _modelmat(self, X, feature=-1): containing model matrix of the spline basis for selected features """ X = check_X(X, n_feats=len(self._n_coeffs) - self._fit_intercept, - edge_knots=self._edge_knots, dtypes=self._dtype) + edge_knots=self._edge_knots, dtypes=self._dtype, + verbose=self.verbose) if feature >= len(self._n_coeffs) or feature < -1: raise ValueError('feature {} out of range for X with shape {}'\ @@ -742,7 +749,8 @@ def _modelmat(self, X, feature=-1): edge_knots=self._edge_knots[feature], spline_order=self._spline_order[feature], n_splines=self._n_splines[feature], - sparse=True)) + sparse=True, + verbose=self.verbose)) return sp.sparse.hstack(featuremat, format='csc') @@ -770,13 +778,14 @@ def _cholesky(self, A, **kwargs): constraint_l2 = self._constraint_l2 while constraint_l2 <= self._constraint_l2_max: try: - L = cholesky(A, **kwargs) + L = cholesky(A, verbose=self.verbose, **kwargs) self._constraint_l2 = constraint_l2 return L except NotPositiveDefiniteError: - warnings.warn('Matrix is not positive definite. \n'\ - 'Increasing l2 reg by factor of 10.', - stacklevel=2) + if self.verbose: + warnings.warn('Matrix is not positive definite. \n'\ + 'Increasing l2 reg by factor of 10.', + stacklevel=2) A -= constraint_l2 * diag constraint_l2 *= 10 A += constraint_l2 * diag @@ -1187,12 +1196,12 @@ def fit(self, X, y, weights=None): self._validate_params() # validate data - y = check_y(y, self.link, self.distribution) - X = check_X(X) + y = check_y(y, self.link, self.distribution, verbose=self.verbose) + X = check_X(X, verbose=self.verbose) check_X_y(X, y) if weights is not None: - weights = check_array(weights, name='sample weights') + weights = check_array(weights, name='sample weights', verbose=self.verbose) check_lengths(y, weights) else: weights = np.ones_like(y).astype('f') @@ -1237,13 +1246,14 @@ def deviance_residuals(self, X, y, weights=None, scaled=False): if not self._is_fitted: raise AttributeError('GAM has not been fitted. Call fit first.') - y = check_y(y, self.link, self.distribution) + y = check_y(y, self.link, self.distribution, verbose=self.verbose) X = check_X(X, n_feats=len(self._n_coeffs) - self._fit_intercept, - edge_knots=self._edge_knots, dtypes=self._dtype) + edge_knots=self._edge_knots, dtypes=self._dtype, + verbose=self.verbose) check_X_y(X, y) if weights is not None: - weights = check_array(weights, name='sample weights') + weights = check_array(weights, name='sample weights', verbose=self.verbose) check_lengths(y, weights) else: weights = np.ones_like(y).astype('f') @@ -1521,7 +1531,8 @@ def confidence_intervals(self, X, width=.95, quantiles=None): raise AttributeError('GAM has not been fitted. Call fit first.') X = check_X(X, n_feats=len(self._n_coeffs) - self._fit_intercept, - edge_knots=self._edge_knots, dtypes=self._dtype) + edge_knots=self._edge_knots, dtypes=self._dtype, + verbose=self.verbose) return self._get_quantiles(X, width, quantiles, prediction=False) @@ -1666,7 +1677,7 @@ def partial_dependence(self, X, feature=-1, width=None, quantiles=None): m = len(self._n_coeffs) - self._fit_intercept X = check_X(X, n_feats=m, edge_knots=self._edge_knots, - dtypes=self._dtype) + dtypes=self._dtype, verbose=self.verbose) p_deps = [] compute_quantiles = (width is not None) or (quantiles is not None) @@ -1797,12 +1808,12 @@ def gridsearch(self, X, y, weights=None, return_scores=False, if not self._is_fitted: self._validate_params() - y = check_y(y, self.link, self.distribution) - X = check_X(X) + y = check_y(y, self.link, self.distribution, verbose=self.verbose) + X = check_X(X, verbose=self.verbose) check_X_y(X, y) if weights is not None: - weights = check_array(weights, name='sample weights') + weights = check_array(weights, name='sample weights', verbose=self.verbose) check_lengths(y, weights) else: weights = np.ones_like(y).astype('f') @@ -1899,7 +1910,8 @@ def gridsearch(self, X, y, weights=None, return_scores=False, except ValueError as error: msg = str(error) + '\non model:\n' + str(gam) msg += '\nskipping...\n' - warnings.warn(msg) + if self.verbose: + warnings.warn(msg) continue # record results @@ -2376,7 +2388,8 @@ def prediction_intervals(self, X, width=.95, quantiles=None): raise AttributeError('GAM has not been fitted. Call fit first.') X = check_X(X, n_feats=len(self._n_coeffs) - self._fit_intercept, - edge_knots=self._edge_knots, dtypes=self._dtype) + edge_knots=self._edge_knots, dtypes=self._dtype, + verbose=self.verbose) return self._get_quantiles(X, width, quantiles, prediction=True) @@ -2557,10 +2570,11 @@ def accuracy(self, X=None, y=None, mu=None): if not self._is_fitted: raise AttributeError('GAM has not been fitted. Call fit first.') - y = check_y(y, self.link, self.distribution) + y = check_y(y, self.link, self.distribution, verbose=self.verbose) if X is not None: X = check_X(X, n_feats=len(self._n_coeffs) - self._fit_intercept, - edge_knots=self._edge_knots, dtypes=self._dtype) + edge_knots=self._edge_knots, dtypes=self._dtype, + verbose=self.verbose) if mu is None: mu = self.predict_mu(X) @@ -2778,7 +2792,7 @@ def _loglikelihood(self, y, mu, weights=None, rescale_y=True): if rescale_y: y = y * weights - return np.log(self.distribution.pdf(y=y, mu=mu, weights=weights)).sum() + return self.distribution.log_pdf(y=y, mu=mu, weights=weights).sum() def loglikelihood(self, X, y, exposure=None, weights=None): """ @@ -2801,11 +2815,11 @@ def loglikelihood(self, X, y, exposure=None, weights=None): log-likelihood : np.array of shape (n,) containing log-likelihood scores """ - y = check_y(y, self.link, self.distribution) + y = check_y(y, self.link, self.distribution, verbose=self.verbose) mu = self.predict_mu(X) if weights is not None: - weights = check_array(weights, name='sample weights') + weights = check_array(weights, name='sample weights', verbose=self.verbose) check_lengths(y, weights) else: weights = np.ones_like(y).astype('f') @@ -2908,7 +2922,8 @@ def predict(self, X, exposure=None): raise AttributeError('GAM has not been fitted. Call fit first.') X = check_X(X, n_feats=len(self._n_coeffs) - self._fit_intercept, - edge_knots=self._edge_knots, dtypes=self._dtype) + edge_knots=self._edge_knots, dtypes=self._dtype, + verbose=self.verbose) if exposure is not None: exposure = np.array(exposure).astype('f') From 261427a63ba28e26ebb9563658dfad9392f2a757 Mon Sep 17 00:00:00 2001 From: dani Date: Wed, 28 Mar 2018 22:26:50 -0700 Subject: [PATCH 4/9] change custom gam test now that weve changed gamma to non-canonical link --- pygam/tests/test_GAMs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygam/tests/test_GAMs.py b/pygam/tests/test_GAMs.py index 51e4fcb4..06ce3115 100644 --- a/pygam/tests/test_GAMs.py +++ b/pygam/tests/test_GAMs.py @@ -69,7 +69,7 @@ def test_CustomGAM(trees): check that we can fit a Custom GAM on real data """ X, y = trees - gam = GAM(distribution='gamma', link='log').fit(X, y) + gam = GAM(distribution='gamma', link='inverse').fit(X, y) assert(gam._is_fitted) # TODO check dicts: DISTRIBUTIONS etc From fda20ba94d340ba0612fd8c8c97cd86d5c8d918c Mon Sep 17 00:00:00 2001 From: dani Date: Wed, 28 Mar 2018 22:27:49 -0700 Subject: [PATCH 5/9] change weights to use float64, gridsearch can block progressbar --- pygam/pygam.py | 78 +++++++++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 33 deletions(-) diff --git a/pygam/pygam.py b/pygam/pygam.py index fc0ac7bd..aa2705ad 100644 --- a/pygam/pygam.py +++ b/pygam/pygam.py @@ -591,7 +591,7 @@ def loglikelihood(self, X, y, weights=None): weights = check_array(weights, name='sample weights', verbose=self.verbose) check_lengths(y, weights) else: - weights = np.ones_like(y).astype('f') + weights = np.ones_like(y).astype('float64') return self._loglikelihood(y, mu, weights=weights) @@ -987,12 +987,12 @@ def _pirls(self, X, Y, weights): self.coef_ = np.ones(m) * np.sqrt(EPS) # allow more training # make a reasonable initial parameter guess - if self._fit_intercept: - # set the intercept as if we had a constant model - const_model = (self.link.link(Y, self.distribution)) - if np.isfinite(const_model).sum() > 0: - const_model = np.median(const_model[np.isfinite(const_model)]) - self.coef_[0] += const_model + # if self._fit_intercept: + # # set the intercept as if we had a constant model + # const_model = (self.link.link(Y, self.distribution)) + # if np.isfinite(const_model).sum() > 0: + # const_model = np.median(const_model[np.isfinite(const_model)]) + # self.coef_[0] += const_model # do our penalties require recomputing cholesky? chol_pen = np.ravel([np.ravel(p) for p in self._penalties]) @@ -1204,7 +1204,7 @@ def fit(self, X, y, weights=None): weights = check_array(weights, name='sample weights', verbose=self.verbose) check_lengths(y, weights) else: - weights = np.ones_like(y).astype('f') + weights = np.ones_like(y).astype('float64') # validate data-dependent parameters self._validate_data_dep_params(X) @@ -1256,7 +1256,7 @@ def deviance_residuals(self, X, y, weights=None, scaled=False): weights = check_array(weights, name='sample weights', verbose=self.verbose) check_lengths(y, weights) else: - weights = np.ones_like(y).astype('f') + weights = np.ones_like(y).astype('float64') mu = self.predict_mu(X) sign = np.sign(y-mu) @@ -1426,9 +1426,9 @@ def _estimate_r2(self, X=None, y=None, mu=None, weights=None): mu = self.predict_mu_(X=X) if weights is None: - weights = np.ones_like(y) + weights = np.ones_like(y).astype('float64') - null_mu = y.mean() * np.ones_like(y) + null_mu = y.mean() * np.ones_like(y).astype('float64') null_d = self.distribution.deviance(y=y, mu=null_mu, weights=weights) full_d = self.distribution.deviance(y=y, mu=mu, weights=weights) @@ -1438,7 +1438,9 @@ def _estimate_r2(self, X=None, y=None, mu=None, weights=None): r2 = OrderedDict() r2['explained_deviance'] = 1. - full_d.sum()/null_d.sum() - r2['McFadden'] = 1. - full_ll/null_ll + print(full_ll) + print(null_ll) + r2['McFadden'] = full_ll/null_ll r2['McFadden_adj'] = 1. - (full_ll - self.statistics_['edof'])/null_ll return r2 @@ -1489,7 +1491,7 @@ def _estimate_GCV_UBRE(self, X=None, y=None, modelmat=None, gamma=1.4, modelmat = self._modelmat(X) if weights is None: - weights = np.ones_like(y) + weights = np.ones_like(y).astype('float64') lp = self._linear_predictor(modelmat=modelmat) mu = self.link.mu(lp, self.distribution) @@ -1744,7 +1746,8 @@ def summary(self): print_data(self.statistics_['pseudo_r2'], title='Pseudo-R^2') def gridsearch(self, X, y, weights=None, return_scores=False, - keep_best=True, objective='auto', **param_grids): + keep_best=True, objective='auto', progress=True, + **param_grids): """ performs a grid search over a space of parameters for a given objective @@ -1772,27 +1775,30 @@ def gridsearch(self, X, y, weights=None, return_scores=False, if None, defaults to array of ones return_scores : boolean, default False - whether to return the hyperpamaters - and score for each element in the grid + whether to return the hyperpamaters + and score for each element in the grid keep_best : boolean - whether to keep the best GAM as self. - default: True + whether to keep the best GAM as self. + default: True objective : string, default: 'auto' - metric to optimize. must be in ['AIC', 'AICc', 'GCV', 'UBRE', 'auto'] - if 'auto', then grid search will optimize GCV for models with unknown - scale and UBRE for models with known scale. + metric to optimize. must be in ['AIC', 'AICc', 'GCV', 'UBRE', 'auto'] + if 'auto', then grid search will optimize GCV for models with unknown + scale and UBRE for models with known scale. + + progress : bool, default: True + whether to display a progress bar **kwargs : dict, default {'lam': np.logspace(-3, 3, 11)} - pairs of parameters and iterables of floats, or - parameters and iterables of iterables of floats. + pairs of parameters and iterables of floats, or + parameters and iterables of iterables of floats. - if iterable of iterables of floats, the outer iterable must have - length m_features. + if iterable of iterables of floats, the outer iterable must have + length m_features. - the method will make a grid of all the combinations of the parameters - and fit a GAM to each combination. + the method will make a grid of all the combinations of the parameters + and fit a GAM to each combination. Returns @@ -1816,7 +1822,7 @@ def gridsearch(self, X, y, weights=None, return_scores=False, weights = check_array(weights, name='sample weights', verbose=self.verbose) check_lengths(y, weights) else: - weights = np.ones_like(y).astype('f') + weights = np.ones_like(y).astype('float64') # validate objective if objective not in ['auto', 'GCV', 'UBRE', 'AIC', 'AICc']: @@ -1889,8 +1895,13 @@ def gridsearch(self, X, y, weights=None, return_scores=False, best_model = models[-1] best_score = scores[-1] + # make progressbar optional + if progress: + pbar = ProgressBar() + else: + pbar = lambda x: x + # loop through candidate model params - pbar = ProgressBar() for param_grid in pbar(param_grid_list): # define new model @@ -1926,7 +1937,8 @@ def gridsearch(self, X, y, weights=None, return_scores=False, # problems if len(models) == 0: msg = 'No models were fitted.' - warnings.warn(msg) + if self.verbose: + warnings.warn(msg) return self # copy over the best @@ -2822,7 +2834,7 @@ def loglikelihood(self, X, y, exposure=None, weights=None): weights = check_array(weights, name='sample weights', verbose=self.verbose) check_lengths(y, weights) else: - weights = np.ones_like(y).astype('f') + weights = np.ones_like(y).astype('float64') y, weights = self._exposure_to_weights(y, exposure, weights) return self._loglikelihood(y, mu, weights=weights, rescale_y=True) @@ -2852,7 +2864,7 @@ def _exposure_to_weights(self, y, exposure=None, weights=None): if exposure is not None: exposure = np.array(exposure).astype('f') else: - exposure = np.ones_like(y).astype('f') + exposure = np.ones_like(y).astype('float64') check_lengths(y, exposure) # normalize response @@ -2861,7 +2873,7 @@ def _exposure_to_weights(self, y, exposure=None, weights=None): if weights is not None: weights = np.array(weights).astype('f') else: - weights = np.ones_like(y).astype('f') + weights = np.ones_like(y).astype('float64') check_lengths(weights, exposure) # set exposure as the weight From 48fd76fb14a217ce56505ca11f11ff8a7e994fba Mon Sep 17 00:00:00 2001 From: dani Date: Tue, 3 Apr 2018 13:22:32 -0700 Subject: [PATCH 6/9] ignore log warnings in check link-domain, verbose false in tests_utils --- pygam/tests/test_utils.py | 18 +++++++++--------- pygam/utils.py | 1 + 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/pygam/tests/test_utils.py b/pygam/tests/test_utils.py index 61260d0e..30db0a3f 100644 --- a/pygam/tests/test_utils.py +++ b/pygam/tests/test_utils.py @@ -66,32 +66,32 @@ def test_check_y_not_min_samples(wage, wage_gam): """check_y expects a minimum number of samples""" X, y = wage try: - check_y(y, wage_gam.link, wage_gam.distribution, min_samples=len(y)+1) + check_y(y, wage_gam.link, wage_gam.distribution, min_samples=len(y)+1, verbose=False) assert False except ValueError: - check_y(y, wage_gam.link, wage_gam.distribution, min_samples=len(y)) + check_y(y, wage_gam.link, wage_gam.distribution, min_samples=len(y), verbose=False) assert True -def test_check_y_not_in_doamin_link(default, default_gam): +def test_check_y_not_in_domain_link(default, default_gam): """if you give labels outide of the links domain, check_y will raise an error""" X, y = default gam = default_gam try: - check_y(y + .1, default_gam.link, default_gam.distribution) + check_y(y + .1, default_gam.link, default_gam.distribution, verbose=False) assert False except ValueError: - check_y(y, default_gam.link, default_gam.distribution) + check_y(y, default_gam.link, default_gam.distribution, verbose=False) assert True def test_check_X_not_int_not_float(): """X must be an in or a float""" try: - check_X(['hi']) + check_X(['hi'], verbose=False) assert False except ValueError: - check_X([4]) + check_X([4], verbose=False) assert True def test_check_X_too_many_dims(): @@ -105,10 +105,10 @@ def test_check_X_too_many_dims(): def test_check_X_not_min_samples(): try: - check_X(np.ones((5)), min_samples=6) + check_X(np.ones((5)), min_samples=6, verbose=False) assert False except ValueError: - check_X(np.ones((5)), min_samples=5) + check_X(np.ones((5)), min_samples=5, verbose=False) assert True def test_check_X_y_different_lengths(): diff --git a/pygam/utils.py b/pygam/utils.py index 0bd0be09..66e1537e 100644 --- a/pygam/utils.py +++ b/pygam/utils.py @@ -278,6 +278,7 @@ def check_y(y, link, dist, min_samples=1, verbose=True): name='y data', verbose=verbose) warnings.filterwarnings('ignore', 'divide by zero encountered in log') + warnings.filterwarnings('ignore', 'invalid value encountered in log') if np.any(np.isnan(link.link(y, dist))): raise ValueError('y data is not in domain of {} link function. ' \ 'Expected domain: {}, but found {}' \ From 1f59906ab2ff0fcda7cc4f3bd8243941c3d4d601 Mon Sep 17 00:00:00 2001 From: dani Date: Tue, 3 Apr 2018 15:34:57 -0700 Subject: [PATCH 7/9] replace all dist logpdfs with scipy implementation (except invgauss) --- pygam/distributions.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pygam/distributions.py b/pygam/distributions.py index f8ae4916..cee6780b 100644 --- a/pygam/distributions.py +++ b/pygam/distributions.py @@ -144,7 +144,7 @@ def log_pdf(self, y, mu, weights=None): if weights is None: weights = np.ones_like(mu) scale = self.scale / weights - return -(y - mu)**2 / (2 * scale) - 0.5 * np.log(scale * 2 * np.pi) + return sp.stats.norm.logpdf(y, loc=mu, scale=scale) @divide_weights def V(self, mu): @@ -233,7 +233,7 @@ def __init__(self, levels=1): Parameters ---------- levels : int of None, default: 1 - number of levels in the binomial distribution + number of trials in the binomial distribution Returns ------- @@ -266,8 +266,8 @@ def log_pdf(self, y, mu, weights=None): if weights is None: weights = np.ones_like(mu) n = self.levels - return (np.log(weights * sp.special.comb(n, y)) + np.log(mu / n) * y + - np.log(1 - (mu / n)) * (n - y)) + p = mu / self.levels + return sp.stats.binom.logpmf(y, n, p) @divide_weights def V(self, mu): @@ -376,7 +376,7 @@ def log_pdf(self, y, mu, weights=None): # so we want to pump up all our predictions # NOTE: we assume the targets are unchanged mu = mu * weights - return np.log(mu) * y - mu - np.log(sp.special.factorial(y)) + return sp.stats.poisson.logpmf(y, mu=mu) @divide_weights def V(self, mu): @@ -480,8 +480,7 @@ def log_pdf(self, y, mu, weights=None): if weights is None: weights = np.ones_like(mu) nu = weights / self.scale - return (-np.log(sp.special.gamma(nu)) + - np.log(nu / mu) * nu + np.log(y)*(nu - 1) + -nu * y / mu) + return sp.stats.gamma.logpdf(x=y, a=nu, scale=mu / nu) @divide_weights def V(self, mu): From c220798e9574d4190c079fc4dc36384df43d45a0 Mon Sep 17 00:00:00 2001 From: dani Date: Tue, 3 Apr 2018 15:39:53 -0700 Subject: [PATCH 8/9] use scipy logpdf for invgauss dist --- pygam/distributions.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pygam/distributions.py b/pygam/distributions.py index cee6780b..79ead1e9 100644 --- a/pygam/distributions.py +++ b/pygam/distributions.py @@ -590,8 +590,7 @@ def log_pdf(self, y, mu, weights=None): if weights is None: weights = np.ones_like(mu) gamma = weights / self.scale - return (0.5 * np.log(gamma / (2 * np.pi * y**3)) + - -gamma * (y - mu)**2 / (2 * mu**2 * y)) + return sp.stats.invgauss.logpdf(y, mu, scale=1./gamma) @divide_weights def V(self, mu): From 1195a908e55dba1522d54bed11774c71ff168e1c Mon Sep 17 00:00:00 2001 From: dani Date: Tue, 3 Apr 2018 15:40:06 -0700 Subject: [PATCH 9/9] remove diagnostic print --- pygam/pygam.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pygam/pygam.py b/pygam/pygam.py index aa2705ad..048b46b2 100644 --- a/pygam/pygam.py +++ b/pygam/pygam.py @@ -1438,8 +1438,6 @@ def _estimate_r2(self, X=None, y=None, mu=None, weights=None): r2 = OrderedDict() r2['explained_deviance'] = 1. - full_d.sum()/null_d.sum() - print(full_ll) - print(null_ll) r2['McFadden'] = full_ll/null_ll r2['McFadden_adj'] = 1. - (full_ll - self.statistics_['edof'])/null_ll