Get rid of TableRegressionModel by storing formula directly in model (#…

…339)
JuliaStats · Nov 20, 2022 · b728917 · b728917
1 parent 9dc4d6b
commit b728917
Show file tree

Hide file tree

Showing 12 changed files with 620 additions and 175 deletions.
diff --git a/Project.toml b/Project.toml
@@ -14,6 +14,7 @@ StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
 StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
+Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
 CSV = "0.7, 0.8"
@@ -27,6 +28,7 @@ StatsAPI = "1.4"
 StatsBase = "0.33.5"
 StatsFuns = "0.6, 0.7, 0.8, 0.9, 1.0"
 StatsModels = "0.6.23"
+Tables = "1"
 julia = "1"
 
 [extras]

diff --git a/docs/src/api.md b/docs/src/api.md
@@ -25,7 +25,7 @@ The most general approach to fitting a model is with the `fit` function, as in
 julia> using Random
 
 julia> fit(LinearModel, hcat(ones(10), 1:10), randn(MersenneTwister(12321), 10))
-LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}:
+LinearModel
 
 Coefficients:
 ────────────────────────────────────────────────────────────────
@@ -41,7 +41,7 @@ This model can also be fit as
 julia> using Random
 
 julia> lm(hcat(ones(10), 1:10), randn(MersenneTwister(12321), 10))
-LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}:
+LinearModel
 
 Coefficients:
 ────────────────────────────────────────────────────────────────

diff --git a/docs/src/examples.md b/docs/src/examples.md
@@ -20,7 +20,7 @@ julia> data = DataFrame(X=[1,2,3], Y=[2,4,7])
    3 │     3      7
 
 julia> ols = lm(@formula(Y ~ X), data)
-StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
+LinearModel
 
 Y ~ 1 + X
 
@@ -99,7 +99,7 @@ julia> data = DataFrame(X=[1,2,2], Y=[1,0,1])
    3 │     2      1
 
 julia> probit = glm(@formula(Y ~ X), data, Binomial(), ProbitLink())
-StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Binomial{Float64}, ProbitLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
+GeneralizedLinearModel
 
 Y ~ 1 + X
 
@@ -140,7 +140,7 @@ julia> quine = dataset("MASS", "quine")
                      131 rows omitted
 
 julia> nbrmodel = glm(@formula(Days ~ Eth+Sex+Age+Lrn), quine, NegativeBinomial(2.0), LogLink())
-StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
+GeneralizedLinearModel
 
 Days ~ 1 + Eth + Sex + Age + Lrn
 
@@ -158,7 +158,7 @@ Lrn: SL       0.296768     0.185934   1.60    0.1105  -0.0676559   0.661191
 ────────────────────────────────────────────────────────────────────────────
 
 julia> nbrmodel = negbin(@formula(Days ~ Eth+Sex+Age+Lrn), quine, LogLink())
-StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
+GeneralizedLinearModel
 
 Days ~ 1 + Eth + Sex + Age + Lrn
 
@@ -175,7 +175,7 @@ Age: F3       0.356955     0.247228   1.44    0.1488  -0.127602    0.841513
 Lrn: SL       0.292138     0.18565    1.57    0.1156  -0.0717297   0.656006
 ────────────────────────────────────────────────────────────────────────────
 
-julia> println("Estimated theta = ", round(nbrmodel.model.rr.d.r, digits=5))
+julia> println("Estimated theta = ", round(nbrmodel.rr.d.r, digits=5))
 Estimated theta = 1.27489
 
 ```
@@ -207,7 +207,7 @@ julia> form = dataset("datasets", "Formaldehyde")
    6 │     0.9    0.782
 
 julia> lm1 = fit(LinearModel, @formula(OptDen ~ Carb), form)
-StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
+LinearModel
 
 OptDen ~ 1 + Carb
 
@@ -256,7 +256,7 @@ julia> LifeCycleSavings = dataset("datasets", "LifeCycleSavings")
                                                     35 rows omitted
 
 julia> fm2 = fit(LinearModel, @formula(SR ~ Pop15 + Pop75 + DPI + DDPI), LifeCycleSavings)
-StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
+LinearModel
 
 SR ~ 1 + Pop15 + Pop75 + DPI + DDPI
 
@@ -364,7 +364,7 @@ julia> dobson = DataFrame(Counts    = [18.,17,15,20,10,21,25,13,13],
    9 │    13.0  3        3
 
 julia> gm1 = fit(GeneralizedLinearModel, @formula(Counts ~ Outcome + Treatment), dobson, Poisson())
-StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Poisson{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
+GeneralizedLinearModel
 
 Counts ~ 1 + Outcome + Treatment
 
@@ -421,7 +421,7 @@ julia> round(optimal_bic.minimizer, digits = 5) # Optimal λ
 0.40935
 
 julia> glm(@formula(Volume ~ Height + Girth), trees, Normal(), PowerLink(optimal_bic.minimizer)) # Best model
-StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Normal{Float64}, PowerLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
+GeneralizedLinearModel
 
 Volume ~ 1 + Height + Girth
 

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -85,7 +85,7 @@ julia> data = DataFrame(y = rand(rng, 100), x = categorical(repeat([1, 2, 3, 4],
 
 
 julia> lm(@formula(y ~ x), data)
-StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
+LinearModel
 
 y ~ 1 + x
 
@@ -108,7 +108,7 @@ julia> using StableRNGs
 julia> data = DataFrame(y = rand(StableRNG(1), 100), x = repeat([1, 2, 3, 4], 25));
 
 julia> lm(@formula(y ~ x), data, contrasts = Dict(:x => DummyCoding()))
-StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
+LinearModel
 
 y ~ 1 + x
 

diff --git a/src/GLM.jl b/src/GLM.jl
@@ -10,12 +10,14 @@ module GLM
     import Base: (\), convert, show, size
     import LinearAlgebra: cholesky, cholesky!
     import Statistics: cor
-    import StatsBase: coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual,
-                      loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict,
+    import StatsBase: coef, coeftable, coefnames, confint, deviance, nulldeviance, dof, dof_residual,
+                      loglikelihood, nullloglikelihood, nobs, stderror, vcov,
+                      residuals, predict, predict!,
                       fitted, fit, model_response, response, modelmatrix, r2, r², adjr2, adjr², PValue
     import StatsFuns: xlogy
     import SpecialFunctions: erfc, erfcinv, digamma, trigamma
     import StatsModels: hasintercept
+    import Tables
     export coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual,
            loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict,
            fitted, fit, fit!, model_response, response, modelmatrix, r2, r², adjr2, adjr²,
@@ -89,6 +91,29 @@ module GLM
         pivoted_cholesky!(A; kwargs...) = cholesky!(A, RowMaximum(); kwargs...)
     end
 
+    const COMMON_FIT_KWARGS_DOCS = """
+        - `dropcollinear::Bool=true`: Controls whether or not a model matrix
+          less-than-full rank is accepted.
+          If `true` (the default) the coefficient for redundant linearly dependent columns is
+          `0.0` and all associated statistics are set to `NaN`.
+          Typically from a set of linearly-dependent columns the last ones are identified as redundant
+          (however, the exact selection of columns identified as redundant is not guaranteed).
+        - `wts::Vector=similar(y,0)`: Prior frequency (a.k.a. case) weights of observations.
+          Such weights are equivalent to repeating each observation a number of times equal
+          to its weight. Do note that this interpretation gives equal point estimates but
+          different standard errors from analytical (a.k.a. inverse variance) weights and
+          from probability (a.k.a. sampling) weights which are the default in some other
+          software.
+          Can be length 0 to indicate no weighting (default).
+        - `contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}()`: a `Dict` mapping term names
+          (as `Symbol`s) to term types (e.g. `ContinuousTerm`) or contrasts
+          (e.g., `HelmertCoding()`, `SeqDiffCoding(; levels=["a", "b", "c"])`,
+          etc.). If contrasts are not provided for a variable, the appropriate
+          term type will be guessed based on the data type from the data column:
+          any numeric data is assumed to be continuous, and any non-numeric data
+          is assumed to be categorical (with `DummyCoding()` as the default contrast type).
+        """
+
     include("linpred.jl")
     include("lm.jl")
     include("glmtools.jl")

diff --git a/src/deprecated.jl b/src/deprecated.jl
@@ -2,3 +2,20 @@
 
 @deprecate confint(obj::LinearModel, level::Real) confint(obj, level=level)
 @deprecate confint(obj::AbstractGLM, level::Real) confint(obj, level=level)
+
+function Base.getproperty(mm::LinPredModel, f::Symbol)
+    if f === :model
+        Base.depwarn("accessing the `model` field of GLM.jl models is deprecated, " *
+                     "as they are no longer wrapped in a `TableRegressionModel` " *
+                     "and can be used directly now", :getproperty)
+        return mm
+    elseif f === :mf
+        Base.depwarn("accessing the `mf` field of GLM.jl models is deprecated, " *
+                     "as they are no longer wrapped in a `TableRegressionModel`." *
+                     "Use `formula(m)` to access the model formula.", :getproperty)
+        form = formula(mm)
+        return ModelFrame{Nothing, typeof(mm)}(form, nothing, nothing, typeof(mm))
+    else
+        return getfield(mm, f)
+    end
+end
diff --git a/src/ftest.jl b/src/ftest.jl
@@ -108,8 +108,7 @@ julia> model = lm(@formula(Result ~ 1 + Treatment), dat);
 
 julia> bigmodel = lm(@formula(Result ~ 1 + Treatment + Other), dat);
 
-
-julia> ftest(nullmodel.model, model.model)
+julia> ftest(nullmodel, model)
 F-test: 2 models fitted on 12 observations
 ─────────────────────────────────────────────────────────────────
      DOF  ΔDOF     SSR     ΔSSR      R²     ΔR²        F*   p(>F)
@@ -118,7 +117,7 @@ F-test: 2 models fitted on 12 observations
 [2]    3     1  0.1283  -3.1008  0.9603  0.9603  241.6234  <1e-07
 ─────────────────────────────────────────────────────────────────
 
-julia> ftest(nullmodel.model, model.model, bigmodel.model)
+julia> ftest(nullmodel, model, bigmodel)
 F-test: 3 models fitted on 12 observations
 ─────────────────────────────────────────────────────────────────
      DOF  ΔDOF     SSR     ΔSSR      R²     ΔR²        F*   p(>F)