From 5df0fb52ed376953d70a5a9593283221174c876a Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Thu, 3 Oct 2019 18:41:26 +0200 Subject: [PATCH 01/14] Get rid of TableRegressionModel by storing formula directly in model --- Project.toml | 1 + docs/src/examples.md | 2 +- src/GLM.jl | 46 ++++++++++++++++++++ src/deprecated.jl | 10 +++++ src/ftest.jl | 5 +-- src/glmfit.jl | 76 +++++++++++++++++++++++++++----- src/linpred.jl | 2 +- src/lm.jl | 69 ++++++++++++++++++++++------- src/negbinfit.jl | 8 ++-- test/runtests.jl | 100 +++++++++++++++++++++---------------------- 10 files changed, 233 insertions(+), 86 deletions(-) diff --git a/Project.toml b/Project.toml index eb886257..27eace0c 100644 --- a/Project.toml +++ b/Project.toml @@ -14,6 +14,7 @@ StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c" StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] CSV = "0.7, 0.8" diff --git a/docs/src/examples.md b/docs/src/examples.md index fc796a67..f7b9ecd4 100644 --- a/docs/src/examples.md +++ b/docs/src/examples.md @@ -132,7 +132,7 @@ Age: F3 0.356955 0.247228 1.44 0.1488 -0.127602 0.841513 Lrn: SL 0.292138 0.18565 1.57 0.1156 -0.0717297 0.656006 ──────────────────────────────────────────────────────────────────────────── -julia> println("Estimated theta = ", round(nbrmodel.model.rr.d.r, digits=5)) +julia> println("Estimated theta = ", round(nbrmodel.rr.d.r, digits=5)) Estimated theta = 1.27489 ``` diff --git a/src/GLM.jl b/src/GLM.jl index 019f80e3..83fcd570 100644 --- a/src/GLM.jl +++ b/src/GLM.jl @@ -89,6 +89,52 @@ module GLM pivoted_cholesky!(A; kwargs...) = cholesky!(A, RowMaximum(); kwargs...) end + ## TODO: define these in StatsModels + using Tables + + function StatsModels.modelmatrix(f::FormulaTerm, data, contrasts; + model::Type{M}=StatisticalModel) where M + Tables.istable(data) || + throw(ArgumentError("expected data in a Table, got $(typeof(data))")) + data, _ = StatsModels.missing_omit(Tables.columntable(data), f) + sch = schema(f, data, contrasts) + f = apply_schema(f, sch, M) + f, modelcols(f, data) + end + + formula(x::StatisticalModel) = + throw(ArgumentError("formula not implemented for $(nameof(typeof(x)))")) + + StatsBase.coefnames(x::StatisticalModel) = coefnames(formula(x).rhs) + + get_type(::ContinuousTerm{T}) where {T} = T + + function StatsBase.predict(mm::StatisticalModel, data; + interval::Union{Symbol,Nothing}=nothing, level::Real=0.95, + kwargs...) + Tables.istable(data) || + throw(ArgumentError("expected data in a Table, got $(typeof(data))")) + + f = formula(mm) + cols, nonmissings = StatsModels.missing_omit(columntable(data), f.rhs) + new_x = modelcols(f.rhs, cols) + nr = size(new_x, 1) + y_pred = Tables.allocatecolumn(Union{get_type(f.lhs), Missing}, nr) + fill!(y_pred, missing) + if interval === nothing + predict!(view(y_pred, nonmissings), mm, reshape(new_x, nr, :); kwargs...) + return y_pred + else + lower = Vector{Union{Float64, Missing}}(missing, nr) + upper = Vector{Union{Float64, Missing}}(missing, nr) + tup = (prediction=view(y_pred, nonmissings), + lower=view(lower, nonmissings), + upper=view(upper, nonmissings)) + predict!(tup, mm, reshape(new_x, nr, :); kwargs...) + return (prediction=y_pred, lower=lower, upper=upper) + end + end + include("linpred.jl") include("lm.jl") include("glmtools.jl") diff --git a/src/deprecated.jl b/src/deprecated.jl index f3f60243..0d374063 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -2,3 +2,13 @@ @deprecate confint(obj::LinearModel, level::Real) confint(obj, level=level) @deprecate confint(obj::AbstractGLM, level::Real) confint(obj, level=level) + +function getproperty(m::AbstractGLM, field) + if field === :model + Base.depwarn("accessing the `model` field of `AbstractGLM` objects is deprecated, " * + "as it now returns its parent object", :getproperty) + return m + else + return getfield(m, field) + end +end \ No newline at end of file diff --git a/src/ftest.jl b/src/ftest.jl index 6c8bb8d9..d3a2a6e0 100644 --- a/src/ftest.jl +++ b/src/ftest.jl @@ -108,8 +108,7 @@ julia> model = lm(@formula(Result ~ 1 + Treatment), dat); julia> bigmodel = lm(@formula(Result ~ 1 + Treatment + Other), dat); - -julia> ftest(nullmodel.model, model.model) +julia> ftest(nullmodel, model) F-test: 2 models fitted on 12 observations ────────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) @@ -118,7 +117,7 @@ F-test: 2 models fitted on 12 observations [2] 3 1 0.1283 -3.1008 0.9603 0.9603 241.6234 <1e-07 ────────────────────────────────────────────────────────────────── -julia> ftest(nullmodel.model, model.model, bigmodel.model) +julia> ftest(nullmodel, model, bigmodel) F-test: 3 models fitted on 12 observations ────────────────────────────────────────────────────────────────── DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) diff --git a/src/glmfit.jl b/src/glmfit.jl index 0c108296..756f6510 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -226,12 +226,15 @@ end abstract type AbstractGLM <: LinPredModel end -mutable struct GeneralizedLinearModel{G<:GlmResp,L<:LinPred} <: AbstractGLM +mutable struct GeneralizedLinearModel{G<:GlmResp,L<:LinPred,F<:Union{FormulaTerm,Nothing}} <: AbstractGLM rr::G pp::L + f::F fit::Bool end +formula(obj::GeneralizedLinearModel) = obj.f + function coeftable(mm::AbstractGLM; level::Real=0.95) cc = coef(mm) se = stderror(mm) @@ -239,9 +242,10 @@ function coeftable(mm::AbstractGLM; level::Real=0.95) p = 2 * ccdf.(Ref(Normal()), abs.(zz)) ci = se*quantile(Normal(), (1-level)/2) levstr = isinteger(level*100) ? string(Integer(level*100)) : string(level*100) + cn = mm.f === nothing ? ["x$i" for i = 1:size(mm.pp.X, 2)] : coefnames(mm) CoefTable(hcat(cc,se,zz,p,cc+ci,cc-ci), ["Coef.","Std. Error","z","Pr(>|z|)","Lower $levstr%","Upper $levstr%"], - ["x$i" for i = 1:size(mm.pp.X, 2)], 4, 3) + cn, 4, 3) end function confint(obj::AbstractGLM; level::Real=0.95) @@ -486,7 +490,7 @@ function fit(::Type{M}, end rr = GlmResp(y, d, l, offset, wts) - res = M(rr, cholpred(X), false) + res = M(rr, cholpred(X), nothing, false) return dofit ? fit!(res; fitargs...) : res end @@ -497,6 +501,33 @@ fit(::Type{M}, l::Link=canonicallink(d); kwargs...) where {M<:AbstractGLM} = fit(M, float(X), float(y), d, l; kwargs...) +function fit(::Type{M}, + f::FormulaTerm, + data, + d::UnivariateDistribution, + l::Link=canonicallink(d); + # TODO: support passing wts and offset as symbols + offset::Union{AbstractVector, Nothing} = nothing, + wts::Union{AbstractVector, Nothing} = nothing, + dofit::Bool = true, + contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}(), + fitargs...) where {M<:AbstractGLM} + f, (y, X) = StatsModels.modelmatrix(f, data, contrasts, model=M) + + # Check that X and y have the same number of observations + if size(X, 1) != size(y, 1) + throw(DimensionMismatch("number of rows in X and y must match")) + end + + # TODO: allocate right type upfront + yf = float(y) + off = offset === nothing ? similar(yf, 0) : offset + wts = wts === nothing ? similar(yf, 0) : wts + rr = GlmResp(yf, d, l, off, wts) + res = M(rr, cholpred(X), f, false) + return dofit ? fit!(res; fitargs...) : res +end + """ glm(formula, data, distr::UnivariateDistribution, link::Link = canonicallink(d); ) @@ -557,7 +588,27 @@ function predict(mm::AbstractGLM, newX::AbstractMatrix; offset::FPVector=eltype(newX)[], interval::Union{Symbol,Nothing}=nothing, level::Real=0.95, - interval_method=:transformation) + interval_method=:transformation, + skipmissing::Bool=false) + r = response(mm) + len = size(newX, 1) + res = interval === nothing ? + similar(r, len) : + (prediction=similar(r, len), lower=similar(r, len), upper=similar(r, len)) + predict!(res, mm, newX, + offset=offset, interval=interval, level=level, + interval_method=interval_method) +end + +# TODO: add docstring +function StatsModels.predict!(res::Union{AbstractVector, + NamedTuple{(:prediction, :lower, :upper), + <: NTuple{3, AbstractVector}}}, + mm::AbstractGLM, newX::AbstractMatrix; + offset::FPVector=eltype(newX)[], + interval::Union{Symbol,Nothing}=nothing, + level::Real=0.95, + interval_method=:transformation) eta = newX * coef(mm) if !isempty(mm.rr.offset) length(offset) == size(newX, 1) || @@ -566,11 +617,14 @@ function predict(mm::AbstractGLM, newX::AbstractMatrix; else length(offset) > 0 && throw(ArgumentError("fit without offset, so value of `offset` kw arg does not make sense")) end - mu = linkinv.(Link(mm), eta) if interval === nothing - return mu + res .= linkinv.(Link(mm), eta) elseif interval == :confidence + res isa NamedTuple || + throw(ArgumentError("`res` must be a `NamedTuple` when `interval == :confidence`")) + mu, lower, upper = res + mu .= linkinv.(Link(mm), eta) normalquantile = quantile(Normal(), (1 + level)/2) # Compute confidence intervals in two steps # (2nd step varies depending on `interval_method`) @@ -583,19 +637,19 @@ function predict(mm::AbstractGLM, newX::AbstractMatrix; # 2. Now compute the variance for mu based on variance of eta and # construct intervals based on that (Delta method) stdmu = stdeta .* abs.(mueta.(Link(mm), eta)) - lower = mu .- normalquantile .* stdmu - upper = mu .+ normalquantile .* stdmu + lower .= mu .- normalquantile .* stdmu + upper .= mu .+ normalquantile .* stdmu elseif interval_method == :transformation # 2. Construct intervals for eta, then apply inverse link - lower = linkinv.(Link(mm), eta .- normalquantile .* stdeta) - upper = linkinv.(Link(mm), eta .+ normalquantile .* stdeta) + lower .= linkinv.(Link(mm), eta .- normalquantile .* stdeta) + upper .= linkinv.(Link(mm), eta .+ normalquantile .* stdeta) else throw(ArgumentError("interval_method can be only :transformation or :delta")) end else throw(ArgumentError("only :confidence intervals are defined")) end - (prediction = mu, lower = lower, upper = upper) + return res end # A helper function to choose default values for eta diff --git a/src/linpred.jl b/src/linpred.jl index 4274f575..1d45c706 100644 --- a/src/linpred.jl +++ b/src/linpred.jl @@ -240,7 +240,7 @@ response(obj::LinPredModel) = obj.rr.y fitted(m::LinPredModel) = m.rr.mu predict(mm::LinPredModel) = fitted(mm) -StatsModels.formula(obj::LinPredModel) = modelframe(obj).formula +StatsModels.formula(obj::LinPredModel) = obj.formula residuals(obj::LinPredModel) = residuals(obj.rr) """ diff --git a/src/lm.jl b/src/lm.jl index 079bc893..edeb2200 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -82,13 +82,16 @@ A combination of a [`LmResp`](@ref) and a [`LinPred`](@ref) - `rr`: a `LmResp` object - `pp`: a `LinPred` object """ -struct LinearModel{L<:LmResp,T<:LinPred} <: LinPredModel +struct LinearModel{L<:LmResp,T<:LinPred,F<:Union{FormulaTerm,Nothing}} <: LinPredModel rr::L pp::T + f::F end LinearAlgebra.cholesky(x::LinearModel) = cholesky(x.pp) +formula(obj::LinearModel) = obj.f + function StatsBase.fit!(obj::LinearModel) if isempty(obj.rr.wts) delbeta!(obj.pp, obj.rr.y) @@ -121,9 +124,10 @@ const FIT_LM_DOC = """ `0.0` and all associated statistics are set to `NaN`. """ +# TODO: document contrasts keyword argument """ - fit(LinearModel, formula, data, allowrankdeficient=false; - [wts::AbstractVector], dropcollinear::Bool=true) + fit(LinearModel, formula::FormulaTerm, data, allowrankdeficient=false; + [wts::AbstractVector], dropcollinear::Bool=true) fit(LinearModel, X::AbstractMatrix, y::AbstractVector; wts::AbstractVector=similar(y, 0), dropcollinear::Bool=true) @@ -140,7 +144,17 @@ function fit(::Type{LinearModel}, X::AbstractMatrix{<:Real}, y::AbstractVector{< "argument `dropcollinear` instead. Proceeding with positional argument value: $allowrankdeficient_dep" dropcollinear = allowrankdeficient_dep end - fit!(LinearModel(LmResp(y, wts), cholpred(X, dropcollinear))) + fit!(LinearModel(LmResp(y, wts), cholpred(X, dropcollinear), nothing)) +end + +function fit(::Type{LinearModel}, f::FormulaTerm, data, + allowrankdeficient_dep::Union{Bool,Nothing}=nothing; + wts::Union{AbstractVector{<:Real}, Nothing}=nothing, + dropcollinear::Bool=true, + contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}()) + f, (y, X) = StatsModels.modelmatrix(f, data, contrasts, model=LinearModel) + wts === nothing && (wts = similar(y, 0)) + fit!(LinearModel(LmResp(y, wts), cholpred(X, dropcollinear), f)) end """ @@ -233,9 +247,10 @@ function coeftable(mm::LinearModel; level::Real=0.95) ci = [isnan(t) ? NaN : -Inf for t in tt] end levstr = isinteger(level*100) ? string(Integer(level*100)) : string(level*100) + cn = mm.f === nothing ? ["x$i" for i = 1:size(mm.pp.X, 2)] : coefnames(mm) CoefTable(hcat(cc,se,tt,p,cc+ci,cc-ci), ["Coef.","Std. Error","t","Pr(>|t|)","Lower $levstr%","Upper $levstr%"], - ["x$i" for i = 1:size(mm.pp.X, 2)], 4, 3) + cn, 4, 3) end """ @@ -250,8 +265,26 @@ Valid values of `interval` are `:confidence` delimiting the uncertainty of the predicted relationship, and `:prediction` delimiting estimated bounds for new data points. """ function predict(mm::LinearModel, newx::AbstractMatrix; - interval::Union{Symbol,Nothing}=nothing, level::Real = 0.95) - retmean = newx * coef(mm) + interval::Union{Symbol,Nothing}=nothing, level::Real=0.95) + retmean = similar(view(newx, :, 1)) + if interval === nothing + res = retmean + predict!(res, mm, newx) + else + res = (prediction=retmean, lower=similar(retmean), upper=similar(retmean)) + predict!(res, mm, newx, interval=interval, level=level) + end + return res +end + +StatsModels.predict!(y::AbstractVector, mm::LinearModel, newx::AbstractMatrix) = + y .= newx * coef(mm) + +function StatsModels.predict!(res::NamedTuple{(:prediction, :lower, :upper), + <:NTuple{3, AbstractVector}}, + mm::LinearModel, newx::AbstractMatrix; + interval::Symbol, level::Real=0.95) + res.prediction .= newx * coef(mm) if interval === :confint Base.depwarn("interval=:confint is deprecated in favor of interval=:confidence", :predict) interval = :confidence @@ -264,6 +297,9 @@ function predict(mm::LinearModel, newx::AbstractMatrix; "when some independent variables have been dropped " * "from the model due to collinearity")) end + res isa NamedTuple || + throw(ArgumentError("`res` must be a `NamedTuple` when `interval` is " * + "`:confidence` or `:prediction`")) length(mm.rr.wts) == 0 || error("prediction with confidence intervals not yet implemented for weighted regression") chol = cholesky!(mm.pp) # get the R matrix from the QR factorization @@ -273,16 +309,19 @@ function predict(mm::LinearModel, newx::AbstractMatrix; else R = chol.U end - residvar = ones(size(newx,2)) * deviance(mm)/dof_residual(mm) - if interval == :confidence - retvariance = (newx/R).^2 * residvar - elseif interval == :prediction - retvariance = (newx/R).^2 * residvar .+ deviance(mm)/dof_residual(mm) - else + dev = deviance(mm) + dofr = dof_residual(mm) + residvar = fill(dev/dofr, size(newx, 2), 1) + ret = dropdims((newx/R).^2 * residvar, dims=2) + if interval == :prediction + ret .+= dev/dofr + elseif interval != :confidence error("only :confidence and :prediction intervals are defined") end - retinterval = quantile(TDist(dof_residual(mm)), (1. - level)/2) * sqrt.(retvariance) - (prediction = retmean, lower = retmean .+ retinterval, upper = retmean .- retinterval) + ret .= quantile(TDist(dofr), (1 - level)/2) .* sqrt.(ret) + res.lower .= res.prediction .+ ret + res.upper .= res.prediction -+ ret + return res end function confint(obj::LinearModel; level::Real=0.95) diff --git a/src/negbinfit.jl b/src/negbinfit.jl index 7c36127c..e01f0336 100644 --- a/src/negbinfit.jl +++ b/src/negbinfit.jl @@ -109,9 +109,9 @@ function negbin(F, maxiter=maxiter, atol=atol, rtol=rtol, verbose=verbose, kwargs...) end - μ = regmodel.model.rr.mu - y = regmodel.model.rr.y - wts = regmodel.model.rr.wts + μ = regmodel.rr.mu + y = regmodel.rr.y + wts = regmodel.rr.wts lw, ly = length(wts), length(y) if lw != ly && lw != 0 throw(ArgumentError("length of wts must be either $ly or 0 but was $lw")) @@ -132,7 +132,7 @@ function negbin(F, verbose && println("[ Alternating iteration ", i, ", θ = ", θ, " ]") regmodel = glm(F, D, NegativeBinomial(θ), args...; maxiter=maxiter, atol=atol, rtol=rtol, verbose=verbose, kwargs...) - μ = regmodel.model.rr.mu + μ = regmodel.rr.mu prevθ = θ θ = mle_for_θ(y, μ, wts; maxiter=maxiter, tol=rtol) δ = prevθ - θ diff --git a/test/runtests.jl b/test/runtests.jl index 447798b2..5f773d0c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -28,7 +28,7 @@ linreg(x::AbstractVecOrMat, y::AbstractVector) = qr!(simplemm(x)) \ y Σ = [6.136653061224592e-05 -9.464489795918525e-05 -9.464489795918525e-05 1.831836734693908e-04] @test isapprox(vcov(lm1), Σ) - @test isapprox(cor(lm1.model), Diagonal(diag(Σ))^(-1/2)*Σ*Diagonal(diag(Σ))^(-1/2)) + @test isapprox(cor(lm1), Diagonal(diag(Σ))^(-1/2)*Σ*Diagonal(diag(Σ))^(-1/2)) @test dof(lm1) == 3 @test isapprox(deviance(lm1), 0.0002992000000000012) @test isapprox(loglikelihood(lm1), 21.204842144047973) @@ -148,7 +148,7 @@ end ct = coeftable(model) @test dof_residual(model) == 0 @test dof(model) == 4 - @test isinf(GLM.dispersion(model.model)) + @test isinf(GLM.dispersion(model)) @test coef(model) ≈ [1, 1, 2] @test isequal(hcat(ct.cols[2:end]...), [Inf 0.0 1.0 -Inf Inf @@ -159,7 +159,7 @@ end ct = coeftable(model) @test dof_residual(model) == 0 @test dof(model) == 4 - @test isinf(GLM.dispersion(model.model)) + @test isinf(GLM.dispersion(model)) @test coef(model) ≈ [1, 2, 3] @test isequal(hcat(ct.cols[2:end]...), [Inf 0.0 1.0 -Inf Inf @@ -170,7 +170,7 @@ end ct = coeftable(model) @test dof_residual(model) == 0 @test dof(model) == 4 - @test isinf(GLM.dispersion(model.model)) + @test isinf(GLM.dispersion(model)) @test coef(model) ≈ [1, 1, 2] @test isequal(hcat(ct.cols[2:end]...), [Inf 0.0 1.0 -Inf Inf @@ -181,7 +181,7 @@ end ct = coeftable(model) @test dof_residual(model) == 0 @test dof(model) == 4 - @test isinf(GLM.dispersion(model.model)) + @test isinf(GLM.dispersion(model)) @test coef(model) ≈ [1, 2, 3] @test isequal(hcat(ct.cols[2:end]...), [Inf 0.0 1.0 -Inf Inf @@ -194,7 +194,7 @@ end ct = coeftable(model) @test dof_residual(model) == 0 @test dof(model) == 4 - @test isinf(GLM.dispersion(model.model)) + @test isinf(GLM.dispersion(model)) @test coef(model) ≈ [1, 1, 2, 0, 0] @test isequal(hcat(ct.cols[2:end]...), [Inf 0.0 1.0 -Inf Inf @@ -216,7 +216,7 @@ end mdl = lm(@formula(y ~ 0 + x), data) @test coef(mdl) ≈ [2.07438016528926] @test stderror(mdl) ≈ [0.165289256198347E-01] - @test GLM.dispersion(mdl.model) ≈ 3.56753034006338 + @test GLM.dispersion(mdl) ≈ 3.56753034006338 @test dof(mdl) == 2 @test dof_residual(mdl) == 10 @test r2(mdl) ≈ 0.999365492298663 @@ -239,7 +239,7 @@ end mdl = lm(@formula(y ~ 0 + x), data) @test coef(mdl) ≈ [0.727272727272727] @test stderror(mdl) ≈ [0.420827318078432E-01] - @test GLM.dispersion(mdl.model) ≈ 0.369274472937998 + @test GLM.dispersion(mdl) ≈ 0.369274472937998 @test dof(mdl) == 2 @test dof_residual(mdl) == 2 @test r2(mdl) ≈ 0.993348115299335 @@ -261,7 +261,7 @@ end @test coef(mdl1) ≈ coef(mdl2) @test stderror(mdl1) ≈ stderror(mdl2) - @test GLM.dispersion(mdl1.model) ≈ GLM.dispersion(mdl2) + @test GLM.dispersion(mdl1) ≈ GLM.dispersion(mdl2) @test dof(mdl1) ≈ dof(mdl2) @test dof_residual(mdl1) ≈ dof_residual(mdl2) @test r2(mdl1) ≈ r2(mdl2) @@ -282,7 +282,7 @@ dobson = DataFrame(Counts = [18.,17,15,20,10,20,25,13,12], @testset "Poisson GLM" begin gm1 = fit(GeneralizedLinearModel, @formula(Counts ~ 1 + Outcome + Treatment), dobson, Poisson()) - @test GLM.cancancel(gm1.model.rr) + @test GLM.cancancel(gm1.rr) test_show(gm1) @test dof(gm1) == 5 @test isapprox(deviance(gm1), 5.12914107700115, rtol = 1e-7) @@ -300,7 +300,7 @@ admit.rank = categorical(admit.rank) @testset "$distr with LogitLink" for distr in (Binomial, Bernoulli) gm2 = fit(GeneralizedLinearModel, @formula(admit ~ 1 + gre + gpa + rank), admit, distr()) - @test GLM.cancancel(gm2.model.rr) + @test GLM.cancancel(gm2.rr) test_show(gm2) @test dof(gm2) == 6 @test deviance(gm2) ≈ 458.5174924758994 @@ -317,7 +317,7 @@ end gm3 = fit(GeneralizedLinearModel, @formula(admit ~ 1 + gre + gpa + rank), admit, Binomial(), ProbitLink()) test_show(gm3) - @test !GLM.cancancel(gm3.model.rr) + @test !GLM.cancancel(gm3.rr) @test dof(gm3) == 6 @test isapprox(deviance(gm3), 458.4131713833386) @test isapprox(loglikelihood(gm3), -229.20658569166932) @@ -332,7 +332,7 @@ end @testset "Bernoulli CauchitLink" begin gm4 = fit(GeneralizedLinearModel, @formula(admit ~ gre + gpa + rank), admit, Binomial(), CauchitLink()) - @test !GLM.cancancel(gm4.model.rr) + @test !GLM.cancancel(gm4.rr) test_show(gm4) @test dof(gm4) == 6 @test isapprox(deviance(gm4), 459.3401112751141) @@ -345,7 +345,7 @@ end @testset "Bernoulli CloglogLink" begin gm5 = fit(GeneralizedLinearModel, @formula(admit ~ gre + gpa + rank), admit, Binomial(), CloglogLink()) - @test !GLM.cancancel(gm5.model.rr) + @test !GLM.cancancel(gm5.rr) test_show(gm5) @test dof(gm5) == 6 @test isapprox(deviance(gm5), 458.89439629612616) @@ -373,7 +373,7 @@ anorexia = CSV.read(joinpath(glm_datadir, "anorexia.csv"), DataFrame) @testset "Offset" begin gm6 = fit(GeneralizedLinearModel, @formula(Postwt ~ 1 + Prewt + Treat), anorexia, Normal(), IdentityLink(), offset=Array{Float64}(anorexia.Prewt)) - @test GLM.cancancel(gm6.model.rr) + @test GLM.cancancel(gm6.rr) test_show(gm6) @test dof(gm6) == 5 @test isapprox(deviance(gm6), 3311.262619919613) @@ -383,7 +383,7 @@ anorexia = CSV.read(joinpath(glm_datadir, "anorexia.csv"), DataFrame) @test isapprox(bic(gm6), 501.35662813730465) @test isapprox(coef(gm6), [49.7711090, -0.5655388, -4.0970655, 4.5630627]) - @test isapprox(GLM.dispersion(gm6.model, true), 48.6950385282296) + @test isapprox(GLM.dispersion(gm6, true), 48.6950385282296) @test isapprox(stderror(gm6), [13.3909581, 0.1611824, 1.8934926, 2.1333359]) end @@ -391,12 +391,12 @@ end @testset "Normal LogLink offset" begin gm7 = fit(GeneralizedLinearModel, @formula(Postwt ~ 1 + Prewt + Treat), anorexia, Normal(), LogLink(), offset=Array{Float64}(anorexia.Prewt), rtol=1e-8) - @test !GLM.cancancel(gm7.model.rr) + @test !GLM.cancancel(gm7.rr) test_show(gm7) @test isapprox(deviance(gm7), 3265.207242977156) @test isapprox(coef(gm7), [3.99232679, -0.99445269, -0.05069826, 0.05149403]) - @test isapprox(GLM.dispersion(gm7.model, true), 48.017753573192266) + @test isapprox(GLM.dispersion(gm7, true), 48.017753573192266) @test isapprox(stderror(gm7), [0.157167944, 0.001886286, 0.022584069, 0.023882826], atol=1e-6) @@ -408,8 +408,8 @@ clotting = DataFrame(u = log.([5,10,15,20,30,40,60,80,100]), @testset "Gamma" begin gm8 = fit(GeneralizedLinearModel, @formula(lot1 ~ 1 + u), clotting, Gamma()) - @test !GLM.cancancel(gm8.model.rr) - @test isa(GLM.Link(gm8.model), InverseLink) + @test !GLM.cancancel(gm8.rr) + @test isa(GLM.Link(gm8), InverseLink) test_show(gm8) @test dof(gm8) == 3 @test isapprox(deviance(gm8), 0.016729715178484157) @@ -418,14 +418,14 @@ clotting = DataFrame(u = log.([5,10,15,20,30,40,60,80,100]), @test isapprox(aicc(gm8), 42.78992394955449) @test isapprox(bic(gm8), 38.58159768156315) @test isapprox(coef(gm8), [-0.01655438172784895,0.01534311491072141]) - @test isapprox(GLM.dispersion(gm8.model, true), 0.002446059333495581, atol=1e-6) + @test isapprox(GLM.dispersion(gm8, true), 0.002446059333495581, atol=1e-6) @test isapprox(stderror(gm8), [0.00092754223, 0.000414957683], atol=1e-6) end @testset "InverseGaussian" begin gm8a = fit(GeneralizedLinearModel, @formula(lot1 ~ 1 + u), clotting, InverseGaussian()) - @test !GLM.cancancel(gm8a.model.rr) - @test isa(GLM.Link(gm8a.model), InverseSquareLink) + @test !GLM.cancancel(gm8a.rr) + @test isa(GLM.Link(gm8a), InverseSquareLink) test_show(gm8a) @test dof(gm8a) == 3 @test isapprox(deviance(gm8a), 0.006931128347234519) @@ -434,14 +434,14 @@ end @test isapprox(aicc(gm8a), 66.37485201769974) @test isapprox(bic(gm8a), 62.16652574970839) @test isapprox(coef(gm8a), [-0.0011079770504295668,0.0007219138982289362]) - @test isapprox(GLM.dispersion(gm8a.model, true), 0.0011008719709455776, atol=1e-6) + @test isapprox(GLM.dispersion(gm8a, true), 0.0011008719709455776, atol=1e-6) @test isapprox(stderror(gm8a), [0.0001675339726910311,9.468485015919463e-5], atol=1e-6) end @testset "Gamma LogLink" begin gm9 = fit(GeneralizedLinearModel, @formula(lot1 ~ 1 + u), clotting, Gamma(), LogLink(), rtol=1e-8, atol=0.0) - @test !GLM.cancancel(gm9.model.rr) + @test !GLM.cancancel(gm9.rr) test_show(gm9) @test dof(gm9) == 3 @test deviance(gm9) ≈ 0.16260829451739 @@ -450,14 +450,14 @@ end @test aicc(gm9) ≈ 63.28165620769822 @test bic(gm9) ≈ 59.07332993970688 @test coef(gm9) ≈ [5.50322528458221, -0.60191617825971] - @test GLM.dispersion(gm9.model, true) ≈ 0.02435442293561081 + @test GLM.dispersion(gm9, true) ≈ 0.02435442293561081 @test stderror(gm9) ≈ [0.19030107482720, 0.05530784660144] end @testset "Gamma IdentityLink" begin gm10 = fit(GeneralizedLinearModel, @formula(lot1 ~ 1 + u), clotting, Gamma(), IdentityLink(), rtol=1e-8, atol=0.0) - @test !GLM.cancancel(gm10.model.rr) + @test !GLM.cancancel(gm10.rr) test_show(gm10) @test dof(gm10) == 3 @test isapprox(deviance(gm10), 0.60845414895344) @@ -466,7 +466,7 @@ end @test isapprox(aicc(gm10), 75.23214487456835) @test isapprox(bic(gm10), 71.02381860657701) @test isapprox(coef(gm10), [99.250446880986, -18.374324929002]) - @test isapprox(GLM.dispersion(gm10.model, true), 0.10417373, atol=1e-6) + @test isapprox(GLM.dispersion(gm10, true), 0.10417373, atol=1e-6) @test isapprox(stderror(gm10), [17.864084, 4.297895], atol=1e-4) end @@ -545,7 +545,7 @@ end quine = dataset("MASS", "quine") @testset "NegativeBinomial LogLink Fixed θ" begin gm18 = fit(GeneralizedLinearModel, @formula(Days ~ Eth+Sex+Age+Lrn), quine, NegativeBinomial(2.0), LogLink()) - @test !GLM.cancancel(gm18.model.rr) + @test !GLM.cancancel(gm18.rr) test_show(gm18) @test dof(gm18) == 8 @test isapprox(deviance(gm18), 239.11105911824325, rtol = 1e-7) @@ -561,7 +561,7 @@ end @testset "NegativeBinomial NegativeBinomialLink Fixed θ" begin # the default/canonical link is NegativeBinomialLink gm19 = fit(GeneralizedLinearModel, @formula(Days ~ Eth+Sex+Age+Lrn), quine, NegativeBinomial(2.0)) - @test GLM.cancancel(gm19.model.rr) + @test GLM.cancancel(gm19.rr) test_show(gm19) @test dof(gm19) == 8 @test isapprox(deviance(gm19), 239.68562048977307, rtol = 1e-7) @@ -848,10 +848,10 @@ end d = DataFrame(Treatment=[1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2.], Result=[1.1, 1.2, 1, 2.2, 1.9, 2, .9, 1, 1, 2.2, 2, 2], Other=categorical([1, 1, 2, 1, 2, 1, 3, 1, 1, 2, 2, 1])) - mod = lm(@formula(Result~Treatment), d).model - othermod = lm(@formula(Result~Other), d).model - nullmod = lm(@formula(Result~1), d).model - bothmod = lm(@formula(Result~Other+Treatment), d).model + mod = lm(@formula(Result~Treatment), d) + othermod = lm(@formula(Result~Other), d) + nullmod = lm(@formula(Result~1), d) + bothmod = lm(@formula(Result~Other+Treatment), d) nointerceptmod = lm(reshape(d.Treatment, :, 1), d.Result) ft1 = ftest(mod) @@ -903,10 +903,10 @@ end d = DataFrame(Treatment=[1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2.], Result=[1.1, 1.2, 1, 2.2, 1.9, 2, .9, 1, 1, 2.2, 2, 2], Other=categorical([1, 1, 2, 1, 2, 1, 3, 1, 1, 2, 2, 1])) - mod = lm(@formula(Result~Treatment), d).model - othermod = lm(@formula(Result~Other), d).model - nullmod = lm(@formula(Result~1), d).model - bothmod = lm(@formula(Result~Other+Treatment), d).model + mod = lm(@formula(Result~Treatment), d) + othermod = lm(@formula(Result~Other), d) + nullmod = lm(@formula(Result~1), d) + bothmod = lm(@formula(Result~Other+Treatment), d) @test StatsModels.isnested(nullmod, mod) @test !StatsModels.isnested(othermod, mod) @test StatsModels.isnested(mod, bothmod) @@ -914,7 +914,7 @@ end @test StatsModels.isnested(othermod, bothmod) d.Sum = d.Treatment + (d.Other .== 1) - summod = lm(@formula(Result~Sum), d).model + summod = lm(@formula(Result~Sum), d) @test StatsModels.isnested(summod, bothmod) ft1a = ftest(mod, nullmod) @@ -963,7 +963,7 @@ end ─────────────────────────────────────────────────────────────────""" end - bigmod = lm(@formula(Result~Treatment+Other), d).model + bigmod = lm(@formula(Result~Treatment+Other), d) ft2a = ftest(nullmod, mod, bigmod) @test isnan(ft2a.pval[1]) @test ft2a.pval[2] ≈ 2.481215056713184e-8 @@ -1019,7 +1019,7 @@ end @test_throws ArgumentError ftest(mod, bigmod, nullmod) @test_throws ArgumentError ftest(nullmod, bigmod, mod) @test_throws ArgumentError ftest(bigmod, nullmod, mod) - mod2 = lm(@formula(Result~Treatment), d[2:end, :]).model + mod2 = lm(@formula(Result~Treatment), d[2:end, :]) @test_throws ArgumentError ftest(mod, mod2) end @@ -1062,7 +1062,7 @@ end @test hcat(t.cols[5:6]...) == confint(lm1) # TODO: call coeftable(gm1, ...) directly once DataFrameRegressionModel # supports keyword arguments - t = coeftable(lm1.model, level=0.99) + t = coeftable(lm1, level=0.99) @test hcat(t.cols[5:6]...) == confint(lm1, level=0.99) gm1 = fit(GeneralizedLinearModel, @formula(Counts ~ 1 + Outcome + Treatment), @@ -1073,9 +1073,7 @@ end @test t.cols[4] ≈ [5.4267674619082684e-71, 0.024647114627808674, 0.12848651178787643, 0.9999999999999981, 0.9999999999999999] @test hcat(t.cols[5:6]...) == confint(gm1) - # TODO: call coeftable(gm1, ...) directly once DataFrameRegressionModel - # supports keyword arguments - t = coeftable(gm1.model, level=0.99) + t = coeftable(gm1, level=0.99) @test hcat(t.cols[5:6]...) == confint(gm1, level=0.99) end @@ -1155,10 +1153,10 @@ end Result=[1.1, 1.2, 1, 2.2, 1.9, 2, .9, 1, 1, 2.2, 2, 2], Other=categorical([1, 1, 2, 1, 2, 1, 3, 1, 1, 2, 2, 1])) - mod = lm(@formula(Result~Treatment), d).model + mod = lm(@formula(Result~Treatment), d) @test hasintercept(mod) - nullmod = lm(@formula(Result~1), d).model + nullmod = lm(@formula(Result~1), d) @test hasintercept(nullmod) nointerceptmod = lm(reshape(d.Treatment, :, 1), d.Result) @@ -1265,7 +1263,7 @@ end @test coef(mdl) ≈ [-0.05132238692134761, 0.01428684676273272, 0.15033126098228242] @test stderror(mdl) ≈ [0.224095414423756, 0.003342439119757, 0.005838227761632] atol=1.0e-8 @test dof(mdl) == 4 - @test GLM.dispersion(mdl.model, true) ≈ 6.577062388609384 + @test GLM.dispersion(mdl, true) ≈ 6.577062388609384 @test loglikelihood(mdl) ≈ -71.60507986987612 @test deviance(mdl) ≈ 184.15774688106 @test aic(mdl) ≈ 151.21015973975 @@ -1278,7 +1276,7 @@ end @test stderror(mdl1) ≈ stderror(mdl2) @test dof(mdl1) == dof(mdl2) @test dof_residual(mdl1) == dof_residual(mdl2) - @test GLM.dispersion(mdl1.model, true) ≈ GLM.dispersion(mdl2.model,true) + @test GLM.dispersion(mdl1, true) ≈ GLM.dispersion(mdl2,true) @test deviance(mdl1) ≈ deviance(mdl2) @test loglikelihood(mdl1) ≈ loglikelihood(mdl2) @test confint(mdl1) ≈ confint(mdl2) @@ -1292,7 +1290,7 @@ end @test stderror(mdl1) ≈ stderror(mdl2) @test dof(mdl1) == dof(mdl2) @test dof_residual(mdl1) == dof_residual(mdl2) - @test GLM.dispersion(mdl1.model, true) ≈ GLM.dispersion(mdl2.model,true) + @test GLM.dispersion(mdl1, true) ≈ GLM.dispersion(mdl2,true) @test deviance(mdl1) ≈ deviance(mdl2) @test loglikelihood(mdl1) ≈ loglikelihood(mdl2) @test confint(mdl1) ≈ confint(mdl2) @@ -1308,7 +1306,7 @@ end @test dof_residual(mdl1) == dof_residual(mdl2) @test deviance(mdl1) ≈ deviance(mdl2) @test loglikelihood(mdl1) ≈ loglikelihood(mdl2) - @test GLM.dispersion(mdl1.model, true) ≈ GLM.dispersion(mdl2.model,true) + @test GLM.dispersion(mdl1, true) ≈ GLM.dispersion(mdl2,true) @test confint(mdl1) ≈ confint(mdl2) @test aic(mdl1) ≈ aic(mdl2) @test predict(mdl1) ≈ predict(mdl2) From d3f94303f46908b22f514ec7acb719b51787e940 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 19 Jun 2022 18:14:39 +0200 Subject: [PATCH 02/14] Cleanup --- src/GLM.jl | 46 -------------------- src/glmfit.jl | 64 ++++++++++++++++++++-------- src/linpred.jl | 45 +++++++++++++++++++- src/lm.jl | 71 +++++++++++++++++-------------- test/runtests.jl | 108 +++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 233 insertions(+), 101 deletions(-) diff --git a/src/GLM.jl b/src/GLM.jl index 83fcd570..019f80e3 100644 --- a/src/GLM.jl +++ b/src/GLM.jl @@ -89,52 +89,6 @@ module GLM pivoted_cholesky!(A; kwargs...) = cholesky!(A, RowMaximum(); kwargs...) end - ## TODO: define these in StatsModels - using Tables - - function StatsModels.modelmatrix(f::FormulaTerm, data, contrasts; - model::Type{M}=StatisticalModel) where M - Tables.istable(data) || - throw(ArgumentError("expected data in a Table, got $(typeof(data))")) - data, _ = StatsModels.missing_omit(Tables.columntable(data), f) - sch = schema(f, data, contrasts) - f = apply_schema(f, sch, M) - f, modelcols(f, data) - end - - formula(x::StatisticalModel) = - throw(ArgumentError("formula not implemented for $(nameof(typeof(x)))")) - - StatsBase.coefnames(x::StatisticalModel) = coefnames(formula(x).rhs) - - get_type(::ContinuousTerm{T}) where {T} = T - - function StatsBase.predict(mm::StatisticalModel, data; - interval::Union{Symbol,Nothing}=nothing, level::Real=0.95, - kwargs...) - Tables.istable(data) || - throw(ArgumentError("expected data in a Table, got $(typeof(data))")) - - f = formula(mm) - cols, nonmissings = StatsModels.missing_omit(columntable(data), f.rhs) - new_x = modelcols(f.rhs, cols) - nr = size(new_x, 1) - y_pred = Tables.allocatecolumn(Union{get_type(f.lhs), Missing}, nr) - fill!(y_pred, missing) - if interval === nothing - predict!(view(y_pred, nonmissings), mm, reshape(new_x, nr, :); kwargs...) - return y_pred - else - lower = Vector{Union{Float64, Missing}}(missing, nr) - upper = Vector{Union{Float64, Missing}}(missing, nr) - tup = (prediction=view(y_pred, nonmissings), - lower=view(lower, nonmissings), - upper=view(upper, nonmissings)) - predict!(tup, mm, reshape(new_x, nr, :); kwargs...) - return (prediction=y_pred, lower=lower, upper=upper) - end - end - include("linpred.jl") include("lm.jl") include("glmtools.jl") diff --git a/src/glmfit.jl b/src/glmfit.jl index 756f6510..c363474a 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -512,7 +512,7 @@ function fit(::Type{M}, dofit::Bool = true, contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}(), fitargs...) where {M<:AbstractGLM} - f, (y, X) = StatsModels.modelmatrix(f, data, contrasts, model=M) + f, (y, X) = modelframe(f, data, contrasts) # Check that X and y have the same number of observations if size(X, 1) != size(y, 1) @@ -568,13 +568,12 @@ function dispersion(m::AbstractGLM, sqr::Bool=false) end end +const PREDICT_COMMON = """ - predict(mm::AbstractGLM, newX::AbstractMatrix; offset::FPVector=eltype(newX)[], - interval::Union{Symbol,Nothing}=nothing, level::Real = 0.95, - interval_method::Symbol = :transformation) - -Return the predicted response of model `mm` from covariate values `newX` and, -optionally, an `offset`. +`newX` must be either a table (in the [Tables.jl]((https://tables.juliadata.org/stable/) +definition) containing all columns used in the model formula, or a matrix with one column +for each predictor in the model. In both cases, each row represents an observation for +which a prediction will be returned. If `interval=:confidence`, also return upper and lower bounds for a given coverage `level`. By default (`interval_method = :transformation`) the intervals are constructed by applying @@ -584,12 +583,23 @@ response around the linear predictor. The `:delta` method intervals are symmetri the point estimates, but do not respect natural parameter constraints (e.g., the lower bound for a probability could be negative). """ + +""" + predict(mm::AbstractGLM, newX; + offset::FPVector=[], + interval::Union{Symbol,Nothing}=nothing, level::Real = 0.95, + interval_method::Symbol = :transformation) + +Return the predicted response of model `mm` from covariate values `newX` and, +optionally, an `offset`. + +$PREDICT_COMMON +""" function predict(mm::AbstractGLM, newX::AbstractMatrix; offset::FPVector=eltype(newX)[], interval::Union{Symbol,Nothing}=nothing, level::Real=0.95, - interval_method=:transformation, - skipmissing::Bool=false) + interval_method=:transformation) r = response(mm) len = size(newX, 1) res = interval === nothing ? @@ -600,15 +610,27 @@ function predict(mm::AbstractGLM, newX::AbstractMatrix; interval_method=interval_method) end -# TODO: add docstring -function StatsModels.predict!(res::Union{AbstractVector, - NamedTuple{(:prediction, :lower, :upper), - <: NTuple{3, AbstractVector}}}, - mm::AbstractGLM, newX::AbstractMatrix; - offset::FPVector=eltype(newX)[], - interval::Union{Symbol,Nothing}=nothing, - level::Real=0.95, - interval_method=:transformation) +""" + predict!(res, mm::AbstractGLM, newX::AbstractMatrix; + offset::FPVector=eltype(newX)[], + interval::Union{Symbol,Nothing}=nothing, level::Real = 0.95, + interval_method::Symbol = :transformation) + +Store in `res` the predicted response of model `mm` from covariate values `newX` +and, optionally, an `offset`. `res` must be a vector with a length equal to the number +of rows in `newX` if `interval=nothing` (the default), and otherwise a `NamedTuple` +of vectors with names `prediction`, `lower` and `upper`. + +$PREDICT_COMMON +""" +function predict!(res::Union{AbstractVector, + NamedTuple{(:prediction, :lower, :upper), + <: NTuple{3, AbstractVector}}}, + mm::AbstractGLM, newX::AbstractMatrix; + offset::FPVector=eltype(newX)[], + interval::Union{Symbol,Nothing}=nothing, + level::Real=0.95, + interval_method=:transformation) eta = newX * coef(mm) if !isempty(mm.rr.offset) length(offset) == size(newX, 1) || @@ -619,11 +641,17 @@ function StatsModels.predict!(res::Union{AbstractVector, end if interval === nothing + res isa AbstractVector || + throw(ArgumentError("`res` must be a vector when `interval == nothing` or is omitted")) + length(res) == size(newX, 1) || + throw(DimensionMismatch("length of `res` must equal the number of rows in `newX`")) res .= linkinv.(Link(mm), eta) elseif interval == :confidence res isa NamedTuple || throw(ArgumentError("`res` must be a `NamedTuple` when `interval == :confidence`")) mu, lower, upper = res + length(mu) == length(lower) == length(upper) == size(newX, 1) || + throw(DimensionMismatch("length of vectors in `res` must equal the number of rows in `newX`")) mu .= linkinv.(Link(mm), eta) normalquantile = quantile(Normal(), (1 + level)/2) # Compute confidence intervals in two steps diff --git a/src/linpred.jl b/src/linpred.jl index 1d45c706..ab161f93 100644 --- a/src/linpred.jl +++ b/src/linpred.jl @@ -234,13 +234,21 @@ function show(io::IO, obj::LinPredModel) println(io, "$(typeof(obj)):\n\nCoefficients:\n", coeftable(obj)) end -modelframe(obj::LinPredModel) = obj.fr +function modelframe(f::FormulaTerm, data, contrasts::AbstractDict) + Tables.istable(data) || + throw(ArgumentError("expected data in a Table, got $(typeof(data))")) + data, _ = StatsModels.missing_omit(Tables.columntable(data), f) + sch = schema(f, data, contrasts) + f = apply_schema(f, sch, LinPredModel) + f, modelcols(f, data) +end + modelmatrix(obj::LinPredModel) = obj.pp.X response(obj::LinPredModel) = obj.rr.y fitted(m::LinPredModel) = m.rr.mu predict(mm::LinPredModel) = fitted(mm) -StatsModels.formula(obj::LinPredModel) = obj.formula +formula(obj::LinPredModel) = obj.formula residuals(obj::LinPredModel) = residuals(obj.rr) """ @@ -260,7 +268,40 @@ end coef(x::LinPred) = x.beta0 coef(obj::LinPredModel) = coef(obj.pp) +coefnames(x::LinPredModel) = coefnames(formula(x).rhs) dof_residual(obj::LinPredModel) = nobs(obj) - dof(obj) + 1 hasintercept(m::LinPredModel) = any(i -> all(==(1), view(m.pp.X , :, i)), 1:size(m.pp.X, 2)) + +_coltype(::ContinuousTerm{T}) where {T} = T + +# Function common to all LinPred models, but documented separately +# for LinearModel and GeneralizedLinearModel +function StatsBase.predict(mm::LinPredModel, data; + interval::Union{Symbol,Nothing}=nothing, level::Real=0.95, + kwargs...) + Tables.istable(data) || + throw(ArgumentError("expected data in a Table, got $(typeof(data))")) + + f = formula(mm) + t = columntable(data) + cols, nonmissings = StatsModels.missing_omit(t, f.rhs) + newx = modelcols(f.rhs, cols) + prediction = Tables.allocatecolumn(Union{_coltype(f.lhs), Missing}, Tables.rowcount(t)) + fill!(prediction, missing) + if interval === nothing + predict!(view(prediction, nonmissings), mm, newx; kwargs...) + return prediction + else + # Finding integer indices once is faster + nonmissinginds = findall(nonmissings) + lower = Vector{Union{Float64, Missing}}(missing, nr) + upper = Vector{Union{Float64, Missing}}(missing, nr) + tup = (prediction=view(prediction, nonmissinginds), + lower=view(lower, nonmissinginds), + upper=view(upper, nonmissinginds)) + predict!(tup, mm, new_x; kwargs...) + return (prediction=prediction, lower=lower, upper=upper) + end +end \ No newline at end of file diff --git a/src/lm.jl b/src/lm.jl index edeb2200..c45a37db 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -152,7 +152,7 @@ function fit(::Type{LinearModel}, f::FormulaTerm, data, wts::Union{AbstractVector{<:Real}, Nothing}=nothing, dropcollinear::Bool=true, contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}()) - f, (y, X) = StatsModels.modelmatrix(f, data, contrasts, model=LinearModel) + f, (y, X) = modelframe(f, data, contrasts) wts === nothing && (wts = similar(y, 0)) fit!(LinearModel(LmResp(y, wts), cholpred(X, dropcollinear), f)) end @@ -277,50 +277,57 @@ function predict(mm::LinearModel, newx::AbstractMatrix; return res end -StatsModels.predict!(y::AbstractVector, mm::LinearModel, newx::AbstractMatrix) = - y .= newx * coef(mm) - -function StatsModels.predict!(res::NamedTuple{(:prediction, :lower, :upper), - <:NTuple{3, AbstractVector}}, +function StatsModels.predict!(res::Union{AbstractVector, + NamedTuple{(:prediction, :lower, :upper), + <:NTuple{3, AbstractVector}}}, mm::LinearModel, newx::AbstractMatrix; - interval::Symbol, level::Real=0.95) - res.prediction .= newx * coef(mm) + interval::Union{Symbol, Nothing}=nothing, + level::Real=0.95) if interval === :confint Base.depwarn("interval=:confint is deprecated in favor of interval=:confidence", :predict) interval = :confidence end if interval === nothing - return retmean + res isa AbstractVector || + throw(ArgumentError("`res` must be a vector when `interval == nothing` or is omitted")) + length(res) == size(newx, 1) || + throw(DimensionMismatch("length of `res` must equal the number of rows in `newx`")) + res .= newx * coef(mm) elseif mm.pp.chol isa CholeskyPivoted && mm.pp.chol.rank < size(mm.pp.chol, 2) throw(ArgumentError("prediction intervals are currently not implemented " * "when some independent variables have been dropped " * "from the model due to collinearity")) - end - res isa NamedTuple || - throw(ArgumentError("`res` must be a `NamedTuple` when `interval` is " * - "`:confidence` or `:prediction`")) - length(mm.rr.wts) == 0 || error("prediction with confidence intervals not yet implemented for weighted regression") - chol = cholesky!(mm.pp) - # get the R matrix from the QR factorization - if chol isa CholeskyPivoted - ip = invperm(chol.p) - R = chol.U[ip, ip] else - R = chol.U - end - dev = deviance(mm) - dofr = dof_residual(mm) - residvar = fill(dev/dofr, size(newx, 2), 1) - ret = dropdims((newx/R).^2 * residvar, dims=2) - if interval == :prediction - ret .+= dev/dofr - elseif interval != :confidence - error("only :confidence and :prediction intervals are defined") + res isa NamedTuple || + throw(ArgumentError("`res` must be a `NamedTuple` when `interval` is " * + "`:confidence` or `:prediction`")) + prediction, lower, upper = res + length(prediction) == length(lower) == length(upper) == size(newx, 1) || + throw(DimensionMismatch("length of vectors in `res` must equal the number of rows in `newx`")) + length(mm.rr.wts) == 0 || error("prediction with confidence intervals not yet implemented for weighted regression") + chol = cholesky!(mm.pp) + # get the R matrix from the QR factorization + if chol isa CholeskyPivoted + ip = invperm(chol.p) + R = chol.U[ip, ip] + else + R = chol.U + end + dev = deviance(mm) + dofr = dof_residual(mm) + residvar = fill(dev/dofr, size(newx, 2), 1) + ret = dropdims((newx/R).^2 * residvar, dims=2) + if interval == :prediction + ret .+= dev/dofr + elseif interval != :confidence + error("only :confidence and :prediction intervals are defined") + end + ret .= quantile(TDist(dofr), (1 - level)/2) .* sqrt.(ret) + prediction .= newx * coef(mm) + lower .= prediction .+ ret + upper .= prediction -+ ret end - ret .= quantile(TDist(dofr), (1 - level)/2) .* sqrt.(ret) - res.lower .= res.prediction .+ ret - res.upper .= res.prediction -+ ret return res end diff --git a/test/runtests.jl b/test/runtests.jl index 5f773d0c..fa758efe 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -687,6 +687,68 @@ end end @testset "Predict" begin + # Linear model + rng = StableRNG(123) + X = rand(rng, 10, 2) + Y = rand(rng, 10) + + lmpred = fit(LinearModel, X, Y) + @test predict(lmpred) == fitted(lmpred) + + newX = rand(rng, 5, 2) + newY = newX * coef(lmpred) + lmpred_pred1 = predict(lmpred, newX) + lmpred_pred2 = predict(lmpred, newX; interval=:confidence) + lmpred_pred3 = predict(lmpred, newX; interval=:prediction) + @test lmpred_pred1 == lmpred_pred2.prediction == lmpred_pred3.prediction ≈ newY + @test lmpred_pred2.upper ≈ + [0.6170432517234414, 0.6857915349758823, 0.8644361267055548, + 0.2510551586658352, 0.6280144618607879] + @test lmpred_pred2.lower ≈ + [0.32609178249933063, 0.41055748807994336, 0.5523913320342061, + 0.14588615084888942, 0.2619696605732852] + @test lmpred_pred3.upper ≈ + [0.8217622514968357, 0.8951782691056336, 1.0631194540216677, + 0.5213302104184558, 0.8123751878951413] + @test lmpred_pred3.lower ≈ + [0.12137278272593627, 0.20117075395019213, 0.35370800471809305, + -0.12438890090373123, 0.07760893453893175] + + @test ndims(lmpred_pred1) == 1 + + @test ndims(lmpred_pred2.prediction) == 1 + @test ndims(lmpred_pred2.upper) == 1 + @test ndims(lmpred_pred2.lower) == 1 + + @test ndims(lmpred_pred3.prediction) == 1 + @test ndims(lmpred_pred3.upper) == 1 + @test ndims(lmpred_pred3.lower) == 1 + + @test predict!(similar(Y, size(newX, 1)), lmpred, newX) == predict(lmpred, newX) + @test predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=similar(Y, size(newX, 1))), + lmpred, newX, interval=:confidence) == + predict(lmpred, newX, interval=:confidence) + @test predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=similar(Y, size(newX, 1))), + lmpred, newX, interval=:prediction) == + predict(lmpred, newX, interval=:prediction) + @test_throws ArgumentError predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=similar(Y, size(newX, 1))), lmpred, newX) + @test_throws ArgumentError predict!(similar(Y, size(newX, 1)), lmpred, newX, + interval=:confidence) + @test_throws ArgumentError predict!(similar(Y, size(newX, 1)), lmpred, newX, + interval=:prediction) + @test_throws DimensionMismatch predict!([1], lmpred, newX) + @test_throws DimensionMismatch predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=[1]), + lmpred, newX, interval=:confidence) + + # Binomial GLM with perfect fit rng = StableRNG(123) X = rand(rng, 10, 2) Y = logistic.(X * [3; -3]) @@ -718,6 +780,28 @@ end @test ndims(gm11_pred3.upper) == 1 @test ndims(gm11_pred3.lower) == 1 + @test predict!(similar(Y, size(newX, 1)), gm11, newX) == predict(gm11, newX) + @test predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=similar(Y, size(newX, 1))), + gm11, newX, interval=:confidence, interval_method=:delta) == + predict(gm11, newX, interval=:confidence, interval_method=:delta) + @test predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=similar(Y, size(newX, 1))), + gm11, newX, interval=:confidence, interval_method=:transformation) == + predict(gm11, newX, interval=:confidence, interval_method=:transformation) + @test_throws ArgumentError predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=similar(Y, size(newX, 1))), gm11, newX) + @test_throws ArgumentError predict!(similar(Y, size(newX, 1)), gm11, newX, + interval=:confidence) + @test_throws DimensionMismatch predict!([1], gm11, newX) + @test_throws DimensionMismatch predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=[1]), + gm11, newX, interval=:confidence) + off = rand(rng, 10) newoff = rand(rng, 5) @@ -733,12 +817,30 @@ end d.y = Y gm13 = fit(GeneralizedLinearModel, @formula(y ~ 0 + x1 + x2), d, Binomial()) - @test predict(gm13) ≈ predict(gm13, d[:,[:x1, :x2]]) - @test predict(gm13) ≈ predict(gm13, d) + @test predict(gm13) ≈ predict(gm13, d[:,[:x1, :x2]]) == predict(gm13, X) + @test predict(gm13) ≈ predict(gm13, d) == predict(gm13, X) newd = DataFrame(newX, :auto) - predict(gm13, newd) + @test predict(gm13, newd) == predict(gm13, newX) + + + # Prediction from DataFrames with missing values + drep = d[[1, 2, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10], :] + dm = allowmissing(drep) + dm.x1[3] = missing + dm.y[9] = missing + + gm13m = fit(GeneralizedLinearModel, @formula(y ~ 0 + x1 + x2), dm, Binomial()) + @test predict(gm13m) == predict(gm13) + @test predict(gm13m, d) == predict(gm13, d) + @test isequal(predict(gm13m, dm), predict(gm13, dm)) + expected = allowmissing(predict(gm13m, drep)) + expected[3] = missing + @test isequal(predict(gm13m, dm), expected) + @test isequal(predict(gm13, dm), expected) + + # Linear Model Ylm = X * [0.8, 1.6] + 0.8randn(rng, 10) mm = fit(LinearModel, X, Ylm) pred1 = predict(mm, newX) From b670a148d6a26d281a74044c33a00bbc3cfca39f Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 19 Jun 2022 19:33:20 +0200 Subject: [PATCH 03/14] Reuse existing test instead of adding another one --- test/runtests.jl | 91 +++++++++++++++--------------------------------- 1 file changed, 28 insertions(+), 63 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index fa758efe..38c1ae76 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -687,68 +687,7 @@ end end @testset "Predict" begin - # Linear model - rng = StableRNG(123) - X = rand(rng, 10, 2) - Y = rand(rng, 10) - - lmpred = fit(LinearModel, X, Y) - @test predict(lmpred) == fitted(lmpred) - - newX = rand(rng, 5, 2) - newY = newX * coef(lmpred) - lmpred_pred1 = predict(lmpred, newX) - lmpred_pred2 = predict(lmpred, newX; interval=:confidence) - lmpred_pred3 = predict(lmpred, newX; interval=:prediction) - @test lmpred_pred1 == lmpred_pred2.prediction == lmpred_pred3.prediction ≈ newY - @test lmpred_pred2.upper ≈ - [0.6170432517234414, 0.6857915349758823, 0.8644361267055548, - 0.2510551586658352, 0.6280144618607879] - @test lmpred_pred2.lower ≈ - [0.32609178249933063, 0.41055748807994336, 0.5523913320342061, - 0.14588615084888942, 0.2619696605732852] - @test lmpred_pred3.upper ≈ - [0.8217622514968357, 0.8951782691056336, 1.0631194540216677, - 0.5213302104184558, 0.8123751878951413] - @test lmpred_pred3.lower ≈ - [0.12137278272593627, 0.20117075395019213, 0.35370800471809305, - -0.12438890090373123, 0.07760893453893175] - - @test ndims(lmpred_pred1) == 1 - - @test ndims(lmpred_pred2.prediction) == 1 - @test ndims(lmpred_pred2.upper) == 1 - @test ndims(lmpred_pred2.lower) == 1 - - @test ndims(lmpred_pred3.prediction) == 1 - @test ndims(lmpred_pred3.upper) == 1 - @test ndims(lmpred_pred3.lower) == 1 - - @test predict!(similar(Y, size(newX, 1)), lmpred, newX) == predict(lmpred, newX) - @test predict!((prediction=similar(Y, size(newX, 1)), - lower=similar(Y, size(newX, 1)), - upper=similar(Y, size(newX, 1))), - lmpred, newX, interval=:confidence) == - predict(lmpred, newX, interval=:confidence) - @test predict!((prediction=similar(Y, size(newX, 1)), - lower=similar(Y, size(newX, 1)), - upper=similar(Y, size(newX, 1))), - lmpred, newX, interval=:prediction) == - predict(lmpred, newX, interval=:prediction) - @test_throws ArgumentError predict!((prediction=similar(Y, size(newX, 1)), - lower=similar(Y, size(newX, 1)), - upper=similar(Y, size(newX, 1))), lmpred, newX) - @test_throws ArgumentError predict!(similar(Y, size(newX, 1)), lmpred, newX, - interval=:confidence) - @test_throws ArgumentError predict!(similar(Y, size(newX, 1)), lmpred, newX, - interval=:prediction) - @test_throws DimensionMismatch predict!([1], lmpred, newX) - @test_throws DimensionMismatch predict!((prediction=similar(Y, size(newX, 1)), - lower=similar(Y, size(newX, 1)), - upper=[1]), - lmpred, newX, interval=:confidence) - - # Binomial GLM with perfect fit + # Binomial GLM rng = StableRNG(123) X = rand(rng, 10, 2) Y = logistic.(X * [3; -3]) @@ -756,7 +695,7 @@ end gm11 = fit(GeneralizedLinearModel, X, Y, Binomial()) @test isapprox(predict(gm11), Y) @test predict(gm11) == fitted(gm11) - + newX = rand(rng, 5, 2) newY = logistic.(newX * coef(gm11)) gm11_pred1 = predict(gm11, newX) @@ -812,6 +751,7 @@ end @test isapprox(predict(gm12, newX, offset=newoff), logistic.(newX * coef(gm12) .+ newoff)) + # Prediction from DataFrames d = DataFrame(X, :auto) d.y = Y @@ -868,6 +808,31 @@ end @test pred3.upper ≈ pred3.prediction + quantile(TDist(dof_residual(mm)), 0.975)*sqrt.(diag(newX*vcov(mm)*newX') .+ deviance(mm)/dof_residual(mm)) ≈ [3.9288331595737196, 4.077092463922373, 4.762903743958081, 3.82184595169028, 4.034521019386702] + @test predict!(similar(Y, size(newX, 1)), mm, newX) == predict(mm, newX) + @test predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=similar(Y, size(newX, 1))), + mm, newX, interval=:confidence) == + predict(mm, newX, interval=:confidence) + @test predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=similar(Y, size(newX, 1))), + mm, newX, interval=:prediction) == + predict(mm, newX, interval=:prediction) + @test_throws ArgumentError predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=similar(Y, size(newX, 1))), mm, newX) + @test_throws ArgumentError predict!(similar(Y, size(newX, 1)), mm, newX, + interval=:confidence) + @test_throws ArgumentError predict!(similar(Y, size(newX, 1)), mm, newX, + interval=:prediction) + @test_throws DimensionMismatch predict!([1], mm, newX) + @test_throws DimensionMismatch predict!((prediction=similar(Y, size(newX, 1)), + lower=similar(Y, size(newX, 1)), + upper=[1]), + mm, newX, interval=:confidence) + + # Prediction with dropcollinear (#409) x = [1.0 1.0 1.0 2.0 From a83cb1b6aeb52ba2938334e924068e4896844e2e Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 19 Jun 2022 19:36:24 +0200 Subject: [PATCH 04/14] Tables.jl fixes --- Project.toml | 1 + src/GLM.jl | 1 + src/linpred.jl | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 27eace0c..b529670a 100644 --- a/Project.toml +++ b/Project.toml @@ -28,6 +28,7 @@ StatsAPI = "1.4" StatsBase = "0.33.5" StatsFuns = "0.6, 0.7, 0.8, 0.9, 1.0" StatsModels = "0.6.23" +Tables = "1" julia = "1" [extras] diff --git a/src/GLM.jl b/src/GLM.jl index 019f80e3..83d33bea 100644 --- a/src/GLM.jl +++ b/src/GLM.jl @@ -16,6 +16,7 @@ module GLM import StatsFuns: xlogy import SpecialFunctions: erfc, erfcinv, digamma, trigamma import StatsModels: hasintercept + import Tables export coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual, loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict, fitted, fit, fit!, model_response, response, modelmatrix, r2, r², adjr2, adjr², diff --git a/src/linpred.jl b/src/linpred.jl index ab161f93..54abd3df 100644 --- a/src/linpred.jl +++ b/src/linpred.jl @@ -288,7 +288,7 @@ function StatsBase.predict(mm::LinPredModel, data; t = columntable(data) cols, nonmissings = StatsModels.missing_omit(t, f.rhs) newx = modelcols(f.rhs, cols) - prediction = Tables.allocatecolumn(Union{_coltype(f.lhs), Missing}, Tables.rowcount(t)) + prediction = Tables.allocatecolumn(Union{_coltype(f.lhs), Missing}, length(nonmissings)) fill!(prediction, missing) if interval === nothing predict!(view(prediction, nonmissings), mm, newx; kwargs...) From bd00e5106b649c337cbb5a395199595470e9c1e3 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 19 Jun 2022 19:53:33 +0200 Subject: [PATCH 05/14] Fixes --- src/GLM.jl | 5 +++-- src/deprecated.jl | 13 +++++++------ src/glmfit.jl | 2 -- src/linpred.jl | 4 ++-- src/lm.jl | 2 -- test/runtests.jl | 4 ++++ 6 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/GLM.jl b/src/GLM.jl index 83d33bea..d806fe93 100644 --- a/src/GLM.jl +++ b/src/GLM.jl @@ -10,8 +10,9 @@ module GLM import Base: (\), convert, show, size import LinearAlgebra: cholesky, cholesky! import Statistics: cor - import StatsBase: coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual, - loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict, + import StatsBase: coef, coeftable, coefnames, confint, deviance, nulldeviance, dof, dof_residual, + loglikelihood, nullloglikelihood, nobs, stderror, vcov, + residuals, predict, predict!, fitted, fit, model_response, response, modelmatrix, r2, r², adjr2, adjr², PValue import StatsFuns: xlogy import SpecialFunctions: erfc, erfcinv, digamma, trigamma diff --git a/src/deprecated.jl b/src/deprecated.jl index 0d374063..8809daea 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -3,12 +3,13 @@ @deprecate confint(obj::LinearModel, level::Real) confint(obj, level=level) @deprecate confint(obj::AbstractGLM, level::Real) confint(obj, level=level) -function getproperty(m::AbstractGLM, field) - if field === :model - Base.depwarn("accessing the `model` field of `AbstractGLM` objects is deprecated, " * - "as it now returns its parent object", :getproperty) - return m +function Base.getproperty(mm::LinPredModel, f::Symbol) + if f === :model + Base.depwarn("accessing the `model` field of GLM.jl models is deprecated, " * + "as they are no longer wrapped in a `TableRegressionModel` " * + "and can be used directly now", :getproperty) + return mm else - return getfield(m, field) + return getfield(mm, f) end end \ No newline at end of file diff --git a/src/glmfit.jl b/src/glmfit.jl index c363474a..1a519bab 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -233,8 +233,6 @@ mutable struct GeneralizedLinearModel{G<:GlmResp,L<:LinPred,F<:Union{FormulaTerm fit::Bool end -formula(obj::GeneralizedLinearModel) = obj.f - function coeftable(mm::AbstractGLM; level::Real=0.95) cc = coef(mm) se = stderror(mm) diff --git a/src/linpred.jl b/src/linpred.jl index 54abd3df..bc2353d4 100644 --- a/src/linpred.jl +++ b/src/linpred.jl @@ -248,7 +248,7 @@ response(obj::LinPredModel) = obj.rr.y fitted(m::LinPredModel) = m.rr.mu predict(mm::LinPredModel) = fitted(mm) -formula(obj::LinPredModel) = obj.formula +formula(obj::LinPredModel) = obj.f residuals(obj::LinPredModel) = residuals(obj.rr) """ @@ -285,7 +285,7 @@ function StatsBase.predict(mm::LinPredModel, data; throw(ArgumentError("expected data in a Table, got $(typeof(data))")) f = formula(mm) - t = columntable(data) + t = Tables.columntable(data) cols, nonmissings = StatsModels.missing_omit(t, f.rhs) newx = modelcols(f.rhs, cols) prediction = Tables.allocatecolumn(Union{_coltype(f.lhs), Missing}, length(nonmissings)) diff --git a/src/lm.jl b/src/lm.jl index c45a37db..507aeaab 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -90,8 +90,6 @@ end LinearAlgebra.cholesky(x::LinearModel) = cholesky(x.pp) -formula(obj::LinearModel) = obj.f - function StatsBase.fit!(obj::LinearModel) if isempty(obj.rr.wts) delbeta!(obj.pp, obj.rr.y) diff --git a/test/runtests.jl b/test/runtests.jl index 38c1ae76..392edaa3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -48,6 +48,8 @@ linreg(x::AbstractVecOrMat, y::AbstractVector) = qr!(simplemm(x)) \ y lm3 = lm(@formula(y~x), (y=1:25, x=repeat(1:5, 5)), contrasts=Dict(:x=>DummyCoding())) lm4 = lm(@formula(y~x), (y=1:25, x=categorical(repeat(1:5, 5)))) @test coef(lm3) == coef(lm4) ≈ [11, 1, 2, 3, 4] + # Deprecated method + @test lm1.model === lm1 end @testset "Linear Model Cook's Distance" begin @@ -292,6 +294,8 @@ dobson = DataFrame(Counts = [18.,17,15,20,10,20,25,13,12], @test isapprox(bic(gm1), 57.74744128863877) @test isapprox(coef(gm1)[1:3], [3.044522437723423,-0.45425527227759555,-0.29298712468147375]) + # Deprecated method + @test gm1.model === gm1 end ## Example from http://www.ats.ucla.edu/stat/r/dae/logit.htm From c244147bc8321979d358273ff8516e623d0fd3ee Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 19 Jun 2022 21:46:13 +0200 Subject: [PATCH 06/14] Docs, etc. --- docs/src/api.md | 4 ++-- docs/src/examples.md | 32 ++++++++------------------------ docs/src/index.md | 8 ++------ src/glmfit.jl | 13 +++++-------- src/linpred.jl | 9 ++++++--- src/lm.jl | 8 +++++--- test/runtests.jl | 8 ++++---- 7 files changed, 32 insertions(+), 50 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 46990470..c04b2b19 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -25,7 +25,7 @@ The most general approach to fitting a model is with the `fit` function, as in julia> using Random julia> fit(LinearModel, hcat(ones(10), 1:10), randn(MersenneTwister(12321), 10)) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}: +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}: Coefficients: ──────────────────────────────────────────────────────────────── @@ -41,7 +41,7 @@ This model can also be fit as julia> using Random julia> lm(hcat(ones(10), 1:10), randn(MersenneTwister(12321), 10)) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}: +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}: Coefficients: ──────────────────────────────────────────────────────────────── diff --git a/docs/src/examples.md b/docs/src/examples.md index f7b9ecd4..6252b284 100644 --- a/docs/src/examples.md +++ b/docs/src/examples.md @@ -20,9 +20,7 @@ julia> data = DataFrame(X=[1,2,3], Y=[2,4,7]) 3 │ 3 7 julia> ols = lm(@formula(Y ~ X), data) -StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}} - -Y ~ 1 + X +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}: Coefficients: ───────────────────────────────────────────────────────────────────────── @@ -56,9 +54,7 @@ julia> data = DataFrame(X=[1,2,2], Y=[1,0,1]) 3 │ 2 1 julia> probit = glm(@formula(Y ~ X), data, Binomial(), ProbitLink()) -StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Binomial{Float64}, ProbitLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}, Matrix{Float64}} - -Y ~ 1 + X +GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Binomial{Float64}, ProbitLink}, GLM.DensePredChol{Float64, Cholesky{Float64, Matrix{Float64}}}}: Coefficients: ──────────────────────────────────────────────────────────────────────── @@ -97,9 +93,7 @@ julia> quine = dataset("MASS", "quine") 131 rows omitted julia> nbrmodel = glm(@formula(Days ~ Eth+Sex+Age+Lrn), quine, NegativeBinomial(2.0), LogLink()) -StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}, Matrix{Float64}} - -Days ~ 1 + Eth + Sex + Age + Lrn +GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, Cholesky{Float64, Matrix{Float64}}}}: Coefficients: ──────────────────────────────────────────────────────────────────────────── @@ -115,9 +109,7 @@ Lrn: SL 0.296768 0.185934 1.60 0.1105 -0.0676559 0.661191 ──────────────────────────────────────────────────────────────────────────── julia> nbrmodel = negbin(@formula(Days ~ Eth+Sex+Age+Lrn), quine, LogLink()) -StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}, Matrix{Float64}} - -Days ~ 1 + Eth + Sex + Age + Lrn +GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, Cholesky{Float64, Matrix{Float64}}}}: Coefficients: ──────────────────────────────────────────────────────────────────────────── @@ -164,9 +156,7 @@ julia> form = dataset("datasets", "Formaldehyde") 6 │ 0.9 0.782 julia> lm1 = fit(LinearModel, @formula(OptDen ~ Carb), form) -StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}} - -OptDen ~ 1 + Carb +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}: Coefficients: ─────────────────────────────────────────────────────────────────────────── @@ -213,9 +203,7 @@ julia> LifeCycleSavings = dataset("datasets", "LifeCycleSavings") 35 rows omitted julia> fm2 = fit(LinearModel, @formula(SR ~ Pop15 + Pop75 + DPI + DDPI), LifeCycleSavings) -StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}} - -SR ~ 1 + Pop15 + Pop75 + DPI + DDPI +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}: Coefficients: ───────────────────────────────────────────────────────────────────────────────── @@ -321,9 +309,7 @@ julia> dobson = DataFrame(Counts = [18.,17,15,20,10,21,25,13,13], 9 │ 13.0 3 3 julia> gm1 = fit(GeneralizedLinearModel, @formula(Counts ~ Outcome + Treatment), dobson, Poisson()) -StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Poisson{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}, Matrix{Float64}} - -Counts ~ 1 + Outcome + Treatment +GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Poisson{Float64}, LogLink}, GLM.DensePredChol{Float64, Cholesky{Float64, Matrix{Float64}}}}: Coefficients: ──────────────────────────────────────────────────────────────────────────── @@ -378,9 +364,7 @@ julia> round(optimal_bic.minimizer, digits = 5) # Optimal λ 0.40935 julia> glm(@formula(Volume ~ Height + Girth), trees, Normal(), PowerLink(optimal_bic.minimizer)) # Best model -StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Normal{Float64}, PowerLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}, Matrix{Float64}} - -Volume ~ 1 + Height + Girth +GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Normal{Float64}, PowerLink}, GLM.DensePredChol{Float64, Cholesky{Float64, Matrix{Float64}}}}: Coefficients: ──────────────────────────────────────────────────────────────────────────── diff --git a/docs/src/index.md b/docs/src/index.md index cb2e96d4..a2d5426c 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -85,9 +85,7 @@ julia> data = DataFrame(y = rand(rng, 100), x = categorical(repeat([1, 2, 3, 4], julia> lm(@formula(y ~ x), data) -StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}} - -y ~ 1 + x +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}: Coefficients: ─────────────────────────────────────────────────────────────────────────── @@ -108,9 +106,7 @@ julia> using StableRNGs julia> data = DataFrame(y = rand(StableRNG(1), 100), x = repeat([1, 2, 3, 4], 25)); julia> lm(@formula(y ~ x), data, contrasts = Dict(:x => DummyCoding())) -StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}} - -y ~ 1 + x +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}: Coefficients: ─────────────────────────────────────────────────────────────────────────── diff --git a/src/glmfit.jl b/src/glmfit.jl index 1a519bab..8e2f20be 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -226,10 +226,10 @@ end abstract type AbstractGLM <: LinPredModel end -mutable struct GeneralizedLinearModel{G<:GlmResp,L<:LinPred,F<:Union{FormulaTerm,Nothing}} <: AbstractGLM +mutable struct GeneralizedLinearModel{G<:GlmResp,L<:LinPred} <: AbstractGLM rr::G pp::L - f::F + f::Union{FormulaTerm,Nothing} fit::Bool end @@ -504,7 +504,6 @@ function fit(::Type{M}, data, d::UnivariateDistribution, l::Link=canonicallink(d); - # TODO: support passing wts and offset as symbols offset::Union{AbstractVector, Nothing} = nothing, wts::Union{AbstractVector, Nothing} = nothing, dofit::Bool = true, @@ -517,11 +516,9 @@ function fit(::Type{M}, throw(DimensionMismatch("number of rows in X and y must match")) end - # TODO: allocate right type upfront - yf = float(y) - off = offset === nothing ? similar(yf, 0) : offset - wts = wts === nothing ? similar(yf, 0) : wts - rr = GlmResp(yf, d, l, off, wts) + off = offset === nothing ? similar(y, 0) : offset + wts = wts === nothing ? similar(y, 0) : wts + rr = GlmResp(y, d, l, off, wts) res = M(rr, cholpred(X), f, false) return dofit ? fit!(res; fitargs...) : res end diff --git a/src/linpred.jl b/src/linpred.jl index bc2353d4..de729350 100644 --- a/src/linpred.jl +++ b/src/linpred.jl @@ -237,7 +237,10 @@ end function modelframe(f::FormulaTerm, data, contrasts::AbstractDict) Tables.istable(data) || throw(ArgumentError("expected data in a Table, got $(typeof(data))")) - data, _ = StatsModels.missing_omit(Tables.columntable(data), f) + t = Tables.columntable(data) + msg = StatsModels.checknamesexist(f, t) + msg != "" && throw(ArgumentError(msg)) + data, _ = StatsModels.missing_omit(t, f) sch = schema(f, data, contrasts) f = apply_schema(f, sch, LinPredModel) f, modelcols(f, data) @@ -299,8 +302,8 @@ function StatsBase.predict(mm::LinPredModel, data; lower = Vector{Union{Float64, Missing}}(missing, nr) upper = Vector{Union{Float64, Missing}}(missing, nr) tup = (prediction=view(prediction, nonmissinginds), - lower=view(lower, nonmissinginds), - upper=view(upper, nonmissinginds)) + lower=view(lower, nonmissinginds), + upper=view(upper, nonmissinginds)) predict!(tup, mm, new_x; kwargs...) return (prediction=prediction, lower=lower, upper=upper) end diff --git a/src/lm.jl b/src/lm.jl index 507aeaab..fda423ff 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -75,17 +75,19 @@ residuals(r::LmResp) = r.y - r.mu """ LinearModel -A combination of a [`LmResp`](@ref) and a [`LinPred`](@ref) +A combination of a [`LmResp`](@ref), a [`LinPred`](@ref), +and possibly a `FormulaTerm` # Members - `rr`: a `LmResp` object - `pp`: a `LinPred` object +- `f`: either a `FormulaTerm` object or `nothing` """ -struct LinearModel{L<:LmResp,T<:LinPred,F<:Union{FormulaTerm,Nothing}} <: LinPredModel +struct LinearModel{L<:LmResp,T<:LinPred} <: LinPredModel rr::L pp::T - f::F + f::Union{FormulaTerm,Nothing} end LinearAlgebra.cholesky(x::LinearModel) = cholesky(x.pp) diff --git a/test/runtests.jl b/test/runtests.jl index 392edaa3..fd3b8547 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -730,8 +730,8 @@ end gm11, newX, interval=:confidence, interval_method=:delta) == predict(gm11, newX, interval=:confidence, interval_method=:delta) @test predict!((prediction=similar(Y, size(newX, 1)), - lower=similar(Y, size(newX, 1)), - upper=similar(Y, size(newX, 1))), + lower=similar(Y, size(newX, 1)), + upper=similar(Y, size(newX, 1))), gm11, newX, interval=:confidence, interval_method=:transformation) == predict(gm11, newX, interval=:confidence, interval_method=:transformation) @test_throws ArgumentError predict!((prediction=similar(Y, size(newX, 1)), @@ -819,8 +819,8 @@ end mm, newX, interval=:confidence) == predict(mm, newX, interval=:confidence) @test predict!((prediction=similar(Y, size(newX, 1)), - lower=similar(Y, size(newX, 1)), - upper=similar(Y, size(newX, 1))), + lower=similar(Y, size(newX, 1)), + upper=similar(Y, size(newX, 1))), mm, newX, interval=:prediction) == predict(mm, newX, interval=:prediction) @test_throws ArgumentError predict!((prediction=similar(Y, size(newX, 1)), From fc0704adeb1bf3754fe5671fd8e67dc8e3025880 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 19 Jun 2022 22:25:07 +0200 Subject: [PATCH 07/14] Docs, contrasts --- docs/src/api.md | 4 ++-- docs/src/examples.md | 16 ++++++++-------- docs/src/index.md | 4 ++-- src/glmfit.jl | 5 +++++ src/lm.jl | 13 ++++++++++--- test/runtests.jl | 35 ++++++++++++++++++++++++++++++----- 6 files changed, 57 insertions(+), 20 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index c04b2b19..46990470 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -25,7 +25,7 @@ The most general approach to fitting a model is with the `fit` function, as in julia> using Random julia> fit(LinearModel, hcat(ones(10), 1:10), randn(MersenneTwister(12321), 10)) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}: +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}: Coefficients: ──────────────────────────────────────────────────────────────── @@ -41,7 +41,7 @@ This model can also be fit as julia> using Random julia> lm(hcat(ones(10), 1:10), randn(MersenneTwister(12321), 10)) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}: +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}: Coefficients: ──────────────────────────────────────────────────────────────── diff --git a/docs/src/examples.md b/docs/src/examples.md index 6252b284..e64b80f6 100644 --- a/docs/src/examples.md +++ b/docs/src/examples.md @@ -20,7 +20,7 @@ julia> data = DataFrame(X=[1,2,3], Y=[2,4,7]) 3 │ 3 7 julia> ols = lm(@formula(Y ~ X), data) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}: +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}: Coefficients: ───────────────────────────────────────────────────────────────────────── @@ -54,7 +54,7 @@ julia> data = DataFrame(X=[1,2,2], Y=[1,0,1]) 3 │ 2 1 julia> probit = glm(@formula(Y ~ X), data, Binomial(), ProbitLink()) -GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Binomial{Float64}, ProbitLink}, GLM.DensePredChol{Float64, Cholesky{Float64, Matrix{Float64}}}}: +GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Binomial{Float64}, ProbitLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}: Coefficients: ──────────────────────────────────────────────────────────────────────── @@ -93,7 +93,7 @@ julia> quine = dataset("MASS", "quine") 131 rows omitted julia> nbrmodel = glm(@formula(Days ~ Eth+Sex+Age+Lrn), quine, NegativeBinomial(2.0), LogLink()) -GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, Cholesky{Float64, Matrix{Float64}}}}: +GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}: Coefficients: ──────────────────────────────────────────────────────────────────────────── @@ -109,7 +109,7 @@ Lrn: SL 0.296768 0.185934 1.60 0.1105 -0.0676559 0.661191 ──────────────────────────────────────────────────────────────────────────── julia> nbrmodel = negbin(@formula(Days ~ Eth+Sex+Age+Lrn), quine, LogLink()) -GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, Cholesky{Float64, Matrix{Float64}}}}: +GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}: Coefficients: ──────────────────────────────────────────────────────────────────────────── @@ -156,7 +156,7 @@ julia> form = dataset("datasets", "Formaldehyde") 6 │ 0.9 0.782 julia> lm1 = fit(LinearModel, @formula(OptDen ~ Carb), form) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}: +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}: Coefficients: ─────────────────────────────────────────────────────────────────────────── @@ -203,7 +203,7 @@ julia> LifeCycleSavings = dataset("datasets", "LifeCycleSavings") 35 rows omitted julia> fm2 = fit(LinearModel, @formula(SR ~ Pop15 + Pop75 + DPI + DDPI), LifeCycleSavings) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}: +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}: Coefficients: ───────────────────────────────────────────────────────────────────────────────── @@ -309,7 +309,7 @@ julia> dobson = DataFrame(Counts = [18.,17,15,20,10,21,25,13,13], 9 │ 13.0 3 3 julia> gm1 = fit(GeneralizedLinearModel, @formula(Counts ~ Outcome + Treatment), dobson, Poisson()) -GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Poisson{Float64}, LogLink}, GLM.DensePredChol{Float64, Cholesky{Float64, Matrix{Float64}}}}: +GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Poisson{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}: Coefficients: ──────────────────────────────────────────────────────────────────────────── @@ -364,7 +364,7 @@ julia> round(optimal_bic.minimizer, digits = 5) # Optimal λ 0.40935 julia> glm(@formula(Volume ~ Height + Girth), trees, Normal(), PowerLink(optimal_bic.minimizer)) # Best model -GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Normal{Float64}, PowerLink}, GLM.DensePredChol{Float64, Cholesky{Float64, Matrix{Float64}}}}: +GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Normal{Float64}, PowerLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}: Coefficients: ──────────────────────────────────────────────────────────────────────────── diff --git a/docs/src/index.md b/docs/src/index.md index a2d5426c..ddb55002 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -85,7 +85,7 @@ julia> data = DataFrame(y = rand(rng, 100), x = categorical(repeat([1, 2, 3, 4], julia> lm(@formula(y ~ x), data) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}: +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}: Coefficients: ─────────────────────────────────────────────────────────────────────────── @@ -106,7 +106,7 @@ julia> using StableRNGs julia> data = DataFrame(y = rand(StableRNG(1), 100), x = repeat([1, 2, 3, 4], 25)); julia> lm(@formula(y ~ x), data, contrasts = Dict(:x => DummyCoding())) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}: +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}: Coefficients: ─────────────────────────────────────────────────────────────────────────── diff --git a/src/glmfit.jl b/src/glmfit.jl index 8e2f20be..baca5075 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -460,6 +460,11 @@ const FIT_GLM_DOC = """ - `minstepfac::Real=0.001`: Minimum line step fraction. Must be between 0 and 1. - `start::AbstractVector=nothing`: Starting values for beta. Should have the same length as the number of columns in the model matrix. + - `contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}()`: a `Dict` mapping term names + (as `Symbol`s) to term or contrast types. If a contrast is not provided + for a variable, the appropriate term type will be guessed based on the data type + from the data column: any numeric data is assumed to be continuous, and any + non-numeric data is assumed to be categorical. """ """ diff --git a/src/lm.jl b/src/lm.jl index fda423ff..0bf93f71 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -122,12 +122,18 @@ const FIT_LM_DOC = """ is less-than-full rank. If `true` (the default), only the first of each set of linearly-dependent columns is used. The coefficient for redundant linearly dependent columns is `0.0` and all associated statistics are set to `NaN`. + + `contrasts` may optionally be supplied in the form of a `Dict` mapping term names + (as `Symbol`s) to term or contrast types. If a contrast is not provided + for a variable, the appropriate term type will be guessed based on the data type + from the data column: any numeric data is assumed to be continuous, and any + non-numeric data is assumed to be categorical. """ -# TODO: document contrasts keyword argument """ fit(LinearModel, formula::FormulaTerm, data, allowrankdeficient=false; - [wts::AbstractVector], dropcollinear::Bool=true) + [wts::AbstractVector], dropcollinear::Bool=true, + contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}()) fit(LinearModel, X::AbstractMatrix, y::AbstractVector; wts::AbstractVector=similar(y, 0), dropcollinear::Bool=true) @@ -159,7 +165,8 @@ end """ lm(formula, data, allowrankdeficient=false; - [wts::AbstractVector], dropcollinear::Bool=true) + [wts::AbstractVector], dropcollinear::Bool=true, + contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}()) lm(X::AbstractMatrix, y::AbstractVector; wts::AbstractVector=similar(y, 0), dropcollinear::Bool=true) diff --git a/test/runtests.jl b/test/runtests.jl index fd3b8547..90d407c9 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -45,9 +45,6 @@ linreg(x::AbstractVecOrMat, y::AbstractVector) = qr!(simplemm(x)) \ y @test isa(lm2.pp.chol, CholeskyPivoted) @test lm2.pp.chol.piv == [2, 1] @test isapprox(coef(lm1), coef(lm2) .* [1., 10.]) - lm3 = lm(@formula(y~x), (y=1:25, x=repeat(1:5, 5)), contrasts=Dict(:x=>DummyCoding())) - lm4 = lm(@formula(y~x), (y=1:25, x=categorical(repeat(1:5, 5)))) - @test coef(lm3) == coef(lm4) ≈ [11, 1, 2, 3, 4] # Deprecated method @test lm1.model === lm1 end @@ -780,8 +777,12 @@ end @test isequal(predict(gm13m, dm), predict(gm13, dm)) expected = allowmissing(predict(gm13m, drep)) expected[3] = missing - @test isequal(predict(gm13m, dm), expected) - @test isequal(predict(gm13, dm), expected) + @test collect(skipmissing(predict(gm13m, dm))) ≈ + collect(skipmissing(predict(gm13, dm))) ≈ + collect(skipmissing(expected)) + @test ismissing.(predict(gm13m, dm)) == + ismissing.(predict(gm13, dm)) == + ismissing.(expected) # Linear Model @@ -1383,3 +1384,27 @@ end @test predict(mdl1) ≈ predict(mdl2) end end + +@testset "contrasts argument" begin + m = lm(@formula(y~x), (y=1:25, x=repeat(1:5, 5)), + contrasts=Dict(:x=>DummyCoding())) + mcat = lm(@formula(y~x), (y=1:25, x=categorical(repeat(1:5, 5)))) + gm = glm(@formula(y~x), (y=1:25, x=repeat(1:5, 5)), Normal(), IdentityLink(), + contrasts=Dict(:x=>DummyCoding())) + gmcat = glm(@formula(y~x), (y=1:25, x=categorical(repeat(1:5, 5))), + Normal(), IdentityLink()) + @test coef(m) == coef(mcat) ≈ [11, 1, 2, 3, 4] + @test coef(gm) == coef(gmcat) ≈ [11, 1, 2, 3, 4] + + m = fit(LinearModel, @formula(y~x), (y=1:25, x=repeat(1:5, 5)), + contrasts=Dict(:x=>DummyCoding())) + mcat = fit(LinearModel, @formula(y~x), (y=1:25, x=categorical(repeat(1:5, 5)))) + gm = fit(GeneralizedLinearModel, @formula(y~x), + (y=1:25, x=repeat(1:5, 5)), Normal(), IdentityLink(), + contrasts=Dict(:x=>DummyCoding())) + gmcat = fit(GeneralizedLinearModel, @formula(y~x), + (y=1:25, x=categorical(repeat(1:5, 5))), + Normal(), IdentityLink()) + @test coef(m) == coef(mcat) ≈ [11, 1, 2, 3, 4] + @test coef(gm) == coef(gmcat) ≈ [11, 1, 2, 3, 4] +end From 399979f7b4ae94bf5e8e3808737dd66063df2c73 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 11 Sep 2022 12:26:32 +0200 Subject: [PATCH 08/14] Review fixes, more tests --- src/glmfit.jl | 14 +++--- src/linpred.jl | 21 ++++++--- src/lm.jl | 14 +++--- test/runtests.jl | 110 ++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 135 insertions(+), 24 deletions(-) diff --git a/src/glmfit.jl b/src/glmfit.jl index baca5075..0f3c3276 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -229,7 +229,7 @@ abstract type AbstractGLM <: LinPredModel end mutable struct GeneralizedLinearModel{G<:GlmResp,L<:LinPred} <: AbstractGLM rr::G pp::L - f::Union{FormulaTerm,Nothing} + formula::Union{FormulaTerm,Nothing} fit::Bool end @@ -240,7 +240,7 @@ function coeftable(mm::AbstractGLM; level::Real=0.95) p = 2 * ccdf.(Ref(Normal()), abs.(zz)) ci = se*quantile(Normal(), (1-level)/2) levstr = isinteger(level*100) ? string(Integer(level*100)) : string(level*100) - cn = mm.f === nothing ? ["x$i" for i = 1:size(mm.pp.X, 2)] : coefnames(mm) + cn = coefnames(mm) CoefTable(hcat(cc,se,zz,p,cc+ci,cc-ci), ["Coef.","Std. Error","z","Pr(>|z|)","Lower $levstr%","Upper $levstr%"], cn, 4, 3) @@ -461,10 +461,12 @@ const FIT_GLM_DOC = """ - `start::AbstractVector=nothing`: Starting values for beta. Should have the same length as the number of columns in the model matrix. - `contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}()`: a `Dict` mapping term names - (as `Symbol`s) to term or contrast types. If a contrast is not provided - for a variable, the appropriate term type will be guessed based on the data type - from the data column: any numeric data is assumed to be continuous, and any - non-numeric data is assumed to be categorical. + (as `Symbol`s) to term types (e.g. `ContinuousTerm`) or contrasts + (e.g., `HelmertCoding()`, `SeqDiffCoding(; levels=["a", "b", "c"])`, + etc.). If contrasts are not provided for a variable, the appropriate + term type will be guessed based on the data type from the data column: + any numeric data is assumed to be continuous, and any non-numeric data + is assumed to be categorical. """ """ diff --git a/src/linpred.jl b/src/linpred.jl index de729350..2b75cd06 100644 --- a/src/linpred.jl +++ b/src/linpred.jl @@ -251,9 +251,13 @@ response(obj::LinPredModel) = obj.rr.y fitted(m::LinPredModel) = m.rr.mu predict(mm::LinPredModel) = fitted(mm) -formula(obj::LinPredModel) = obj.f residuals(obj::LinPredModel) = residuals(obj.rr) +function formula(obj::LinPredModel) + obj.formula === nothing && throw(ArgumentError("model was fitted without a formula")) + return obj.formula +end + """ nobs(obj::LinearModel) nobs(obj::GLM) @@ -271,7 +275,8 @@ end coef(x::LinPred) = x.beta0 coef(obj::LinPredModel) = coef(obj.pp) -coefnames(x::LinPredModel) = coefnames(formula(x).rhs) +coefnames(x::LinPredModel) = + x.formula === nothing ? ["x$i" for i in 1:length(coef(x))] : coefnames(formula(x).rhs) dof_residual(obj::LinPredModel) = nobs(obj) - dof(obj) + 1 @@ -282,7 +287,7 @@ _coltype(::ContinuousTerm{T}) where {T} = T # Function common to all LinPred models, but documented separately # for LinearModel and GeneralizedLinearModel function StatsBase.predict(mm::LinPredModel, data; - interval::Union{Symbol,Nothing}=nothing, level::Real=0.95, + interval::Union{Symbol,Nothing}=nothing, kwargs...) Tables.istable(data) || throw(ArgumentError("expected data in a Table, got $(typeof(data))")) @@ -294,17 +299,19 @@ function StatsBase.predict(mm::LinPredModel, data; prediction = Tables.allocatecolumn(Union{_coltype(f.lhs), Missing}, length(nonmissings)) fill!(prediction, missing) if interval === nothing - predict!(view(prediction, nonmissings), mm, newx; kwargs...) + predict!(view(prediction, nonmissings), mm, newx; + interval=interval, kwargs...) return prediction else # Finding integer indices once is faster nonmissinginds = findall(nonmissings) - lower = Vector{Union{Float64, Missing}}(missing, nr) - upper = Vector{Union{Float64, Missing}}(missing, nr) + lower = Vector{Union{Float64, Missing}}(missing, length(nonmissings)) + upper = Vector{Union{Float64, Missing}}(missing, length(nonmissings)) tup = (prediction=view(prediction, nonmissinginds), lower=view(lower, nonmissinginds), upper=view(upper, nonmissinginds)) - predict!(tup, mm, new_x; kwargs...) + predict!(tup, mm, newx; + interval=interval, kwargs...) return (prediction=prediction, lower=lower, upper=upper) end end \ No newline at end of file diff --git a/src/lm.jl b/src/lm.jl index 0bf93f71..c65e2b68 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -87,7 +87,7 @@ and possibly a `FormulaTerm` struct LinearModel{L<:LmResp,T<:LinPred} <: LinPredModel rr::L pp::T - f::Union{FormulaTerm,Nothing} + formula::Union{FormulaTerm,Nothing} end LinearAlgebra.cholesky(x::LinearModel) = cholesky(x.pp) @@ -124,10 +124,12 @@ const FIT_LM_DOC = """ `0.0` and all associated statistics are set to `NaN`. `contrasts` may optionally be supplied in the form of a `Dict` mapping term names - (as `Symbol`s) to term or contrast types. If a contrast is not provided - for a variable, the appropriate term type will be guessed based on the data type - from the data column: any numeric data is assumed to be continuous, and any - non-numeric data is assumed to be categorical. + (as `Symbol`s) to term types (e.g. `ContinuousTerm`) or contrasts + (e.g., `HelmertCoding()`, `SeqDiffCoding(; levels=["a", "b", "c"])`, + etc.). If contrasts are not provided for a variable, the appropriate + term type will be guessed based on the data type from the data column: + any numeric data is assumed to be continuous, and any non-numeric data + is assumed to be categorical. """ """ @@ -254,7 +256,7 @@ function coeftable(mm::LinearModel; level::Real=0.95) ci = [isnan(t) ? NaN : -Inf for t in tt] end levstr = isinteger(level*100) ? string(Integer(level*100)) : string(level*100) - cn = mm.f === nothing ? ["x$i" for i = 1:size(mm.pp.X, 2)] : coefnames(mm) + cn = coefnames(mm) CoefTable(hcat(cc,se,tt,p,cc+ci,cc-ci), ["Coef.","Std. Error","t","Pr(>|t|)","Lower $levstr%","Upper $levstr%"], cn, 4, 3) diff --git a/test/runtests.jl b/test/runtests.jl index 90d407c9..ba5e3d1e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -760,9 +760,18 @@ end gm13 = fit(GeneralizedLinearModel, @formula(y ~ 0 + x1 + x2), d, Binomial()) @test predict(gm13) ≈ predict(gm13, d[:,[:x1, :x2]]) == predict(gm13, X) @test predict(gm13) ≈ predict(gm13, d) == predict(gm13, X) + @test predict(gm13) ≈ predict(gm11) + @test predict(gm13, newX; interval=:confidence, interval_method=:delta) == + predict(gm11, newX; interval=:confidence, interval_method=:delta) + @test predict(gm13, newX; interval=:confidence, interval_method=:transformation) == + predict(gm11, newX; interval=:confidence, interval_method=:transformation) newd = DataFrame(newX, :auto) @test predict(gm13, newd) == predict(gm13, newX) + @test predict(gm13, newX; interval=:confidence, interval_method=:delta) == + predict(gm11, newX; interval=:confidence, interval_method=:delta) + @test predict(gm13, newd; interval=:confidence, interval_method=:transformation) == + predict(gm11, newX; interval=:confidence, interval_method=:transformation) # Prediction from DataFrames with missing values @@ -775,14 +784,32 @@ end @test predict(gm13m) == predict(gm13) @test predict(gm13m, d) == predict(gm13, d) @test isequal(predict(gm13m, dm), predict(gm13, dm)) - expected = allowmissing(predict(gm13m, drep)) - expected[3] = missing + gm13m_pred2 = predict(gm13m, dm; interval=:confidence, interval_method=:delta) + gm13m_pred3 = predict(gm13m, dm; interval=:confidence, interval_method=:transformation) + expected_pred = allowmissing(predict(gm13m, drep)) + expected_pred[3] = missing @test collect(skipmissing(predict(gm13m, dm))) ≈ collect(skipmissing(predict(gm13, dm))) ≈ - collect(skipmissing(expected)) + collect(skipmissing(gm13m_pred2.prediction)) == + collect(skipmissing(gm13m_pred3.prediction)) == + collect(skipmissing(expected_pred)) @test ismissing.(predict(gm13m, dm)) == ismissing.(predict(gm13, dm)) == - ismissing.(expected) + ismissing.(gm13m_pred2.prediction) == + ismissing.(gm13m_pred3.prediction) == + ismissing.(expected_pred) + expected_lower = + allowmissing(predict(gm13m, drep; + interval=:confidence, interval_method=:delta).lower) + expected_lower[3] = missing + @test collect(skipmissing(gm13m_pred2.lower)) == collect(skipmissing(expected_lower)) + @test ismissing.(gm13m_pred2.lower) == ismissing.(expected_lower) + expected_upper = + allowmissing(predict(gm13m, drep; + interval=:confidence, interval_method=:delta).upper) + expected_upper[3] = missing + @test collect(skipmissing(gm13m_pred2.upper)) == collect(skipmissing(expected_upper)) + @test ismissing.(gm13m_pred2.upper) == ismissing.(expected_upper) # Linear Model @@ -822,7 +849,7 @@ end @test predict!((prediction=similar(Y, size(newX, 1)), lower=similar(Y, size(newX, 1)), upper=similar(Y, size(newX, 1))), - mm, newX, interval=:prediction) == + mm, newX, interval=:prediction) == predict(mm, newX, interval=:prediction) @test_throws ArgumentError predict!((prediction=similar(Y, size(newX, 1)), lower=similar(Y, size(newX, 1)), @@ -838,6 +865,63 @@ end mm, newX, interval=:confidence) + # Prediction from DataFrames + d = DataFrame(X, :auto) + d.y = Ylm + + mmd = lm(@formula(y ~ 0 + x1 + x2), d) + @test predict(mmd) ≈ predict(mmd, d[:,[:x1, :x2]]) == predict(mm, X) + @test predict(mmd) ≈ predict(mmd, d) == predict(mm, X) + @test predict(mmd) ≈ predict(mm) + @test predict(mmd, newX; interval=:confidence) == + predict(mm, newX; interval=:confidence) + @test predict(mmd, newX; interval=:prediction) == + predict(mm, newX; interval=:prediction) + + newd = DataFrame(newX, :auto) + @test predict(mmd, newd) == predict(mm, newX) + @test predict(mmd, newd; interval=:confidence) == + predict(mm, newX; interval=:confidence) + @test predict(mmd, newd; interval=:prediction) == + predict(mm, newX; interval=:prediction) + + + # Prediction from DataFrames with missing values + drep = d[[1, 2, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10], :] + dm = allowmissing(drep) + dm.x1[3] = missing + dm.y[9] = missing + + mmdm = lm(@formula(y ~ 0 + x1 + x2), dm) + @test predict(mmdm) == predict(mmd) + @test predict(mmdm, d) == predict(mmd, d) + @test isequal(predict(mmdm, dm), predict(mmd, dm)) + mmdm_pred2 = predict(mmdm, dm; interval=:confidence) + mmdm_pred3 = predict(mmdm, dm; interval=:prediction) + expected_pred = allowmissing(predict(mmdm, drep)) + expected_pred[3] = missing + @test collect(skipmissing(predict(mmdm, dm))) ≈ + collect(skipmissing(predict(mmd, dm))) ≈ + collect(skipmissing(mmdm_pred2.prediction)) == + collect(skipmissing(mmdm_pred3.prediction)) == + collect(skipmissing(expected_pred)) + @test ismissing.(predict(mmdm, dm)) == + ismissing.(predict(mmdm, dm)) == + ismissing.(mmdm_pred2.prediction) == + ismissing.(mmdm_pred3.prediction) == + ismissing.(expected_pred) + expected_lower = + allowmissing(predict(mmdm, drep; interval=:confidence).lower) + expected_lower[3] = missing + @test collect(skipmissing(mmdm_pred2.lower)) == collect(skipmissing(expected_lower)) + @test ismissing.(mmdm_pred2.lower) == ismissing.(expected_lower) + expected_upper = + allowmissing(predict(mmdm, drep; interval=:confidence).upper) + expected_upper[3] = missing + @test collect(skipmissing(mmdm_pred2.upper)) == collect(skipmissing(expected_upper)) + @test ismissing.(mmdm_pred2.upper) == ismissing.(expected_upper) + + # Prediction with dropcollinear (#409) x = [1.0 1.0 1.0 2.0 @@ -1408,3 +1492,19 @@ end @test coef(m) == coef(mcat) ≈ [11, 1, 2, 3, 4] @test coef(gm) == coef(gmcat) ≈ [11, 1, 2, 3, 4] end + +@testset "formula accessor" begin + m = lm(rand(10, 2), rand(10)) + @test_throws ArgumentError formula(m) + + m = glm(rand(10, 2), rand(10), Normal(), IdentityLink()) + @test_throws ArgumentError formula(m) + + df = DataFrame(x=["a", "b", "c"], y=[1, 2, 3]) + m = lm(@formula(y ~ x), df) + @test formula(m)::FormulaTerm === m.formula + + df = DataFrame(x=["a", "b", "c"], y=[1, 2, 3]) + m = glm(@formula(y ~ x), df, Normal(), IdentityLink()) + @test formula(m)::FormulaTerm === m.formula +end \ No newline at end of file From 1a012f1f90466a128aa14b3044e7b47e6c2ae39a Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 11 Sep 2022 13:29:02 +0200 Subject: [PATCH 09/14] Fix doctests on Julia 1.8 --- docs/src/api.md | 4 ++-- docs/src/examples.md | 6 +++--- docs/src/index.md | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 46990470..20723cf8 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -25,7 +25,7 @@ The most general approach to fitting a model is with the `fit` function, as in julia> using Random julia> fit(LinearModel, hcat(ones(10), 1:10), randn(MersenneTwister(12321), 10)) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}: +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}: Coefficients: ──────────────────────────────────────────────────────────────── @@ -41,7 +41,7 @@ This model can also be fit as julia> using Random julia> lm(hcat(ones(10), 1:10), randn(MersenneTwister(12321), 10)) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}: +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}: Coefficients: ──────────────────────────────────────────────────────────────── diff --git a/docs/src/examples.md b/docs/src/examples.md index 94f37a20..6ace2262 100644 --- a/docs/src/examples.md +++ b/docs/src/examples.md @@ -20,7 +20,7 @@ julia> data = DataFrame(X=[1,2,3], Y=[2,4,7]) 3 │ 3 7 julia> ols = lm(@formula(Y ~ X), data) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}: +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}: Coefficients: ───────────────────────────────────────────────────────────────────────── @@ -199,7 +199,7 @@ julia> form = dataset("datasets", "Formaldehyde") 6 │ 0.9 0.782 julia> lm1 = fit(LinearModel, @formula(OptDen ~ Carb), form) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}: +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}: Coefficients: ─────────────────────────────────────────────────────────────────────────── @@ -246,7 +246,7 @@ julia> LifeCycleSavings = dataset("datasets", "LifeCycleSavings") 35 rows omitted julia> fm2 = fit(LinearModel, @formula(SR ~ Pop15 + Pop75 + DPI + DDPI), LifeCycleSavings) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}: +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}: Coefficients: ───────────────────────────────────────────────────────────────────────────────── diff --git a/docs/src/index.md b/docs/src/index.md index b1a2be85..4d2bb2a0 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -85,7 +85,7 @@ julia> data = DataFrame(y = rand(rng, 100), x = categorical(repeat([1, 2, 3, 4], julia> lm(@formula(y ~ x), data) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}: +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}: Coefficients: ─────────────────────────────────────────────────────────────────────────── @@ -106,7 +106,7 @@ julia> using StableRNGs julia> data = DataFrame(y = rand(StableRNG(1), 100), x = repeat([1, 2, 3, 4], 25)); julia> lm(@formula(y ~ x), data, contrasts = Dict(:x => DummyCoding())) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}: +LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}: Coefficients: ─────────────────────────────────────────────────────────────────────────── From f9f30079be4cdbf3543f0fc030eb48775e86660c Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 11 Sep 2022 13:33:02 +0200 Subject: [PATCH 10/14] =?UTF-8?q?Use=20=E2=89=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/runtests.jl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index b5197c9c..6df62364 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -917,8 +917,8 @@ end expected_pred[3] = missing @test collect(skipmissing(predict(gm13m, dm))) ≈ collect(skipmissing(predict(gm13, dm))) ≈ - collect(skipmissing(gm13m_pred2.prediction)) == - collect(skipmissing(gm13m_pred3.prediction)) == + collect(skipmissing(gm13m_pred2.prediction)) ≈ + collect(skipmissing(gm13m_pred3.prediction)) ≈ collect(skipmissing(expected_pred)) @test ismissing.(predict(gm13m, dm)) == ismissing.(predict(gm13, dm)) == @@ -929,13 +929,13 @@ end allowmissing(predict(gm13m, drep; interval=:confidence, interval_method=:delta).lower) expected_lower[3] = missing - @test collect(skipmissing(gm13m_pred2.lower)) == collect(skipmissing(expected_lower)) + @test collect(skipmissing(gm13m_pred2.lower)) ≈ collect(skipmissing(expected_lower)) @test ismissing.(gm13m_pred2.lower) == ismissing.(expected_lower) expected_upper = allowmissing(predict(gm13m, drep; interval=:confidence, interval_method=:delta).upper) expected_upper[3] = missing - @test collect(skipmissing(gm13m_pred2.upper)) == collect(skipmissing(expected_upper)) + @test collect(skipmissing(gm13m_pred2.upper)) ≈ collect(skipmissing(expected_upper)) @test ismissing.(gm13m_pred2.upper) == ismissing.(expected_upper) @@ -1029,8 +1029,8 @@ end expected_pred[3] = missing @test collect(skipmissing(predict(mmdm, dm))) ≈ collect(skipmissing(predict(mmd, dm))) ≈ - collect(skipmissing(mmdm_pred2.prediction)) == - collect(skipmissing(mmdm_pred3.prediction)) == + collect(skipmissing(mmdm_pred2.prediction)) ≈ + collect(skipmissing(mmdm_pred3.prediction)) ≈ collect(skipmissing(expected_pred)) @test ismissing.(predict(mmdm, dm)) == ismissing.(predict(mmdm, dm)) == @@ -1040,12 +1040,12 @@ end expected_lower = allowmissing(predict(mmdm, drep; interval=:confidence).lower) expected_lower[3] = missing - @test collect(skipmissing(mmdm_pred2.lower)) == collect(skipmissing(expected_lower)) + @test collect(skipmissing(mmdm_pred2.lower)) ≈ collect(skipmissing(expected_lower)) @test ismissing.(mmdm_pred2.lower) == ismissing.(expected_lower) expected_upper = allowmissing(predict(mmdm, drep; interval=:confidence).upper) expected_upper[3] = missing - @test collect(skipmissing(mmdm_pred2.upper)) == collect(skipmissing(expected_upper)) + @test collect(skipmissing(mmdm_pred2.upper)) ≈ collect(skipmissing(expected_upper)) @test ismissing.(mmdm_pred2.upper) == ismissing.(expected_upper) From 93b3d9770b44e2d42d6ebc2648b49fbcb195969e Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sat, 8 Oct 2022 15:25:56 +0200 Subject: [PATCH 11/14] Review fixes --- src/GLM.jl | 17 +++++++++++++++++ src/glmfit.jl | 41 +++++++++++++++++++---------------------- src/linpred.jl | 2 +- src/lm.jl | 29 ++++++++--------------------- test/runtests.jl | 41 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 86 insertions(+), 44 deletions(-) diff --git a/src/GLM.jl b/src/GLM.jl index 8a35945d..50ab2a2d 100644 --- a/src/GLM.jl +++ b/src/GLM.jl @@ -91,6 +91,23 @@ module GLM pivoted_cholesky!(A; kwargs...) = cholesky!(A, RowMaximum(); kwargs...) end + const COMMON_FIT_KWARGS_DOCS = """ + - `wts::Vector=similar(y,0)`: Prior frequency (a.k.a. case) weights of observations. + Such weights are equivalent to repeating each observation a number of times equal + to its weight. Do note that this interpretation gives equal point estimates but + different standard errors from analytical (a.k.a. inverse variance) weights and + from probability (a.k.a. sampling) weights which are the default in some other + software. + Can be length 0 to indicate no weighting (default). + - `contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}()`: a `Dict` mapping term names + (as `Symbol`s) to term types (e.g. `ContinuousTerm`) or contrasts + (e.g., `HelmertCoding()`, `SeqDiffCoding(; levels=["a", "b", "c"])`, + etc.). If contrasts are not provided for a variable, the appropriate + term type will be guessed based on the data type from the data column: + any numeric data is assumed to be continuous, and any non-numeric data + is assumed to be categorical (with `DummyCoding()` as the default contrast type). + """ + include("linpred.jl") include("lm.jl") include("glmtools.jl") diff --git a/src/glmfit.jl b/src/glmfit.jl index 1fc6bf99..6fa339df 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -464,7 +464,6 @@ function StatsBase.fit!(m::AbstractGLM, y; wts=nothing, offset=nothing, - dofit::Bool=true, verbose::Bool=false, maxiter::Integer=30, minstepfac::Real=0.001, @@ -524,14 +523,8 @@ const FIT_GLM_DOC = """ for a list of built-in links). # Keyword Arguments - - `dofit::Bool=true`: Determines whether model will be fit - - `wts::Vector=similar(y,0)`: Prior frequency (a.k.a. case) weights of observations. - Such weights are equivalent to repeating each observation a number of times equal - to its weight. Do note that this interpretation gives equal point estimates but - different standard errors from analytical (a.k.a. inverse variance) weights and - from probability (a.k.a. sampling) weights which are the default in some other - software. - Can be length 0 to indicate no weighting (default). + - `dofit::Bool=true`: Determines whether model will be fit. Only supported with `glm`. + $COMMON_FIT_KWARGS_DOCS - `offset::Vector=similar(y,0)`: offset added to `Xβ` to form `eta`. Can be of length 0 - `verbose::Bool=false`: Display convergence information for each iteration @@ -543,13 +536,6 @@ const FIT_GLM_DOC = """ - `minstepfac::Real=0.001`: Minimum line step fraction. Must be between 0 and 1. - `start::AbstractVector=nothing`: Starting values for beta. Should have the same length as the number of columns in the model matrix. - - `contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}()`: a `Dict` mapping term names - (as `Symbol`s) to term types (e.g. `ContinuousTerm`) or contrasts - (e.g., `HelmertCoding()`, `SeqDiffCoding(; levels=["a", "b", "c"])`, - etc.). If contrasts are not provided for a variable, the appropriate - term type will be guessed based on the data type from the data column: - any numeric data is assumed to be continuous, and any non-numeric data - is assumed to be categorical. """ """ @@ -567,10 +553,15 @@ function fit(::Type{M}, y::AbstractVector{<:Real}, d::UnivariateDistribution, l::Link = canonicallink(d); - dofit::Bool = true, + dofit::Union{Bool, Nothing} = nothing, wts::AbstractVector{<:Real} = similar(y, 0), offset::AbstractVector{<:Real} = similar(y, 0), fitargs...) where {M<:AbstractGLM} + if dofit === nothing + dofit = true + else + Base.depwarn("`dofit` argument to `fit` is deprecated", :fit) + end # Check that X and y have the same number of observations if size(X, 1) != size(y, 1) @@ -596,9 +587,15 @@ function fit(::Type{M}, l::Link=canonicallink(d); offset::Union{AbstractVector, Nothing} = nothing, wts::Union{AbstractVector, Nothing} = nothing, - dofit::Bool = true, + dofit::Union{Bool, Nothing} = nothing, contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}(), fitargs...) where {M<:AbstractGLM} + if dofit === nothing + dofit = true + else + Base.depwarn("`dofit` argument to `fit` is deprecated", :fit) + end + f, (y, X) = modelframe(f, data, contrasts) # Check that X and y have the same number of observations @@ -672,8 +669,8 @@ the point estimates, but do not respect natural parameter constraints """ predict(mm::AbstractGLM, newX; offset::FPVector=[], - interval::Union{Symbol,Nothing}=nothing, level::Real = 0.95, - interval_method::Symbol = :transformation) + interval::Union{Symbol,Nothing}=nothing, level::Real=0.95, + interval_method::Symbol=:transformation) Return the predicted response of model `mm` from covariate values `newX` and, optionally, an `offset`. @@ -698,8 +695,8 @@ end """ predict!(res, mm::AbstractGLM, newX::AbstractMatrix; offset::FPVector=eltype(newX)[], - interval::Union{Symbol,Nothing}=nothing, level::Real = 0.95, - interval_method::Symbol = :transformation) + interval::Union{Symbol,Nothing}=nothing, level::Real=0.95, + interval_method::Symbol=:transformation) Store in `res` the predicted response of model `mm` from covariate values `newX` and, optionally, an `offset`. `res` must be a vector with a length equal to the number diff --git a/src/linpred.jl b/src/linpred.jl index 2b75cd06..fd2cefa9 100644 --- a/src/linpred.jl +++ b/src/linpred.jl @@ -314,4 +314,4 @@ function StatsBase.predict(mm::LinPredModel, data; interval=interval, kwargs...) return (prediction=prediction, lower=lower, upper=upper) end -end \ No newline at end of file +end diff --git a/src/lm.jl b/src/lm.jl index c65e2b68..f8efede0 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -111,29 +111,16 @@ const FIT_LM_DOC = """ in columns (including if appropriate the intercept), and `y` must be a vector holding values of the dependent variable. - The keyword argument `wts` can be a `Vector` specifying frequency weights for observations. - Such weights are equivalent to repeating each observation a number of times equal - to its weight. Do note that this interpretation gives equal point estimates but - different standard errors from analytical (a.k.a. inverse variance) weights and - from probability (a.k.a. sampling) weights which are the default in some other - software. - - `dropcollinear` controls whether or not `lm` accepts a model matrix which - is less-than-full rank. If `true` (the default), only the first of each set of - linearly-dependent columns is used. The coefficient for redundant linearly dependent columns is - `0.0` and all associated statistics are set to `NaN`. - - `contrasts` may optionally be supplied in the form of a `Dict` mapping term names - (as `Symbol`s) to term types (e.g. `ContinuousTerm`) or contrasts - (e.g., `HelmertCoding()`, `SeqDiffCoding(; levels=["a", "b", "c"])`, - etc.). If contrasts are not provided for a variable, the appropriate - term type will be guessed based on the data type from the data column: - any numeric data is assumed to be continuous, and any non-numeric data - is assumed to be categorical. + # Keyword Arguments + $COMMON_FIT_KWARGS_DOCS + - `dropcollinear::Bool=false` controls whether or not `lm` accepts a model matrix which + is less-than-full rank. If `true` (the default), only the first of each set of + linearly-dependent columns is used. The coefficient for redundant linearly dependent + columns is `0.0` and all associated statistics are set to `NaN`. """ """ - fit(LinearModel, formula::FormulaTerm, data, allowrankdeficient=false; + fit(LinearModel, formula::FormulaTerm, data; [wts::AbstractVector], dropcollinear::Bool=true, contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}()) fit(LinearModel, X::AbstractMatrix, y::AbstractVector; @@ -166,7 +153,7 @@ function fit(::Type{LinearModel}, f::FormulaTerm, data, end """ - lm(formula, data, allowrankdeficient=false; + lm(formula, data; [wts::AbstractVector], dropcollinear::Bool=true, contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}()) lm(X::AbstractMatrix, y::AbstractVector; diff --git a/test/runtests.jl b/test/runtests.jl index 6df62364..2adbd93d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1600,6 +1600,7 @@ end end @testset "contrasts argument" begin + # DummyCoding (default) m = lm(@formula(y~x), (y=1:25, x=repeat(1:5, 5)), contrasts=Dict(:x=>DummyCoding())) mcat = lm(@formula(y~x), (y=1:25, x=categorical(repeat(1:5, 5)))) @@ -1621,6 +1622,46 @@ end Normal(), IdentityLink()) @test coef(m) == coef(mcat) ≈ [11, 1, 2, 3, 4] @test coef(gm) == coef(gmcat) ≈ [11, 1, 2, 3, 4] + + # EffectsCoding + m = lm(@formula(y~x), (y=1:25, x=repeat(1:5, 5)), + contrasts=Dict(:x=>EffectsCoding())) + gm = glm(@formula(y~x), (y=1:25, x=repeat(1:5, 5)), Normal(), IdentityLink(), + contrasts=Dict(:x=>EffectsCoding())) + @test coef(m) ≈ coef(gm) ≈ [13, -1, 0, 1, 2] + + m = fit(LinearModel, @formula(y~x), (y=1:25, x=repeat(1:5, 5)), + contrasts=Dict(:x=>EffectsCoding())) + gm = fit(GeneralizedLinearModel, @formula(y~x), + (y=1:25, x=repeat(1:5, 5)), Normal(), IdentityLink(), + contrasts=Dict(:x=>EffectsCoding())) + @test coef(m) ≈ coef(gm) ≈ [13, -1, 0, 1, 2] +end + + +@testset "dofit argument" begin + gm1 = glm(@formula(y~x), (y=1:25, x=repeat(1:5, 5)), Normal(), IdentityLink(), + dofit=true) + gm2 = glm(@formula(y~x), (y=1:25, x=repeat(1:5, 5)), Normal(), IdentityLink(), + dofit=false) + @test gm1.fit + @test !gm2.fit + fit!(gm2) + @test gm2.fit + @test coef(gm1) == coef(gm2) ≈ [10, 1] + + # Deprecated + gm1 = fit(GeneralizedLinearModel, @formula(y~x), + (y=1:25, x=repeat(1:5, 5)), Normal(), IdentityLink(), + dofit=true) + gm2 = fit(GeneralizedLinearModel, @formula(y~x), + (y=1:25, x=repeat(1:5, 5)), Normal(), IdentityLink(), + dofit=false) + @test gm1.fit + @test !gm2.fit + fit!(gm2) + @test gm2.fit + @test coef(gm1) == coef(gm2) ≈ [10, 1] end @testset "formula accessor" begin From 5fb340d2422d954a0fd6ec3181539a663be4aa33 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 21 Oct 2022 09:40:57 +0200 Subject: [PATCH 12/14] More fixes --- docs/src/api.md | 4 ++-- docs/src/examples.md | 32 ++++++++++++++++++++++++-------- docs/src/index.md | 8 ++++++-- src/glmfit.jl | 2 +- src/linpred.jl | 8 +++++--- src/lm.jl | 2 +- 6 files changed, 39 insertions(+), 17 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 20723cf8..70d0356b 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -25,7 +25,7 @@ The most general approach to fitting a model is with the `fit` function, as in julia> using Random julia> fit(LinearModel, hcat(ones(10), 1:10), randn(MersenneTwister(12321), 10)) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}: +LinearModel Coefficients: ──────────────────────────────────────────────────────────────── @@ -41,7 +41,7 @@ This model can also be fit as julia> using Random julia> lm(hcat(ones(10), 1:10), randn(MersenneTwister(12321), 10)) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}: +LinearModel Coefficients: ──────────────────────────────────────────────────────────────── diff --git a/docs/src/examples.md b/docs/src/examples.md index 6ace2262..2eeb9f79 100644 --- a/docs/src/examples.md +++ b/docs/src/examples.md @@ -20,7 +20,9 @@ julia> data = DataFrame(X=[1,2,3], Y=[2,4,7]) 3 │ 3 7 julia> ols = lm(@formula(Y ~ X), data) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}: +LinearModel + +Y ~ 1 + X Coefficients: ───────────────────────────────────────────────────────────────────────── @@ -97,7 +99,9 @@ julia> data = DataFrame(X=[1,2,2], Y=[1,0,1]) 3 │ 2 1 julia> probit = glm(@formula(Y ~ X), data, Binomial(), ProbitLink()) -GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Binomial{Float64}, ProbitLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}: +GeneralizedLinearModel + +Y ~ 1 + X Coefficients: ──────────────────────────────────────────────────────────────────────── @@ -136,7 +140,9 @@ julia> quine = dataset("MASS", "quine") 131 rows omitted julia> nbrmodel = glm(@formula(Days ~ Eth+Sex+Age+Lrn), quine, NegativeBinomial(2.0), LogLink()) -GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}: +GeneralizedLinearModel + +Days ~ 1 + Eth + Sex + Age + Lrn Coefficients: ──────────────────────────────────────────────────────────────────────────── @@ -152,7 +158,9 @@ Lrn: SL 0.296768 0.185934 1.60 0.1105 -0.0676559 0.661191 ──────────────────────────────────────────────────────────────────────────── julia> nbrmodel = negbin(@formula(Days ~ Eth+Sex+Age+Lrn), quine, LogLink()) -GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}: +GeneralizedLinearModel + +Days ~ 1 + Eth + Sex + Age + Lrn Coefficients: ──────────────────────────────────────────────────────────────────────────── @@ -199,7 +207,9 @@ julia> form = dataset("datasets", "Formaldehyde") 6 │ 0.9 0.782 julia> lm1 = fit(LinearModel, @formula(OptDen ~ Carb), form) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}: +LinearModel + +OptDen ~ 1 + Carb Coefficients: ─────────────────────────────────────────────────────────────────────────── @@ -246,7 +256,9 @@ julia> LifeCycleSavings = dataset("datasets", "LifeCycleSavings") 35 rows omitted julia> fm2 = fit(LinearModel, @formula(SR ~ Pop15 + Pop75 + DPI + DDPI), LifeCycleSavings) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}: +LinearModel + +SR ~ 1 + Pop15 + Pop75 + DPI + DDPI Coefficients: ───────────────────────────────────────────────────────────────────────────────── @@ -352,7 +364,9 @@ julia> dobson = DataFrame(Counts = [18.,17,15,20,10,21,25,13,13], 9 │ 13.0 3 3 julia> gm1 = fit(GeneralizedLinearModel, @formula(Counts ~ Outcome + Treatment), dobson, Poisson()) -GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Poisson{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}: +GeneralizedLinearModel + +Counts ~ 1 + Outcome + Treatment Coefficients: ──────────────────────────────────────────────────────────────────────────── @@ -407,7 +421,9 @@ julia> round(optimal_bic.minimizer, digits = 5) # Optimal λ 0.40935 julia> glm(@formula(Volume ~ Height + Girth), trees, Normal(), PowerLink(optimal_bic.minimizer)) # Best model -GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Normal{Float64}, PowerLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}: +GeneralizedLinearModel + +Volume ~ 1 + Height + Girth Coefficients: ──────────────────────────────────────────────────────────────────────────── diff --git a/docs/src/index.md b/docs/src/index.md index 4d2bb2a0..65fd104f 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -85,7 +85,9 @@ julia> data = DataFrame(y = rand(rng, 100), x = categorical(repeat([1, 2, 3, 4], julia> lm(@formula(y ~ x), data) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}: +LinearModel + +y ~ 1 + x Coefficients: ─────────────────────────────────────────────────────────────────────────── @@ -106,7 +108,9 @@ julia> using StableRNGs julia> data = DataFrame(y = rand(StableRNG(1), 100), x = repeat([1, 2, 3, 4], 25)); julia> lm(@formula(y ~ x), data, contrasts = Dict(:x => DummyCoding())) -LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}: +LinearModel + +y ~ 1 + x Coefficients: ─────────────────────────────────────────────────────────────────────────── diff --git a/src/glmfit.jl b/src/glmfit.jl index 6fa339df..1d963499 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -596,7 +596,7 @@ function fit(::Type{M}, Base.depwarn("`dofit` argument to `fit` is deprecated", :fit) end - f, (y, X) = modelframe(f, data, contrasts) + f, (y, X) = modelframe(f, data, contrasts, M) # Check that X and y have the same number of observations if size(X, 1) != size(y, 1) diff --git a/src/linpred.jl b/src/linpred.jl index fd2cefa9..ed73c3ee 100644 --- a/src/linpred.jl +++ b/src/linpred.jl @@ -231,10 +231,12 @@ end stderror(x::LinPredModel) = sqrt.(diag(vcov(x))) function show(io::IO, obj::LinPredModel) - println(io, "$(typeof(obj)):\n\nCoefficients:\n", coeftable(obj)) + println(io, nameof(typeof(obj)), '\n') + obj.formula !== nothing && println(io, obj.formula, '\n') + println(io, "Coefficients:\n", coeftable(obj)) end -function modelframe(f::FormulaTerm, data, contrasts::AbstractDict) +function modelframe(f::FormulaTerm, data, contrasts::AbstractDict, ::Type{M}) where M Tables.istable(data) || throw(ArgumentError("expected data in a Table, got $(typeof(data))")) t = Tables.columntable(data) @@ -242,7 +244,7 @@ function modelframe(f::FormulaTerm, data, contrasts::AbstractDict) msg != "" && throw(ArgumentError(msg)) data, _ = StatsModels.missing_omit(t, f) sch = schema(f, data, contrasts) - f = apply_schema(f, sch, LinPredModel) + f = apply_schema(f, sch, M) f, modelcols(f, data) end diff --git a/src/lm.jl b/src/lm.jl index f8efede0..cc7f8afd 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -147,7 +147,7 @@ function fit(::Type{LinearModel}, f::FormulaTerm, data, wts::Union{AbstractVector{<:Real}, Nothing}=nothing, dropcollinear::Bool=true, contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}()) - f, (y, X) = modelframe(f, data, contrasts) + f, (y, X) = modelframe(f, data, contrasts, LinearModel) wts === nothing && (wts = similar(y, 0)) fit!(LinearModel(LmResp(y, wts), cholpred(X, dropcollinear), f)) end From 19250398c0698b74b29c730cbdea06697161b037 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 21 Oct 2022 09:42:11 +0200 Subject: [PATCH 13/14] Update test/runtests.jl Co-authored-by: Dave Kleinschmidt --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 865ae6f7..09390c16 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -853,7 +853,7 @@ end upper=similar(Y, size(newX, 1))), gm11, newX, interval=:confidence, interval_method=:delta) == predict(gm11, newX, interval=:confidence, interval_method=:delta) - @test predict!((prediction=similar(Y, size(newX, 1)), + @test predict!((prediction=similar(Y, size(newX, 1)), lower=similar(Y, size(newX, 1)), upper=similar(Y, size(newX, 1))), gm11, newX, interval=:confidence, interval_method=:transformation) == From 5c6bf20a072aa30cf5fb648a2f64bbc42655dfc6 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 20 Nov 2022 16:46:17 +0100 Subject: [PATCH 14/14] Add deprecation to access formula --- src/deprecated.jl | 8 +++++++- test/runtests.jl | 34 ++++++++++++++++++---------------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/src/deprecated.jl b/src/deprecated.jl index 8809daea..e496a139 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -9,7 +9,13 @@ function Base.getproperty(mm::LinPredModel, f::Symbol) "as they are no longer wrapped in a `TableRegressionModel` " * "and can be used directly now", :getproperty) return mm + elseif f === :mf + Base.depwarn("accessing the `mf` field of GLM.jl models is deprecated, " * + "as they are no longer wrapped in a `TableRegressionModel`." * + "Use `formula(m)` to access the model formula.", :getproperty) + form = formula(mm) + return ModelFrame{Nothing, typeof(mm)}(form, nothing, nothing, typeof(mm)) else return getfield(mm, f) end -end \ No newline at end of file +end diff --git a/test/runtests.jl b/test/runtests.jl index afcedf90..cc860641 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -45,8 +45,9 @@ linreg(x::AbstractVecOrMat, y::AbstractVector) = qr!(simplemm(x)) \ y @test isa(lm2.pp.chol, CholeskyPivoted) @test lm2.pp.chol.piv == [2, 1] @test isapprox(coef(lm1), coef(lm2) .* [1., 10.]) - # Deprecated method + # Deprecated methods @test lm1.model === lm1 + @test lm1.mf.f == formula(lm1) end @testset "Linear Model Cook's Distance" begin @@ -293,8 +294,9 @@ dobson = DataFrame(Counts = [18.,17,15,20,10,20,25,13,12], @test isapprox(bic(gm1), 57.74744128863877) @test isapprox(coef(gm1)[1:3], [3.044522437723423,-0.45425527227759555,-0.29298712468147375]) - # Deprecated method + # Deprecated methods @test gm1.model === gm1 + @test gm1.mf.f == formula(gm1) end ## Example from http://www.ats.ucla.edu/stat/r/dae/logit.htm @@ -422,7 +424,7 @@ end gm7p = fit(GeneralizedLinearModel, @formula(round(Postwt) ~ 1 + Prewt + Treat), anorexia, Poisson(), LogLink(), offset=log.(anorexia.Prewt), rtol=1e-8) - @test GLM.cancancel(gm7p.model.rr) + @test GLM.cancancel(gm7p.rr) test_show(gm7p) @test deviance(gm7p) ≈ 39.686114742427705 @test nulldeviance(gm7p) ≈ 54.749010639715294 @@ -439,7 +441,7 @@ end Poisson(), LogLink(), offset=log.(anorexia.Prewt), wts=repeat(1:4, outer=18), rtol=1e-8) - @test GLM.cancancel(gm7pw.model.rr) + @test GLM.cancancel(gm7pw.rr) test_show(gm7pw) @test deviance(gm7pw) ≈ 90.17048668870225 @test nulldeviance(gm7pw) ≈ 139.63782826574652 @@ -732,9 +734,9 @@ end @testset "GLM with no intercept" begin # Gamma with single numeric predictor nointglm1 = fit(GeneralizedLinearModel, @formula(lot1 ~ 0 + u), clotting, Gamma()) - @test !hasintercept(nointglm1.model) - @test !GLM.cancancel(nointglm1.model.rr) - @test isa(GLM.Link(nointglm1.model), InverseLink) + @test !hasintercept(nointglm1) + @test !GLM.cancancel(nointglm1.rr) + @test isa(GLM.Link(nointglm1), InverseLink) test_show(nointglm1) @test dof(nointglm1) == 2 @test deviance(nointglm1) ≈ 0.6629903395245351 @@ -745,13 +747,13 @@ end @test aicc(nointglm1) ≈ 71.21377945777526 @test bic(nointglm1) ≈ 69.6082286124477 @test coef(nointglm1) ≈ [0.009200201253724151] - @test GLM.dispersion(nointglm1.model, true) ≈ 0.10198331431820506 + @test GLM.dispersion(nointglm1, true) ≈ 0.10198331431820506 @test stderror(nointglm1) ≈ [0.000979309363228589] # Bernoulli with numeric predictors nointglm2 = fit(GeneralizedLinearModel, @formula(admit ~ 0 + gre + gpa), admit, Bernoulli()) - @test !hasintercept(nointglm2.model) - @test GLM.cancancel(nointglm2.model.rr) + @test !hasintercept(nointglm2) + @test GLM.cancancel(nointglm2.rr) test_show(nointglm2) @test dof(nointglm2) == 2 @test deviance(nointglm2) ≈ 503.5584368354113 @@ -768,8 +770,8 @@ end nointglm3 = fit(GeneralizedLinearModel, @formula(round(Postwt) ~ 0 + Prewt + Treat), anorexia, Poisson(), LogLink(); offset=log.(anorexia.Prewt), wts=repeat(1:4, outer=18), rtol=1e-8, dropcollinear=false) - @test !hasintercept(nointglm3.model) - @test GLM.cancancel(nointglm3.model.rr) + @test !hasintercept(nointglm3) + @test GLM.cancancel(nointglm3.rr) test_show(nointglm3) @test deviance(nointglm3) ≈ 90.17048668870225 @test nulldeviance(nointglm3) ≈ 159.32999067102548 @@ -1694,7 +1696,7 @@ end @test isnan(stderror(mdl1)[4]) @test dof(mdl1) == dof(mdl2) @test dof_residual(mdl1) == dof_residual(mdl2) - @test GLM.dispersion(mdl1.model, true) ≈ GLM.dispersion(mdl2.model,true) + @test GLM.dispersion(mdl1, true) ≈ GLM.dispersion(mdl2,true) @test deviance(mdl1) ≈ deviance(mdl2) @test loglikelihood(mdl1) ≈ loglikelihood(mdl2) @test aic(mdl1) ≈ aic(mdl2) @@ -1709,7 +1711,7 @@ end @test stderror(mdl1) ≈ stderror(mdl2) @test dof(mdl1) == dof(mdl2) @test dof_residual(mdl1) == dof_residual(mdl2) - @test GLM.dispersion(mdl1.model, true) ≈ GLM.dispersion(mdl2.model,true) + @test GLM.dispersion(mdl1, true) ≈ GLM.dispersion(mdl2,true) @test deviance(mdl1) ≈ deviance(mdl2) @test loglikelihood(mdl1) ≈ loglikelihood(mdl2) @test aic(mdl1) ≈ aic(mdl2) @@ -1723,7 +1725,7 @@ end @test stderror(mdl)[1:3] ≈ [0.58371400875263, 0.10681694901238, 0.08531532203251] @test dof(mdl) == 4 @test dof_residual(mdl) == 2 - @test GLM.dispersion(mdl.model, true) ≈ 0.1341642228738996 + @test GLM.dispersion(mdl, true) ≈ 0.1341642228738996 @test deviance(mdl) ≈ 0.2683284457477991 @test loglikelihood(mdl) ≈ 0.2177608775670037 @test aic(mdl) ≈ 7.564478244866 @@ -1750,7 +1752,7 @@ end @test dof(mdl) == 3 @test dof_residual(mdl) == 98 @test aic(mdl) ≈ 141.68506068159 - @test GLM.dispersion(mdl.model, true) ≈ 1 + @test GLM.dispersion(mdl, true) ≈ 1 @test predict(mdl)[1:3] ≈ [0.4241893070433117, 0.3754516361306202, 0.6327877688720133] atol = 1.0E-6 @test confint(mdl)[1:2,1:2] ≈ [-0.5493329715011036 0.26350316142056085; -0.3582545657827583 0.64313795309765587] atol = 1.0E-1