diff --git a/src/mca.jl b/src/mca.jl new file mode 100644 index 0000000..a997709 --- /dev/null +++ b/src/mca.jl @@ -0,0 +1,584 @@ +# Correspondence Analysis and Multiple Correspondence Analysis + +#== +References: +https://personal.utdallas.edu/~herve/Abdi-MCA2007-pretty.pdf +https://www.stata.com/manuals/mvmca.pdf +https://www.stata.com/manuals/mvca.pdf +https://en.wikipedia.org/wiki/Multiple_correspondence_analysis +https://pca4ds.github.io +https://maths.cnam.fr/IMG/pdf/ClassMCA_cle825cfc.pdf +==# + +""" +Correspondence Analysis +""" +mutable struct CA{T<:Real} <: LinearDimensionalityReduction + + # The data matrix + X::Matrix{T} + + # The residuals + R::Matrix{T} + + # Row masses + rm::Vector{T} + + # Column masses + cm::Vector{T} + + # The standardized residuals + SR::Matrix{T} + + # Object coordinates in standard coordinates + FS::Matrix{T} + + # Variable coordinates in standard coordinates + GS::Matrix{T} + + # Inertia (eigenvalues of the indicator matrix) + I::Vector{T} + + # Standard normalization or principal normalization + normalize::String + + # Use either the Burt method or the indicator method + method::String +end + +function show(io::IO, ca::CA) + nobs, nvar = size(ca.X) + print( + io, + "CA(nobs = $nobs, nvar = $nvar, method = $(ca.method), normalize = $(ca.normalize))", + ) +end + +function print_inertia(io, I) + xx = hcat(I, I, I) + xx[:, 2] = xx[:, 1] ./ sum(xx[:, 1]) + xx[:, 3] = cumsum(xx[:, 2]) + ii = findfirst(xx[:, 3] .>= 0.999) + xx = xx[1:ii, :] + vn = ["Inertia", "Prop inertia", "Cumulative inertia"] + cft = CoefTable(xx, vn, string.("", 1:ii)) + println(io, cft) +end + +function build_coeftable(ca, vc, stats, rn) + cn = ["Mass"] + xx = [ca.cm] + d = size(ca.GS, 2) + for j = 1:d + push!(cn, "Coord-$(j)") + push!(xx, vc[:, j]) + push!(cn, "SqCorr-$(j)") + push!(xx, stats.sqcorr_col[:, j]) + push!(cn, "RelContrib-$(j)") + push!(xx, stats.relcontrib_col[:, j]) + end + xx = hcat(xx...) + cft = CoefTable(xx, cn, rn) + return cft +end + +function show(io::IO, ::MIME"text/plain", ca::CA) + nobs, nvar = size(ca.X) + stats = ca_stats(ca) + + println( + io, + "CA(nobs = $nobs, nvar = $nvar, method = $(ca.method), normalize = $(ca.normalize))", + ) + print_inertia(io, ca.I) + vc = variable_coords(ca) + m, d = size(ca.GS) + println(io, "\nVariable coordinates:") + rn = string.("", 1:m) + cft = build_coeftable(ca, vc, stats, rn) + print(io, cft) + print(io, "\n\n") +end + +# Squared correlation of each variable with each component. +function sqcorr(ca::CA, axis) + if axis == :row + z = ca.rm ./ sum(abs2, ca.SR; dims = 2)[:] + p = length(ca.rm) + d = size(ca.GS, 2) + if ca.method == "burt" + # Not clear how to compute this so report zero for now + return zeros(p, d) + end + f = z * ones(d)' + return object_coords(ca; normalize = "principal") .^ 2 .* f + elseif axis == :col + z = ca.cm ./ sum(abs2, ca.SR; dims = 1)[:] + p = length(ca.cm) + d = size(ca.GS, 2) + f = z * ones(d)' + if ca.method == "burt" + # Not clear how to compute this so report zero for now + return zeros(p, d) + end + return variable_coords(ca; normalize = "principal") .^ 2 .* f + else + error("Unknown axis '$(axis)'") + end +end + +# Relative contributions of each variable to each component. +function relcontrib(ca::CA, axis) + e = ca.method == "indicator" ? 0.5 : 1 + if axis == :row + d = size(ca.GS, 2) + z = ca.rm * (ca.I[1:d] .^ -e)' + return object_coords(ca; normalize = "principal") .^ 2 .* z + elseif axis == :col + d = size(ca.GS, 2) + z = ca.cm * (ca.I[1:d] .^ -e)' + return variable_coords(ca; normalize = "principal") .^ 2 .* z + else + error("Unknown axis '$(axis)'") + end +end + +# Return several fit statistics. +function ca_stats(ca::CA) + cr = sqcorr(ca, :col) + rc = relcontrib(ca, :col) + return (sqcorr_col = cr, relcontrib_col = rc) +end + +# Multiply corresponding columns of Q by -1 as needed +# to make the first non-zero value in each column of Q +# positive. This is the method used by Stata to identify +# the coefficients. +function orient(P, Q, e = 1e-12) + for j = 1:size(Q, 2) + i = findfirst(abs.(Q[:, j]) .>= e) + if Q[i, j] < 0 + Q[:, j] .*= -1 + P[:, j] .*= -1 + end + end + return P, Q +end + +function mca_check_args(normalize, method) + if !(normalize in ["standard", "principal"]) + error("normalize = '$normalize' should be 'standard' or 'principal'") + end + if !(method in ["indicator", "burt"]) + error("method = '$method' should be 'indicator' or 'burt'") + end +end + +""" + fit(CA, X; ...) + +Peform a Correspondence Analysis using the data in `X`. + +**Keyword Arguments** + +- `d` the number of dimensions to retain. +- `normalize` a coordinate normalization method, either 'standard' or 'principal'. + +**Notes:** + +- The matrix `X` should contain numerical data for which it makes + sense to use chi^2 distance to compare rows. Most commonly this + is an indicator matrix in which each row contains a single 1 with + the remaining values all being 0. +- See `MCA` for a more general analysis that takes a dataframe + with multiple nominal columns as input and performs a CA on + its indicator matrix. +""" +function fit( + ::Type{CA}, + X::AbstractMatrix{T}; + d::Int = 5, + normalize = "standard", + method = "indicator", +) where {T} + + mca_check_args(normalize, method) + + # Convert to proportions + X = X ./ sum(X) + + # Calculate row and column margins + rm = sum(X, dims = 2)[:] + cm = sum(X, dims = 1)[:] + + # Center the data matrix to create residuals + R = X - rm * cm' + + # Standardize the data matrix to create standardized residuals + Wr = Diagonal(sqrt.(rm)) + Wc = Diagonal(sqrt.(cm)) + SR = Wr \ R / Wc + + # Factor the standardized residual matrix + P, D, Q = svd(SR) + + # Check that there are no repeated non-zero eigenvalues. + di = diff(D[D.>1e-10]) + if maximum(di) >= -1e-10 + @warn("The indicator matrix has repeated non-zero eigenvalues") + end + + # Get the eigenvalues + I = method == "burt" ? D .^ 4 : D .^ 2 + + # Reduce to the selected dimension + P = P[:, 1:d] + D = D[1:d] + Q = Q[:, 1:d] + + # Flip the loading vectors to a standard orientation. + P, Q = orient(P, Q) + + # Get the object scores (F) and variable scores (G). These are the + # standard coordinates. + FS = Wr \ P + GS = Wc \ Q + + ca = CA(X, R, rm, cm, SR, FS, GS, I, normalize, method) + return ca +end + +# Calculate the standard coordinates of any qualitative passive variables. +function quali_passive(ca::CA, passive; normalize = "principal") + + if size(passive, 1) == 0 + return + end + + if length(size(passive)) != 2 + error("passive variable array must be two-dimensional") + end + + #(; X, GS) = ca + X = ca.X + GS = ca.GS + + PX = Matrix(passive) + + if size(PX, 1) != size(X, 1) + @error("Passive data must have same leading axis length as active data.") + end + + M = hcat(X, PX) + B = M' * M + B ./= sum(B) + p = size(X, 2) + B = B[p+1:end, 1:p] + + PGS = B * GS + for k = 1:size(B, 1) + PGS[k, :] ./= sum(B[k, :]) + end + d = size(PGS, 2) + if ca.method == "burt" + PGS = PGS * Diagonal(1 ./ sqrt.(ca.I[1:d])) + elseif ca.method == "indicator" + PGS = PGS * Diagonal(1 ./ ca.I[1:d]) + else + error("Unknown method '$(ca.method)'") + end + + if normalize == "standard" + return PGS + elseif normalize == "principal" + return PGS * Diagonal(sqrt.(ca.I[1:d])) + else + error("Unknown normalization '$(normalize)'") + end + + return PGS +end + +function object_coords(ca::CA; normalize = ca.normalize) + if normalize == "standard" + ca.FS + elseif normalize == "principal" + d = size(ca.FS, 2) + return ca.FS * Diagonal(sqrt.(ca.I[1:d])) + else + error("Unknown normalization '$(normalize)'") + end +end + +inertia(ca::CA) = ca.I + +function variable_coords(ca::CA; normalize = ca.normalize) + #(; GS) = ca + GS = ca.GS + + d = size(GS, 2) + if normalize == "standard" + return GS + elseif normalize == "principal" + return GS * Diagonal(sqrt.(ca.I[1:d])) + else + error("Unknown normalization '$(normalize)'") + end +end + +""" +Multiple Correspondence Analysis +""" +mutable struct MCA{T<:Real} <: LinearDimensionalityReduction + + # The underlying corresponence analysis + C::CA{T} + + # Indicator matrix + Inds::Matrix{Float64} + + # Variable names + vnames::Vector{String} + + # Variable+level names + names::NamedTuple + + # Map values to integer positions + rd::Vector{Dict} + + # Map integer positions to values + dr::Vector{Vector} + + # Number of nominal variables + K::Int + + # Total number of categories in all variables + J::Int + + # Eigenvalues + unadjusted_eigs::Vector{Float64} + benzecri_eigs::Vector{Float64} + greenacre_eigs::Vector{Float64} +end + +function expand_names(vnames, dr) + names, levels = [], [] + for (j, v) in enumerate(vnames) + u = dr[j] + for k in eachindex(u) + push!(names, v) + push!(levels, u[k]) + end + end + return (Variable = names, Level = levels) +end + +object_coords(mca::MCA; normalize = "principal") = + object_coords(mca.C, normalize = normalize) + +function variable_coords(mca::MCA; normalize = "principal") + #(; C, vnames, dr) = mca + C = mca.C + vnames = mca.vnames + dr = mca.dr + + na = expand_names(vnames, dr) + G = variable_coords(C, normalize = normalize) + return (Variable = na.Variable, Level = na.Level, Coord = G) +end + +function show(io::IO, mca::MCA) + nobs, ninds = size(mca.C.X) + nvar = length(mca.vnames) + print(io, "MCA(nobs = $nobs, nvar = $nvar, ninds = $ninds)") +end + +function show(io::IO, ::MIME"text/plain", mca::MCA) + nobs, ninds = size(mca.C.X) + stats = ca_stats(mca.C) + nvar = length(mca.vnames) + println(io, "MCA(nobs = $nobs, nvar = $nvar, ninds = $ninds)") + print_inertia(io, mca.C.I) + vc = variable_coords(mca.C; normalize = "principal") + d = size(vc, 2) + en = expand_names(mca.vnames, mca.dr) + rn = ["$(a) $(b)" for (a, b) in zip(en.Variable, en.Level)] + cft = build_coeftable(mca.C, vc, stats, rn) + println(io, "\nVariable coordinates (principal normalization):") + print(io, cft) + print(io, "\n\n") +end + +function ca_stats(mca::MCA) + return ca_stats(mca.C) +end + +# Calculate the eigenvalues with different debiasings. +function get_eigs(I, K, J) + ben = zeros(length(I)) + gra = zeros(length(I)) + Ki = 1 / K + f = K / (K - 1) + for i in eachindex(I) + if I[i] > Ki + ben[i] = (f * (I[i] - Ki))^2 + end + end + + unadjusted = I ./ sum(I) + gt = f * (sum(abs2, I) - (J - K) / K^2) + + return unadjusted, ben ./ sum(ben), ben ./ gt +end + +""" + fit(MCA, X; ...) + +Fit a multiple correspondence analysis using the columns of `X` as the variables. + +**Keyword Arguments** + +- `d`: The number of dimensions to retain. +- `vnames`: Variable names, if `X` is a data frame then the column names are the + default variable names but will be replaced with these values if provided. + +**Notes:** + +- Missing values are recoded as fractional indicators, i.e. if there are k distinct + levels of a variable it is coded as 1/k, 1/k, ..., 1/k. +""" +function fit( + ::Type{MCA}, + X; + d::Int = 5, + normalize = "standard", + method = "indicator", + vnames = [], +) + + if length(vnames) == 0 && typeof(X) <: AbstractDataFrame + vnames = names(X) + elseif length(vnames) == 0 + vnames = ["v$j" for j = 1:size(X, 2)] + end + + # Get the indicator matrix + XI, rd, dr = make_indicators(X, "active") + + # Create the underlying correspondence analysis value + C = fit(CA, XI; d = d, normalize = normalize, method = method) + + # Number of nominal variables + K = size(X, 2) + + # Total number of categories in all variables + J = size(XI, 2) + + una, ben, gra = get_eigs(C.I, J, K) + + na = expand_names(vnames, dr) + + mca = MCA(C, XI, vnames, na, rd, dr, K, J, una, ben, gra) + + return mca +end + +function quali_passive(mca::MCA, passive; normalize = "principal") + #(; C) = mca + C = mca.C + if size(passive, 1) != size(C.X, 1) + error("Wrong number of rows in passive data array") + end + + PI, _, drp = make_indicators(passive, "passive") + r = quali_passive(C, PI; normalize = normalize) + + vnames = if typeof(passive) <: AbstractDataFrame + names(passive) + else + m = length(drp) + string.("p", 1:m) + end + + v = expand_names(vnames, drp) + return (Variable = v.Variable, Level = v.Level, Coord = r) +end + +# Create an indicator matrix corresponding to the distinct +# values in the vector 'z'. Also returns dictionaries mapping +# the unique values to column offsets, and mapping the column +# offsets to the unique values. +function make_single_indicator(z::AbstractVector, vtype::String, pos::Int) + + n = length(z) + + # Unique values of the variable + uq = sort(unique(z)) + + # This situation usually results from user error so warn. + if length(uq) > 20 + @warn("$(titlecase(vtype)) variable in column $(pos) has more than 20 levels") + end + + # Recoding dictionary, maps each distinct value in z to + # an offset + T = eltype(z) + rd = Dict{T,Int}() + rdi = [] + for (j, v) in enumerate(uq) + if !ismissing(v) + rd[v] = j + push!(rdi, v) + end + end + + # Number of unique values of the variable excluding missing + m = length(rd) + + # The indicator matrix + X = zeros(n, m) + for (i, v) in enumerate(z) + if ismissing(v) + # Missing values are treated as uniform across the levels. + X[i, :] .= 1 / m + else + X[i, rd[v]] = 1 + end + end + + return X, rd, rdi +end + +# Create an indicator matrix for the nominal data matrix Z. +# In addition to the indicator matrix, return vectors of +# dictionaries mapping levels to positions and positions +# to levels for each variable. +function make_indicators(Z, vtype::String) + + if size(Z, 1) == 0 + return zeros(0, 0), Dict[], Vector[] + end + + rd, rdi = Dict[], Vector[] + XX = [] + for j = 1:size(Z, 2) + X, dv, di = make_single_indicator(Z[:, j], vtype, j) + push!(rd, dv) + push!(rdi, di) + push!(XX, X) + end + I = hcat(XX...) + + return I, rd, rdi +end + +# Return a table summarizing the inertia. +function inertia(mca::MCA) + inr = ( + Raw = mca.C.I, + Unadjusted = mca.unadjusted_eigs, + Benzecri = mca.benzecri_eigs, + Greenacre = mca.greenacre_eigs, + ) + return inr +end diff --git a/test/mca.jl b/test/mca.jl new file mode 100644 index 0000000..196fe5b --- /dev/null +++ b/test/mca.jl @@ -0,0 +1,201 @@ +using MultivariateStats, DataFrames, Test + +function mca_simple_data() + # Active variables + da = DataFrame( + :V1 => ["A", "A", "A", "A", "B", "B", "B", "B"], + :V2 => ["X", "X", "Y", "X", "X", "Y", "X", "X"], + :V3 => ["D", "D", "D", "C", "D", "C", "D", "C"], + ) + + # Passive variables + dp = DataFrame( + :V4 => [1, 2, 2, 2, 1, 1, 2, 1], + :V5 => [2, 1, 2, 2, 1, 2, 1, 2], + ) + + return da, dp +end + +# These values come from Stata and/or R FactoMineR. +function mca_simple_burt_correct() + + # Eigenvalues + eig = [0.18724152, 0.11111111, 0.05473379] + + # Variable coordinates in standard normalization + c1 = [1.0606602, -1.0606602, 0.35355339, -1.0606602, -1.5811388, 0.9486833] + c2 = [0.8660254, -.8660254, -.8660254, 2.5980762, 0, 0] + variable_standard = hcat(c1, c2) + + # Variable coordinates in principal normalization + c1 = [0.45896265, -0.45896265, 0.15298755, -0.45896265, -0.68418112, 0.41050867] + c2 = [0.28867513, -0.28867513, -0.28867513, 0.8660254, 0, 0] + variable_principal = hcat(c1, c2) + + # Passive variable coordinates in standard normalization + c1 = [-0.65213019, 0.65213019, 0.73080064, -0.43848039] + c2 = [-0.4330127, 0.4330127, -1.1547005, 0.69282032] + passive_standard = hcat(c1, c2) + + # Passive variable coordinates in principal normalization + c1 = [-0.28218595, 0.28218595, 0.31622777, -0.18973666] + c2 = [-0.14433757, 0.14433757, -0.38490018, 0.23094011] + passive_principal = hcat(c1, c2) + + # Relative column contributions + c1 = [0.187, 0.187, 0.031, 0.094, 0.312, 0.188] + c2 = [0.125, 0.125, 0.188, 0.562, 0, 0] + contrib = hcat(c1, c2) + + return (eig = eig, + variable_principal = variable_principal, + variable_standard = variable_standard, + passive_principal = passive_principal, + passive_standard = passive_standard, + contrib = contrib) +end + + +# These values come from Stata and/or R FactoMiner. +function mca_simple_indicator_correct() + + # Eigenvalues in principal normalization + eig = [0.4327141, 0.3333333, 0.2339525] + + # Object coordinates in principal normalization + c1 = [ + 7.876323e-01, + 7.876323e-01, + 0.3162278, + -5.564176e-02, + 0.08052551, + -1.2341531, + 0.08052551, + -0.7627485, + ] + c2 = [0, 0, 1.1547005, 0, -0.57735027, 0.5773503, -0.57735027, -0.5773503] + object_principal = hcat(c1, c2) + + # Variable coordinates in principal normalization + c1 = [0.6977130, -0.6977130, 0.232571, -0.6977130, -1.040089e+00, 6.240535e-01] + c2 = [0.5000000, -0.5000000, -0.500000, 1.5000000, 0, 0] + variable_principal = hcat(c1, c2) + + # Variable coordinates in standard normalization + c1 = [1.0606602, -1.0606602, 0.35355339, -1.0606602, -1.5811388, 0.9486833] + c2 = [0.8660254, -0.8660254, -0.8660254, 2.5980762, 0, 0] + variable_standard = hcat(c1, c2) + + # Passive variables in standard normalization + c1 = [-0.65213019, 0.65213019, 0.73080064, -0.43848039] + c2 = [-0.4330127, 0.4330127, -1.1547005, 0.69282032] + passive_standard = hcat(c1, c2) + + # Passive variables in principal normalization + c1 = [-0.42897783, 0.42897783, 0.48072805, -0.28843683] + c2 = [-0.25, 0.25, -0.66666667, 0.4] + passive_principal = hcat(c1, c2) + + # Squared column correlations + c1 = [0.487, 0.487, 0.162, 0.162, 0.649, 0.649] + c2 = [0.250, 0.250, 0.750, 0.750, 0, 0] + sqcorr = hcat(c1, c2) + + # Relative column contributions + c1 = [0.123, 0.123, 0.021, 0.062, 0.206, 0.123] + c2 = [0.072, 0.072, 0.108, 0.325, 0, 0] + contrib = hcat(c1, c2) + + return (eig = eig, object_principal = object_principal, + variable_principal = variable_principal, + variable_standard = variable_standard, + passive_standard = passive_standard, + passive_principal = passive_principal, + sqcorr = sqcorr, contrib = contrib) +end + +@testset "Compare MCA to R FactoMineR and Stata using Burt method" begin + + da, dp = mca_simple_data() + xc = mca_simple_burt_correct() + + mp = fit(MCA, da; d=2, normalize = "principal", method = "burt") + ms = fit(MCA, da; d=2, normalize = "standard", method = "burt") + + GPP = quali_passive(mp, dp; normalize = "principal") + GPS = quali_passive(ms, dp; normalize = "standard") + + show(devnull, MIME("text/plain"), mp) + show(devnull, MIME("text/plain"), ms) + + # Eigenvalues + eig = inertia(mp).Raw[1:3] + @test isapprox(eig, xc.eig, rtol = 1e-6, atol = 1e-6) + + # Variable coordinates in principal normalization + GP = variable_coords(mp.C) + @test isapprox(GP, xc.variable_principal, atol = 1e-6, rtol = 1e-6) + + # Variable coordinates in standard normalization + GS = variable_coords(ms.C) + @test isapprox(GS, xc.variable_standard, atol = 1e-6, rtol = 1e-6) + + # Passive variable coordinates in principal normalization + #GPP = variable_coords(mp.C; type = "passive") + @test isapprox(GPP.Coord, xc.passive_principal, atol = 1e-6, rtol = 1e-6) + + # Passive variable coordinates in standard normalization + #GPS = variable_coords(ms.C; type = "passive") + @test isapprox(GPS.Coord, xc.passive_standard, atol = 1e-6, rtol = 1e-6) + + for m in [mp, ms] + stat = ca_stats(m.C) + @test isapprox(stat.relcontrib_col, xc.contrib, atol = 1e-2, atol = 1e-2) + end +end + + +@testset "Compare MCA to R FactoMineR and Stata using indicator method" begin + + da, dp = mca_simple_data() + xc = mca_simple_indicator_correct() + + mp = fit(MCA, da; d=2, normalize = "principal", method = "indicator") + ms = fit(MCA, da; d=2, normalize = "standard", method = "indicator") + + GPP = quali_passive(mp, dp, normalize = "principal") + GPS = quali_passive(ms, dp, normalize = "standard") + + show(devnull, MIME("text/plain"), mp) + show(devnull, MIME("text/plain"), ms) + + # Eigenvalues + eig = inertia(mp).Raw[1:3] + @test isapprox(eig, xc.eig, rtol = 1e-6, atol = 1e-6) + + # Variable coordinates in principal normalization + G = variable_coords(mp.C; normalize = "principal") + @test isapprox(G, xc.variable_principal, atol = 1e-6, rtol = 1e-6) + + # Variable coordinates in standard normalization + GS = variable_coords(ms.C, normalize = "standard") + @test isapprox(GS, xc.variable_standard, atol = 1e-6, rtol = 1e-6) + + # Object coordinates in principal normalization + F = object_coords(mp; normalize = "principal") + @test isapprox(F, xc.object_principal, atol = 1e-6, rtol = 1e-6) + + # Coordinates of passive variables in principal coordinates + @test isapprox(GPP.Coord, xc.passive_principal, atol = 1e-6, rtol = 1e-6) + + # Coordinates of passive variables in standard coordinates + @test isapprox(GPS.Coord, xc.passive_standard, atol = 1e-6, rtol = 1e-6) + + # Fit statistics + for m in [mp, ms] + stat = ca_stats(m.C) + @test isapprox(stat.sqcorr_col, xc.sqcorr, atol = 1e-3, atol = 1e-3) + @test isapprox(stat.relcontrib_col, xc.contrib, atol = 1e-2, atol = 1e-2) + end +end diff --git a/test/runtests.jl b/test/runtests.jl index a551076..9798730 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,7 +8,8 @@ tests = ["lreg", "ica", "ppca", "kpca", - "fa" + "fa", + "mca", ] for test in tests