From c04fc79a6ed1ca5814963319c678c37351bb912f Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Mon, 5 Sep 2022 23:57:43 -0400 Subject: [PATCH 01/11] Add file --- src/MultivariateStats.jl | 10 +- src/mca.jl | 274 +++++++++++++++++++++++++++++++++++++++ test/mca.jl | 51 ++++++++ 3 files changed, 334 insertions(+), 1 deletion(-) create mode 100644 src/mca.jl create mode 100644 test/mca.jl diff --git a/src/MultivariateStats.jl b/src/MultivariateStats.jl index 13d33f6..1afc7e9 100644 --- a/src/MultivariateStats.jl +++ b/src/MultivariateStats.jl @@ -110,7 +110,14 @@ module MultivariateStats FactorAnalysis, # Type: the Factor Analysis model faem, # EM algorithm for factor analysis - facm # CM algorithm for factor analysis + facm, # CM algorithm for factor analysis + + ## CA, MCA + CA, + MCA, + objectscores, + variablescores, + inertia ## source files include("types.jl") @@ -126,6 +133,7 @@ module MultivariateStats include("lda.jl") include("ica.jl") include("fa.jl") + include("mca.jl") ## deprecations @deprecate indim(f) size(f,1) diff --git a/src/mca.jl b/src/mca.jl new file mode 100644 index 0000000..d9cdd4b --- /dev/null +++ b/src/mca.jl @@ -0,0 +1,274 @@ +# Correspondence Analysis and Multiple Correspondence Analysis + +using Printf, PyPlot + +#== +References: +https://personal.utdallas.edu/~herve/Abdi-MCA2007-pretty.pdf +https://www.stata.com/manuals/mvmca.pdf +https://www.stata.com/manuals/mvca.pdf +https://en.wikipedia.org/wiki/Multiple_correspondence_analysis +https://pca4ds.github.io +https://maths.cnam.fr/IMG/pdf/ClassMCA_cle825cfc.pdf +==# + +""" +Correspondence Analysis +""" +struct CA{T<:Real} <: LinearDimensionalityReduction + + # The data matrix + Z::Array{T} + + # The residuals + R::Array{T} + + # Row and column masses (means) + rm::Vector{T} + cm::Vector{T} + + # The standardized residuals + SR::Array{T} + + # Object scores + F::Array{T} + + # Variable scores + G::Array{T} + + # Inertia (eigenvalues of the indicator matrix) + I::Vector{T} +end + +function CA(X, d::Int) + + # Convert to proportions + X = X ./ sum(X) + + # Calculate row and column means + r = sum(X, dims = 2)[:] + c = sum(X, dims = 1)[:] + + # Center the data matrix to create residuals + R = X - r * c' + + # Standardize the data matrix to create standardized residuals + Wr = Diagonal(sqrt.(r)) + Wc = Diagonal(sqrt.(c)) + SR = Wr \ R / Wc + + # Get the object factor scores (F) and variable factor scores (G). + P, D, Q = svd(SR) + Dq = Diagonal(D)[:, 1:d] + F = Wr \ P * Dq + G = Wc \ Q * Dq + I = D .^ 2 + + return CA(X, R, r, c, SR, F, G, I) +end + +objectscores(ca::CA) = ca.F +variablescores(ca::CA) = ca.G +inertia(ca::CA) = ca.I + +""" +Multiple Correspondence Analysis +""" +struct MCA{T<:Real} <: LinearDimensionalityReduction + + # The underlying corresponence analysis + C::CA{T} + + # Variable names + vnames::Vector{String} + + # Map values to integer positions + rd::Vector{Dict} + + # Map integer positions to values + dr::Vector{Dict} + + # Split the variable scores into separate arrays for + # each variable. + Gv::Vector{Matrix{T}} + + # Eigenvalues + unadjusted_eigs::Vector{Float64} + benzecri_eigs::Vector{Float64} + greenacre_eigs::Vector{Float64} +end + +objectscores(mca::MCA) = mca.C.F +variablescores(mca::MCA) = mca.Gv + +# Split the variable scores to a separate array for each +# variable. +function xsplit(G, rd) + K = [length(di) for di in rd] + Js = cumsum(K) + Js = vcat(1, 1 .+ Js) + Gv = Vector{Matrix{eltype(G)}}() + for j in eachindex(K) + g = G[Js[j]:Js[j+1]-1, :] + push!(Gv, g) + end + return Gv +end + +# Calculate the eigenvalues with different debiasings. +function get_eigs(I, K, J) + ben = zeros(length(I)) + gra = zeros(length(I)) + Ki = 1 / K + f = K / (K - 1) + for i in eachindex(I) + if I[i] > Ki + ben[i] = (f * (I[i] - Ki))^2 + end + end + + unadjusted = I ./ sum(I) + gt = f * (sum(abs2, I) - (J - K) / K^2) + + return unadjusted, ben ./ sum(ben), ben ./ gt +end + +function MCA(Z, d::Int; vnames = []) + + if length(vnames) == 0 + vnames = ["v$(j)" for j = 1:size(Z, 2)] + end + + # Get the indicator matrix + X, rd, dr = make_indicators(Z) + + C = CA(X, d) + + # Split the variable scores into separate arrays for each variable. + Gv = xsplit(C.G, rd) + + una, ben, gra = get_eigs(C.I, size(Z, 2), size(X, 2)) + + return MCA(C, vnames, rd, dr, Gv, una, ben, gra) +end + +# Create an indicator matrix corresponding to the distinct +# values in z. Also returns dictionaries mapping the unique +# values to column offsets, and mapping the column offsets +# to the unique values. +function make_single_indicator(z) + + n = length(z) + + # Unique values of the variable + uq = sort(unique(z)) + + if length(uq) > 50 + @warn("Nominal variable has more than 50 levels") + end + + # Recoding dictionary, maps each distinct value in z to + # an offset + rd = Dict{eltype(z),Int}() + for (j, v) in enumerate(uq) + if !ismissing(v) + rd[v] = j + end + end + + # Number of unique values of the variable excluding missing + m = length(rd) + + # The indicator matrix + X = zeros(n, m) + for (i, v) in enumerate(z) + if ismissing(v) + # Missing values are treated as uniform across the levels. + X[i, :] .= 1 / m + else + X[i, rd[v]] = 1 + end + end + + # Reverse the recoding dictionary + rdi = Dict{Int,eltype(z)}() + for (k, v) in rd + rdi[v] = k + end + + return X, rd, rdi +end + +# Create an indicator matrix for the nominal data matrix Z. +# In addition to the indicator matrix, return vectors of +# dictionaries mapping levels to positions and positions +# to levels for each variable. +function make_indicators(Z) + + rd, rdr = Dict[], Dict[] + XX = [] + for j = 1:size(Z, 2) + X, di, dir = make_single_indicator(Z[:, j]) + push!(rd, di) + push!(rdr, dir) + push!(XX, X) + end + I = hcat(XX...) + + return I, rd, rdr +end + +# Return a table summarizing the inertia. +function inertia(mca::MCA) + inr = ( + Raw = mca.C.I, + Unadjusted = mca.unadjusted_eigs, + Benzecri = mca.benzecri_eigs, + Greenacre = mca.greenacre_eigs, + ) + return inr +end + +# Plot the category scores for components numbered 'x' and 'y'. Ordered factors +# are connected with line segments. +function variable_plot(mca::MCA; x = 1, y = 2, vnames = [], ordered = [], kwargs...) + + fig = PyPlot.figure() + ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) + ax.grid(true) + + # Set up the colormap + cm = get(kwargs, :cmap, PyPlot.get_cmap("tab10")) + + # Set up the axis limits + mn = 1.2 * minimum(mca.C.G, dims = 1) + mx = 1.2 * maximum(mca.C.G, dims = 1) + xlim = get(kwargs, :xlim, [mn[x], mx[x]]) + ylim = get(kwargs, :ylim, [mn[y], mx[y]]) + ax.set_xlim(xlim...) + ax.set_ylim(ylim...) + + for (j, g) in enumerate(mca.Gv) + + if mca.vnames[j] in ordered + PyPlot.plot(g[:, x], g[:, y], "-", color = cm(j)) + end + + dr = mca.dr[j] + vn = length(vnames) > 0 ? vnames[j] : "" + for (k, v) in dr + if vn != "" + lb = "$(vn)-$(v)" + else + lb = v + end + ax.text(g[k, x], g[k, y], lb, color = cm(j), ha = "center", va = "center") + end + end + + inr = inertia(mca) + PyPlot.xlabel(@sprintf("Dimension %d (%.2f%%)", x, 100 * inr[x, :Greenacre])) + PyPlot.ylabel(@sprintf("Dimension %d (%.2f%%)", y, 100 * inr[y, :Greenacre])) + + return fig +end diff --git a/test/mca.jl b/test/mca.jl new file mode 100644 index 0000000..9706972 --- /dev/null +++ b/test/mca.jl @@ -0,0 +1,51 @@ +using MultivariateStats, DataFrames, Test + +@testset "Compare MCA to R FactoMiner" begin + + da = DataFrame( + :V1 => ["A", "A", "A", "A", "B", "B", "B", "B"], + :V2 => ["X", "X", "Y", "X", "X", "Y", "X", "X"], + :V3 => ["D", "D", "D", "C", "D", "C", "D", "C"], + ) + + m = MCA(da, 3; vnames = names(da)) + F = objectscores(m) + G = variablescores(m.C) + + # Eigenvalues + eig = inertia(m).Raw[1:3] + @test isapprox(eig, [0.4327141, 0.3333333, 0.2339525], rtol = 1e-6, atol = 1e-6) + + # Object coordinates + c1 = [ + -7.876323e-01, + -7.876323e-01, + -0.3162278, + 5.564176e-02, + -0.08052551, + 1.2341531, + -0.08052551, + 0.7627485, + ] + c2 = [0, 0, 1.1547005, 0, -0.57735027, 0.5773503, -0.57735027, -0.5773503] + c3 = [ + 1.551768e-01, + 1.551768e-01, + -0.3162278, + 9.984508e-01, + -0.55193003, + -0.1800605, + -0.55193003, + 0.2913440, + ] + coord = hcat(c1, c2, c3) + @test isapprox(coord, F, atol = 1e-6, rtol = 1e-6) + + # Variable coordinates + c1 = [-0.6977130, 0.6977130, -0.232571, 0.6977130, 1.040089e+00, -6.240535e-01] + c2 = [0.5000000, -0.5000000, -0.500000, 1.5000000, 0, 0] + c3 = [0.5130269, -0.5130269, 0.171009, -0.5130269, 7.647753e-01, -4.588652e-01] + coord = hcat(c1, c2, c3) + @test isapprox(coord, G, atol = 1e-6, rtol = 1e-6) + +end From 1070ba38aa48f0785f597e652fbd8132c498f947 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Tue, 6 Sep 2022 09:28:33 -0400 Subject: [PATCH 02/11] Don't include plotting code for now --- src/mca.jl | 75 +++++++++++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/src/mca.jl b/src/mca.jl index d9cdd4b..c6a460a 100644 --- a/src/mca.jl +++ b/src/mca.jl @@ -1,6 +1,7 @@ # Correspondence Analysis and Multiple Correspondence Analysis -using Printf, PyPlot +# Needed for the plotting function below +#using Printf, PyPlot #== References: @@ -231,44 +232,44 @@ end # Plot the category scores for components numbered 'x' and 'y'. Ordered factors # are connected with line segments. -function variable_plot(mca::MCA; x = 1, y = 2, vnames = [], ordered = [], kwargs...) +#function variable_plot(mca::MCA; x = 1, y = 2, vnames = [], ordered = [], kwargs...) - fig = PyPlot.figure() - ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) - ax.grid(true) +# fig = PyPlot.figure() +# ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) +# ax.grid(true) # Set up the colormap - cm = get(kwargs, :cmap, PyPlot.get_cmap("tab10")) +# cm = get(kwargs, :cmap, PyPlot.get_cmap("tab10")) # Set up the axis limits - mn = 1.2 * minimum(mca.C.G, dims = 1) - mx = 1.2 * maximum(mca.C.G, dims = 1) - xlim = get(kwargs, :xlim, [mn[x], mx[x]]) - ylim = get(kwargs, :ylim, [mn[y], mx[y]]) - ax.set_xlim(xlim...) - ax.set_ylim(ylim...) - - for (j, g) in enumerate(mca.Gv) - - if mca.vnames[j] in ordered - PyPlot.plot(g[:, x], g[:, y], "-", color = cm(j)) - end - - dr = mca.dr[j] - vn = length(vnames) > 0 ? vnames[j] : "" - for (k, v) in dr - if vn != "" - lb = "$(vn)-$(v)" - else - lb = v - end - ax.text(g[k, x], g[k, y], lb, color = cm(j), ha = "center", va = "center") - end - end - - inr = inertia(mca) - PyPlot.xlabel(@sprintf("Dimension %d (%.2f%%)", x, 100 * inr[x, :Greenacre])) - PyPlot.ylabel(@sprintf("Dimension %d (%.2f%%)", y, 100 * inr[y, :Greenacre])) - - return fig -end +# mn = 1.2 * minimum(mca.C.G, dims = 1) +# mx = 1.2 * maximum(mca.C.G, dims = 1) +# xlim = get(kwargs, :xlim, [mn[x], mx[x]]) +# ylim = get(kwargs, :ylim, [mn[y], mx[y]]) +# ax.set_xlim(xlim...) +# ax.set_ylim(ylim...) + +# for (j, g) in enumerate(mca.Gv) + +# if mca.vnames[j] in ordered +# PyPlot.plot(g[:, x], g[:, y], "-", color = cm(j)) +# end + +# dr = mca.dr[j] +# vn = length(vnames) > 0 ? vnames[j] : "" +# for (k, v) in dr +# if vn != "" +# lb = "$(vn)-$(v)" +# else +# lb = v +# end +# ax.text(g[k, x], g[k, y], lb, color = cm(j), ha = "center", va = "center") +# end +# end + +# inr = inertia(mca) +# PyPlot.xlabel(@sprintf("Dimension %d (%.2f%%)", x, 100 * inr[x, :Greenacre])) +# PyPlot.ylabel(@sprintf("Dimension %d (%.2f%%)", y, 100 * inr[y, :Greenacre])) + +# return fig +#end From 359de04c1610ad713d3318749d9ce89c4d3eb8a2 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Tue, 6 Sep 2022 19:11:33 -0400 Subject: [PATCH 03/11] Restructure to match rest of package --- src/mca.jl | 103 ++++++++++++++++++++++++++++++++++++++++++++-------- test/mca.jl | 2 +- 2 files changed, 89 insertions(+), 16 deletions(-) diff --git a/src/mca.jl b/src/mca.jl index c6a460a..868685d 100644 --- a/src/mca.jl +++ b/src/mca.jl @@ -16,7 +16,7 @@ https://maths.cnam.fr/IMG/pdf/ClassMCA_cle825cfc.pdf """ Correspondence Analysis """ -struct CA{T<:Real} <: LinearDimensionalityReduction +mutable struct CA{T<:Real} <: LinearDimensionalityReduction # The data matrix Z::Array{T} @@ -41,7 +41,9 @@ struct CA{T<:Real} <: LinearDimensionalityReduction I::Vector{T} end -function CA(X, d::Int) +# Constructor + +function CA(X) # Convert to proportions X = X ./ sum(X) @@ -58,14 +60,41 @@ function CA(X, d::Int) Wc = Diagonal(sqrt.(c)) SR = Wr \ R / Wc + T = eltype(X) + return CA(X, R, r, c, SR, zeros(T, 0, 0), zeros(T, 0, 0), zeros(T, 0)) +end + +function fit!(ca::CA, d::Int) + # Get the object factor scores (F) and variable factor scores (G). - P, D, Q = svd(SR) + P, D, Q = svd(ca.SR) Dq = Diagonal(D)[:, 1:d] - F = Wr \ P * Dq - G = Wc \ Q * Dq - I = D .^ 2 - return CA(X, R, r, c, SR, F, G, I) + Wr = Diagonal(sqrt.(ca.rm)) + Wc = Diagonal(sqrt.(ca.cm)) + ca.F = Wr \ P * Dq + ca.G = Wc \ Q * Dq + + # Get the eigenvalues + ca.I = D .^ 2 +end + +function fit(::Type{CA}; X::AbstractMatrix, d::Int = 5) + ca = CA(X) + fit!(ca, d) + return ca::CA +end + +""" + ca + +Fit a correspondence analysis using the data array `X` whose rows are +the objects and columns are the variables. The first `d` components +are retained. +""" +function ca(X, d) + c = fit(CA, X, d) + return c end objectscores(ca::CA) = ca.F @@ -75,7 +104,7 @@ inertia(ca::CA) = ca.I """ Multiple Correspondence Analysis """ -struct MCA{T<:Real} <: LinearDimensionalityReduction +mutable struct MCA{T<:Real} <: LinearDimensionalityReduction # The underlying corresponence analysis C::CA{T} @@ -93,6 +122,12 @@ struct MCA{T<:Real} <: LinearDimensionalityReduction # each variable. Gv::Vector{Matrix{T}} + # Number of nominal variables + K::Int + + # Total number of categories in all variables + J::Int + # Eigenvalues unadjusted_eigs::Vector{Float64} benzecri_eigs::Vector{Float64} @@ -134,7 +169,9 @@ function get_eigs(I, K, J) return unadjusted, ben ./ sum(ben), ben ./ gt end -function MCA(Z, d::Int; vnames = []) +# constructor + +function MCA(Z; vnames = []) if length(vnames) == 0 vnames = ["v$(j)" for j = 1:size(Z, 2)] @@ -143,14 +180,50 @@ function MCA(Z, d::Int; vnames = []) # Get the indicator matrix X, rd, dr = make_indicators(Z) - C = CA(X, d) + # Create the underlying correspondence analysis value + C = CA(X) + + # Number of nominal variables + K = size(Z, 2) + + # Total number of categories in all variables + J = size(X, 2) + + return MCA(C, vnames, rd, dr, Matrix{Float64}[], K, J, zeros(0), zeros(0), zeros(0)) +end + +""" + mca + +Fit a multiple correspondence analysis using the columns of `Z` as the +variables. The first `d` components are retained. If `Z` is a +dataframe then the column names are used as variable names, otherwise +variable names may be provided as `vnames`. +""" +function mca(Z, d::Int; vnames = []) + m = MCA(Z; vnames) + fit!(m, d) + return m +end + +function fit(::Type{MCA}, Z::AbstractMatrix, d::Int; vnames = []) + return mca(Z, d; vnames) +end + +function fit!(mca::MCA, d::Int) + + fit!(mca.C, d) # Split the variable scores into separate arrays for each variable. - Gv = xsplit(C.G, rd) + mca.Gv = xsplit(mca.C.G, mca.rd) + + una, ben, gra = get_eigs(mca.C.I, mca.J, mca.K) - una, ben, gra = get_eigs(C.I, size(Z, 2), size(X, 2)) + mca.unadjusted_eigs = una + mca.benzecri_eigs = ben + mca.greenacre_eigs = gra - return MCA(C, vnames, rd, dr, Gv, una, ben, gra) + return mca end # Create an indicator matrix corresponding to the distinct @@ -238,10 +311,10 @@ end # ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) # ax.grid(true) - # Set up the colormap +# Set up the colormap # cm = get(kwargs, :cmap, PyPlot.get_cmap("tab10")) - # Set up the axis limits +# Set up the axis limits # mn = 1.2 * minimum(mca.C.G, dims = 1) # mx = 1.2 * maximum(mca.C.G, dims = 1) # xlim = get(kwargs, :xlim, [mn[x], mx[x]]) diff --git a/test/mca.jl b/test/mca.jl index 9706972..b497686 100644 --- a/test/mca.jl +++ b/test/mca.jl @@ -8,7 +8,7 @@ using MultivariateStats, DataFrames, Test :V3 => ["D", "D", "D", "C", "D", "C", "D", "C"], ) - m = MCA(da, 3; vnames = names(da)) + m = mca(da, 3; vnames = names(da)) F = objectscores(m) G = variablescores(m.C) From f5d16bc49d70e2354d1da90d6826aeca193ec288 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Tue, 6 Sep 2022 19:58:10 -0400 Subject: [PATCH 04/11] check for repeated non-zero signular values pick f57ef58 check for repeated non-zero signular values --- src/MultivariateStats.jl | 139 +++++++++++++++++++-------------------- src/mca.jl | 6 ++ 2 files changed, 75 insertions(+), 70 deletions(-) diff --git a/src/MultivariateStats.jl b/src/MultivariateStats.jl index 1afc7e9..4bd977b 100644 --- a/src/MultivariateStats.jl +++ b/src/MultivariateStats.jl @@ -1,19 +1,24 @@ module MultivariateStats - using LinearAlgebra - using SparseArrays - using Statistics: middle - using Distributions: cdf, FDist - using StatsAPI: RegressionModel, HypothesisTest - using StatsBase: SimpleCovariance, CovarianceEstimator, AbstractDataTransform, - ConvergenceException, pairwise, pairwise!, CoefTable - - import Statistics: mean, var, cov, covm, cor - import Base: length, size, show - import StatsAPI: fit, predict, coef, weights, dof, r2, pvalue - import LinearAlgebra: eigvals, eigvecs - - export +using LinearAlgebra +using SparseArrays +using Statistics: middle +using StatsAPI: RegressionModel +using StatsBase: + SimpleCovariance, + CovarianceEstimator, + AbstractDataTransform, + ConvergenceException, + pairwise, + pairwise!, + CoefTable + +import Statistics: mean, var, cov, covm, cor +import Base: length, size, show +import StatsAPI: fit, predict, coef, weights, dof, r2 +import LinearAlgebra: eigvals, eigvecs + +export ## common evaluate, # evaluate discriminant function values @@ -38,7 +43,6 @@ module MultivariateStats # whiten Whitening, # Type: Whitening transformation - invsqrtm, # Compute inverse of matrix square root, i.e. inv(sqrtm(A)) cov_whitening, # Compute a whitening transform based on covariance cov_whitening!, # Compute a whitening transform based on covariance (input will be overwritten) @@ -46,19 +50,16 @@ module MultivariateStats ## pca PCA, # Type: Principal Component Analysis model - pcacov, # PCA based on covariance pcasvd, # PCA based on singular value decomposition of input data principalratio, # the ratio of variances preserved in the principal subspace principalvar, # the variance along a specific principal direction principalvars, # the variances along all principal directions - tprincipalvar, # total principal variance, i.e. sum(principalvars(M)) tresidualvar, # total residual variance ## ppca PPCA, # Type: the Probabilistic PCA model - ppcaml, # Maximum likelihood probabilistic PCA ppcaem, # EM algorithm for probabilistic PCA bayespca, # Bayesian PCA @@ -67,10 +68,7 @@ module MultivariateStats KernelPCA, # Type: the Kernel PCA model ## cca - CCA, # Type: Correlation Component Analysis model - WilksLambdaTest, # Wilks lambda statistics and tests - PillaiTraceTest, # Pillai trace statistics and tests - LawleyHotellingTest, # Lawley-Hotelling statistics and tests + CCA, # Type: Correlation Component Analysis model ccacov, # CCA based on covariances ccasvd, # CCA based on singular value decomposition of input data @@ -81,18 +79,17 @@ module MultivariateStats MetricMDS, classical_mds, # perform classical MDS over a given distance matrix stress, # stress evaluation - - gram2dmat, gram2dmat!, # Gram matrix => Distance matrix - dmat2gram, dmat2gram!, # Distance matrix => Gram matrix + gram2dmat, + gram2dmat!, # Gram matrix => Distance matrix + dmat2gram, + dmat2gram!, # Distance matrix => Gram matrix ## lda LinearDiscriminant, # Type: Linear Discriminant functional MulticlassLDAStats, # Type: Statistics required for training multi-class LDA MulticlassLDA, # Type: Multi-class LDA model SubspaceLDA, # Type: LDA model for high-dimensional spaces - ldacov, # Linear discriminant analysis based on covariances - classweights, # class-specific weights classmeans, # class-specific means withclass_scatter, # with-class scatter matrix @@ -103,56 +100,58 @@ module MultivariateStats ## ica ICA, # Type: the Fast ICA model - fastica!, # core algorithm function for the Fast ICA ## fa FactorAnalysis, # Type: the Factor Analysis model - faem, # EM algorithm for factor analysis facm, # CM algorithm for factor analysis ## CA, MCA - CA, - MCA, - objectscores, - variablescores, - inertia - - ## source files - include("types.jl") - include("common.jl") - include("lreg.jl") - include("whiten.jl") - include("pca.jl") - include("ppca.jl") - include("kpca.jl") - include("cca.jl") - include("cmds.jl") - include("mmds.jl") - include("lda.jl") - include("ica.jl") - include("fa.jl") - include("mca.jl") - - ## deprecations - @deprecate indim(f) size(f,1) - @deprecate outdim(f) size(f,2) - @deprecate transform(f, x) predict(f, x) - @deprecate indim(f::Whitening) length(f::Whitening) - @deprecate outdim(f::Whitening) length(f::Whitening) - @deprecate tvar(f::PCA) var(f::PCA) - @deprecate classical_mds(D::AbstractMatrix, p::Int) predict(fit(MDS, D, maxoutdim=p, distances=true)) - @deprecate transform(f::MDS) predict(f::MDS) - @deprecate xindim(M::CCA) size(M,1) - @deprecate yindim(M::CCA) size(M,2) - @deprecate outdim(M::CCA) size(M,3) - @deprecate correlations(M::CCA) cor(M) - @deprecate xmean(M::CCA) mean(M, :x) - @deprecate ymean(M::CCA) mean(M, :y) - @deprecate xprojection(M::CCA) projection(M, :x) - @deprecate yprojection(M::CCA) projection(M, :y) - @deprecate xtransform(M::CCA, x) predict(M, x, :x) - @deprecate ytransform(M::CCA, y) predict(M, y, :y) + CA, # Type: correspondence analysis + MCA, # Type: multiple correspondence analysis + ca, # fit and return a correspondence analysis + mca, # fit and return a multiple correspondence analysis + objectscores, # return the object scores or coordinates from CA or MCA + variablescores, # return the variable/category scores or coordinates from CA or MCA + inertia # return the inertia (derived from eigenvalues) for CA + +## source files +include("types.jl") +include("common.jl") +include("lreg.jl") +include("whiten.jl") +include("pca.jl") +include("ppca.jl") +include("kpca.jl") +include("cca.jl") +include("cmds.jl") +include("mmds.jl") +include("lda.jl") +include("ica.jl") +include("fa.jl") +include("mca.jl") + +## deprecations +@deprecate indim(f) size(f, 1) +@deprecate outdim(f) size(f, 2) +@deprecate transform(f, x) predict(f, x) +@deprecate indim(f::Whitening) length(f::Whitening) +@deprecate outdim(f::Whitening) length(f::Whitening) +@deprecate tvar(f::PCA) var(f::PCA) +@deprecate classical_mds(D::AbstractMatrix, p::Int) predict( + fit(MDS, D, maxoutdim = p, distances = true), +) +@deprecate transform(f::MDS) predict(f::MDS) +@deprecate xindim(M::CCA) size(M, 1) +@deprecate yindim(M::CCA) size(M, 2) +@deprecate outdim(M::CCA) size(M, 3) +@deprecate correlations(M::CCA) cor(M) +@deprecate xmean(M::CCA) mean(M, :x) +@deprecate ymean(M::CCA) mean(M, :y) +@deprecate xprojection(M::CCA) projection(M, :x) +@deprecate yprojection(M::CCA) projection(M, :y) +@deprecate xtransform(M::CCA, x) predict(M, x, :x) +@deprecate ytransform(M::CCA, y) predict(M, y, :y) end # module diff --git a/src/mca.jl b/src/mca.jl index 868685d..9428e1c 100644 --- a/src/mca.jl +++ b/src/mca.jl @@ -70,6 +70,12 @@ function fit!(ca::CA, d::Int) P, D, Q = svd(ca.SR) Dq = Diagonal(D)[:, 1:d] + # Check that there are no repeated non-zero eigenvalues. + d = diff(D[D .> 1e-10]) + if maximum(d) >= -1e-10 + @warn("The indicator matrix has repeated non-zero eigenvalues") + end + Wr = Diagonal(sqrt.(ca.rm)) Wc = Diagonal(sqrt.(ca.cm)) ca.F = Wr \ P * Dq From fa98503ee0b1f17738ce8c82add63b1b91d1ec34 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Tue, 20 Sep 2022 20:24:49 -0400 Subject: [PATCH 05/11] Update API to conform to package --- src/MultivariateStats.jl | 123 +++++++++++++++---------------- src/mca.jl | 154 +++++++++++++++++---------------------- test/mca.jl | 2 +- 3 files changed, 131 insertions(+), 148 deletions(-) diff --git a/src/MultivariateStats.jl b/src/MultivariateStats.jl index 4bd977b..ca7af4c 100644 --- a/src/MultivariateStats.jl +++ b/src/MultivariateStats.jl @@ -1,24 +1,19 @@ module MultivariateStats -using LinearAlgebra -using SparseArrays -using Statistics: middle -using StatsAPI: RegressionModel -using StatsBase: - SimpleCovariance, - CovarianceEstimator, - AbstractDataTransform, - ConvergenceException, - pairwise, - pairwise!, - CoefTable - -import Statistics: mean, var, cov, covm, cor -import Base: length, size, show -import StatsAPI: fit, predict, coef, weights, dof, r2 -import LinearAlgebra: eigvals, eigvecs - -export + using DataFrames + using LinearAlgebra + using SparseArrays + using Statistics: middle + using StatsAPI: RegressionModel + using StatsBase: SimpleCovariance, CovarianceEstimator, AbstractDataTransform, + ConvergenceException, pairwise, pairwise!, CoefTable + + import Statistics: mean, var, cov, covm, cor + import Base: length, size, show + import StatsAPI: fit, predict, coef, weights, dof, r2 + import LinearAlgebra: eigvals, eigvecs + + export ## common evaluate, # evaluate discriminant function values @@ -43,6 +38,7 @@ export # whiten Whitening, # Type: Whitening transformation + invsqrtm, # Compute inverse of matrix square root, i.e. inv(sqrtm(A)) cov_whitening, # Compute a whitening transform based on covariance cov_whitening!, # Compute a whitening transform based on covariance (input will be overwritten) @@ -50,16 +46,19 @@ export ## pca PCA, # Type: Principal Component Analysis model + pcacov, # PCA based on covariance pcasvd, # PCA based on singular value decomposition of input data principalratio, # the ratio of variances preserved in the principal subspace principalvar, # the variance along a specific principal direction principalvars, # the variances along all principal directions + tprincipalvar, # total principal variance, i.e. sum(principalvars(M)) tresidualvar, # total residual variance ## ppca PPCA, # Type: the Probabilistic PCA model + ppcaml, # Maximum likelihood probabilistic PCA ppcaem, # EM algorithm for probabilistic PCA bayespca, # Bayesian PCA @@ -79,17 +78,18 @@ export MetricMDS, classical_mds, # perform classical MDS over a given distance matrix stress, # stress evaluation - gram2dmat, - gram2dmat!, # Gram matrix => Distance matrix - dmat2gram, - dmat2gram!, # Distance matrix => Gram matrix + + gram2dmat, gram2dmat!, # Gram matrix => Distance matrix + dmat2gram, dmat2gram!, # Distance matrix => Gram matrix ## lda LinearDiscriminant, # Type: Linear Discriminant functional MulticlassLDAStats, # Type: Statistics required for training multi-class LDA MulticlassLDA, # Type: Multi-class LDA model SubspaceLDA, # Type: LDA model for high-dimensional spaces + ldacov, # Linear discriminant analysis based on covariances + classweights, # class-specific weights classmeans, # class-specific means withclass_scatter, # with-class scatter matrix @@ -100,15 +100,18 @@ export ## ica ICA, # Type: the Fast ICA model + fastica!, # core algorithm function for the Fast ICA ## fa FactorAnalysis, # Type: the Factor Analysis model + faem, # EM algorithm for factor analysis facm, # CM algorithm for factor analysis - ## CA, MCA + ## ca, mca CA, # Type: correspondence analysis + MCA, # Type: multiple correspondence analysis ca, # fit and return a correspondence analysis mca, # fit and return a multiple correspondence analysis @@ -116,42 +119,40 @@ export variablescores, # return the variable/category scores or coordinates from CA or MCA inertia # return the inertia (derived from eigenvalues) for CA -## source files -include("types.jl") -include("common.jl") -include("lreg.jl") -include("whiten.jl") -include("pca.jl") -include("ppca.jl") -include("kpca.jl") -include("cca.jl") -include("cmds.jl") -include("mmds.jl") -include("lda.jl") -include("ica.jl") -include("fa.jl") -include("mca.jl") - -## deprecations -@deprecate indim(f) size(f, 1) -@deprecate outdim(f) size(f, 2) -@deprecate transform(f, x) predict(f, x) -@deprecate indim(f::Whitening) length(f::Whitening) -@deprecate outdim(f::Whitening) length(f::Whitening) -@deprecate tvar(f::PCA) var(f::PCA) -@deprecate classical_mds(D::AbstractMatrix, p::Int) predict( - fit(MDS, D, maxoutdim = p, distances = true), -) -@deprecate transform(f::MDS) predict(f::MDS) -@deprecate xindim(M::CCA) size(M, 1) -@deprecate yindim(M::CCA) size(M, 2) -@deprecate outdim(M::CCA) size(M, 3) -@deprecate correlations(M::CCA) cor(M) -@deprecate xmean(M::CCA) mean(M, :x) -@deprecate ymean(M::CCA) mean(M, :y) -@deprecate xprojection(M::CCA) projection(M, :x) -@deprecate yprojection(M::CCA) projection(M, :y) -@deprecate xtransform(M::CCA, x) predict(M, x, :x) -@deprecate ytransform(M::CCA, y) predict(M, y, :y) + ## source files + include("types.jl") + include("common.jl") + include("lreg.jl") + include("whiten.jl") + include("pca.jl") + include("ppca.jl") + include("kpca.jl") + include("cca.jl") + include("cmds.jl") + include("mmds.jl") + include("lda.jl") + include("ica.jl") + include("fa.jl") + include("mca.jl") + + ## deprecations + @deprecate indim(f) size(f,1) + @deprecate outdim(f) size(f,2) + @deprecate transform(f, x) predict(f, x) + @deprecate indim(f::Whitening) length(f::Whitening) + @deprecate outdim(f::Whitening) length(f::Whitening) + @deprecate tvar(f::PCA) var(f::PCA) + @deprecate classical_mds(D::AbstractMatrix, p::Int) predict(fit(MDS, D, maxoutdim=p, distances=true)) + @deprecate transform(f::MDS) predict(f::MDS) + @deprecate xindim(M::CCA) size(M,1) + @deprecate yindim(M::CCA) size(M,2) + @deprecate outdim(M::CCA) size(M,3) + @deprecate correlations(M::CCA) cor(M) + @deprecate xmean(M::CCA) mean(M, :x) + @deprecate ymean(M::CCA) mean(M, :y) + @deprecate xprojection(M::CCA) projection(M, :x) + @deprecate yprojection(M::CCA) projection(M, :y) + @deprecate xtransform(M::CCA, x) predict(M, x, :x) + @deprecate ytransform(M::CCA, y) predict(M, y, :y) end # module diff --git a/src/mca.jl b/src/mca.jl index 9428e1c..abbce26 100644 --- a/src/mca.jl +++ b/src/mca.jl @@ -16,10 +16,10 @@ https://maths.cnam.fr/IMG/pdf/ClassMCA_cle825cfc.pdf """ Correspondence Analysis """ -mutable struct CA{T<:Real} <: LinearDimensionalityReduction +struct CA{T<:Real} <: LinearDimensionalityReduction # The data matrix - Z::Array{T} + X::Array{T} # The residuals R::Array{T} @@ -41,33 +41,44 @@ mutable struct CA{T<:Real} <: LinearDimensionalityReduction I::Vector{T} end -# Constructor +""" + fit(CA, X; ...) + +Peform a Correspondence Analysis using the data in `X`. + +**Keyword Arguments** -function CA(X) +- `d` the number of dimensions to retain. + +**Notes:** + +- The matrix `X` should contain numerical data for which it makes + sense to use chi^2 distance to compare rows. Most commonly this + is an indicator matrix in which each row contains a single 1 with + the remaining values all being 0. +- See `MCA` for a more general analysis that takes a dataframe + with multiple nominal columns as input and performs a CA on + its indicator matrix. +""" +function fit(::Type{CA}, X::AbstractMatrix{T}; d::Int = 5) where{T} # Convert to proportions X = X ./ sum(X) - # Calculate row and column means - r = sum(X, dims = 2)[:] - c = sum(X, dims = 1)[:] + # Calculate row and column margins + rm = sum(X, dims = 2)[:] + cm = sum(X, dims = 1)[:] # Center the data matrix to create residuals - R = X - r * c' + R = X - rm * cm' # Standardize the data matrix to create standardized residuals - Wr = Diagonal(sqrt.(r)) - Wc = Diagonal(sqrt.(c)) + Wr = Diagonal(sqrt.(rm)) + Wc = Diagonal(sqrt.(cm)) SR = Wr \ R / Wc - T = eltype(X) - return CA(X, R, r, c, SR, zeros(T, 0, 0), zeros(T, 0, 0), zeros(T, 0)) -end - -function fit!(ca::CA, d::Int) - - # Get the object factor scores (F) and variable factor scores (G). - P, D, Q = svd(ca.SR) + # Get the object scores (F) and variable scores (G). + P, D, Q = svd(SR) Dq = Diagonal(D)[:, 1:d] # Check that there are no repeated non-zero eigenvalues. @@ -76,31 +87,15 @@ function fit!(ca::CA, d::Int) @warn("The indicator matrix has repeated non-zero eigenvalues") end - Wr = Diagonal(sqrt.(ca.rm)) - Wc = Diagonal(sqrt.(ca.cm)) - ca.F = Wr \ P * Dq - ca.G = Wc \ Q * Dq + Wr = Diagonal(sqrt.(rm)) + Wc = Diagonal(sqrt.(cm)) + F = Wr \ P * Dq + G = Wc \ Q * Dq # Get the eigenvalues - ca.I = D .^ 2 -end - -function fit(::Type{CA}; X::AbstractMatrix, d::Int = 5) - ca = CA(X) - fit!(ca, d) - return ca::CA -end - -""" - ca + I = D .^ 2 -Fit a correspondence analysis using the data array `X` whose rows are -the objects and columns are the variables. The first `d` components -are retained. -""" -function ca(X, d) - c = fit(CA, X, d) - return c + return CA(X, R, rm, cm, SR, F, G, I) end objectscores(ca::CA) = ca.F @@ -110,7 +105,7 @@ inertia(ca::CA) = ca.I """ Multiple Correspondence Analysis """ -mutable struct MCA{T<:Real} <: LinearDimensionalityReduction +struct MCA{T<:Real} <: LinearDimensionalityReduction # The underlying corresponence analysis C::CA{T} @@ -175,68 +170,55 @@ function get_eigs(I, K, J) return unadjusted, ben ./ sum(ben), ben ./ gt end -# constructor +""" + fit(MCA, X; ...) + +Fit a multiple correspondence analysis using the columns of `X` as the variables. -function MCA(Z; vnames = []) +**Keyword Arguments** - if length(vnames) == 0 +- `d`: The number of dimensions to retain. +- `vnames`: Variable names, if `X` is a data frame then the column names are the + default variable names but will be replaced with these values if provided. + +**Notes:** + +- Missing values are recoded as fractional indicators, i.e. if there are k distinct + levels of a variable it is coded as 1/k, 1/k, ..., 1/k. +""" +function fit(::Type{MCA}, X; d::Int=5, vnames=[]) + + if length(vnames) == 0 && typeof(X) <: AbstractDataFrame + vnames = names(X) + elseif length(vnames) == 0 vnames = ["v$(j)" for j = 1:size(Z, 2)] end # Get the indicator matrix - X, rd, dr = make_indicators(Z) + XI, rd, dr = make_indicators(X) # Create the underlying correspondence analysis value - C = CA(X) + C = fit(CA, XI; d=d) # Number of nominal variables - K = size(Z, 2) + K = size(X, 2) # Total number of categories in all variables - J = size(X, 2) - - return MCA(C, vnames, rd, dr, Matrix{Float64}[], K, J, zeros(0), zeros(0), zeros(0)) -end - -""" - mca - -Fit a multiple correspondence analysis using the columns of `Z` as the -variables. The first `d` components are retained. If `Z` is a -dataframe then the column names are used as variable names, otherwise -variable names may be provided as `vnames`. -""" -function mca(Z, d::Int; vnames = []) - m = MCA(Z; vnames) - fit!(m, d) - return m -end - -function fit(::Type{MCA}, Z::AbstractMatrix, d::Int; vnames = []) - return mca(Z, d; vnames) -end - -function fit!(mca::MCA, d::Int) - - fit!(mca.C, d) + J = size(XI, 2) # Split the variable scores into separate arrays for each variable. - mca.Gv = xsplit(mca.C.G, mca.rd) - - una, ben, gra = get_eigs(mca.C.I, mca.J, mca.K) + Gv = xsplit(C.G, rd) - mca.unadjusted_eigs = una - mca.benzecri_eigs = ben - mca.greenacre_eigs = gra + una, ben, gra = get_eigs(C.I, J, K) - return mca + return MCA(C, vnames, rd, dr, Gv, K, J, una, ben, gra) end # Create an indicator matrix corresponding to the distinct -# values in z. Also returns dictionaries mapping the unique -# values to column offsets, and mapping the column offsets -# to the unique values. -function make_single_indicator(z) +# values in the vector 'z'. Also returns dictionaries mapping +# the unique values to column offsets, and mapping the column +# offsets to the unique values. +function make_single_indicator(z::Vector{T}) where{T} n = length(z) @@ -249,7 +231,7 @@ function make_single_indicator(z) # Recoding dictionary, maps each distinct value in z to # an offset - rd = Dict{eltype(z),Int}() + rd = Dict{T,Int}() for (j, v) in enumerate(uq) if !ismissing(v) rd[v] = j @@ -271,7 +253,7 @@ function make_single_indicator(z) end # Reverse the recoding dictionary - rdi = Dict{Int,eltype(z)}() + rdi = Dict{Int,T}() for (k, v) in rd rdi[v] = k end diff --git a/test/mca.jl b/test/mca.jl index b497686..dc5209e 100644 --- a/test/mca.jl +++ b/test/mca.jl @@ -8,7 +8,7 @@ using MultivariateStats, DataFrames, Test :V3 => ["D", "D", "D", "C", "D", "C", "D", "C"], ) - m = mca(da, 3; vnames = names(da)) + m = fit(MCA, da; d=3, vnames = names(da)) F = objectscores(m) G = variablescores(m.C) From cb97a3317717d45548d9f7dc3e91533913ea4956 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sun, 11 Dec 2022 15:23:42 -0500 Subject: [PATCH 06/11] Refactor interface, add many features pick ab82e6e Refactor interface, add many features pick 7901ecf Update API to conform to package --- Project.toml | 7 +- src/MultivariateStats.jl | 10 +- src/mca.jl | 450 +++++++++++++++++++++++++++++---------- test/mca.jl | 216 ++++++++++++++++--- test/runtests.jl | 3 +- 5 files changed, 533 insertions(+), 153 deletions(-) diff --git a/Project.toml b/Project.toml index a1bd61f..f07ec20 100644 --- a/Project.toml +++ b/Project.toml @@ -8,7 +8,7 @@ version = "0.10.3" [deps] Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" -Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" @@ -17,11 +17,8 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] Arpack = "0.3, 0.4, 0.5" -Distributions = "0.25" -StableRNGs = "1" -Statistics = "1" StatsAPI = "^1.3" -StatsBase = "^0.33, 0.34" +StatsBase = "^0.33" julia = "1.1" [extras] diff --git a/src/MultivariateStats.jl b/src/MultivariateStats.jl index ca7af4c..0528b12 100644 --- a/src/MultivariateStats.jl +++ b/src/MultivariateStats.jl @@ -113,11 +113,11 @@ module MultivariateStats CA, # Type: correspondence analysis MCA, # Type: multiple correspondence analysis - ca, # fit and return a correspondence analysis - mca, # fit and return a multiple correspondence analysis - objectscores, # return the object scores or coordinates from CA or MCA - variablescores, # return the variable/category scores or coordinates from CA or MCA - inertia # return the inertia (derived from eigenvalues) for CA + object_coords, # return the object scores or coordinates from CA or MCA + variable_coords, # return the variable/category scores or coordinates from CA or MCA + inertia, # return the inertia (derived from eigenvalues) for CA + ca_stats, # fit statistics + quali_passive # handle qualitative passive variables ## source files include("types.jl") diff --git a/src/mca.jl b/src/mca.jl index abbce26..06df67f 100644 --- a/src/mca.jl +++ b/src/mca.jl @@ -1,8 +1,5 @@ # Correspondence Analysis and Multiple Correspondence Analysis -# Needed for the plotting function below -#using Printf, PyPlot - #== References: https://personal.utdallas.edu/~herve/Abdi-MCA2007-pretty.pdf @@ -16,29 +13,165 @@ https://maths.cnam.fr/IMG/pdf/ClassMCA_cle825cfc.pdf """ Correspondence Analysis """ -struct CA{T<:Real} <: LinearDimensionalityReduction +mutable struct CA{T<:Real} <: LinearDimensionalityReduction # The data matrix - X::Array{T} + X::Matrix{T} # The residuals - R::Array{T} + R::Matrix{T} - # Row and column masses (means) + # Row masses rm::Vector{T} + + # Column masses cm::Vector{T} # The standardized residuals - SR::Array{T} + SR::Matrix{T} - # Object scores - F::Array{T} + # Object coordinates in standard coordinates + FS::Matrix{T} - # Variable scores - G::Array{T} + # Variable coordinates in standard coordinates + GS::Matrix{T} # Inertia (eigenvalues of the indicator matrix) I::Vector{T} + + # Standard normalization or principal normalization + normalize::String + + # Use either the Burt method or the indicator method + method::String +end + +function show(io::IO, ca::CA) + nobs, nvar = size(ca.X) + print( + io, + "CA(nobs = $nobs, nvar = $nvar, method = $(ca.method), normalize = $(ca.normalize))", + ) +end + +function print_inertia(io, I) + xx = hcat(I, I, I) + xx[:, 2] = xx[:, 1] ./ sum(xx[:, 1]) + xx[:, 3] = cumsum(xx[:, 2]) + ii = findfirst(xx[:, 3] .>= 0.999) + xx = xx[1:ii, :] + vn = ["Inertia", "Prop inertia", "Cumulative inertia"] + cft = CoefTable(xx, vn, string.("", 1:ii)) + println(io, cft) +end + +function build_coeftable(ca, vc, stats, rn) + cn = ["Mass"] + xx = [ca.cm] + d = size(ca.GS, 2) + for j = 1:d + push!(cn, "Coord-$(j)") + push!(xx, vc[:, j]) + push!(cn, "SqCorr-$(j)") + push!(xx, stats.sqcorr_col[:, j]) + push!(cn, "RelContrib-$(j)") + push!(xx, stats.relcontrib_col[:, j]) + end + xx = hcat(xx...) + cft = CoefTable(xx, cn, rn) + return cft +end + +function show(io::IO, ::MIME"text/plain", ca::CA) + nobs, nvar = size(ca.X) + stats = ca_stats(ca) + + println( + io, + "CA(nobs = $nobs, nvar = $nvar, method = $(ca.method), normalize = $(ca.normalize))", + ) + print_inertia(io, ca.I) + vc = variable_coords(ca) + m, d = size(ca.GS) + println(io, "\nVariable coordinates:") + rn = string.("", 1:m) + cft = build_coeftable(ca, vc, stats, rn) + print(io, cft) + print(io, "\n\n") +end + +# Squared correlation of each variable with each component. +function sqcorr(ca::CA, axis) + if axis == :row + z = ca.rm ./ sum(abs2, ca.SR; dims = 2)[:] + p = length(ca.rm) + d = size(ca.GS, 2) + if ca.method == "burt" + # Not clear how to compute this so report zero for now + return zeros(p, d) + end + f = z * ones(d)' + return object_coords(ca; normalize = "principal") .^ 2 .* f + elseif axis == :col + z = ca.cm ./ sum(abs2, ca.SR; dims = 1)[:] + p = length(ca.cm) + d = size(ca.GS, 2) + f = z * ones(d)' + if ca.method == "burt" + # Not clear how to compute this so report zero for now + return zeros(p, d) + end + return variable_coords(ca; normalize = "principal") .^ 2 .* f + else + error("Unknown axis '$(axis)'") + end +end + +# Relative contributions of each variable to each component. +function relcontrib(ca::CA, axis) + e = ca.method == "indicator" ? 0.5 : 1 + if axis == :row + d = size(ca.GS, 2) + z = ca.rm * (ca.I[1:d] .^ -e)' + return object_coords(ca; normalize = "principal") .^ 2 .* z + elseif axis == :col + d = size(ca.GS, 2) + z = ca.cm * (ca.I[1:d] .^ -e)' + return variable_coords(ca; normalize = "principal") .^ 2 .* z + else + error("Unknown axis '$(axis)'") + end +end + +# Return several fit statistics. +function ca_stats(ca::CA) + cr = sqcorr(ca, :col) + rc = relcontrib(ca, :col) + return (sqcorr_col = cr, relcontrib_col = rc) +end + +# Multiply corresponding columns of Q by -1 as needed +# to make the first non-zero value in each column of Q +# positive. This is the method used by Stata to identify +# the coefficients. +function orient(P, Q, e = 1e-12) + for j = 1:size(Q, 2) + i = findfirst(abs.(Q[:, j]) .>= e) + if Q[i, j] < 0 + Q[:, j] .*= -1 + P[:, j] .*= -1 + end + end + return P, Q +end + +function mca_check_args(normalize, method) + if !(normalize in ["standard", "principal"]) + error("normalize = '$normalize' should be 'standard' or 'principal'") + end + if !(method in ["indicator", "burt"]) + error("method = '$method' should be 'indicator' or 'burt'") + end end """ @@ -49,6 +182,7 @@ Peform a Correspondence Analysis using the data in `X`. **Keyword Arguments** - `d` the number of dimensions to retain. +- `normalize` a coordinate normalization method, either 'standard' or 'principal'. **Notes:** @@ -60,7 +194,15 @@ Peform a Correspondence Analysis using the data in `X`. with multiple nominal columns as input and performs a CA on its indicator matrix. """ -function fit(::Type{CA}, X::AbstractMatrix{T}; d::Int = 5) where{T} +function fit( + ::Type{CA}, + X::AbstractMatrix{T}; + d::Int = 5, + normalize = "standard", + method = "indicator", +) where {T} + + mca_check_args(normalize, method) # Convert to proportions X = X ./ sum(X) @@ -77,39 +219,120 @@ function fit(::Type{CA}, X::AbstractMatrix{T}; d::Int = 5) where{T} Wc = Diagonal(sqrt.(cm)) SR = Wr \ R / Wc - # Get the object scores (F) and variable scores (G). + # Factor the standardized residual matrix P, D, Q = svd(SR) - Dq = Diagonal(D)[:, 1:d] # Check that there are no repeated non-zero eigenvalues. - d = diff(D[D .> 1e-10]) - if maximum(d) >= -1e-10 + di = diff(D[D.>1e-10]) + if maximum(di) >= -1e-10 @warn("The indicator matrix has repeated non-zero eigenvalues") end - Wr = Diagonal(sqrt.(rm)) - Wc = Diagonal(sqrt.(cm)) - F = Wr \ P * Dq - G = Wc \ Q * Dq - # Get the eigenvalues - I = D .^ 2 + I = method == "burt" ? D .^ 4 : D .^ 2 + + # Reduce to the selected dimension + P = P[:, 1:d] + D = D[1:d] + Q = Q[:, 1:d] + + # Flip the loading vectors to a standard orientation. + P, Q = orient(P, Q) + + # Get the object scores (F) and variable scores (G). These are the + # standard coordinates. + FS = Wr \ P + GS = Wc \ Q + + ca = CA(X, R, rm, cm, SR, FS, GS, I, normalize, method) + return ca +end + +# Calculate the standard coordinates of any qualitative passive variables. +function quali_passive(ca::CA, passive; normalize = "principal") + + if size(passive, 1) == 0 + return + end + + if length(size(passive)) != 2 + error("passive variable array must be two-dimensional") + end + + (; X, GS) = ca + PX = Matrix(passive) + + if size(PX, 1) != size(X, 1) + @error("Passive data must have same leading axis length as active data.") + end + + M = hcat(X, PX) + B = M' * M + B ./= sum(B) + p = size(X, 2) + B = B[p+1:end, 1:p] + + PGS = B * GS + for k = 1:size(B, 1) + PGS[k, :] ./= sum(B[k, :]) + end + d = size(PGS, 2) + if ca.method == "burt" + PGS = PGS * Diagonal(1 ./ sqrt.(ca.I[1:d])) + elseif ca.method == "indicator" + PGS = PGS * Diagonal(1 ./ ca.I[1:d]) + else + error("Unknown method '$(ca.method)'") + end + + if normalize == "standard" + return PGS + elseif normalize == "principal" + return PGS * Diagonal(sqrt.(ca.I[1:d])) + else + error("Unknown normalization '$(normalize)'") + end - return CA(X, R, rm, cm, SR, F, G, I) + return PGS +end + +function object_coords(ca::CA; normalize = ca.normalize) + if normalize == "standard" + ca.FS + elseif normalize == "principal" + d = size(ca.FS, 2) + return ca.FS * Diagonal(sqrt.(ca.I[1:d])) + else + error("Unknown normalization '$(normalize)'") + end end -objectscores(ca::CA) = ca.F -variablescores(ca::CA) = ca.G inertia(ca::CA) = ca.I +function variable_coords(ca::CA; normalize = ca.normalize) + (; GS) = ca + + d = size(GS, 2) + if normalize == "standard" + return GS + elseif normalize == "principal" + return GS * Diagonal(sqrt.(ca.I[1:d])) + else + error("Unknown normalization '$(normalize)'") + end +end + """ Multiple Correspondence Analysis """ -struct MCA{T<:Real} <: LinearDimensionalityReduction +mutable struct MCA{T<:Real} <: LinearDimensionalityReduction # The underlying corresponence analysis C::CA{T} + # Indicator matrix + Inds::Matrix{Float64} + # Variable names vnames::Vector{String} @@ -117,11 +340,7 @@ struct MCA{T<:Real} <: LinearDimensionalityReduction rd::Vector{Dict} # Map integer positions to values - dr::Vector{Dict} - - # Split the variable scores into separate arrays for - # each variable. - Gv::Vector{Matrix{T}} + dr::Vector{Vector} # Number of nominal variables K::Int @@ -135,21 +354,52 @@ struct MCA{T<:Real} <: LinearDimensionalityReduction greenacre_eigs::Vector{Float64} end -objectscores(mca::MCA) = mca.C.F -variablescores(mca::MCA) = mca.Gv - -# Split the variable scores to a separate array for each -# variable. -function xsplit(G, rd) - K = [length(di) for di in rd] - Js = cumsum(K) - Js = vcat(1, 1 .+ Js) - Gv = Vector{Matrix{eltype(G)}}() - for j in eachindex(K) - g = G[Js[j]:Js[j+1]-1, :] - push!(Gv, g) +function expand_names(vnames, dr) + names, levels = [], [] + for (j, v) in enumerate(vnames) + u = dr[j] + for k in eachindex(u) + push!(names, v) + push!(levels, u[k]) + end end - return Gv + return (Variable = names, Level = levels) +end + +object_coords(mca::MCA; normalize = "principal") = + object_coords(mca.C, normalize = normalize) + +function variable_coords(mca::MCA; normalize = "principal") + (; C, vnames, dr) = mca + na = expand_names(vnames, dr) + G = variable_coords(C, normalize = normalize) + return (Variable = na.Variable, Level = na.Level, Coord = G) +end + +function show(io::IO, mca::MCA) + nobs, ninds = size(mca.C.X) + nvar = length(mca.vnames) + print(io, "MCA(nobs = $nobs, nvar = $nvar, ninds = $ninds)") +end + +function show(io::IO, ::MIME"text/plain", mca::MCA) + nobs, ninds = size(mca.C.X) + stats = ca_stats(mca.C) + nvar = length(mca.vnames) + println(io, "MCA(nobs = $nobs, nvar = $nvar, ninds = $ninds)") + print_inertia(io, mca.C.I) + vc = variable_coords(mca.C; normalize = "principal") + d = size(vc, 2) + en = expand_names(mca.vnames, mca.dr) + rn = ["$(a) $(b)" for (a, b) in zip(en.Variable, en.Level)] + cft = build_coeftable(mca.C, vc, stats, rn) + println(io, "\nVariable coordinates (principal normalization):") + print(io, cft) + print(io, "\n\n") +end + +function ca_stats(mca::MCA) + return ca_stats(mca.C) end # Calculate the eigenvalues with different debiasings. @@ -186,19 +436,26 @@ Fit a multiple correspondence analysis using the columns of `X` as the variables - Missing values are recoded as fractional indicators, i.e. if there are k distinct levels of a variable it is coded as 1/k, 1/k, ..., 1/k. """ -function fit(::Type{MCA}, X; d::Int=5, vnames=[]) +function fit( + ::Type{MCA}, + X; + d::Int = 5, + normalize = "standard", + method = "indicator", + vnames = [], +) if length(vnames) == 0 && typeof(X) <: AbstractDataFrame vnames = names(X) elseif length(vnames) == 0 - vnames = ["v$(j)" for j = 1:size(Z, 2)] + vnames = ["v$j" for j = 1:size(X, 2)] end # Get the indicator matrix XI, rd, dr = make_indicators(X) # Create the underlying correspondence analysis value - C = fit(CA, XI; d=d) + C = fit(CA, XI; d = d, normalize = normalize, method = method) # Number of nominal variables K = size(X, 2) @@ -206,19 +463,38 @@ function fit(::Type{MCA}, X; d::Int=5, vnames=[]) # Total number of categories in all variables J = size(XI, 2) - # Split the variable scores into separate arrays for each variable. - Gv = xsplit(C.G, rd) - una, ben, gra = get_eigs(C.I, J, K) - return MCA(C, vnames, rd, dr, Gv, K, J, una, ben, gra) + mca = MCA(C, XI, vnames, rd, dr, K, J, una, ben, gra) + + return mca +end + +function quali_passive(mca::MCA, passive; normalize = "principal") + (; C) = mca + if size(passive, 1) != size(C.X, 1) + error("Wrong number of rows in passive data array") + end + + PI, _, drp = make_indicators(passive) + r = quali_passive(C, PI; normalize = normalize) + + vnames = if typeof(passive) <: AbstractDataFrame + names(passive) + else + m = length(drp) + string.("p", 1:m) + end + + v = expand_names(vnames, drp) + return (Variable = v.Variable, Level = v.Level, Coord = r) end # Create an indicator matrix corresponding to the distinct # values in the vector 'z'. Also returns dictionaries mapping # the unique values to column offsets, and mapping the column # offsets to the unique values. -function make_single_indicator(z::Vector{T}) where{T} +function make_single_indicator(z::Vector{T}) where {T} n = length(z) @@ -232,9 +508,11 @@ function make_single_indicator(z::Vector{T}) where{T} # Recoding dictionary, maps each distinct value in z to # an offset rd = Dict{T,Int}() + rdi = [] for (j, v) in enumerate(uq) if !ismissing(v) rd[v] = j + push!(rdi, v) end end @@ -252,12 +530,6 @@ function make_single_indicator(z::Vector{T}) where{T} end end - # Reverse the recoding dictionary - rdi = Dict{Int,T}() - for (k, v) in rd - rdi[v] = k - end - return X, rd, rdi end @@ -267,17 +539,21 @@ end # to levels for each variable. function make_indicators(Z) - rd, rdr = Dict[], Dict[] + if size(Z, 1) == 0 + return zeros(0, 0), Dict[], Vector[] + end + + rd, rdi = Dict[], Vector[] XX = [] for j = 1:size(Z, 2) - X, di, dir = make_single_indicator(Z[:, j]) - push!(rd, di) - push!(rdr, dir) + X, dv, di = make_single_indicator(Z[:, j]) + push!(rd, dv) + push!(rdi, di) push!(XX, X) end I = hcat(XX...) - return I, rd, rdr + return I, rd, rdi end # Return a table summarizing the inertia. @@ -290,47 +566,3 @@ function inertia(mca::MCA) ) return inr end - -# Plot the category scores for components numbered 'x' and 'y'. Ordered factors -# are connected with line segments. -#function variable_plot(mca::MCA; x = 1, y = 2, vnames = [], ordered = [], kwargs...) - -# fig = PyPlot.figure() -# ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) -# ax.grid(true) - -# Set up the colormap -# cm = get(kwargs, :cmap, PyPlot.get_cmap("tab10")) - -# Set up the axis limits -# mn = 1.2 * minimum(mca.C.G, dims = 1) -# mx = 1.2 * maximum(mca.C.G, dims = 1) -# xlim = get(kwargs, :xlim, [mn[x], mx[x]]) -# ylim = get(kwargs, :ylim, [mn[y], mx[y]]) -# ax.set_xlim(xlim...) -# ax.set_ylim(ylim...) - -# for (j, g) in enumerate(mca.Gv) - -# if mca.vnames[j] in ordered -# PyPlot.plot(g[:, x], g[:, y], "-", color = cm(j)) -# end - -# dr = mca.dr[j] -# vn = length(vnames) > 0 ? vnames[j] : "" -# for (k, v) in dr -# if vn != "" -# lb = "$(vn)-$(v)" -# else -# lb = v -# end -# ax.text(g[k, x], g[k, y], lb, color = cm(j), ha = "center", va = "center") -# end -# end - -# inr = inertia(mca) -# PyPlot.xlabel(@sprintf("Dimension %d (%.2f%%)", x, 100 * inr[x, :Greenacre])) -# PyPlot.ylabel(@sprintf("Dimension %d (%.2f%%)", y, 100 * inr[y, :Greenacre])) - -# return fig -#end diff --git a/test/mca.jl b/test/mca.jl index dc5209e..196fe5b 100644 --- a/test/mca.jl +++ b/test/mca.jl @@ -1,51 +1,201 @@ using MultivariateStats, DataFrames, Test -@testset "Compare MCA to R FactoMiner" begin - +function mca_simple_data() + # Active variables da = DataFrame( :V1 => ["A", "A", "A", "A", "B", "B", "B", "B"], :V2 => ["X", "X", "Y", "X", "X", "Y", "X", "X"], :V3 => ["D", "D", "D", "C", "D", "C", "D", "C"], ) - m = fit(MCA, da; d=3, vnames = names(da)) - F = objectscores(m) - G = variablescores(m.C) + # Passive variables + dp = DataFrame( + :V4 => [1, 2, 2, 2, 1, 1, 2, 1], + :V5 => [2, 1, 2, 2, 1, 2, 1, 2], + ) + + return da, dp +end + +# These values come from Stata and/or R FactoMineR. +function mca_simple_burt_correct() # Eigenvalues - eig = inertia(m).Raw[1:3] - @test isapprox(eig, [0.4327141, 0.3333333, 0.2339525], rtol = 1e-6, atol = 1e-6) + eig = [0.18724152, 0.11111111, 0.05473379] + + # Variable coordinates in standard normalization + c1 = [1.0606602, -1.0606602, 0.35355339, -1.0606602, -1.5811388, 0.9486833] + c2 = [0.8660254, -.8660254, -.8660254, 2.5980762, 0, 0] + variable_standard = hcat(c1, c2) + + # Variable coordinates in principal normalization + c1 = [0.45896265, -0.45896265, 0.15298755, -0.45896265, -0.68418112, 0.41050867] + c2 = [0.28867513, -0.28867513, -0.28867513, 0.8660254, 0, 0] + variable_principal = hcat(c1, c2) + + # Passive variable coordinates in standard normalization + c1 = [-0.65213019, 0.65213019, 0.73080064, -0.43848039] + c2 = [-0.4330127, 0.4330127, -1.1547005, 0.69282032] + passive_standard = hcat(c1, c2) + + # Passive variable coordinates in principal normalization + c1 = [-0.28218595, 0.28218595, 0.31622777, -0.18973666] + c2 = [-0.14433757, 0.14433757, -0.38490018, 0.23094011] + passive_principal = hcat(c1, c2) + + # Relative column contributions + c1 = [0.187, 0.187, 0.031, 0.094, 0.312, 0.188] + c2 = [0.125, 0.125, 0.188, 0.562, 0, 0] + contrib = hcat(c1, c2) + + return (eig = eig, + variable_principal = variable_principal, + variable_standard = variable_standard, + passive_principal = passive_principal, + passive_standard = passive_standard, + contrib = contrib) +end + - # Object coordinates +# These values come from Stata and/or R FactoMiner. +function mca_simple_indicator_correct() + + # Eigenvalues in principal normalization + eig = [0.4327141, 0.3333333, 0.2339525] + + # Object coordinates in principal normalization c1 = [ - -7.876323e-01, - -7.876323e-01, - -0.3162278, - 5.564176e-02, - -0.08052551, - 1.2341531, - -0.08052551, - 0.7627485, + 7.876323e-01, + 7.876323e-01, + 0.3162278, + -5.564176e-02, + 0.08052551, + -1.2341531, + 0.08052551, + -0.7627485, ] c2 = [0, 0, 1.1547005, 0, -0.57735027, 0.5773503, -0.57735027, -0.5773503] - c3 = [ - 1.551768e-01, - 1.551768e-01, - -0.3162278, - 9.984508e-01, - -0.55193003, - -0.1800605, - -0.55193003, - 0.2913440, - ] - coord = hcat(c1, c2, c3) - @test isapprox(coord, F, atol = 1e-6, rtol = 1e-6) + object_principal = hcat(c1, c2) - # Variable coordinates - c1 = [-0.6977130, 0.6977130, -0.232571, 0.6977130, 1.040089e+00, -6.240535e-01] + # Variable coordinates in principal normalization + c1 = [0.6977130, -0.6977130, 0.232571, -0.6977130, -1.040089e+00, 6.240535e-01] c2 = [0.5000000, -0.5000000, -0.500000, 1.5000000, 0, 0] - c3 = [0.5130269, -0.5130269, 0.171009, -0.5130269, 7.647753e-01, -4.588652e-01] - coord = hcat(c1, c2, c3) - @test isapprox(coord, G, atol = 1e-6, rtol = 1e-6) + variable_principal = hcat(c1, c2) + + # Variable coordinates in standard normalization + c1 = [1.0606602, -1.0606602, 0.35355339, -1.0606602, -1.5811388, 0.9486833] + c2 = [0.8660254, -0.8660254, -0.8660254, 2.5980762, 0, 0] + variable_standard = hcat(c1, c2) + + # Passive variables in standard normalization + c1 = [-0.65213019, 0.65213019, 0.73080064, -0.43848039] + c2 = [-0.4330127, 0.4330127, -1.1547005, 0.69282032] + passive_standard = hcat(c1, c2) + + # Passive variables in principal normalization + c1 = [-0.42897783, 0.42897783, 0.48072805, -0.28843683] + c2 = [-0.25, 0.25, -0.66666667, 0.4] + passive_principal = hcat(c1, c2) + + # Squared column correlations + c1 = [0.487, 0.487, 0.162, 0.162, 0.649, 0.649] + c2 = [0.250, 0.250, 0.750, 0.750, 0, 0] + sqcorr = hcat(c1, c2) + + # Relative column contributions + c1 = [0.123, 0.123, 0.021, 0.062, 0.206, 0.123] + c2 = [0.072, 0.072, 0.108, 0.325, 0, 0] + contrib = hcat(c1, c2) + + return (eig = eig, object_principal = object_principal, + variable_principal = variable_principal, + variable_standard = variable_standard, + passive_standard = passive_standard, + passive_principal = passive_principal, + sqcorr = sqcorr, contrib = contrib) +end + +@testset "Compare MCA to R FactoMineR and Stata using Burt method" begin + + da, dp = mca_simple_data() + xc = mca_simple_burt_correct() + + mp = fit(MCA, da; d=2, normalize = "principal", method = "burt") + ms = fit(MCA, da; d=2, normalize = "standard", method = "burt") + + GPP = quali_passive(mp, dp; normalize = "principal") + GPS = quali_passive(ms, dp; normalize = "standard") + + show(devnull, MIME("text/plain"), mp) + show(devnull, MIME("text/plain"), ms) + + # Eigenvalues + eig = inertia(mp).Raw[1:3] + @test isapprox(eig, xc.eig, rtol = 1e-6, atol = 1e-6) + + # Variable coordinates in principal normalization + GP = variable_coords(mp.C) + @test isapprox(GP, xc.variable_principal, atol = 1e-6, rtol = 1e-6) + + # Variable coordinates in standard normalization + GS = variable_coords(ms.C) + @test isapprox(GS, xc.variable_standard, atol = 1e-6, rtol = 1e-6) + + # Passive variable coordinates in principal normalization + #GPP = variable_coords(mp.C; type = "passive") + @test isapprox(GPP.Coord, xc.passive_principal, atol = 1e-6, rtol = 1e-6) + + # Passive variable coordinates in standard normalization + #GPS = variable_coords(ms.C; type = "passive") + @test isapprox(GPS.Coord, xc.passive_standard, atol = 1e-6, rtol = 1e-6) + + for m in [mp, ms] + stat = ca_stats(m.C) + @test isapprox(stat.relcontrib_col, xc.contrib, atol = 1e-2, atol = 1e-2) + end +end + + +@testset "Compare MCA to R FactoMineR and Stata using indicator method" begin + + da, dp = mca_simple_data() + xc = mca_simple_indicator_correct() + + mp = fit(MCA, da; d=2, normalize = "principal", method = "indicator") + ms = fit(MCA, da; d=2, normalize = "standard", method = "indicator") + + GPP = quali_passive(mp, dp, normalize = "principal") + GPS = quali_passive(ms, dp, normalize = "standard") + + show(devnull, MIME("text/plain"), mp) + show(devnull, MIME("text/plain"), ms) + + # Eigenvalues + eig = inertia(mp).Raw[1:3] + @test isapprox(eig, xc.eig, rtol = 1e-6, atol = 1e-6) + + # Variable coordinates in principal normalization + G = variable_coords(mp.C; normalize = "principal") + @test isapprox(G, xc.variable_principal, atol = 1e-6, rtol = 1e-6) + + # Variable coordinates in standard normalization + GS = variable_coords(ms.C, normalize = "standard") + @test isapprox(GS, xc.variable_standard, atol = 1e-6, rtol = 1e-6) + + # Object coordinates in principal normalization + F = object_coords(mp; normalize = "principal") + @test isapprox(F, xc.object_principal, atol = 1e-6, rtol = 1e-6) + + # Coordinates of passive variables in principal coordinates + @test isapprox(GPP.Coord, xc.passive_principal, atol = 1e-6, rtol = 1e-6) + + # Coordinates of passive variables in standard coordinates + @test isapprox(GPS.Coord, xc.passive_standard, atol = 1e-6, rtol = 1e-6) + # Fit statistics + for m in [mp, ms] + stat = ca_stats(m.C) + @test isapprox(stat.sqcorr_col, xc.sqcorr, atol = 1e-3, atol = 1e-3) + @test isapprox(stat.relcontrib_col, xc.contrib, atol = 1e-2, atol = 1e-2) + end end diff --git a/test/runtests.jl b/test/runtests.jl index a551076..9798730 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,7 +8,8 @@ tests = ["lreg", "ica", "ppca", "kpca", - "fa" + "fa", + "mca", ] for test in tests From be7bf00164ee7208532ff78a9c84dcd177ac83bd Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Mon, 12 Dec 2022 23:51:40 -0500 Subject: [PATCH 07/11] Remove unsupported destructuring syntax --- src/mca.jl | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/mca.jl b/src/mca.jl index 06df67f..2b32d65 100644 --- a/src/mca.jl +++ b/src/mca.jl @@ -259,7 +259,10 @@ function quali_passive(ca::CA, passive; normalize = "principal") error("passive variable array must be two-dimensional") end - (; X, GS) = ca + #(; X, GS) = ca + X = ca.X + GS = ca.GS + PX = Matrix(passive) if size(PX, 1) != size(X, 1) @@ -310,7 +313,8 @@ end inertia(ca::CA) = ca.I function variable_coords(ca::CA; normalize = ca.normalize) - (; GS) = ca + #(; GS) = ca + GS = ca.GS d = size(GS, 2) if normalize == "standard" @@ -370,7 +374,11 @@ object_coords(mca::MCA; normalize = "principal") = object_coords(mca.C, normalize = normalize) function variable_coords(mca::MCA; normalize = "principal") - (; C, vnames, dr) = mca + #(; C, vnames, dr) = mca + C = mca.C + vnames = mca.vnames + dr = mca.dr + na = expand_names(vnames, dr) G = variable_coords(C, normalize = normalize) return (Variable = na.Variable, Level = na.Level, Coord = G) @@ -471,7 +479,8 @@ function fit( end function quali_passive(mca::MCA, passive; normalize = "principal") - (; C) = mca + #(; C) = mca + C = mca.C if size(passive, 1) != size(C.X, 1) error("Wrong number of rows in passive data array") end From 7dbd4f75847e6c2f2054e56cd0970e14aedf33ee Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Tue, 17 Jan 2023 10:12:35 -0500 Subject: [PATCH 08/11] Add information about names/levels to MCA struct --- src/mca.jl | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/mca.jl b/src/mca.jl index 2b32d65..88dc34f 100644 --- a/src/mca.jl +++ b/src/mca.jl @@ -340,6 +340,9 @@ mutable struct MCA{T<:Real} <: LinearDimensionalityReduction # Variable names vnames::Vector{String} + # Variable+level names + names::NamedTuple + # Map values to integer positions rd::Vector{Dict} @@ -473,7 +476,9 @@ function fit( una, ben, gra = get_eigs(C.I, J, K) - mca = MCA(C, XI, vnames, rd, dr, K, J, una, ben, gra) + na = expand_names(vnames, dr) + + mca = MCA(C, XI, vnames, na, rd, dr, K, J, una, ben, gra) return mca end From 5a192dce9117fef961da1fc9e9b49d5051b7c41c Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sun, 5 Feb 2023 15:01:48 -0500 Subject: [PATCH 09/11] Tolerate more types in one-hot coding --- src/mca.jl | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/mca.jl b/src/mca.jl index 88dc34f..a997709 100644 --- a/src/mca.jl +++ b/src/mca.jl @@ -463,7 +463,7 @@ function fit( end # Get the indicator matrix - XI, rd, dr = make_indicators(X) + XI, rd, dr = make_indicators(X, "active") # Create the underlying correspondence analysis value C = fit(CA, XI; d = d, normalize = normalize, method = method) @@ -490,7 +490,7 @@ function quali_passive(mca::MCA, passive; normalize = "principal") error("Wrong number of rows in passive data array") end - PI, _, drp = make_indicators(passive) + PI, _, drp = make_indicators(passive, "passive") r = quali_passive(C, PI; normalize = normalize) vnames = if typeof(passive) <: AbstractDataFrame @@ -508,19 +508,21 @@ end # values in the vector 'z'. Also returns dictionaries mapping # the unique values to column offsets, and mapping the column # offsets to the unique values. -function make_single_indicator(z::Vector{T}) where {T} +function make_single_indicator(z::AbstractVector, vtype::String, pos::Int) n = length(z) # Unique values of the variable uq = sort(unique(z)) - if length(uq) > 50 - @warn("Nominal variable has more than 50 levels") + # This situation usually results from user error so warn. + if length(uq) > 20 + @warn("$(titlecase(vtype)) variable in column $(pos) has more than 20 levels") end # Recoding dictionary, maps each distinct value in z to # an offset + T = eltype(z) rd = Dict{T,Int}() rdi = [] for (j, v) in enumerate(uq) @@ -551,7 +553,7 @@ end # In addition to the indicator matrix, return vectors of # dictionaries mapping levels to positions and positions # to levels for each variable. -function make_indicators(Z) +function make_indicators(Z, vtype::String) if size(Z, 1) == 0 return zeros(0, 0), Dict[], Vector[] @@ -560,7 +562,7 @@ function make_indicators(Z) rd, rdi = Dict[], Vector[] XX = [] for j = 1:size(Z, 2) - X, dv, di = make_single_indicator(Z[:, j]) + X, dv, di = make_single_indicator(Z[:, j], vtype, j) push!(rd, dv) push!(rdi, di) push!(XX, X) From f4c5d3e0895c6cf4bfb868df1d6999bfe219059f Mon Sep 17 00:00:00 2001 From: kshedden Date: Tue, 3 Sep 2024 20:36:11 -0400 Subject: [PATCH 10/11] Fix project.toml --- Project.toml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index f07ec20..a1bd61f 100644 --- a/Project.toml +++ b/Project.toml @@ -8,7 +8,7 @@ version = "0.10.3" [deps] Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" -DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" @@ -17,8 +17,11 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] Arpack = "0.3, 0.4, 0.5" +Distributions = "0.25" +StableRNGs = "1" +Statistics = "1" StatsAPI = "^1.3" -StatsBase = "^0.33" +StatsBase = "^0.33, 0.34" julia = "1.1" [extras] From febc790c9f0c55498f39c75edd81b332c2c71a0e Mon Sep 17 00:00:00 2001 From: kshedden Date: Tue, 3 Sep 2024 20:37:10 -0400 Subject: [PATCH 11/11] fix multivariatestats.jl --- src/MultivariateStats.jl | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/src/MultivariateStats.jl b/src/MultivariateStats.jl index 0528b12..13d33f6 100644 --- a/src/MultivariateStats.jl +++ b/src/MultivariateStats.jl @@ -1,16 +1,16 @@ module MultivariateStats - using DataFrames using LinearAlgebra using SparseArrays using Statistics: middle - using StatsAPI: RegressionModel + using Distributions: cdf, FDist + using StatsAPI: RegressionModel, HypothesisTest using StatsBase: SimpleCovariance, CovarianceEstimator, AbstractDataTransform, ConvergenceException, pairwise, pairwise!, CoefTable import Statistics: mean, var, cov, covm, cor import Base: length, size, show - import StatsAPI: fit, predict, coef, weights, dof, r2 + import StatsAPI: fit, predict, coef, weights, dof, r2, pvalue import LinearAlgebra: eigvals, eigvecs export @@ -67,7 +67,10 @@ module MultivariateStats KernelPCA, # Type: the Kernel PCA model ## cca - CCA, # Type: Correlation Component Analysis model + CCA, # Type: Correlation Component Analysis model + WilksLambdaTest, # Wilks lambda statistics and tests + PillaiTraceTest, # Pillai trace statistics and tests + LawleyHotellingTest, # Lawley-Hotelling statistics and tests ccacov, # CCA based on covariances ccasvd, # CCA based on singular value decomposition of input data @@ -107,17 +110,7 @@ module MultivariateStats FactorAnalysis, # Type: the Factor Analysis model faem, # EM algorithm for factor analysis - facm, # CM algorithm for factor analysis - - ## ca, mca - CA, # Type: correspondence analysis - - MCA, # Type: multiple correspondence analysis - object_coords, # return the object scores or coordinates from CA or MCA - variable_coords, # return the variable/category scores or coordinates from CA or MCA - inertia, # return the inertia (derived from eigenvalues) for CA - ca_stats, # fit statistics - quali_passive # handle qualitative passive variables + facm # CM algorithm for factor analysis ## source files include("types.jl") @@ -133,7 +126,6 @@ module MultivariateStats include("lda.jl") include("ica.jl") include("fa.jl") - include("mca.jl") ## deprecations @deprecate indim(f) size(f,1)