From c04fc79a6ed1ca5814963319c678c37351bb912f Mon Sep 17 00:00:00 2001
From: Kerby Shedden <kshedden@umich.edu>
Date: Mon, 5 Sep 2022 23:57:43 -0400
Subject: [PATCH 01/11] Add file

---
 src/MultivariateStats.jl |  10 +-
 src/mca.jl               | 274 +++++++++++++++++++++++++++++++++++++++
 test/mca.jl              |  51 ++++++++
 3 files changed, 334 insertions(+), 1 deletion(-)
 create mode 100644 src/mca.jl
 create mode 100644 test/mca.jl

diff --git a/src/MultivariateStats.jl b/src/MultivariateStats.jl
index 13d33f6..1afc7e9 100644
--- a/src/MultivariateStats.jl
+++ b/src/MultivariateStats.jl
@@ -110,7 +110,14 @@ module MultivariateStats
     FactorAnalysis,         # Type: the Factor Analysis model
 
     faem,                   # EM algorithm for factor analysis
-    facm                    # CM algorithm for factor analysis
+    facm,                   # CM algorithm for factor analysis
+
+    ## CA, MCA
+    CA,
+    MCA,
+    objectscores,
+    variablescores,
+    inertia
 
     ## source files
     include("types.jl")
@@ -126,6 +133,7 @@ module MultivariateStats
     include("lda.jl")
     include("ica.jl")
     include("fa.jl")
+    include("mca.jl")
 
     ## deprecations
     @deprecate indim(f) size(f,1)
diff --git a/src/mca.jl b/src/mca.jl
new file mode 100644
index 0000000..d9cdd4b
--- /dev/null
+++ b/src/mca.jl
@@ -0,0 +1,274 @@
+# Correspondence Analysis and Multiple Correspondence Analysis
+
+using Printf, PyPlot
+
+#==
+References:
+https://personal.utdallas.edu/~herve/Abdi-MCA2007-pretty.pdf
+https://www.stata.com/manuals/mvmca.pdf
+https://www.stata.com/manuals/mvca.pdf
+https://en.wikipedia.org/wiki/Multiple_correspondence_analysis
+https://pca4ds.github.io
+https://maths.cnam.fr/IMG/pdf/ClassMCA_cle825cfc.pdf
+==#
+
+"""
+Correspondence Analysis
+"""
+struct CA{T<:Real} <: LinearDimensionalityReduction
+
+    # The data matrix
+    Z::Array{T}
+
+    # The residuals
+    R::Array{T}
+
+    # Row and column masses (means)
+    rm::Vector{T}
+    cm::Vector{T}
+
+    # The standardized residuals
+    SR::Array{T}
+
+    # Object scores
+    F::Array{T}
+
+    # Variable scores
+    G::Array{T}
+
+    # Inertia (eigenvalues of the indicator matrix)
+    I::Vector{T}
+end
+
+function CA(X, d::Int)
+
+    # Convert to proportions
+    X = X ./ sum(X)
+
+    # Calculate row and column means
+    r = sum(X, dims = 2)[:]
+    c = sum(X, dims = 1)[:]
+
+    # Center the data matrix to create residuals
+    R = X - r * c'
+
+    # Standardize the data matrix to create standardized residuals
+    Wr = Diagonal(sqrt.(r))
+    Wc = Diagonal(sqrt.(c))
+    SR = Wr \ R / Wc
+
+    # Get the object factor scores (F) and variable factor scores (G).
+    P, D, Q = svd(SR)
+    Dq = Diagonal(D)[:, 1:d]
+    F = Wr \ P * Dq
+    G = Wc \ Q * Dq
+    I = D .^ 2
+
+    return CA(X, R, r, c, SR, F, G, I)
+end
+
+objectscores(ca::CA) = ca.F
+variablescores(ca::CA) = ca.G
+inertia(ca::CA) = ca.I
+
+"""
+Multiple Correspondence Analysis
+"""
+struct MCA{T<:Real} <: LinearDimensionalityReduction
+
+    # The underlying corresponence analysis
+    C::CA{T}
+
+    # Variable names
+    vnames::Vector{String}
+
+    # Map values to integer positions
+    rd::Vector{Dict}
+
+    # Map integer positions to values
+    dr::Vector{Dict}
+
+    # Split the variable scores into separate arrays for
+    # each variable.
+    Gv::Vector{Matrix{T}}
+
+    # Eigenvalues
+    unadjusted_eigs::Vector{Float64}
+    benzecri_eigs::Vector{Float64}
+    greenacre_eigs::Vector{Float64}
+end
+
+objectscores(mca::MCA) = mca.C.F
+variablescores(mca::MCA) = mca.Gv
+
+# Split the variable scores to a separate array for each
+# variable.
+function xsplit(G, rd)
+    K = [length(di) for di in rd]
+    Js = cumsum(K)
+    Js = vcat(1, 1 .+ Js)
+    Gv = Vector{Matrix{eltype(G)}}()
+    for j in eachindex(K)
+        g = G[Js[j]:Js[j+1]-1, :]
+        push!(Gv, g)
+    end
+    return Gv
+end
+
+# Calculate the eigenvalues with different debiasings.
+function get_eigs(I, K, J)
+    ben = zeros(length(I))
+    gra = zeros(length(I))
+    Ki = 1 / K
+    f = K / (K - 1)
+    for i in eachindex(I)
+        if I[i] > Ki
+            ben[i] = (f * (I[i] - Ki))^2
+        end
+    end
+
+    unadjusted = I ./ sum(I)
+    gt = f * (sum(abs2, I) - (J - K) / K^2)
+
+    return unadjusted, ben ./ sum(ben), ben ./ gt
+end
+
+function MCA(Z, d::Int; vnames = [])
+
+    if length(vnames) == 0
+        vnames = ["v$(j)" for j = 1:size(Z, 2)]
+    end
+
+    # Get the indicator matrix
+    X, rd, dr = make_indicators(Z)
+
+    C = CA(X, d)
+
+    # Split the variable scores into separate arrays for each variable.
+    Gv = xsplit(C.G, rd)
+
+    una, ben, gra = get_eigs(C.I, size(Z, 2), size(X, 2))
+
+    return MCA(C, vnames, rd, dr, Gv, una, ben, gra)
+end
+
+# Create an indicator matrix corresponding to the distinct
+# values in z.  Also returns dictionaries mapping the unique
+# values to column offsets, and mapping the column offsets
+# to the unique values.
+function make_single_indicator(z)
+
+    n = length(z)
+
+    # Unique values of the variable
+    uq = sort(unique(z))
+
+    if length(uq) > 50
+        @warn("Nominal variable has more than 50 levels")
+    end
+
+    # Recoding dictionary, maps each distinct value in z to
+    # an offset
+    rd = Dict{eltype(z),Int}()
+    for (j, v) in enumerate(uq)
+        if !ismissing(v)
+            rd[v] = j
+        end
+    end
+
+    # Number of unique values of the variable excluding missing
+    m = length(rd)
+
+    # The indicator matrix
+    X = zeros(n, m)
+    for (i, v) in enumerate(z)
+        if ismissing(v)
+            # Missing values are treated as uniform across the levels.
+            X[i, :] .= 1 / m
+        else
+            X[i, rd[v]] = 1
+        end
+    end
+
+    # Reverse the recoding dictionary
+    rdi = Dict{Int,eltype(z)}()
+    for (k, v) in rd
+        rdi[v] = k
+    end
+
+    return X, rd, rdi
+end
+
+# Create an indicator matrix for the nominal data matrix Z.
+# In addition to the indicator matrix, return vectors of
+# dictionaries mapping levels to positions and positions
+# to levels for each variable.
+function make_indicators(Z)
+
+    rd, rdr = Dict[], Dict[]
+    XX = []
+    for j = 1:size(Z, 2)
+        X, di, dir = make_single_indicator(Z[:, j])
+        push!(rd, di)
+        push!(rdr, dir)
+        push!(XX, X)
+    end
+    I = hcat(XX...)
+
+    return I, rd, rdr
+end
+
+# Return a table summarizing the inertia.
+function inertia(mca::MCA)
+    inr = (
+        Raw = mca.C.I,
+        Unadjusted = mca.unadjusted_eigs,
+        Benzecri = mca.benzecri_eigs,
+        Greenacre = mca.greenacre_eigs,
+    )
+    return inr
+end
+
+# Plot the category scores for components numbered 'x' and 'y'.  Ordered factors
+# are connected with line segments.
+function variable_plot(mca::MCA; x = 1, y = 2, vnames = [], ordered = [], kwargs...)
+
+    fig = PyPlot.figure()
+    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
+    ax.grid(true)
+
+    # Set up the colormap
+    cm = get(kwargs, :cmap, PyPlot.get_cmap("tab10"))
+
+    # Set up the axis limits
+    mn = 1.2 * minimum(mca.C.G, dims = 1)
+    mx = 1.2 * maximum(mca.C.G, dims = 1)
+    xlim = get(kwargs, :xlim, [mn[x], mx[x]])
+    ylim = get(kwargs, :ylim, [mn[y], mx[y]])
+    ax.set_xlim(xlim...)
+    ax.set_ylim(ylim...)
+
+    for (j, g) in enumerate(mca.Gv)
+
+        if mca.vnames[j] in ordered
+            PyPlot.plot(g[:, x], g[:, y], "-", color = cm(j))
+        end
+
+        dr = mca.dr[j]
+        vn = length(vnames) > 0 ? vnames[j] : ""
+        for (k, v) in dr
+            if vn != ""
+                lb = "$(vn)-$(v)"
+            else
+                lb = v
+            end
+            ax.text(g[k, x], g[k, y], lb, color = cm(j), ha = "center", va = "center")
+        end
+    end
+
+    inr = inertia(mca)
+    PyPlot.xlabel(@sprintf("Dimension %d (%.2f%%)", x, 100 * inr[x, :Greenacre]))
+    PyPlot.ylabel(@sprintf("Dimension %d (%.2f%%)", y, 100 * inr[y, :Greenacre]))
+
+    return fig
+end
diff --git a/test/mca.jl b/test/mca.jl
new file mode 100644
index 0000000..9706972
--- /dev/null
+++ b/test/mca.jl
@@ -0,0 +1,51 @@
+using MultivariateStats, DataFrames, Test
+
+@testset "Compare MCA to R FactoMiner" begin
+
+    da = DataFrame(
+        :V1 => ["A", "A", "A", "A", "B", "B", "B", "B"],
+        :V2 => ["X", "X", "Y", "X", "X", "Y", "X", "X"],
+        :V3 => ["D", "D", "D", "C", "D", "C", "D", "C"],
+    )
+
+    m = MCA(da, 3; vnames = names(da))
+    F = objectscores(m)
+    G = variablescores(m.C)
+
+    # Eigenvalues
+    eig = inertia(m).Raw[1:3]
+    @test isapprox(eig, [0.4327141, 0.3333333, 0.2339525], rtol = 1e-6, atol = 1e-6)
+
+    # Object coordinates
+    c1 = [
+        -7.876323e-01,
+        -7.876323e-01,
+        -0.3162278,
+        5.564176e-02,
+        -0.08052551,
+        1.2341531,
+        -0.08052551,
+        0.7627485,
+    ]
+    c2 = [0, 0, 1.1547005, 0, -0.57735027, 0.5773503, -0.57735027, -0.5773503]
+    c3 = [
+        1.551768e-01,
+        1.551768e-01,
+        -0.3162278,
+        9.984508e-01,
+        -0.55193003,
+        -0.1800605,
+        -0.55193003,
+        0.2913440,
+    ]
+    coord = hcat(c1, c2, c3)
+    @test isapprox(coord, F, atol = 1e-6, rtol = 1e-6)
+
+    # Variable coordinates
+    c1 = [-0.6977130, 0.6977130, -0.232571, 0.6977130, 1.040089e+00, -6.240535e-01]
+    c2 = [0.5000000, -0.5000000, -0.500000, 1.5000000, 0, 0]
+    c3 = [0.5130269, -0.5130269, 0.171009, -0.5130269, 7.647753e-01, -4.588652e-01]
+    coord = hcat(c1, c2, c3)
+    @test isapprox(coord, G, atol = 1e-6, rtol = 1e-6)
+
+end

From 1070ba38aa48f0785f597e652fbd8132c498f947 Mon Sep 17 00:00:00 2001
From: Kerby Shedden <kshedden@umich.edu>
Date: Tue, 6 Sep 2022 09:28:33 -0400
Subject: [PATCH 02/11] Don't include plotting code for now

---
 src/mca.jl | 75 +++++++++++++++++++++++++++---------------------------
 1 file changed, 38 insertions(+), 37 deletions(-)

diff --git a/src/mca.jl b/src/mca.jl
index d9cdd4b..c6a460a 100644
--- a/src/mca.jl
+++ b/src/mca.jl
@@ -1,6 +1,7 @@
 # Correspondence Analysis and Multiple Correspondence Analysis
 
-using Printf, PyPlot
+# Needed for the plotting function below
+#using Printf, PyPlot
 
 #==
 References:
@@ -231,44 +232,44 @@ end
 
 # Plot the category scores for components numbered 'x' and 'y'.  Ordered factors
 # are connected with line segments.
-function variable_plot(mca::MCA; x = 1, y = 2, vnames = [], ordered = [], kwargs...)
+#function variable_plot(mca::MCA; x = 1, y = 2, vnames = [], ordered = [], kwargs...)
 
-    fig = PyPlot.figure()
-    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
-    ax.grid(true)
+#    fig = PyPlot.figure()
+#    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
+#    ax.grid(true)
 
     # Set up the colormap
-    cm = get(kwargs, :cmap, PyPlot.get_cmap("tab10"))
+#    cm = get(kwargs, :cmap, PyPlot.get_cmap("tab10"))
 
     # Set up the axis limits
-    mn = 1.2 * minimum(mca.C.G, dims = 1)
-    mx = 1.2 * maximum(mca.C.G, dims = 1)
-    xlim = get(kwargs, :xlim, [mn[x], mx[x]])
-    ylim = get(kwargs, :ylim, [mn[y], mx[y]])
-    ax.set_xlim(xlim...)
-    ax.set_ylim(ylim...)
-
-    for (j, g) in enumerate(mca.Gv)
-
-        if mca.vnames[j] in ordered
-            PyPlot.plot(g[:, x], g[:, y], "-", color = cm(j))
-        end
-
-        dr = mca.dr[j]
-        vn = length(vnames) > 0 ? vnames[j] : ""
-        for (k, v) in dr
-            if vn != ""
-                lb = "$(vn)-$(v)"
-            else
-                lb = v
-            end
-            ax.text(g[k, x], g[k, y], lb, color = cm(j), ha = "center", va = "center")
-        end
-    end
-
-    inr = inertia(mca)
-    PyPlot.xlabel(@sprintf("Dimension %d (%.2f%%)", x, 100 * inr[x, :Greenacre]))
-    PyPlot.ylabel(@sprintf("Dimension %d (%.2f%%)", y, 100 * inr[y, :Greenacre]))
-
-    return fig
-end
+#    mn = 1.2 * minimum(mca.C.G, dims = 1)
+#    mx = 1.2 * maximum(mca.C.G, dims = 1)
+#    xlim = get(kwargs, :xlim, [mn[x], mx[x]])
+#    ylim = get(kwargs, :ylim, [mn[y], mx[y]])
+#    ax.set_xlim(xlim...)
+#    ax.set_ylim(ylim...)
+
+#    for (j, g) in enumerate(mca.Gv)
+
+#        if mca.vnames[j] in ordered
+#            PyPlot.plot(g[:, x], g[:, y], "-", color = cm(j))
+#        end
+
+#        dr = mca.dr[j]
+#        vn = length(vnames) > 0 ? vnames[j] : ""
+#        for (k, v) in dr
+#            if vn != ""
+#                lb = "$(vn)-$(v)"
+#            else
+#                lb = v
+#            end
+#            ax.text(g[k, x], g[k, y], lb, color = cm(j), ha = "center", va = "center")
+#        end
+#    end
+
+#    inr = inertia(mca)
+#    PyPlot.xlabel(@sprintf("Dimension %d (%.2f%%)", x, 100 * inr[x, :Greenacre]))
+#    PyPlot.ylabel(@sprintf("Dimension %d (%.2f%%)", y, 100 * inr[y, :Greenacre]))
+
+#    return fig
+#end

From 359de04c1610ad713d3318749d9ce89c4d3eb8a2 Mon Sep 17 00:00:00 2001
From: Kerby Shedden <kshedden@umich.edu>
Date: Tue, 6 Sep 2022 19:11:33 -0400
Subject: [PATCH 03/11] Restructure to match rest of package

---
 src/mca.jl  | 103 ++++++++++++++++++++++++++++++++++++++++++++--------
 test/mca.jl |   2 +-
 2 files changed, 89 insertions(+), 16 deletions(-)

diff --git a/src/mca.jl b/src/mca.jl
index c6a460a..868685d 100644
--- a/src/mca.jl
+++ b/src/mca.jl
@@ -16,7 +16,7 @@ https://maths.cnam.fr/IMG/pdf/ClassMCA_cle825cfc.pdf
 """
 Correspondence Analysis
 """
-struct CA{T<:Real} <: LinearDimensionalityReduction
+mutable struct CA{T<:Real} <: LinearDimensionalityReduction
 
     # The data matrix
     Z::Array{T}
@@ -41,7 +41,9 @@ struct CA{T<:Real} <: LinearDimensionalityReduction
     I::Vector{T}
 end
 
-function CA(X, d::Int)
+# Constructor
+
+function CA(X)
 
     # Convert to proportions
     X = X ./ sum(X)
@@ -58,14 +60,41 @@ function CA(X, d::Int)
     Wc = Diagonal(sqrt.(c))
     SR = Wr \ R / Wc
 
+    T = eltype(X)
+    return CA(X, R, r, c, SR, zeros(T, 0, 0), zeros(T, 0, 0), zeros(T, 0))
+end
+
+function fit!(ca::CA, d::Int)
+
     # Get the object factor scores (F) and variable factor scores (G).
-    P, D, Q = svd(SR)
+    P, D, Q = svd(ca.SR)
     Dq = Diagonal(D)[:, 1:d]
-    F = Wr \ P * Dq
-    G = Wc \ Q * Dq
-    I = D .^ 2
 
-    return CA(X, R, r, c, SR, F, G, I)
+    Wr = Diagonal(sqrt.(ca.rm))
+    Wc = Diagonal(sqrt.(ca.cm))
+    ca.F = Wr \ P * Dq
+    ca.G = Wc \ Q * Dq
+
+    # Get the eigenvalues
+    ca.I = D .^ 2
+end
+
+function fit(::Type{CA}; X::AbstractMatrix, d::Int = 5)
+    ca = CA(X)
+    fit!(ca, d)
+    return ca::CA
+end
+
+"""
+    ca
+
+Fit a correspondence analysis using the data array `X` whose rows are
+the objects and columns are the variables.  The first `d` components
+are retained.
+"""
+function ca(X, d)
+    c = fit(CA, X, d)
+    return c
 end
 
 objectscores(ca::CA) = ca.F
@@ -75,7 +104,7 @@ inertia(ca::CA) = ca.I
 """
 Multiple Correspondence Analysis
 """
-struct MCA{T<:Real} <: LinearDimensionalityReduction
+mutable struct MCA{T<:Real} <: LinearDimensionalityReduction
 
     # The underlying corresponence analysis
     C::CA{T}
@@ -93,6 +122,12 @@ struct MCA{T<:Real} <: LinearDimensionalityReduction
     # each variable.
     Gv::Vector{Matrix{T}}
 
+    # Number of nominal variables
+    K::Int
+
+    # Total number of categories in all variables
+    J::Int
+
     # Eigenvalues
     unadjusted_eigs::Vector{Float64}
     benzecri_eigs::Vector{Float64}
@@ -134,7 +169,9 @@ function get_eigs(I, K, J)
     return unadjusted, ben ./ sum(ben), ben ./ gt
 end
 
-function MCA(Z, d::Int; vnames = [])
+# constructor
+
+function MCA(Z; vnames = [])
 
     if length(vnames) == 0
         vnames = ["v$(j)" for j = 1:size(Z, 2)]
@@ -143,14 +180,50 @@ function MCA(Z, d::Int; vnames = [])
     # Get the indicator matrix
     X, rd, dr = make_indicators(Z)
 
-    C = CA(X, d)
+    # Create the underlying correspondence analysis value
+    C = CA(X)
+
+    # Number of nominal variables
+    K = size(Z, 2)
+
+    # Total number of categories in all variables
+    J = size(X, 2)
+
+    return MCA(C, vnames, rd, dr, Matrix{Float64}[], K, J, zeros(0), zeros(0), zeros(0))
+end
+
+"""
+    mca
+
+Fit a multiple correspondence analysis using the columns of `Z` as the
+variables.  The first `d` components are retained.  If `Z` is a
+dataframe then the column names are used as variable names, otherwise
+variable names may be provided as `vnames`.
+"""
+function mca(Z, d::Int; vnames = [])
+    m = MCA(Z; vnames)
+    fit!(m, d)
+    return m
+end
+
+function fit(::Type{MCA}, Z::AbstractMatrix, d::Int; vnames = [])
+    return mca(Z, d; vnames)
+end
+
+function fit!(mca::MCA, d::Int)
+
+    fit!(mca.C, d)
 
     # Split the variable scores into separate arrays for each variable.
-    Gv = xsplit(C.G, rd)
+    mca.Gv = xsplit(mca.C.G, mca.rd)
+
+    una, ben, gra = get_eigs(mca.C.I, mca.J, mca.K)
 
-    una, ben, gra = get_eigs(C.I, size(Z, 2), size(X, 2))
+    mca.unadjusted_eigs = una
+    mca.benzecri_eigs = ben
+    mca.greenacre_eigs = gra
 
-    return MCA(C, vnames, rd, dr, Gv, una, ben, gra)
+    return mca
 end
 
 # Create an indicator matrix corresponding to the distinct
@@ -238,10 +311,10 @@ end
 #    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
 #    ax.grid(true)
 
-    # Set up the colormap
+# Set up the colormap
 #    cm = get(kwargs, :cmap, PyPlot.get_cmap("tab10"))
 
-    # Set up the axis limits
+# Set up the axis limits
 #    mn = 1.2 * minimum(mca.C.G, dims = 1)
 #    mx = 1.2 * maximum(mca.C.G, dims = 1)
 #    xlim = get(kwargs, :xlim, [mn[x], mx[x]])
diff --git a/test/mca.jl b/test/mca.jl
index 9706972..b497686 100644
--- a/test/mca.jl
+++ b/test/mca.jl
@@ -8,7 +8,7 @@ using MultivariateStats, DataFrames, Test
         :V3 => ["D", "D", "D", "C", "D", "C", "D", "C"],
     )
 
-    m = MCA(da, 3; vnames = names(da))
+    m = mca(da, 3; vnames = names(da))
     F = objectscores(m)
     G = variablescores(m.C)
 

From f5d16bc49d70e2354d1da90d6826aeca193ec288 Mon Sep 17 00:00:00 2001
From: Kerby Shedden <kshedden@umich.edu>
Date: Tue, 6 Sep 2022 19:58:10 -0400
Subject: [PATCH 04/11] check for repeated non-zero signular values

    pick f57ef58 check for repeated non-zero signular values
---
 src/MultivariateStats.jl | 139 +++++++++++++++++++--------------------
 src/mca.jl               |   6 ++
 2 files changed, 75 insertions(+), 70 deletions(-)

diff --git a/src/MultivariateStats.jl b/src/MultivariateStats.jl
index 1afc7e9..4bd977b 100644
--- a/src/MultivariateStats.jl
+++ b/src/MultivariateStats.jl
@@ -1,19 +1,24 @@
 module MultivariateStats
 
-    using LinearAlgebra
-    using SparseArrays
-    using Statistics: middle
-    using Distributions: cdf, FDist
-    using StatsAPI: RegressionModel, HypothesisTest
-    using StatsBase: SimpleCovariance, CovarianceEstimator, AbstractDataTransform,
-                     ConvergenceException, pairwise, pairwise!, CoefTable
-
-    import Statistics: mean, var, cov, covm, cor
-    import Base: length, size, show
-    import StatsAPI: fit, predict, coef, weights, dof, r2, pvalue
-    import LinearAlgebra: eigvals, eigvecs
-
-    export
+using LinearAlgebra
+using SparseArrays
+using Statistics: middle
+using StatsAPI: RegressionModel
+using StatsBase:
+    SimpleCovariance,
+    CovarianceEstimator,
+    AbstractDataTransform,
+    ConvergenceException,
+    pairwise,
+    pairwise!,
+    CoefTable
+
+import Statistics: mean, var, cov, covm, cor
+import Base: length, size, show
+import StatsAPI: fit, predict, coef, weights, dof, r2
+import LinearAlgebra: eigvals, eigvecs
+
+export
 
     ## common
     evaluate,           # evaluate discriminant function values
@@ -38,7 +43,6 @@ module MultivariateStats
 
     # whiten
     Whitening,          # Type: Whitening transformation
-
     invsqrtm,           # Compute inverse of matrix square root, i.e. inv(sqrtm(A))
     cov_whitening,      # Compute a whitening transform based on covariance
     cov_whitening!,     # Compute a whitening transform based on covariance (input will be overwritten)
@@ -46,19 +50,16 @@ module MultivariateStats
 
     ## pca
     PCA,                # Type: Principal Component Analysis model
-
     pcacov,             # PCA based on covariance
     pcasvd,             # PCA based on singular value decomposition of input data
     principalratio,     # the ratio of variances preserved in the principal subspace
     principalvar,       # the variance along a specific principal direction
     principalvars,      # the variances along all principal directions
-
     tprincipalvar,      # total principal variance, i.e. sum(principalvars(M))
     tresidualvar,       # total residual variance
 
     ## ppca
     PPCA,               # Type: the Probabilistic PCA model
-
     ppcaml,             # Maximum likelihood probabilistic PCA
     ppcaem,             # EM algorithm for probabilistic PCA
     bayespca,           # Bayesian PCA
@@ -67,10 +68,7 @@ module MultivariateStats
     KernelPCA,          # Type: the Kernel PCA model
 
     ## cca
-    CCA,                 # Type: Correlation Component Analysis model
-    WilksLambdaTest,     # Wilks lambda statistics and tests
-    PillaiTraceTest,     # Pillai trace statistics and tests
-    LawleyHotellingTest, # Lawley-Hotelling statistics and tests
+    CCA,                # Type: Correlation Component Analysis model
 
     ccacov,             # CCA based on covariances
     ccasvd,             # CCA based on singular value decomposition of input data
@@ -81,18 +79,17 @@ module MultivariateStats
     MetricMDS,
     classical_mds,      # perform classical MDS over a given distance matrix
     stress,             # stress evaluation
-
-    gram2dmat, gram2dmat!,  # Gram matrix => Distance matrix
-    dmat2gram, dmat2gram!,  # Distance matrix => Gram matrix
+    gram2dmat,
+    gram2dmat!,  # Gram matrix => Distance matrix
+    dmat2gram,
+    dmat2gram!,  # Distance matrix => Gram matrix
 
     ## lda
     LinearDiscriminant,     # Type: Linear Discriminant functional
     MulticlassLDAStats,     # Type: Statistics required for training multi-class LDA
     MulticlassLDA,          # Type: Multi-class LDA model
     SubspaceLDA,            # Type: LDA model for high-dimensional spaces
-
     ldacov,                 # Linear discriminant analysis based on covariances
-
     classweights,           # class-specific weights
     classmeans,             # class-specific means
     withclass_scatter,      # with-class scatter matrix
@@ -103,56 +100,58 @@ module MultivariateStats
 
     ## ica
     ICA,                    # Type: the Fast ICA model
-
     fastica!,               # core algorithm function for the Fast ICA
 
     ## fa
     FactorAnalysis,         # Type: the Factor Analysis model
-
     faem,                   # EM algorithm for factor analysis
     facm,                   # CM algorithm for factor analysis
 
     ## CA, MCA
-    CA,
-    MCA,
-    objectscores,
-    variablescores,
-    inertia
-
-    ## source files
-    include("types.jl")
-    include("common.jl")
-    include("lreg.jl")
-    include("whiten.jl")
-    include("pca.jl")
-    include("ppca.jl")
-    include("kpca.jl")
-    include("cca.jl")
-    include("cmds.jl")
-    include("mmds.jl")
-    include("lda.jl")
-    include("ica.jl")
-    include("fa.jl")
-    include("mca.jl")
-
-    ## deprecations
-    @deprecate indim(f) size(f,1)
-    @deprecate outdim(f) size(f,2)
-    @deprecate transform(f, x) predict(f, x)
-    @deprecate indim(f::Whitening) length(f::Whitening)
-    @deprecate outdim(f::Whitening) length(f::Whitening)
-    @deprecate tvar(f::PCA) var(f::PCA)
-    @deprecate classical_mds(D::AbstractMatrix, p::Int) predict(fit(MDS, D, maxoutdim=p, distances=true))
-    @deprecate transform(f::MDS) predict(f::MDS)
-    @deprecate xindim(M::CCA) size(M,1)
-    @deprecate yindim(M::CCA) size(M,2)
-    @deprecate outdim(M::CCA) size(M,3)
-    @deprecate correlations(M::CCA) cor(M)
-    @deprecate xmean(M::CCA) mean(M, :x)
-    @deprecate ymean(M::CCA) mean(M, :y)
-    @deprecate xprojection(M::CCA) projection(M, :x)
-    @deprecate yprojection(M::CCA) projection(M, :y)
-    @deprecate xtransform(M::CCA, x) predict(M, x, :x)
-    @deprecate ytransform(M::CCA, y) predict(M, y, :y)
+    CA,                     # Type: correspondence analysis
+    MCA,                    # Type: multiple correspondence analysis
+    ca,                     # fit and return a correspondence analysis
+    mca,                    # fit and return a multiple correspondence analysis
+    objectscores,           # return the object scores or coordinates from CA or MCA
+    variablescores,         # return the variable/category scores or coordinates from CA or MCA
+    inertia                 # return the inertia (derived from eigenvalues) for CA
+
+## source files
+include("types.jl")
+include("common.jl")
+include("lreg.jl")
+include("whiten.jl")
+include("pca.jl")
+include("ppca.jl")
+include("kpca.jl")
+include("cca.jl")
+include("cmds.jl")
+include("mmds.jl")
+include("lda.jl")
+include("ica.jl")
+include("fa.jl")
+include("mca.jl")
+
+## deprecations
+@deprecate indim(f) size(f, 1)
+@deprecate outdim(f) size(f, 2)
+@deprecate transform(f, x) predict(f, x)
+@deprecate indim(f::Whitening) length(f::Whitening)
+@deprecate outdim(f::Whitening) length(f::Whitening)
+@deprecate tvar(f::PCA) var(f::PCA)
+@deprecate classical_mds(D::AbstractMatrix, p::Int) predict(
+    fit(MDS, D, maxoutdim = p, distances = true),
+)
+@deprecate transform(f::MDS) predict(f::MDS)
+@deprecate xindim(M::CCA) size(M, 1)
+@deprecate yindim(M::CCA) size(M, 2)
+@deprecate outdim(M::CCA) size(M, 3)
+@deprecate correlations(M::CCA) cor(M)
+@deprecate xmean(M::CCA) mean(M, :x)
+@deprecate ymean(M::CCA) mean(M, :y)
+@deprecate xprojection(M::CCA) projection(M, :x)
+@deprecate yprojection(M::CCA) projection(M, :y)
+@deprecate xtransform(M::CCA, x) predict(M, x, :x)
+@deprecate ytransform(M::CCA, y) predict(M, y, :y)
 
 end # module
diff --git a/src/mca.jl b/src/mca.jl
index 868685d..9428e1c 100644
--- a/src/mca.jl
+++ b/src/mca.jl
@@ -70,6 +70,12 @@ function fit!(ca::CA, d::Int)
     P, D, Q = svd(ca.SR)
     Dq = Diagonal(D)[:, 1:d]
 
+    # Check that there are no repeated non-zero eigenvalues.
+    d = diff(D[D .> 1e-10])
+    if maximum(d) >= -1e-10
+        @warn("The indicator matrix has repeated non-zero eigenvalues")
+    end
+
     Wr = Diagonal(sqrt.(ca.rm))
     Wc = Diagonal(sqrt.(ca.cm))
     ca.F = Wr \ P * Dq

From fa98503ee0b1f17738ce8c82add63b1b91d1ec34 Mon Sep 17 00:00:00 2001
From: Kerby Shedden <kshedden@umich.edu>
Date: Tue, 20 Sep 2022 20:24:49 -0400
Subject: [PATCH 05/11] Update API to conform to package

---
 src/MultivariateStats.jl | 123 +++++++++++++++----------------
 src/mca.jl               | 154 +++++++++++++++++----------------------
 test/mca.jl              |   2 +-
 3 files changed, 131 insertions(+), 148 deletions(-)

diff --git a/src/MultivariateStats.jl b/src/MultivariateStats.jl
index 4bd977b..ca7af4c 100644
--- a/src/MultivariateStats.jl
+++ b/src/MultivariateStats.jl
@@ -1,24 +1,19 @@
 module MultivariateStats
 
-using LinearAlgebra
-using SparseArrays
-using Statistics: middle
-using StatsAPI: RegressionModel
-using StatsBase:
-    SimpleCovariance,
-    CovarianceEstimator,
-    AbstractDataTransform,
-    ConvergenceException,
-    pairwise,
-    pairwise!,
-    CoefTable
-
-import Statistics: mean, var, cov, covm, cor
-import Base: length, size, show
-import StatsAPI: fit, predict, coef, weights, dof, r2
-import LinearAlgebra: eigvals, eigvecs
-
-export
+    using DataFrames
+    using LinearAlgebra
+    using SparseArrays
+    using Statistics: middle
+    using StatsAPI: RegressionModel
+    using StatsBase: SimpleCovariance, CovarianceEstimator, AbstractDataTransform,
+                     ConvergenceException, pairwise, pairwise!, CoefTable
+
+    import Statistics: mean, var, cov, covm, cor
+    import Base: length, size, show
+    import StatsAPI: fit, predict, coef, weights, dof, r2
+    import LinearAlgebra: eigvals, eigvecs
+
+    export
 
     ## common
     evaluate,           # evaluate discriminant function values
@@ -43,6 +38,7 @@ export
 
     # whiten
     Whitening,          # Type: Whitening transformation
+
     invsqrtm,           # Compute inverse of matrix square root, i.e. inv(sqrtm(A))
     cov_whitening,      # Compute a whitening transform based on covariance
     cov_whitening!,     # Compute a whitening transform based on covariance (input will be overwritten)
@@ -50,16 +46,19 @@ export
 
     ## pca
     PCA,                # Type: Principal Component Analysis model
+
     pcacov,             # PCA based on covariance
     pcasvd,             # PCA based on singular value decomposition of input data
     principalratio,     # the ratio of variances preserved in the principal subspace
     principalvar,       # the variance along a specific principal direction
     principalvars,      # the variances along all principal directions
+
     tprincipalvar,      # total principal variance, i.e. sum(principalvars(M))
     tresidualvar,       # total residual variance
 
     ## ppca
     PPCA,               # Type: the Probabilistic PCA model
+
     ppcaml,             # Maximum likelihood probabilistic PCA
     ppcaem,             # EM algorithm for probabilistic PCA
     bayespca,           # Bayesian PCA
@@ -79,17 +78,18 @@ export
     MetricMDS,
     classical_mds,      # perform classical MDS over a given distance matrix
     stress,             # stress evaluation
-    gram2dmat,
-    gram2dmat!,  # Gram matrix => Distance matrix
-    dmat2gram,
-    dmat2gram!,  # Distance matrix => Gram matrix
+
+    gram2dmat, gram2dmat!,  # Gram matrix => Distance matrix
+    dmat2gram, dmat2gram!,  # Distance matrix => Gram matrix
 
     ## lda
     LinearDiscriminant,     # Type: Linear Discriminant functional
     MulticlassLDAStats,     # Type: Statistics required for training multi-class LDA
     MulticlassLDA,          # Type: Multi-class LDA model
     SubspaceLDA,            # Type: LDA model for high-dimensional spaces
+
     ldacov,                 # Linear discriminant analysis based on covariances
+
     classweights,           # class-specific weights
     classmeans,             # class-specific means
     withclass_scatter,      # with-class scatter matrix
@@ -100,15 +100,18 @@ export
 
     ## ica
     ICA,                    # Type: the Fast ICA model
+
     fastica!,               # core algorithm function for the Fast ICA
 
     ## fa
     FactorAnalysis,         # Type: the Factor Analysis model
+
     faem,                   # EM algorithm for factor analysis
     facm,                   # CM algorithm for factor analysis
 
-    ## CA, MCA
+    ## ca, mca
     CA,                     # Type: correspondence analysis
+
     MCA,                    # Type: multiple correspondence analysis
     ca,                     # fit and return a correspondence analysis
     mca,                    # fit and return a multiple correspondence analysis
@@ -116,42 +119,40 @@ export
     variablescores,         # return the variable/category scores or coordinates from CA or MCA
     inertia                 # return the inertia (derived from eigenvalues) for CA
 
-## source files
-include("types.jl")
-include("common.jl")
-include("lreg.jl")
-include("whiten.jl")
-include("pca.jl")
-include("ppca.jl")
-include("kpca.jl")
-include("cca.jl")
-include("cmds.jl")
-include("mmds.jl")
-include("lda.jl")
-include("ica.jl")
-include("fa.jl")
-include("mca.jl")
-
-## deprecations
-@deprecate indim(f) size(f, 1)
-@deprecate outdim(f) size(f, 2)
-@deprecate transform(f, x) predict(f, x)
-@deprecate indim(f::Whitening) length(f::Whitening)
-@deprecate outdim(f::Whitening) length(f::Whitening)
-@deprecate tvar(f::PCA) var(f::PCA)
-@deprecate classical_mds(D::AbstractMatrix, p::Int) predict(
-    fit(MDS, D, maxoutdim = p, distances = true),
-)
-@deprecate transform(f::MDS) predict(f::MDS)
-@deprecate xindim(M::CCA) size(M, 1)
-@deprecate yindim(M::CCA) size(M, 2)
-@deprecate outdim(M::CCA) size(M, 3)
-@deprecate correlations(M::CCA) cor(M)
-@deprecate xmean(M::CCA) mean(M, :x)
-@deprecate ymean(M::CCA) mean(M, :y)
-@deprecate xprojection(M::CCA) projection(M, :x)
-@deprecate yprojection(M::CCA) projection(M, :y)
-@deprecate xtransform(M::CCA, x) predict(M, x, :x)
-@deprecate ytransform(M::CCA, y) predict(M, y, :y)
+    ## source files
+    include("types.jl")
+    include("common.jl")
+    include("lreg.jl")
+    include("whiten.jl")
+    include("pca.jl")
+    include("ppca.jl")
+    include("kpca.jl")
+    include("cca.jl")
+    include("cmds.jl")
+    include("mmds.jl")
+    include("lda.jl")
+    include("ica.jl")
+    include("fa.jl")
+    include("mca.jl")
+
+    ## deprecations
+    @deprecate indim(f) size(f,1)
+    @deprecate outdim(f) size(f,2)
+    @deprecate transform(f, x) predict(f, x)
+    @deprecate indim(f::Whitening) length(f::Whitening)
+    @deprecate outdim(f::Whitening) length(f::Whitening)
+    @deprecate tvar(f::PCA) var(f::PCA)
+    @deprecate classical_mds(D::AbstractMatrix, p::Int) predict(fit(MDS, D, maxoutdim=p, distances=true))
+    @deprecate transform(f::MDS) predict(f::MDS)
+    @deprecate xindim(M::CCA) size(M,1)
+    @deprecate yindim(M::CCA) size(M,2)
+    @deprecate outdim(M::CCA) size(M,3)
+    @deprecate correlations(M::CCA) cor(M)
+    @deprecate xmean(M::CCA) mean(M, :x)
+    @deprecate ymean(M::CCA) mean(M, :y)
+    @deprecate xprojection(M::CCA) projection(M, :x)
+    @deprecate yprojection(M::CCA) projection(M, :y)
+    @deprecate xtransform(M::CCA, x) predict(M, x, :x)
+    @deprecate ytransform(M::CCA, y) predict(M, y, :y)
 
 end # module
diff --git a/src/mca.jl b/src/mca.jl
index 9428e1c..abbce26 100644
--- a/src/mca.jl
+++ b/src/mca.jl
@@ -16,10 +16,10 @@ https://maths.cnam.fr/IMG/pdf/ClassMCA_cle825cfc.pdf
 """
 Correspondence Analysis
 """
-mutable struct CA{T<:Real} <: LinearDimensionalityReduction
+struct CA{T<:Real} <: LinearDimensionalityReduction
 
     # The data matrix
-    Z::Array{T}
+    X::Array{T}
 
     # The residuals
     R::Array{T}
@@ -41,33 +41,44 @@ mutable struct CA{T<:Real} <: LinearDimensionalityReduction
     I::Vector{T}
 end
 
-# Constructor
+"""
+    fit(CA, X; ...)
+
+Peform a Correspondence Analysis using the data in `X`.
+
+**Keyword Arguments**
 
-function CA(X)
+- `d` the number of dimensions to retain.
+
+**Notes:**
+
+- The matrix `X` should contain numerical data for which it makes
+  sense to use chi^2 distance to compare rows.  Most commonly this
+  is an indicator matrix in which each row contains a single 1 with
+  the remaining values all being 0.
+- See `MCA` for a more general analysis that takes a dataframe
+  with multiple nominal columns as input and performs a CA on
+  its indicator matrix.
+"""
+function fit(::Type{CA}, X::AbstractMatrix{T}; d::Int = 5) where{T}
 
     # Convert to proportions
     X = X ./ sum(X)
 
-    # Calculate row and column means
-    r = sum(X, dims = 2)[:]
-    c = sum(X, dims = 1)[:]
+    # Calculate row and column margins
+    rm = sum(X, dims = 2)[:]
+    cm = sum(X, dims = 1)[:]
 
     # Center the data matrix to create residuals
-    R = X - r * c'
+    R = X - rm * cm'
 
     # Standardize the data matrix to create standardized residuals
-    Wr = Diagonal(sqrt.(r))
-    Wc = Diagonal(sqrt.(c))
+    Wr = Diagonal(sqrt.(rm))
+    Wc = Diagonal(sqrt.(cm))
     SR = Wr \ R / Wc
 
-    T = eltype(X)
-    return CA(X, R, r, c, SR, zeros(T, 0, 0), zeros(T, 0, 0), zeros(T, 0))
-end
-
-function fit!(ca::CA, d::Int)
-
-    # Get the object factor scores (F) and variable factor scores (G).
-    P, D, Q = svd(ca.SR)
+    # Get the object scores (F) and variable scores (G).
+    P, D, Q = svd(SR)
     Dq = Diagonal(D)[:, 1:d]
 
     # Check that there are no repeated non-zero eigenvalues.
@@ -76,31 +87,15 @@ function fit!(ca::CA, d::Int)
         @warn("The indicator matrix has repeated non-zero eigenvalues")
     end
 
-    Wr = Diagonal(sqrt.(ca.rm))
-    Wc = Diagonal(sqrt.(ca.cm))
-    ca.F = Wr \ P * Dq
-    ca.G = Wc \ Q * Dq
+    Wr = Diagonal(sqrt.(rm))
+    Wc = Diagonal(sqrt.(cm))
+    F = Wr \ P * Dq
+    G = Wc \ Q * Dq
 
     # Get the eigenvalues
-    ca.I = D .^ 2
-end
-
-function fit(::Type{CA}; X::AbstractMatrix, d::Int = 5)
-    ca = CA(X)
-    fit!(ca, d)
-    return ca::CA
-end
-
-"""
-    ca
+    I = D .^ 2
 
-Fit a correspondence analysis using the data array `X` whose rows are
-the objects and columns are the variables.  The first `d` components
-are retained.
-"""
-function ca(X, d)
-    c = fit(CA, X, d)
-    return c
+    return CA(X, R, rm, cm, SR, F, G, I)
 end
 
 objectscores(ca::CA) = ca.F
@@ -110,7 +105,7 @@ inertia(ca::CA) = ca.I
 """
 Multiple Correspondence Analysis
 """
-mutable struct MCA{T<:Real} <: LinearDimensionalityReduction
+struct MCA{T<:Real} <: LinearDimensionalityReduction
 
     # The underlying corresponence analysis
     C::CA{T}
@@ -175,68 +170,55 @@ function get_eigs(I, K, J)
     return unadjusted, ben ./ sum(ben), ben ./ gt
 end
 
-# constructor
+"""
+    fit(MCA, X; ...)
+
+Fit a multiple correspondence analysis using the columns of `X` as the variables.
 
-function MCA(Z; vnames = [])
+**Keyword Arguments**
 
-    if length(vnames) == 0
+- `d`: The number of dimensions to retain.
+- `vnames`: Variable names, if `X` is a data frame then the column names are the
+  default variable names but will be replaced with these values if provided.
+
+**Notes:**
+
+- Missing values are recoded as fractional indicators, i.e. if there are k distinct
+  levels of a variable it is coded as 1/k, 1/k, ..., 1/k.
+"""
+function fit(::Type{MCA}, X; d::Int=5, vnames=[])
+
+    if length(vnames) == 0 && typeof(X) <: AbstractDataFrame
+        vnames = names(X)
+    elseif length(vnames) == 0
         vnames = ["v$(j)" for j = 1:size(Z, 2)]
     end
 
     # Get the indicator matrix
-    X, rd, dr = make_indicators(Z)
+    XI, rd, dr = make_indicators(X)
 
     # Create the underlying correspondence analysis value
-    C = CA(X)
+    C = fit(CA, XI; d=d)
 
     # Number of nominal variables
-    K = size(Z, 2)
+    K = size(X, 2)
 
     # Total number of categories in all variables
-    J = size(X, 2)
-
-    return MCA(C, vnames, rd, dr, Matrix{Float64}[], K, J, zeros(0), zeros(0), zeros(0))
-end
-
-"""
-    mca
-
-Fit a multiple correspondence analysis using the columns of `Z` as the
-variables.  The first `d` components are retained.  If `Z` is a
-dataframe then the column names are used as variable names, otherwise
-variable names may be provided as `vnames`.
-"""
-function mca(Z, d::Int; vnames = [])
-    m = MCA(Z; vnames)
-    fit!(m, d)
-    return m
-end
-
-function fit(::Type{MCA}, Z::AbstractMatrix, d::Int; vnames = [])
-    return mca(Z, d; vnames)
-end
-
-function fit!(mca::MCA, d::Int)
-
-    fit!(mca.C, d)
+    J = size(XI, 2)
 
     # Split the variable scores into separate arrays for each variable.
-    mca.Gv = xsplit(mca.C.G, mca.rd)
-
-    una, ben, gra = get_eigs(mca.C.I, mca.J, mca.K)
+    Gv = xsplit(C.G, rd)
 
-    mca.unadjusted_eigs = una
-    mca.benzecri_eigs = ben
-    mca.greenacre_eigs = gra
+    una, ben, gra = get_eigs(C.I, J, K)
 
-    return mca
+    return MCA(C, vnames, rd, dr, Gv, K, J, una, ben, gra)
 end
 
 # Create an indicator matrix corresponding to the distinct
-# values in z.  Also returns dictionaries mapping the unique
-# values to column offsets, and mapping the column offsets
-# to the unique values.
-function make_single_indicator(z)
+# values in the vector 'z'.  Also returns dictionaries mapping
+# the unique values to column offsets, and mapping the column
+# offsets to the unique values.
+function make_single_indicator(z::Vector{T}) where{T}
 
     n = length(z)
 
@@ -249,7 +231,7 @@ function make_single_indicator(z)
 
     # Recoding dictionary, maps each distinct value in z to
     # an offset
-    rd = Dict{eltype(z),Int}()
+    rd = Dict{T,Int}()
     for (j, v) in enumerate(uq)
         if !ismissing(v)
             rd[v] = j
@@ -271,7 +253,7 @@ function make_single_indicator(z)
     end
 
     # Reverse the recoding dictionary
-    rdi = Dict{Int,eltype(z)}()
+    rdi = Dict{Int,T}()
     for (k, v) in rd
         rdi[v] = k
     end
diff --git a/test/mca.jl b/test/mca.jl
index b497686..dc5209e 100644
--- a/test/mca.jl
+++ b/test/mca.jl
@@ -8,7 +8,7 @@ using MultivariateStats, DataFrames, Test
         :V3 => ["D", "D", "D", "C", "D", "C", "D", "C"],
     )
 
-    m = mca(da, 3; vnames = names(da))
+    m = fit(MCA, da; d=3, vnames = names(da))
     F = objectscores(m)
     G = variablescores(m.C)
 

From cb97a3317717d45548d9f7dc3e91533913ea4956 Mon Sep 17 00:00:00 2001
From: Kerby Shedden <kshedden@umich.edu>
Date: Sun, 11 Dec 2022 15:23:42 -0500
Subject: [PATCH 06/11] Refactor interface, add many features

    pick ab82e6e Refactor interface, add many features

    pick 7901ecf Update API to conform to package
---
 Project.toml             |   7 +-
 src/MultivariateStats.jl |  10 +-
 src/mca.jl               | 450 +++++++++++++++++++++++++++++----------
 test/mca.jl              | 216 ++++++++++++++++---
 test/runtests.jl         |   3 +-
 5 files changed, 533 insertions(+), 153 deletions(-)

diff --git a/Project.toml b/Project.toml
index a1bd61f..f07ec20 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,7 +8,7 @@ version = "0.10.3"
 
 [deps]
 Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97"
-Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -17,11 +17,8 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
 Arpack = "0.3, 0.4, 0.5"
-Distributions = "0.25"
-StableRNGs = "1"
-Statistics = "1"
 StatsAPI = "^1.3"
-StatsBase = "^0.33, 0.34"
+StatsBase = "^0.33"
 julia = "1.1"
 
 [extras]
diff --git a/src/MultivariateStats.jl b/src/MultivariateStats.jl
index ca7af4c..0528b12 100644
--- a/src/MultivariateStats.jl
+++ b/src/MultivariateStats.jl
@@ -113,11 +113,11 @@ module MultivariateStats
     CA,                     # Type: correspondence analysis
 
     MCA,                    # Type: multiple correspondence analysis
-    ca,                     # fit and return a correspondence analysis
-    mca,                    # fit and return a multiple correspondence analysis
-    objectscores,           # return the object scores or coordinates from CA or MCA
-    variablescores,         # return the variable/category scores or coordinates from CA or MCA
-    inertia                 # return the inertia (derived from eigenvalues) for CA
+    object_coords,          # return the object scores or coordinates from CA or MCA
+    variable_coords,        # return the variable/category scores or coordinates from CA or MCA
+    inertia,                # return the inertia (derived from eigenvalues) for CA
+    ca_stats,               # fit statistics
+    quali_passive           # handle qualitative passive variables
 
     ## source files
     include("types.jl")
diff --git a/src/mca.jl b/src/mca.jl
index abbce26..06df67f 100644
--- a/src/mca.jl
+++ b/src/mca.jl
@@ -1,8 +1,5 @@
 # Correspondence Analysis and Multiple Correspondence Analysis
 
-# Needed for the plotting function below
-#using Printf, PyPlot
-
 #==
 References:
 https://personal.utdallas.edu/~herve/Abdi-MCA2007-pretty.pdf
@@ -16,29 +13,165 @@ https://maths.cnam.fr/IMG/pdf/ClassMCA_cle825cfc.pdf
 """
 Correspondence Analysis
 """
-struct CA{T<:Real} <: LinearDimensionalityReduction
+mutable struct CA{T<:Real} <: LinearDimensionalityReduction
 
     # The data matrix
-    X::Array{T}
+    X::Matrix{T}
 
     # The residuals
-    R::Array{T}
+    R::Matrix{T}
 
-    # Row and column masses (means)
+    # Row masses
     rm::Vector{T}
+
+    # Column masses
     cm::Vector{T}
 
     # The standardized residuals
-    SR::Array{T}
+    SR::Matrix{T}
 
-    # Object scores
-    F::Array{T}
+    # Object coordinates in standard coordinates
+    FS::Matrix{T}
 
-    # Variable scores
-    G::Array{T}
+    # Variable coordinates in standard coordinates
+    GS::Matrix{T}
 
     # Inertia (eigenvalues of the indicator matrix)
     I::Vector{T}
+
+    # Standard normalization or principal normalization
+    normalize::String
+
+    # Use either the Burt method or the indicator method
+    method::String
+end
+
+function show(io::IO, ca::CA)
+    nobs, nvar = size(ca.X)
+    print(
+        io,
+        "CA(nobs = $nobs, nvar = $nvar, method = $(ca.method), normalize = $(ca.normalize))",
+    )
+end
+
+function print_inertia(io, I)
+    xx = hcat(I, I, I)
+    xx[:, 2] = xx[:, 1] ./ sum(xx[:, 1])
+    xx[:, 3] = cumsum(xx[:, 2])
+    ii = findfirst(xx[:, 3] .>= 0.999)
+    xx = xx[1:ii, :]
+    vn = ["Inertia", "Prop inertia", "Cumulative inertia"]
+    cft = CoefTable(xx, vn, string.("", 1:ii))
+    println(io, cft)
+end
+
+function build_coeftable(ca, vc, stats, rn)
+    cn = ["Mass"]
+    xx = [ca.cm]
+    d = size(ca.GS, 2)
+    for j = 1:d
+        push!(cn, "Coord-$(j)")
+        push!(xx, vc[:, j])
+        push!(cn, "SqCorr-$(j)")
+        push!(xx, stats.sqcorr_col[:, j])
+        push!(cn, "RelContrib-$(j)")
+        push!(xx, stats.relcontrib_col[:, j])
+    end
+    xx = hcat(xx...)
+    cft = CoefTable(xx, cn, rn)
+    return cft
+end
+
+function show(io::IO, ::MIME"text/plain", ca::CA)
+    nobs, nvar = size(ca.X)
+    stats = ca_stats(ca)
+
+    println(
+        io,
+        "CA(nobs = $nobs, nvar = $nvar, method = $(ca.method), normalize = $(ca.normalize))",
+    )
+    print_inertia(io, ca.I)
+    vc = variable_coords(ca)
+    m, d = size(ca.GS)
+    println(io, "\nVariable coordinates:")
+    rn = string.("", 1:m)
+    cft = build_coeftable(ca, vc, stats, rn)
+    print(io, cft)
+    print(io, "\n\n")
+end
+
+# Squared correlation of each variable with each component.
+function sqcorr(ca::CA, axis)
+    if axis == :row
+        z = ca.rm ./ sum(abs2, ca.SR; dims = 2)[:]
+        p = length(ca.rm)
+        d = size(ca.GS, 2)
+        if ca.method == "burt"
+            # Not clear how to compute this so report zero for now
+            return zeros(p, d)
+        end
+        f = z * ones(d)'
+        return object_coords(ca; normalize = "principal") .^ 2 .* f
+    elseif axis == :col
+        z = ca.cm ./ sum(abs2, ca.SR; dims = 1)[:]
+        p = length(ca.cm)
+        d = size(ca.GS, 2)
+        f = z * ones(d)'
+        if ca.method == "burt"
+            # Not clear how to compute this so report zero for now
+            return zeros(p, d)
+        end
+        return variable_coords(ca; normalize = "principal") .^ 2 .* f
+    else
+        error("Unknown axis '$(axis)'")
+    end
+end
+
+# Relative contributions of each variable to each component.
+function relcontrib(ca::CA, axis)
+    e = ca.method == "indicator" ? 0.5 : 1
+    if axis == :row
+        d = size(ca.GS, 2)
+        z = ca.rm * (ca.I[1:d] .^ -e)'
+        return object_coords(ca; normalize = "principal") .^ 2 .* z
+    elseif axis == :col
+        d = size(ca.GS, 2)
+        z = ca.cm * (ca.I[1:d] .^ -e)'
+        return variable_coords(ca; normalize = "principal") .^ 2 .* z
+    else
+        error("Unknown axis '$(axis)'")
+    end
+end
+
+# Return several fit statistics.
+function ca_stats(ca::CA)
+    cr = sqcorr(ca, :col)
+    rc = relcontrib(ca, :col)
+    return (sqcorr_col = cr, relcontrib_col = rc)
+end
+
+# Multiply corresponding columns of Q by -1 as needed
+# to make the first non-zero value in each column of Q
+# positive.  This is the method used by Stata to identify
+# the coefficients.
+function orient(P, Q, e = 1e-12)
+    for j = 1:size(Q, 2)
+        i = findfirst(abs.(Q[:, j]) .>= e)
+        if Q[i, j] < 0
+            Q[:, j] .*= -1
+            P[:, j] .*= -1
+        end
+    end
+    return P, Q
+end
+
+function mca_check_args(normalize, method)
+    if !(normalize in ["standard", "principal"])
+        error("normalize = '$normalize' should be 'standard' or 'principal'")
+    end
+    if !(method in ["indicator", "burt"])
+        error("method = '$method' should be 'indicator' or 'burt'")
+    end
 end
 
 """
@@ -49,6 +182,7 @@ Peform a Correspondence Analysis using the data in `X`.
 **Keyword Arguments**
 
 - `d` the number of dimensions to retain.
+- `normalize` a coordinate normalization method, either 'standard' or 'principal'.
 
 **Notes:**
 
@@ -60,7 +194,15 @@ Peform a Correspondence Analysis using the data in `X`.
   with multiple nominal columns as input and performs a CA on
   its indicator matrix.
 """
-function fit(::Type{CA}, X::AbstractMatrix{T}; d::Int = 5) where{T}
+function fit(
+    ::Type{CA},
+    X::AbstractMatrix{T};
+    d::Int = 5,
+    normalize = "standard",
+    method = "indicator",
+) where {T}
+
+    mca_check_args(normalize, method)
 
     # Convert to proportions
     X = X ./ sum(X)
@@ -77,39 +219,120 @@ function fit(::Type{CA}, X::AbstractMatrix{T}; d::Int = 5) where{T}
     Wc = Diagonal(sqrt.(cm))
     SR = Wr \ R / Wc
 
-    # Get the object scores (F) and variable scores (G).
+    # Factor the standardized residual matrix
     P, D, Q = svd(SR)
-    Dq = Diagonal(D)[:, 1:d]
 
     # Check that there are no repeated non-zero eigenvalues.
-    d = diff(D[D .> 1e-10])
-    if maximum(d) >= -1e-10
+    di = diff(D[D.>1e-10])
+    if maximum(di) >= -1e-10
         @warn("The indicator matrix has repeated non-zero eigenvalues")
     end
 
-    Wr = Diagonal(sqrt.(rm))
-    Wc = Diagonal(sqrt.(cm))
-    F = Wr \ P * Dq
-    G = Wc \ Q * Dq
-
     # Get the eigenvalues
-    I = D .^ 2
+    I = method == "burt" ? D .^ 4 : D .^ 2
+
+    # Reduce to the selected dimension
+    P = P[:, 1:d]
+    D = D[1:d]
+    Q = Q[:, 1:d]
+
+    # Flip the loading vectors to a standard orientation.
+    P, Q = orient(P, Q)
+
+    # Get the object scores (F) and variable scores (G).  These are the
+    # standard coordinates.
+    FS = Wr \ P
+    GS = Wc \ Q
+
+    ca = CA(X, R, rm, cm, SR, FS, GS, I, normalize, method)
+    return ca
+end
+
+# Calculate the standard coordinates of any qualitative passive variables.
+function quali_passive(ca::CA, passive; normalize = "principal")
+
+    if size(passive, 1) == 0
+        return
+    end
+
+    if length(size(passive)) != 2
+        error("passive variable array must be two-dimensional")
+    end
+
+    (; X, GS) = ca
+    PX = Matrix(passive)
+
+    if size(PX, 1) != size(X, 1)
+        @error("Passive data must have same leading axis length as active data.")
+    end
+
+    M = hcat(X, PX)
+    B = M' * M
+    B ./= sum(B)
+    p = size(X, 2)
+    B = B[p+1:end, 1:p]
+
+    PGS = B * GS
+    for k = 1:size(B, 1)
+        PGS[k, :] ./= sum(B[k, :])
+    end
+    d = size(PGS, 2)
+    if ca.method == "burt"
+        PGS = PGS * Diagonal(1 ./ sqrt.(ca.I[1:d]))
+    elseif ca.method == "indicator"
+        PGS = PGS * Diagonal(1 ./ ca.I[1:d])
+    else
+        error("Unknown method '$(ca.method)'")
+    end
+
+    if normalize == "standard"
+        return PGS
+    elseif normalize == "principal"
+        return PGS * Diagonal(sqrt.(ca.I[1:d]))
+    else
+        error("Unknown normalization '$(normalize)'")
+    end
 
-    return CA(X, R, rm, cm, SR, F, G, I)
+    return PGS
+end
+
+function object_coords(ca::CA; normalize = ca.normalize)
+    if normalize == "standard"
+        ca.FS
+    elseif normalize == "principal"
+        d = size(ca.FS, 2)
+        return ca.FS * Diagonal(sqrt.(ca.I[1:d]))
+    else
+        error("Unknown normalization '$(normalize)'")
+    end
 end
 
-objectscores(ca::CA) = ca.F
-variablescores(ca::CA) = ca.G
 inertia(ca::CA) = ca.I
 
+function variable_coords(ca::CA; normalize = ca.normalize)
+    (; GS) = ca
+
+    d = size(GS, 2)
+    if normalize == "standard"
+        return GS
+    elseif normalize == "principal"
+        return GS * Diagonal(sqrt.(ca.I[1:d]))
+    else
+        error("Unknown normalization '$(normalize)'")
+    end
+end
+
 """
 Multiple Correspondence Analysis
 """
-struct MCA{T<:Real} <: LinearDimensionalityReduction
+mutable struct MCA{T<:Real} <: LinearDimensionalityReduction
 
     # The underlying corresponence analysis
     C::CA{T}
 
+    # Indicator matrix
+    Inds::Matrix{Float64}
+
     # Variable names
     vnames::Vector{String}
 
@@ -117,11 +340,7 @@ struct MCA{T<:Real} <: LinearDimensionalityReduction
     rd::Vector{Dict}
 
     # Map integer positions to values
-    dr::Vector{Dict}
-
-    # Split the variable scores into separate arrays for
-    # each variable.
-    Gv::Vector{Matrix{T}}
+    dr::Vector{Vector}
 
     # Number of nominal variables
     K::Int
@@ -135,21 +354,52 @@ struct MCA{T<:Real} <: LinearDimensionalityReduction
     greenacre_eigs::Vector{Float64}
 end
 
-objectscores(mca::MCA) = mca.C.F
-variablescores(mca::MCA) = mca.Gv
-
-# Split the variable scores to a separate array for each
-# variable.
-function xsplit(G, rd)
-    K = [length(di) for di in rd]
-    Js = cumsum(K)
-    Js = vcat(1, 1 .+ Js)
-    Gv = Vector{Matrix{eltype(G)}}()
-    for j in eachindex(K)
-        g = G[Js[j]:Js[j+1]-1, :]
-        push!(Gv, g)
+function expand_names(vnames, dr)
+    names, levels = [], []
+    for (j, v) in enumerate(vnames)
+        u = dr[j]
+        for k in eachindex(u)
+            push!(names, v)
+            push!(levels, u[k])
+        end
     end
-    return Gv
+    return (Variable = names, Level = levels)
+end
+
+object_coords(mca::MCA; normalize = "principal") =
+    object_coords(mca.C, normalize = normalize)
+
+function variable_coords(mca::MCA; normalize = "principal")
+    (; C, vnames, dr) = mca
+    na = expand_names(vnames, dr)
+    G = variable_coords(C, normalize = normalize)
+    return (Variable = na.Variable, Level = na.Level, Coord = G)
+end
+
+function show(io::IO, mca::MCA)
+    nobs, ninds = size(mca.C.X)
+    nvar = length(mca.vnames)
+    print(io, "MCA(nobs = $nobs, nvar = $nvar, ninds = $ninds)")
+end
+
+function show(io::IO, ::MIME"text/plain", mca::MCA)
+    nobs, ninds = size(mca.C.X)
+    stats = ca_stats(mca.C)
+    nvar = length(mca.vnames)
+    println(io, "MCA(nobs = $nobs, nvar = $nvar, ninds = $ninds)")
+    print_inertia(io, mca.C.I)
+    vc = variable_coords(mca.C; normalize = "principal")
+    d = size(vc, 2)
+    en = expand_names(mca.vnames, mca.dr)
+    rn = ["$(a) $(b)" for (a, b) in zip(en.Variable, en.Level)]
+    cft = build_coeftable(mca.C, vc, stats, rn)
+    println(io, "\nVariable coordinates (principal normalization):")
+    print(io, cft)
+    print(io, "\n\n")
+end
+
+function ca_stats(mca::MCA)
+    return ca_stats(mca.C)
 end
 
 # Calculate the eigenvalues with different debiasings.
@@ -186,19 +436,26 @@ Fit a multiple correspondence analysis using the columns of `X` as the variables
 - Missing values are recoded as fractional indicators, i.e. if there are k distinct
   levels of a variable it is coded as 1/k, 1/k, ..., 1/k.
 """
-function fit(::Type{MCA}, X; d::Int=5, vnames=[])
+function fit(
+    ::Type{MCA},
+    X;
+    d::Int = 5,
+    normalize = "standard",
+    method = "indicator",
+    vnames = [],
+)
 
     if length(vnames) == 0 && typeof(X) <: AbstractDataFrame
         vnames = names(X)
     elseif length(vnames) == 0
-        vnames = ["v$(j)" for j = 1:size(Z, 2)]
+        vnames = ["v$j" for j = 1:size(X, 2)]
     end
 
     # Get the indicator matrix
     XI, rd, dr = make_indicators(X)
 
     # Create the underlying correspondence analysis value
-    C = fit(CA, XI; d=d)
+    C = fit(CA, XI; d = d, normalize = normalize, method = method)
 
     # Number of nominal variables
     K = size(X, 2)
@@ -206,19 +463,38 @@ function fit(::Type{MCA}, X; d::Int=5, vnames=[])
     # Total number of categories in all variables
     J = size(XI, 2)
 
-    # Split the variable scores into separate arrays for each variable.
-    Gv = xsplit(C.G, rd)
-
     una, ben, gra = get_eigs(C.I, J, K)
 
-    return MCA(C, vnames, rd, dr, Gv, K, J, una, ben, gra)
+    mca = MCA(C, XI, vnames, rd, dr, K, J, una, ben, gra)
+
+    return mca
+end
+
+function quali_passive(mca::MCA, passive; normalize = "principal")
+    (; C) = mca
+    if size(passive, 1) != size(C.X, 1)
+        error("Wrong number of rows in passive data array")
+    end
+
+    PI, _, drp = make_indicators(passive)
+    r = quali_passive(C, PI; normalize = normalize)
+
+    vnames = if typeof(passive) <: AbstractDataFrame
+        names(passive)
+    else
+        m = length(drp)
+        string.("p", 1:m)
+    end
+
+    v = expand_names(vnames, drp)
+    return (Variable = v.Variable, Level = v.Level, Coord = r)
 end
 
 # Create an indicator matrix corresponding to the distinct
 # values in the vector 'z'.  Also returns dictionaries mapping
 # the unique values to column offsets, and mapping the column
 # offsets to the unique values.
-function make_single_indicator(z::Vector{T}) where{T}
+function make_single_indicator(z::Vector{T}) where {T}
 
     n = length(z)
 
@@ -232,9 +508,11 @@ function make_single_indicator(z::Vector{T}) where{T}
     # Recoding dictionary, maps each distinct value in z to
     # an offset
     rd = Dict{T,Int}()
+    rdi = []
     for (j, v) in enumerate(uq)
         if !ismissing(v)
             rd[v] = j
+            push!(rdi, v)
         end
     end
 
@@ -252,12 +530,6 @@ function make_single_indicator(z::Vector{T}) where{T}
         end
     end
 
-    # Reverse the recoding dictionary
-    rdi = Dict{Int,T}()
-    for (k, v) in rd
-        rdi[v] = k
-    end
-
     return X, rd, rdi
 end
 
@@ -267,17 +539,21 @@ end
 # to levels for each variable.
 function make_indicators(Z)
 
-    rd, rdr = Dict[], Dict[]
+    if size(Z, 1) == 0
+        return zeros(0, 0), Dict[], Vector[]
+    end
+
+    rd, rdi = Dict[], Vector[]
     XX = []
     for j = 1:size(Z, 2)
-        X, di, dir = make_single_indicator(Z[:, j])
-        push!(rd, di)
-        push!(rdr, dir)
+        X, dv, di = make_single_indicator(Z[:, j])
+        push!(rd, dv)
+        push!(rdi, di)
         push!(XX, X)
     end
     I = hcat(XX...)
 
-    return I, rd, rdr
+    return I, rd, rdi
 end
 
 # Return a table summarizing the inertia.
@@ -290,47 +566,3 @@ function inertia(mca::MCA)
     )
     return inr
 end
-
-# Plot the category scores for components numbered 'x' and 'y'.  Ordered factors
-# are connected with line segments.
-#function variable_plot(mca::MCA; x = 1, y = 2, vnames = [], ordered = [], kwargs...)
-
-#    fig = PyPlot.figure()
-#    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
-#    ax.grid(true)
-
-# Set up the colormap
-#    cm = get(kwargs, :cmap, PyPlot.get_cmap("tab10"))
-
-# Set up the axis limits
-#    mn = 1.2 * minimum(mca.C.G, dims = 1)
-#    mx = 1.2 * maximum(mca.C.G, dims = 1)
-#    xlim = get(kwargs, :xlim, [mn[x], mx[x]])
-#    ylim = get(kwargs, :ylim, [mn[y], mx[y]])
-#    ax.set_xlim(xlim...)
-#    ax.set_ylim(ylim...)
-
-#    for (j, g) in enumerate(mca.Gv)
-
-#        if mca.vnames[j] in ordered
-#            PyPlot.plot(g[:, x], g[:, y], "-", color = cm(j))
-#        end
-
-#        dr = mca.dr[j]
-#        vn = length(vnames) > 0 ? vnames[j] : ""
-#        for (k, v) in dr
-#            if vn != ""
-#                lb = "$(vn)-$(v)"
-#            else
-#                lb = v
-#            end
-#            ax.text(g[k, x], g[k, y], lb, color = cm(j), ha = "center", va = "center")
-#        end
-#    end
-
-#    inr = inertia(mca)
-#    PyPlot.xlabel(@sprintf("Dimension %d (%.2f%%)", x, 100 * inr[x, :Greenacre]))
-#    PyPlot.ylabel(@sprintf("Dimension %d (%.2f%%)", y, 100 * inr[y, :Greenacre]))
-
-#    return fig
-#end
diff --git a/test/mca.jl b/test/mca.jl
index dc5209e..196fe5b 100644
--- a/test/mca.jl
+++ b/test/mca.jl
@@ -1,51 +1,201 @@
 using MultivariateStats, DataFrames, Test
 
-@testset "Compare MCA to R FactoMiner" begin
-
+function mca_simple_data()
+    # Active variables
     da = DataFrame(
         :V1 => ["A", "A", "A", "A", "B", "B", "B", "B"],
         :V2 => ["X", "X", "Y", "X", "X", "Y", "X", "X"],
         :V3 => ["D", "D", "D", "C", "D", "C", "D", "C"],
     )
 
-    m = fit(MCA, da; d=3, vnames = names(da))
-    F = objectscores(m)
-    G = variablescores(m.C)
+    # Passive variables
+    dp = DataFrame(
+        :V4 => [1, 2, 2, 2, 1, 1, 2, 1],
+        :V5 => [2, 1, 2, 2, 1, 2, 1, 2],
+    )
+
+    return da, dp
+end
+
+# These values come from Stata and/or R FactoMineR.
+function mca_simple_burt_correct()
 
     # Eigenvalues
-    eig = inertia(m).Raw[1:3]
-    @test isapprox(eig, [0.4327141, 0.3333333, 0.2339525], rtol = 1e-6, atol = 1e-6)
+    eig = [0.18724152, 0.11111111, 0.05473379]
+
+    # Variable coordinates in standard normalization
+    c1 = [1.0606602, -1.0606602, 0.35355339, -1.0606602, -1.5811388, 0.9486833]
+    c2 = [0.8660254, -.8660254, -.8660254, 2.5980762, 0, 0]
+    variable_standard = hcat(c1, c2)
+
+    # Variable coordinates in principal normalization
+    c1 = [0.45896265, -0.45896265, 0.15298755, -0.45896265, -0.68418112, 0.41050867]
+    c2 = [0.28867513, -0.28867513, -0.28867513, 0.8660254, 0, 0]
+    variable_principal = hcat(c1, c2)
+
+    # Passive variable coordinates in standard normalization
+    c1 = [-0.65213019, 0.65213019, 0.73080064, -0.43848039]
+    c2 = [-0.4330127, 0.4330127, -1.1547005, 0.69282032]
+    passive_standard = hcat(c1, c2)
+
+    # Passive variable coordinates in principal normalization
+    c1 = [-0.28218595, 0.28218595, 0.31622777, -0.18973666]
+    c2 = [-0.14433757, 0.14433757, -0.38490018, 0.23094011]
+    passive_principal = hcat(c1, c2)
+
+    # Relative column contributions
+    c1 = [0.187, 0.187, 0.031, 0.094, 0.312, 0.188]
+    c2 = [0.125, 0.125, 0.188, 0.562, 0, 0]
+    contrib = hcat(c1, c2)
+
+    return (eig = eig,
+            variable_principal = variable_principal,
+            variable_standard = variable_standard,
+            passive_principal = passive_principal,
+            passive_standard = passive_standard,
+            contrib = contrib)
+end
+
 
-    # Object coordinates
+# These values come from Stata and/or R FactoMiner.
+function mca_simple_indicator_correct()
+
+    # Eigenvalues in principal normalization
+    eig = [0.4327141, 0.3333333, 0.2339525]
+
+    # Object coordinates in principal normalization
     c1 = [
-        -7.876323e-01,
-        -7.876323e-01,
-        -0.3162278,
-        5.564176e-02,
-        -0.08052551,
-        1.2341531,
-        -0.08052551,
-        0.7627485,
+        7.876323e-01,
+        7.876323e-01,
+        0.3162278,
+        -5.564176e-02,
+        0.08052551,
+        -1.2341531,
+        0.08052551,
+        -0.7627485,
     ]
     c2 = [0, 0, 1.1547005, 0, -0.57735027, 0.5773503, -0.57735027, -0.5773503]
-    c3 = [
-        1.551768e-01,
-        1.551768e-01,
-        -0.3162278,
-        9.984508e-01,
-        -0.55193003,
-        -0.1800605,
-        -0.55193003,
-        0.2913440,
-    ]
-    coord = hcat(c1, c2, c3)
-    @test isapprox(coord, F, atol = 1e-6, rtol = 1e-6)
+    object_principal = hcat(c1, c2)
 
-    # Variable coordinates
-    c1 = [-0.6977130, 0.6977130, -0.232571, 0.6977130, 1.040089e+00, -6.240535e-01]
+    # Variable coordinates in principal normalization
+    c1 = [0.6977130, -0.6977130, 0.232571, -0.6977130, -1.040089e+00, 6.240535e-01]
     c2 = [0.5000000, -0.5000000, -0.500000, 1.5000000, 0, 0]
-    c3 = [0.5130269, -0.5130269, 0.171009, -0.5130269, 7.647753e-01, -4.588652e-01]
-    coord = hcat(c1, c2, c3)
-    @test isapprox(coord, G, atol = 1e-6, rtol = 1e-6)
+    variable_principal = hcat(c1, c2)
+
+    # Variable coordinates in standard normalization
+    c1 = [1.0606602, -1.0606602, 0.35355339, -1.0606602, -1.5811388, 0.9486833]
+    c2 = [0.8660254, -0.8660254, -0.8660254, 2.5980762, 0, 0]
+    variable_standard = hcat(c1, c2)
+
+    # Passive variables in standard normalization
+    c1 = [-0.65213019, 0.65213019, 0.73080064, -0.43848039]
+    c2 = [-0.4330127, 0.4330127, -1.1547005, 0.69282032]
+    passive_standard = hcat(c1, c2)
+
+    # Passive variables in principal normalization
+    c1 = [-0.42897783, 0.42897783, 0.48072805, -0.28843683]
+    c2 = [-0.25, 0.25, -0.66666667, 0.4]
+    passive_principal = hcat(c1, c2)
+
+    # Squared column correlations
+    c1 = [0.487, 0.487, 0.162, 0.162, 0.649, 0.649]
+    c2 = [0.250, 0.250, 0.750, 0.750, 0, 0]
+    sqcorr = hcat(c1, c2)
+
+    # Relative column contributions
+    c1 = [0.123, 0.123, 0.021, 0.062, 0.206, 0.123]
+    c2 = [0.072, 0.072, 0.108, 0.325, 0, 0]
+    contrib = hcat(c1, c2)
+
+    return (eig = eig, object_principal = object_principal,
+            variable_principal = variable_principal,
+            variable_standard = variable_standard,
+            passive_standard = passive_standard,
+            passive_principal = passive_principal,
+            sqcorr = sqcorr, contrib = contrib)
+end
+
+@testset "Compare MCA to R FactoMineR and Stata using Burt method" begin
+
+    da, dp = mca_simple_data()
+    xc = mca_simple_burt_correct()
+
+    mp = fit(MCA, da; d=2, normalize = "principal", method = "burt")
+    ms = fit(MCA, da; d=2, normalize = "standard", method = "burt")
+
+    GPP = quali_passive(mp, dp; normalize = "principal")
+    GPS = quali_passive(ms, dp; normalize = "standard")
+
+    show(devnull, MIME("text/plain"), mp)
+    show(devnull, MIME("text/plain"), ms)
+
+    # Eigenvalues
+    eig = inertia(mp).Raw[1:3]
+    @test isapprox(eig, xc.eig, rtol = 1e-6, atol = 1e-6)
+
+    # Variable coordinates in principal normalization
+    GP = variable_coords(mp.C)
+    @test isapprox(GP, xc.variable_principal, atol = 1e-6, rtol = 1e-6)
+
+    # Variable coordinates in standard normalization
+    GS = variable_coords(ms.C)
+    @test isapprox(GS, xc.variable_standard, atol = 1e-6, rtol = 1e-6)
+
+    # Passive variable coordinates in principal normalization
+    #GPP = variable_coords(mp.C; type = "passive")
+    @test isapprox(GPP.Coord, xc.passive_principal, atol = 1e-6, rtol = 1e-6)
+
+    # Passive variable coordinates in standard normalization
+    #GPS = variable_coords(ms.C; type = "passive")
+    @test isapprox(GPS.Coord, xc.passive_standard, atol = 1e-6, rtol = 1e-6)
+
+    for m in [mp, ms]
+        stat = ca_stats(m.C)
+        @test isapprox(stat.relcontrib_col, xc.contrib, atol = 1e-2, atol = 1e-2)
+    end
+end
+
+
+@testset "Compare MCA to R FactoMineR and Stata using indicator method" begin
+
+    da, dp = mca_simple_data()
+    xc = mca_simple_indicator_correct()
+
+    mp = fit(MCA, da; d=2, normalize = "principal", method = "indicator")
+    ms = fit(MCA, da; d=2, normalize = "standard", method = "indicator")
+
+    GPP = quali_passive(mp, dp, normalize = "principal")
+    GPS = quali_passive(ms, dp, normalize = "standard")
+
+    show(devnull, MIME("text/plain"), mp)
+    show(devnull, MIME("text/plain"), ms)
+
+    # Eigenvalues
+    eig = inertia(mp).Raw[1:3]
+    @test isapprox(eig, xc.eig, rtol = 1e-6, atol = 1e-6)
+
+    # Variable coordinates in principal normalization
+    G = variable_coords(mp.C; normalize = "principal")
+    @test isapprox(G, xc.variable_principal, atol = 1e-6, rtol = 1e-6)
+
+    # Variable coordinates in standard normalization
+    GS = variable_coords(ms.C, normalize = "standard")
+    @test isapprox(GS, xc.variable_standard, atol = 1e-6, rtol = 1e-6)
+
+    # Object coordinates in principal normalization
+    F = object_coords(mp; normalize = "principal")
+    @test isapprox(F, xc.object_principal, atol = 1e-6, rtol = 1e-6)
+
+    # Coordinates of passive variables in principal coordinates
+    @test isapprox(GPP.Coord, xc.passive_principal, atol = 1e-6, rtol = 1e-6)
+
+    # Coordinates of passive variables in standard coordinates
+    @test isapprox(GPS.Coord, xc.passive_standard, atol = 1e-6, rtol = 1e-6)
 
+    # Fit statistics
+    for m in [mp, ms]
+        stat = ca_stats(m.C)
+        @test isapprox(stat.sqcorr_col, xc.sqcorr, atol = 1e-3, atol = 1e-3)
+        @test isapprox(stat.relcontrib_col, xc.contrib, atol = 1e-2, atol = 1e-2)
+    end
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index a551076..9798730 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -8,7 +8,8 @@ tests = ["lreg",
          "ica",
          "ppca",
          "kpca",
-         "fa"
+         "fa",
+         "mca",
          ]
 
 for test in tests

From be7bf00164ee7208532ff78a9c84dcd177ac83bd Mon Sep 17 00:00:00 2001
From: Kerby Shedden <kshedden@umich.edu>
Date: Mon, 12 Dec 2022 23:51:40 -0500
Subject: [PATCH 07/11] Remove unsupported destructuring syntax

---
 src/mca.jl | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/mca.jl b/src/mca.jl
index 06df67f..2b32d65 100644
--- a/src/mca.jl
+++ b/src/mca.jl
@@ -259,7 +259,10 @@ function quali_passive(ca::CA, passive; normalize = "principal")
         error("passive variable array must be two-dimensional")
     end
 
-    (; X, GS) = ca
+    #(; X, GS) = ca
+    X = ca.X
+    GS = ca.GS
+
     PX = Matrix(passive)
 
     if size(PX, 1) != size(X, 1)
@@ -310,7 +313,8 @@ end
 inertia(ca::CA) = ca.I
 
 function variable_coords(ca::CA; normalize = ca.normalize)
-    (; GS) = ca
+    #(; GS) = ca
+    GS = ca.GS
 
     d = size(GS, 2)
     if normalize == "standard"
@@ -370,7 +374,11 @@ object_coords(mca::MCA; normalize = "principal") =
     object_coords(mca.C, normalize = normalize)
 
 function variable_coords(mca::MCA; normalize = "principal")
-    (; C, vnames, dr) = mca
+    #(; C, vnames, dr) = mca
+    C = mca.C
+    vnames = mca.vnames
+    dr = mca.dr
+
     na = expand_names(vnames, dr)
     G = variable_coords(C, normalize = normalize)
     return (Variable = na.Variable, Level = na.Level, Coord = G)
@@ -471,7 +479,8 @@ function fit(
 end
 
 function quali_passive(mca::MCA, passive; normalize = "principal")
-    (; C) = mca
+    #(; C) = mca
+    C = mca.C
     if size(passive, 1) != size(C.X, 1)
         error("Wrong number of rows in passive data array")
     end

From 7dbd4f75847e6c2f2054e56cd0970e14aedf33ee Mon Sep 17 00:00:00 2001
From: Kerby Shedden <kshedden@umich.edu>
Date: Tue, 17 Jan 2023 10:12:35 -0500
Subject: [PATCH 08/11] Add information about names/levels to MCA struct

---
 src/mca.jl | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/mca.jl b/src/mca.jl
index 2b32d65..88dc34f 100644
--- a/src/mca.jl
+++ b/src/mca.jl
@@ -340,6 +340,9 @@ mutable struct MCA{T<:Real} <: LinearDimensionalityReduction
     # Variable names
     vnames::Vector{String}
 
+    # Variable+level names
+    names::NamedTuple
+
     # Map values to integer positions
     rd::Vector{Dict}
 
@@ -473,7 +476,9 @@ function fit(
 
     una, ben, gra = get_eigs(C.I, J, K)
 
-    mca = MCA(C, XI, vnames, rd, dr, K, J, una, ben, gra)
+    na = expand_names(vnames, dr)
+
+    mca = MCA(C, XI, vnames, na, rd, dr, K, J, una, ben, gra)
 
     return mca
 end

From 5a192dce9117fef961da1fc9e9b49d5051b7c41c Mon Sep 17 00:00:00 2001
From: Kerby Shedden <kshedden@umich.edu>
Date: Sun, 5 Feb 2023 15:01:48 -0500
Subject: [PATCH 09/11] Tolerate more types in one-hot coding

---
 src/mca.jl | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/mca.jl b/src/mca.jl
index 88dc34f..a997709 100644
--- a/src/mca.jl
+++ b/src/mca.jl
@@ -463,7 +463,7 @@ function fit(
     end
 
     # Get the indicator matrix
-    XI, rd, dr = make_indicators(X)
+    XI, rd, dr = make_indicators(X, "active")
 
     # Create the underlying correspondence analysis value
     C = fit(CA, XI; d = d, normalize = normalize, method = method)
@@ -490,7 +490,7 @@ function quali_passive(mca::MCA, passive; normalize = "principal")
         error("Wrong number of rows in passive data array")
     end
 
-    PI, _, drp = make_indicators(passive)
+    PI, _, drp = make_indicators(passive, "passive")
     r = quali_passive(C, PI; normalize = normalize)
 
     vnames = if typeof(passive) <: AbstractDataFrame
@@ -508,19 +508,21 @@ end
 # values in the vector 'z'.  Also returns dictionaries mapping
 # the unique values to column offsets, and mapping the column
 # offsets to the unique values.
-function make_single_indicator(z::Vector{T}) where {T}
+function make_single_indicator(z::AbstractVector, vtype::String, pos::Int)
 
     n = length(z)
 
     # Unique values of the variable
     uq = sort(unique(z))
 
-    if length(uq) > 50
-        @warn("Nominal variable has more than 50 levels")
+    # This situation usually results from user error so warn.
+    if length(uq) > 20
+        @warn("$(titlecase(vtype)) variable in column $(pos) has more than 20 levels")
     end
 
     # Recoding dictionary, maps each distinct value in z to
     # an offset
+    T = eltype(z)
     rd = Dict{T,Int}()
     rdi = []
     for (j, v) in enumerate(uq)
@@ -551,7 +553,7 @@ end
 # In addition to the indicator matrix, return vectors of
 # dictionaries mapping levels to positions and positions
 # to levels for each variable.
-function make_indicators(Z)
+function make_indicators(Z, vtype::String)
 
     if size(Z, 1) == 0
         return zeros(0, 0), Dict[], Vector[]
@@ -560,7 +562,7 @@ function make_indicators(Z)
     rd, rdi = Dict[], Vector[]
     XX = []
     for j = 1:size(Z, 2)
-        X, dv, di = make_single_indicator(Z[:, j])
+        X, dv, di = make_single_indicator(Z[:, j], vtype, j)
         push!(rd, dv)
         push!(rdi, di)
         push!(XX, X)

From f4c5d3e0895c6cf4bfb868df1d6999bfe219059f Mon Sep 17 00:00:00 2001
From: kshedden <kshedden@umich.edu>
Date: Tue, 3 Sep 2024 20:36:11 -0400
Subject: [PATCH 10/11] Fix project.toml

---
 Project.toml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index f07ec20..a1bd61f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,7 +8,7 @@ version = "0.10.3"
 
 [deps]
 Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97"
-DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -17,8 +17,11 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
 Arpack = "0.3, 0.4, 0.5"
+Distributions = "0.25"
+StableRNGs = "1"
+Statistics = "1"
 StatsAPI = "^1.3"
-StatsBase = "^0.33"
+StatsBase = "^0.33, 0.34"
 julia = "1.1"
 
 [extras]

From febc790c9f0c55498f39c75edd81b332c2c71a0e Mon Sep 17 00:00:00 2001
From: kshedden <kshedden@umich.edu>
Date: Tue, 3 Sep 2024 20:37:10 -0400
Subject: [PATCH 11/11] fix multivariatestats.jl

---
 src/MultivariateStats.jl | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/src/MultivariateStats.jl b/src/MultivariateStats.jl
index 0528b12..13d33f6 100644
--- a/src/MultivariateStats.jl
+++ b/src/MultivariateStats.jl
@@ -1,16 +1,16 @@
 module MultivariateStats
 
-    using DataFrames
     using LinearAlgebra
     using SparseArrays
     using Statistics: middle
-    using StatsAPI: RegressionModel
+    using Distributions: cdf, FDist
+    using StatsAPI: RegressionModel, HypothesisTest
     using StatsBase: SimpleCovariance, CovarianceEstimator, AbstractDataTransform,
                      ConvergenceException, pairwise, pairwise!, CoefTable
 
     import Statistics: mean, var, cov, covm, cor
     import Base: length, size, show
-    import StatsAPI: fit, predict, coef, weights, dof, r2
+    import StatsAPI: fit, predict, coef, weights, dof, r2, pvalue
     import LinearAlgebra: eigvals, eigvecs
 
     export
@@ -67,7 +67,10 @@ module MultivariateStats
     KernelPCA,          # Type: the Kernel PCA model
 
     ## cca
-    CCA,                # Type: Correlation Component Analysis model
+    CCA,                 # Type: Correlation Component Analysis model
+    WilksLambdaTest,     # Wilks lambda statistics and tests
+    PillaiTraceTest,     # Pillai trace statistics and tests
+    LawleyHotellingTest, # Lawley-Hotelling statistics and tests
 
     ccacov,             # CCA based on covariances
     ccasvd,             # CCA based on singular value decomposition of input data
@@ -107,17 +110,7 @@ module MultivariateStats
     FactorAnalysis,         # Type: the Factor Analysis model
 
     faem,                   # EM algorithm for factor analysis
-    facm,                   # CM algorithm for factor analysis
-
-    ## ca, mca
-    CA,                     # Type: correspondence analysis
-
-    MCA,                    # Type: multiple correspondence analysis
-    object_coords,          # return the object scores or coordinates from CA or MCA
-    variable_coords,        # return the variable/category scores or coordinates from CA or MCA
-    inertia,                # return the inertia (derived from eigenvalues) for CA
-    ca_stats,               # fit statistics
-    quali_passive           # handle qualitative passive variables
+    facm                    # CM algorithm for factor analysis
 
     ## source files
     include("types.jl")
@@ -133,7 +126,6 @@ module MultivariateStats
     include("lda.jl")
     include("ica.jl")
     include("fa.jl")
-    include("mca.jl")
 
     ## deprecations
     @deprecate indim(f) size(f,1)