rggplot2ellipse

stat_ellipse() in MCA plot does not cover jittered points / extends far beyond the data


I am creating a Multiple Correspondence Analysis (MCA) plot in R using FactoMineR, factoextra, and ggplot2. The goal is to add confidence ellipses around the archetype categories in the MCA space.

The ellipses produced by stat_ellipse() do not match the distribution of the points:

example

How can I generate ellipses in an MCA plot that accurately reflect the distribution of the points?

Code:

pacman::p_load(FactoMineR, factoextra, dplyr, gridExtra, tidyr)

# MCA with template as supplementary
mca_input <- all_df |> select(sector, type, template)
mca_res <- MCA(mca_input, quali.sup = 3, graph = FALSE)

# Extract coordinates
mca_coords <- as.data.frame(mca_res$ind$coord)
mca_coords$archetype <- all_df$template

# Test 1: Original variable associations (Fisher)
fish_type <- fisher.test(table(all_df$template, all_df$type), simulate.p.value = TRUE)
fish_sector <- fisher.test(table(all_df$template, all_df$sector), simulate.p.value = TRUE)

# Test 2: MCA dimensional separation (Kruskal-Wallis)
kw_dim1 <- kruskal.test(`Dim 1` ~ archetype, data = mca_coords)
kw_dim2 <- kruskal.test(`Dim 2` ~ archetype, data = mca_coords)

# Plot 1: MCA biplot
p1 <- ggplot() +
  geom_hline(yintercept = 0, color = "grey50", linewidth = 0.5, linetype = "dashed") +
  geom_vline(xintercept = 0, color = "grey50", linewidth = 0.5, linetype = "dashed") +
  geom_jitter(data = mca_coords, 
              aes(x = `Dim 1`, y = `Dim 2`, color = archetype),
              size = 3, alpha = 0.6, width = 0.03, height = 0.03) +
  stat_ellipse(data = mca_coords,
               aes(x = `Dim 1`, y = `Dim 2`, color = archetype),
               level = 0.68, linewidth = 0.7) +
  labs(title = "(A) Archetype Clustering in Feature Space",
       x = paste0("Dim 1: Essential ↔ Non-essential (", round(mca_res$eig[1,2], 1), "%)"),
       y = paste0("Dim 2: Retail/Commercial ↔ Industrial (", round(mca_res$eig[2,2], 1), "%)"),
       color = "Archetype") +
  theme_minimal() +
  theme(panel.grid = element_blank(),
        legend.position = "bottom")

p1

Dataset:

> dput(all_df)
structure(list(city = c("amsterdam", "ba", "berlin", "brisbane", 
"cairo", "caracas", "dallas", "delhi", "dubai", "frankfurt", 
"guangzhou", "istanbul", "johannesburg", "la", "lima", "london", 
"madrid", "manchester", "melbourne", "milan", "mumbai", "munich", 
"nairobi", "paris", "pune", "rio", "rome", "santiago", "shanghai", 
"shenzhen", "sydney", "vienna", "almaty", "amsterdam", "ba", 
"baku", "caracas", "chicago", "dallas", "johannesburg", "la", 
"lima", "madrid", "manchester", "melbourne", "mexico", "milan", 
"ny", "paris", "abu", "almaty", "amsterdam", "athens", "ba", 
"baku", "beijing", "berlin", "brisbane", "cairo", "cape", "caracas", 
"chicago", "dallas", "delhi", "dubai", "frankfurt", "guangzhou", 
"hk", "istanbul", "jeddah", "johannesburg", "la", "lahore", "lima", 
"london", "madrid", "manchester", "melbourne", "mexico", "milan", 
"mumbai", "munich", "nairobi", "ny", "paris", "pune", "rio", 
"riyadh", "rome", "santiago", "shanghai", "shenzhen", "sp", "sydney", 
"vienna", "wash", "wuhan"), template = c("Chronic decline", "Resilient", 
"Chronic decline", "Resilient", "Full recovery", "Resilient", 
"Resilient", "Full recovery", "Full recovery", "Chronic decline", 
"Partial recovery", "Chronic decline", "Chronic decline", "Full recovery", 
"Resilient", "Chronic decline", "Full recovery", "Chronic decline", 
"Partial recovery", "Chronic decline", "Full recovery", "Chronic decline", 
"Full recovery", "Chronic decline", "Resilient", "Full recovery", 
"Chronic decline", "Resilient", "Chronic decline", "Resilient", 
"Partial recovery", "Chronic decline", "Resilient", "Chronic decline", 
"Resilient", "Resilient", "Resilient", "Full recovery", "Resilient", 
"Chronic decline", "Resilient", "Resilient", "Full recovery", 
"Chronic decline", "Partial recovery", "Full recovery", "Chronic decline", 
"Resilient", "Chronic decline", "Chronic decline", "Partial recovery", 
"Chronic decline", "Full recovery", "Resilient", "Resilient", 
"Resilient", "Chronic decline", "Resilient", "Partial recovery", 
"Chronic decline", "Resilient", "Partial recovery", "Resilient", 
"Full recovery", "Full recovery", "Chronic decline", "Partial recovery", 
"Full recovery", "Chronic decline", "Chronic decline", "Chronic decline", 
"Partial recovery", "Partial recovery", "Resilient", "Chronic decline", 
"Full recovery", "Chronic decline", "Full recovery", "Full recovery", 
"Chronic decline", "Resilient", "Chronic decline", "Partial recovery", 
"Resilient", "Chronic decline", "Resilient", "Full recovery", 
"Full recovery", "Full recovery", "Resilient", "Chronic decline", 
"Resilient", "Resilient", "Partial recovery", "Chronic decline", 
"Partial recovery", "Resilient"), type = c("non-essential", "mix", 
"non-essential", "mix", "mix", "mix", "mix", "mix", "non-essential", 
"non-essential", "non-essential", "non-essential", "mix", "mix", 
"non-essential", "non-essential", "mix", "non-essential", "mix", 
"non-essential", "non-essential", "non-essential", "mix", "non-essential", 
"non-essential", "mix", "non-essential", "mix", "non-essential", 
"non-essential", "mix", "non-essential", "essential", "non-essential", 
"mix", "essential", "mix", "mix", "mix", "non-essential", "mix", 
"essential", "mix", "non-essential", "mix", "non-essential", 
"non-essential", "mix", "non-essential", "mix", "mix", "non-essential", 
"mix", "mix", "mix", "essential", "non-essential", "mix", "non-essential", 
"non-essential", "essential", "mix", "mix", "mix", "non-essential", 
"non-essential", "non-essential", "mix", "non-essential", "non-essential", 
"non-essential", "mix", "mix", "mix", "non-essential", "mix", 
"non-essential", "mix", "mix", "non-essential", "mix", "non-essential", 
"non-essential", "non-essential", "mix", "mix", "mix", "non-essential", 
"mix", "essential", "non-essential", "non-essential", "mix", 
"non-essential", "non-essential", "non-essential", "mix"), sector = c("Commercial", 
"Commercial", "Commercial", "Commercial", "Commercial", "Commercial", 
"Commercial", "Commercial", "Commercial", "Commercial", "Commercial", 
"Commercial", "Commercial", "Commercial", "Commercial", "Commercial", 
"Commercial", "Commercial", "Commercial", "Commercial", "Commercial", 
"Commercial", "Commercial", "Commercial", "Commercial", "Commercial", 
"Commercial", "Commercial", "Commercial", "Commercial", "Commercial", 
"Commercial", "Retail", "Retail", "Retail", "Retail", "Retail", 
"Retail", "Retail", "Retail", "Retail", "Retail", "Retail", "Retail", 
"Retail", "Retail", "Retail", "Retail", "Retail", "Industrial", 
"Industrial", "Industrial", "Industrial", "Industrial", "Industrial", 
"Industrial", "Industrial", "Industrial", "Industrial", "Industrial", 
"Industrial", "Industrial", "Industrial", "Industrial", "Industrial", 
"Industrial", "Industrial", "Industrial", "Industrial", "Industrial", 
"Industrial", "Industrial", "Industrial", "Industrial", "Industrial", 
"Industrial", "Industrial", "Industrial", "Industrial", "Industrial", 
"Industrial", "Industrial", "Industrial", "Industrial", "Industrial", 
"Industrial", "Industrial", "Industrial", "Industrial", "Industrial", 
"Industrial", "Industrial", "Industrial", "Industrial", "Industrial", 
"Industrial", "Industrial")), class = "data.frame", row.names = c(NA, 
-97L))

Session Info:

R version 4.5.2 (2025-10-31 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 26200)

Matrix products: default
  LAPACK version 3.12.1

locale:
[1] LC_COLLATE=English_United States.utf8  LC_CTYPE=English_United States.utf8    LC_MONETARY=English_United States.utf8
[4] LC_NUMERIC=C                           LC_TIME=English_United States.utf8    

time zone: Europe/Bucharest
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] tidyr_1.3.1      gridExtra_2.3    dplyr_1.1.4      factoextra_1.0.7 ggplot2_4.0.1    FactoMineR_2.12 

loaded via a namespace (and not attached):
 [1] utf8_1.2.6           sandwich_3.1-1       generics_0.1.4       lattice_0.22-7       digest_0.6.38        magrittr_2.0.4      
 [7] grid_4.5.2           estimability_1.5.1   RColorBrewer_1.1-3   mvtnorm_1.3-3        fastmap_1.2.0        Matrix_1.7-4        
[13] ggrepel_0.9.6        Formula_1.2-5        survival_3.8-3       multcomp_1.4-29      purrr_1.2.0          scales_1.4.0        
[19] TH.data_1.1-5        isoband_0.2.7        codetools_0.2-20     abind_1.4-8          cli_3.6.5            rlang_1.1.6         
[25] scatterplot3d_0.3-44 splines_4.5.2        leaps_3.2            withr_3.0.2          tools_4.5.2          multcompView_0.1-10 
[31] coda_0.19-4.1        DT_0.34.0            flashClust_1.01-2    vctrs_0.6.5          R6_2.6.1             zoo_1.8-14          
[37] lifecycle_1.0.4      emmeans_2.0.0        car_3.1-3            htmlwidgets_1.6.4    MASS_7.3-65          cluster_2.1.8.1     
[43] pkgconfig_2.0.3      pillar_1.11.1        gtable_0.3.6         glue_1.8.0           Rcpp_1.1.0           tibble_3.3.0        
[49] tidyselect_1.2.1     rstudioapi_0.17.1    dichromat_2.0-0.1    farver_2.1.2         xtable_1.8-4         htmltools_0.5.8.1   
[55] carData_3.0-5        labeling_0.4.3       compiler_4.5.2       S7_0.2.1

Solution

  • My solution was to use convex hull instead of ellipses:

    library(ggforce)
    
    p1 <- ggplot(mca_coords, aes(x = `Dim 1`, y = `Dim 2`, color = archetype)) +
      geom_hline(yintercept = 0, color = "grey50", linewidth = 0.5, linetype = "dashed") +
      geom_vline(xintercept = 0, color = "grey50", linewidth = 0.5, linetype = "dashed") +
      geom_mark_hull(aes(fill = archetype), alpha = 0.15, concavity = 1, 
                     expand = unit(2, "mm"), radius = unit(2, "mm")) +
      geom_jitter(size = 3, alpha = 0.6, width = 0.03, height = 0.03) +
      labs(title = "(A) Archetype Clustering in Feature Space",
           x = paste0("Dim 1: Essential ↔ Non-essential (", round(mca_res$eig[1,2], 1), "%)"),
           y = paste0("Dim 2: Retail/Commercial ↔ Industrial (", round(mca_res$eig[2,2], 1), "%)"),
           color = "Archetype",
           fill = "Archetype") +
      theme_minimal() +
      theme(panel.grid = element_blank(),
            legend.position = "bottom") + theme(panel.grid = element_blank(),
                                                legend.position = "bottom",
                                                axis.line = element_line(linewidth = 1))
    
    p1
    

    enter image description here