rggplot2pca

Vectors on "handmade" PCA don't match the output biplot


I used the R package PCAtools to generate a PCA of gene count metagenomic data. When visualizing the biplot using PCAtools, the loadings are "appropriately" sized for the data dispersion (I understand vector length is representative of impact on dispersal). See below: ex is a normalized gene count table subset. dput(ex) is below.

library(PCAtools)
p <- pca(ex,removeVar = 0.1)
biplot(p,lab=NULL,hline = 0, vline = 0,
       showLoadings = T)
# this gives a figure with 3 loadings, automatically made from PCAtools.

I then extracted the rotated data and variable loadings to generate my own PCA plot with ggplot2. However, the loading vector lengths on my plot are minuscule and don't match the PCAtools biplot.

# select matching loadings
loadplot <- c("K02469","K10118","K20444")
# extract PC scores and loadings
PC <- p$rotated
load <-p$loadings
load <- na.omit(load[loadplot,])

# image with ggplot2
p.pcaNORM <- ggplot(data = PC, aes(x = PC1, y = PC2)) +
  geom_point(aes(),shape = 21,colour = "black", size = 5) +
  geom_text(data=load, aes(x=PC1, y=PC2, label=row.names(load)),
            inherit.aes = FALSE, size=3) +
  geom_segment(data=load, aes(x=0, xend=load$PC1, y=0, yend=load$PC2), inherit.aes = FALSE,
               arrow = arrow(length = unit(0.25, "cm")), colour = "grey") +
  labs(x = "PC1 27.8%", y = "PC2 12.3%", title="PCA of Gene Count Differential Abundances") +
  geom_hline(yintercept = 0, linetype="dashed",color="black") +
  geom_vline(xintercept=0, linetype="dashed",color="black") +
  theme_linedraw() +
  theme(legend.position  = "bottom", 
        legend.box       = "horizontal",
        legend.direction = "horizontal",
        panel.background = element_blank(),
        axis.text.x      = element_text(size=8),
        axis.text.y      = element_text(size=8),
        plot.title       = element_text(size = 10))
print(p.pcaNORM)

You may then see, for example, the loading "K20444" ends between -1 and -2.5 in the PCAtools biplot but it ends between 0 and -1 in my ggplot2 biplot. Does anyone know what is happening?

> dput(ex) structure(list(Ga0598246 = c(6.56539918955656, 6.07685285958297, 
9.80857543461825, 7.45380707677968, 5.50253732907563, 0, 1.00547341301176, 
9.57886787328451, 5.91763530872237, 9.13761113968035), Ga0598239 = c(5.37714039215198, 
4.44627250098116, 9.12527549535166, 7.37818173071451, 4.22343869037075,  0, 0, 9.41186693570199, 2.04265725161515, 9.06211714113395), 
    Ga0598242 = c(7.23329507168685, 3.30011916965994, 9.14571484611697, 
    6.63420100118826, 5.33389579265697, 0, 0, 9.23317518972483, 
    4.83457149887647, 9.02564207931812), Ga0598241 = c(6.42805923331295, 
    4.41343389152249, 9.17585857319441, 6.53724813980237, 5.95254094365946, 
    0, 1.55299220383005, 9.34416529172944, 5.9067619585435, 9.35272687344133
    ), Ga0598244 = c(6.51891536576828, 4.56535598153963, 9.40792503580225, 
    6.60909974507472, 5.37234302403178, 0, 0.989803762135471, 
    9.61255610440809, 5.76126935753228, 9.26497019795582), Ga0598240 = c(6.59592709583215, 
    5.0116546308929, 9.10583727581318, 6.45845808171092, 6.32419402380456, 
    0, 1.97474341675587, 9.59380242030398, 6.25206560327315, 
    9.37141719913754), Ga0598243 = c(6.48685136970719, 4.54940912022517, 
    8.95163946868835, 6.2487649672149, 6.21130294335921, 0, 1.97225049345733, 
    9.55911910161097, 6.0719500604804, 9.61218376859803), Ga0598247 = c(3.42017839411825, 
    5.22468821931174, 9.43811837974567, 8.49700801179772, 5.02416088318049, 
    0, 0, 10.2938603383254, 0, 8.70421802755197), Ga0598259 = c(6.85309401728228, 
    5.08842002313584, 9.16500402830579, 6.20640673224555, 5.24441419037537, 
    0, 0, 9.46180201100927, 5.5134145812396, 9.53162068178508
    ), Ga0598260 = c(6.7006085700961, 4.80510960111303, 9.06424555064897, 
    7.05413314135196, 5.56105321976388, 0, 0, 9.94666671304116, 
    5.99044411371985, 9.14986983395412)), row.names = c("K20444",  "K01711", "K10119", "K10118", "K02469", "K03046", "K14358", "K00945",  "K12454", "K01129"), class = "data.frame")

Solution

  • The issue is that biplot scales the loadings under the hood, i.e. first the loadings vectors are scaled to the range of the rotated data. Second, the length of the vectors are scaled to the value of the lengthLoadingsArrowsFactor= parameter which defaults to 1.5.

    library(PCAtools)
    library(ggplot2)
    
    lengthLoadingsArrowsFactor <- 1.5
    
    r <- min(
      (max(p$rotated[["PC1"]]) - min(p$rotated[["PC1"]]) /
        (max(p$loadings[["PC1"]]) - min(p$loadings[["PC1"]]))),
      (max(p$rotated[["PC2"]]) - min(p$rotated[["PC2"]]) /
        (max(p$loadings[["PC2"]]) - min(p$loadings[["PC2"]])))
    )
    
    ggplot(data = PC, aes(x = PC1, y = PC2)) +
      geom_point(aes(), shape = 21, colour = "black", size = 5) +
      geom_text(
        data = load, aes(
          x = PC1 * r * lengthLoadingsArrowsFactor,
          y = PC2 * r * lengthLoadingsArrowsFactor,
          label = row.names(load)
        ),
        inherit.aes = FALSE, size = 3
      ) +
      geom_segment(
        data = load, aes(
          x = 0, xend = PC1 * r * lengthLoadingsArrowsFactor,
          y = 0, yend = PC2 * r * lengthLoadingsArrowsFactor
        ), inherit.aes = FALSE,
        arrow = arrow(length = unit(0.25, "cm")), colour = "grey"
      ) +
      labs(x = "PC1 27.8%", y = "PC2 12.3%", title = "PCA of Gene Count Differential Abundances") +
      geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
      geom_vline(xintercept = 0, linetype = "dashed", color = "black") +
      theme_linedraw() +
      theme(
        legend.position = "bottom",
        legend.box = "horizontal",
        legend.direction = "horizontal",
        panel.background = element_blank(),
        axis.text.x = element_text(size = 8),
        axis.text.y = element_text(size = 8),
        plot.title = element_text(size = 10)
      ) +
      xlim(-2.5, 6.25) +
      ylim(-2.5, 1.5)
    

    
    biplot(p,
      lab = NULL, hline = 0, vline = 0, showLoadings = TRUE
    )