rvariancegeometric-mean

DESEQ2: varianceStabilizingTransformation Error: every gene contains at least one zero, cannot compute log geometric means


I have a data-set comprising of an OTU table, where rows = samples and columns = OTUs, like this:

Otus <- data.frame(OTU_1 = c(0, 0, 1), OTU_2 = c(12, 0, 5), OTU_3 = c(0, 5, 3), row.names = c("S_1", "S_2", "S_3"))

I moved it from phyloseq over to DESeq2 using the phyloseq_to_deseq2 command without issue. Now that it's a DESeq2object, I want to normalize the OTU table using the variance stabilizing transformation with the following command:

vsd <- varianceStabilizingTransformation(DESeq2_Object, blind = TRUE)

However, I keep getting this error:

Error in estimateSizeFactorsForMatrix(counts(object), locfunc = locfunc, : every gene contains at least one zero, cannot compute log geometric means

After doing extensive research to understand this error, I found that what it means is that if each of my columns (OTU_1, OTU_2, OTU_3, etc.) contain a zero, DESeq2 cannot compute its geometric means. This is indeed my case, as all of my columns do contain at least 1 zero, for each column. However, I have found a work around to the exact same error with a different command used to compute differential expression. The work-around in this case is to apply the sfType = poscounts within the DESeqcommand, like this:

Diffs <- DESeq(DESeq2_Object, test = "Wald", fitType = "parametric", sfType = "poscounts")

However, this command does not populate the variance stabilization matrix that I am after, it only computes differential expression from the raw OTU table.

I have looked into the vignette and read up on the varianceStabilizingTransformationcommand (using R) but it does not have the sfType flag to overcome this issue.

Is there a way around this error, so I can obtain a variance stabilized matrix?


Solution

  • What is your intended purpose for the variance-stabilized matrix? Are you then going to use the matrix to calculate differential abundance? In this tutorial, the authors (Susan Holmes and Joey McMurdie) refer to this tutorial for conducting differential abundance testing using the altered geometric mean formula, e.g.

    #BiocManager::install("phyloseq")
    #BiocManager::install("DESeq2")
    library(phyloseq)
    library(DESeq2)
    #> Loading required package: S4Vectors
    #> Loading required package: stats4
    #> Loading required package: BiocGenerics
    #> Loading required package: parallel
    #> Loading required package: IRanges
    #> Loading required package: GenomicRanges
    #> Loading required package: GenomeInfoDb
    #> Loading required package: SummarizedExperiment
    #> Loading required package: MatrixGenerics
    #> Loading required package: matrixStats
    
    
    # Load example data
    otufile = system.file("extdata", "GP_otu_table_rand_short.txt.gz", package = "phyloseq")
    mapfile = system.file("extdata", "master_map.txt", package = "phyloseq")
    trefile = system.file("extdata", "GP_tree_rand_short.newick.gz", package = "phyloseq")
    qiimex = import_qiime(otufile, mapfile, trefile, showProgress = FALSE)
    #> Processing map file...
    #> Processing otu/tax file...
    #> Reading file into memory prior to parsing...
    #> Detecting first header line...
    #> Header is on line 2  
    #> Converting input file to a table...
    #> Defining OTU table... 
    #> Parsing taxonomy table...
    #> Processing phylogenetic tree...
    #>  /Library/Frameworks/R.framework/Versions/4.1/Resources/library/phyloseq/extdata/GP_tree_rand_short.newick.gz ...
    #> Warning in import_qiime(otufile, mapfile, trefile, showProgress = FALSE):
    #> treefilename failed import. It will not be included.
    qiimex
    #> phyloseq-class experiment-level object
    #> otu_table()   OTU Table:         [ 500 taxa and 26 samples ]
    #> sample_data() Sample Data:       [ 26 samples by 7 sample variables ]
    #> tax_table()   Taxonomy Table:    [ 500 taxa by 7 taxonomic ranks ]
    
    # Select samples
    qiimex@sam_data$SampleType
    #>  [1] "Freshwater (creek)" "Freshwater (creek)" "Freshwater (creek)"
    #>  [4] "Soil"               "Soil"               "Mock"              
    #>  [7] "Mock"               "Mock"               "Skin"              
    #> [10] "Freshwater"         "Feces"              "Skin"              
    #> [13] "Tongue"             "Feces"              "Skin"              
    #> [16] "Tongue"             "Ocean"              "Ocean"             
    #> [19] "Ocean"              "Freshwater"         "Soil"              
    #> [22] "Sediment (estuary)" "Sediment (estuary)" "Sediment (estuary)"
    #> [25] "Feces"              "Feces"
    qiimex_subset <- subset_samples(qiimex, SampleType == "Soil" | SampleType == "Mock")
    
    # Convert to DESeq2 object
    DESeq2_Object <- phyloseq_to_deseq2(qiimex_subset, ~ SampleType)
    #> converting counts to integer mode
    #> Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
    #> design formula are characters, converting to factors
    
    # calculate geometric means prior to estimate size factors
    gm_mean = function(x, na.rm=TRUE){
      exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
    }
    geoMeans = apply(counts(DESeq2_Object), 1, gm_mean)
    DESeq2_dds = estimateSizeFactors(DESeq2_Object, geoMeans = geoMeans)
    DESeq2_dds = DESeq(DESeq2_dds, fitType="local")
    #> using pre-existing size factors
    #> estimating dispersions
    #> gene-wise dispersion estimates
    #> mean-dispersion relationship
    #> final dispersion estimates
    #> fitting model and testing
    
    # Explore results
    res = results(DESeq2_dds)
    res = res[order(res$padj, na.last=NA), ]
    alpha = 0.01
    sigtab = res[(res$padj < alpha), ]
    sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(qiimex_subset)[rownames(sigtab), ], "matrix"))
    head(sigtab)
    #>         baseMean log2FoldChange    lfcSE      stat       pvalue         padj
    #> 184403 3316.1856     -11.604876 1.446460 -8.022948 1.032372e-15 1.827298e-13
    #> 193343  820.0846     -10.147175 1.317754 -7.700354 1.356898e-14 1.200855e-12
    #> 10113   316.3938      -9.216756 1.313276 -7.018143 2.248365e-12 1.326535e-10
    #> 234421  238.0599     -11.167192 1.657220 -6.738510 1.600195e-11 7.080861e-10
    #> 526081  259.6722      -9.346145 1.471139 -6.352999 2.111570e-10 7.474959e-09
    #> 313166 1322.7565      -9.911298 1.593579 -6.219519 4.986800e-10 1.471106e-08
    #>         Kingdom         Phylum               Class             Order
    #> 184403 Bacteria     Firmicutes          Clostridia     Clostridiales
    #> 193343 Bacteria     Firmicutes          Clostridia     Clostridiales
    #> 10113  Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales
    #> 234421 Bacteria     Firmicutes          Clostridia     Clostridiales
    #> 526081 Bacteria     Firmicutes          Clostridia     Clostridiales
    #> 313166 Bacteria  Bacteroidetes         Bacteroidia     Bacteroidales
    #>                    Family        Genus            Species
    #> 184403    Lachnospiraceae         <NA>               <NA>
    #> 193343    Lachnospiraceae    Roseburia Eubacteriumrectale
    #> 10113  Enterobacteriaceae         <NA>               <NA>
    #> 234421    Lachnospiraceae Ruminococcus               <NA>
    #> 526081    Lachnospiraceae Oribacterium               <NA>
    #> 313166     Bacteroidaceae  Bacteroides               <NA>
    posigtab = sigtab[sigtab[, "log2FoldChange"] > 0, ]
    posigtab = posigtab[, c("baseMean", "log2FoldChange", "lfcSE", "padj", "Phylum", "Class", "Family", "Genus")]
    posigtab
    #>          baseMean log2FoldChange    lfcSE         padj          Phylum
    #> 303295 201.335216       6.506674 1.204792 1.305775e-06  Proteobacteria
    #> 5552   196.834531       6.309543 1.178371 1.518862e-06  Proteobacteria
    #> 236550  69.552022       8.770931 1.815173 1.840288e-05  Proteobacteria
    #> 113626 844.163976       7.207704 1.503844 2.078799e-05   Acidobacteria
    #> 97627  171.891263       6.169004 1.307597 2.812963e-05  Proteobacteria
    #> 89763   52.875334       8.375284 1.795633 3.426524e-05  Actinobacteria
    #> 306684 149.608143       5.964643 1.346820 9.322811e-05   Bacteroidetes
    #> 218985  50.742401       8.315832 1.882803 9.334941e-05     Chloroflexi
    #> 102382 335.108981       6.747414 1.555112 1.267530e-04  Proteobacteria
    #> 206632 332.777997       7.486585 1.747926 1.553274e-04 Verrucomicrobia
    #> 573607 178.082806       6.729167 1.614098 2.360541e-04   Acidobacteria
    #> 593006 146.954357       7.392336 1.781983 2.469451e-04  Proteobacteria
    #> 279590 151.624630       6.576054 1.637287 3.873396e-04   Acidobacteria
    #> 89337   56.919233       6.692225 1.663102 3.873396e-04  Actinobacteria
    #> 218035  43.876643       7.133246 1.793490 4.373432e-04  Actinobacteria
    #> 368027  18.296878       6.844054 1.791678 7.384186e-04  Actinobacteria
    #> 572242  62.103316       6.140545 1.616489 7.802034e-04  Planctomycetes
    #> 588604 211.108277       6.589139 1.738856 7.862979e-04  Proteobacteria
    #> 212596 183.109699       6.931159 1.846832 8.501324e-04   Acidobacteria
    #> 237963 113.390704       7.549096 2.013483 8.501324e-04  Proteobacteria
    #> 548077  23.508540       7.205060 1.921985 8.501324e-04  Actinobacteria
    #> 222914 348.580255       7.393221 2.040394 1.319435e-03 Verrucomicrobia
    #> 265094 519.413992       5.885380 1.636179 1.389556e-03  Actinobacteria
    #> 341901  26.562097       7.381503 2.068411 1.512002e-03  Actinobacteria
    #> 104310  11.372452       6.157420 1.746257 1.715256e-03  Actinobacteria
    #> 561842  51.066519       5.459834 1.549686 1.715256e-03  Proteobacteria
    #> 248299  16.577945       6.700539 1.906851 1.736686e-03  Actinobacteria
    #> 166835 868.093822       6.189342 1.766318 1.762950e-03  Proteobacteria
    #> 369734  32.758200       6.711075 1.919352 1.775080e-03  Proteobacteria
    #> 295422  15.101303       6.565460 1.913842 2.221501e-03  Proteobacteria
    #> 137099  25.876135       6.367122 1.867180 2.254420e-03 Verrucomicrobia
    #> 254706  13.039735       6.358471 1.862435 2.254420e-03   Acidobacteria
    #> 204462 137.941401       6.613294 1.963268 2.572456e-03   Acidobacteria
    #> 160337  18.708823       5.893400 1.783191 3.172144e-03  Actinobacteria
    #> 144065  23.284353       7.191156 2.184408 3.260211e-03  Proteobacteria
    #> 113959 894.081156       6.136411 1.870146 3.325954e-03   Acidobacteria
    #> 223948  10.185674       6.003017 1.833057 3.341251e-03   Acidobacteria
    #> 160135   8.669323       5.764444 1.764057 3.366660e-03  Actinobacteria
    #> 244840   8.505259       5.736708 1.759948 3.404914e-03  Planctomycetes
    #> 240591 179.115203       5.662464 1.749499 3.628632e-03  Proteobacteria
    #> 547752  22.387044       6.156040 1.918045 3.921972e-03  Proteobacteria
    #> 163176  17.574738       5.803532 1.823396 4.230077e-03  Proteobacteria
    #> 321885  24.312098       6.270669 2.013227 5.172611e-03  Planctomycetes
    #> 329327  16.600143       5.717837 1.839840 5.212980e-03   Fibrobacteres
    #> 262115  25.392319       7.316622 2.362343 5.319945e-03             SC4
    #> 512616  15.661404       5.632695 1.854032 6.385192e-03  Proteobacteria
    #> 261663  25.664154       4.953787 1.643615 6.812189e-03     Chloroflexi
    #> 257151  16.884984       4.909367 1.658737 8.015407e-03   Bacteroidetes
    #> 238929  25.265076       4.509092 1.544313 8.856346e-03 Verrucomicrobia
    #> 322045   6.504135       5.355636 1.834044 8.856346e-03  Proteobacteria
    #>                      Class                      Family                Genus
    #> 303295 Deltaproteobacteria               Haliangiaceae                 <NA>
    #> 5552   Alphaproteobacteria            Caulobacteraceae     Phenylobacterium
    #> 236550  Betaproteobacteria            Oxalobacteraceae             Massilia
    #> 113626  Chloracidobacteria                        <NA>                 <NA>
    #> 97627   Betaproteobacteria              Comamonadaceae          Hylemonella
    #> 89763       Actinobacteria                        <NA>                 <NA>
    #> 306684     Sphingobacteria                        <NA>                 <NA>
    #> 218985              SOGA31                        <NA>                 <NA>
    #> 102382  Betaproteobacteria            Oxalobacteraceae             Massilia
    #> 206632      Spartobacteria          Spartobacteriaceae                 MC18
    #> 573607  Chloracidobacteria                        <NA>                 <NA>
    #> 593006 Gammaproteobacteria             Sinobacteraceae                 <NA>
    #> 279590       Acidobacteria                        <NA>                 <NA>
    #> 89337       Actinobacteria          Pseudonocardiaceae    Actinoalloteichus
    #> 218035      Actinobacteria           Microbacteriaceae                 <NA>
    #> 368027      Actinobacteria                        <NA>                 <NA>
    #> 572242        Planctomycea                 Gemmataceae                 <NA>
    #> 588604 Alphaproteobacteria           Rhodospirillaceae                 <NA>
    #> 212596        Solibacteres             Solibacteraceae CandidatusSolibacter
    #> 237963 Deltaproteobacteria                        <NA>                 <NA>
    #> 548077      Actinobacteria                        <NA>                 <NA>
    #> 222914    Verrucomicrobiae Verrucomicrobiasubdivision3                 <NA>
    #> 265094      Actinobacteria          Micromonosporaceae       Micromonospora
    #> 341901      Actinobacteria             Nocardioidaceae                 <NA>
    #> 104310      Actinobacteria                 Frankiaceae                 <NA>
    #> 561842 Alphaproteobacteria                        <NA>                 <NA>
    #> 248299      Actinobacteria                        <NA>                 <NA>
    #> 166835  Betaproteobacteria            Burkholderiaceae         Burkholderia
    #> 369734 Alphaproteobacteria           Rhodospirillaceae                 <NA>
    #> 295422  Betaproteobacteria              Alcaligenaceae           Sutterella
    #> 137099    Verrucomicrobiae                        <NA>                 <NA>
    #> 254706  Chloracidobacteria                        <NA>                 <NA>
    #> 204462        Solibacteres             Solibacteraceae CandidatusSolibacter
    #> 160337      Actinobacteria                 Frankiaceae                 <NA>
    #> 144065 Alphaproteobacteria            Acetobacteraceae                 <NA>
    #> 113959                <NA>                        <NA>                 <NA>
    #> 223948  Chloracidobacteria                        <NA>                 <NA>
    #> 160135      Actinobacteria           Streptomycetaceae         Streptomyces
    #> 244840        Planctomycea               Pirellulaceae            Pirellula
    #> 240591 Alphaproteobacteria          Phyllobacteriaceae                 <NA>
    #> 547752  Betaproteobacteria            Burkholderiaceae         Burkholderia
    #> 163176  Betaproteobacteria              Rhodocyclaceae    Methyloversatilis
    #> 321885        Planctomycea                 Gemmataceae                 <NA>
    #> 329327       Fibrobacteres            Fibrobacteraceae          Fibrobacter
    #> 262115                <NA>                        <NA>                 <NA>
    #> 512616 Gammaproteobacteria             Sinobacteraceae                 <NA>
    #> 261663             Bljii12                     Dolo_23                 <NA>
    #> 257151     Sphingobacteria                        <NA>                 <NA>
    #> 238929    Verrucomicrobiae                        <NA>                 <NA>
    #> 322045  Betaproteobacteria            Oxalobacteraceae             Massilia
    
    # Plot the results
    library(ggplot2)
    theme_set(theme_bw())
    sigtabgen = subset(sigtab, !is.na(Genus))
    # Phylum order
    x = tapply(sigtabgen$log2FoldChange, sigtabgen$Phylum, function(x) max(x))
    x = sort(x, TRUE)
    sigtabgen$Phylum = factor(as.character(sigtabgen$Phylum), levels=names(x))
    # Genus order
    x = tapply(sigtabgen$log2FoldChange, sigtabgen$Genus, function(x) max(x))
    x = sort(x, TRUE)
    sigtabgen$Genus = factor(as.character(sigtabgen$Genus), levels=names(x))
    ggplot(sigtabgen, aes(y=Genus, x=log2FoldChange, color=Phylum)) + 
      geom_vline(xintercept = 0.0, color = "gray", size = 0.5) +
      geom_point(size=6) + 
      theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))
    

    Created on 2021-10-08 by the reprex package (v2.0.1)

    The tutorial was published in May 2021, so it's ~up-to-date, and they basically skip the 'intermediate step' your question is related to, but if you need a variance-stabilized matrix for some other reason a potential solution is to apply Anscombe's variance stabilizing transformation using Varistran (which also provides some nice plotting features), e.g.

    #devtools::install_github("MonashBioinformaticsPlatform/varistran")
    #BiocManager::install("edgeR")
    library(varistran)
    #> Loading required package: grid
    
    counts <- DESeq2_Object@assays@data$counts
    design <- model.matrix(~ qiimex_subset@sam_data$SampleType)
    
    y <- vst(counts, design=design)
    #> Dispersion estimated as 0.5672283
    
    # "y" contains the transformed counts
    y
    #>               CC1        CL3      Even1      Even2      Even3        SV1
    #> 338272 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927  0.7855066
    #> 289855 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927  0.7855066
    #> 287759  0.2967411 -0.3192927 -0.3192927 -0.3192927  1.5903105  4.0624033
    #> 334839 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
    #> 144065  3.2103181  6.8071310 -0.3192927 -0.3192927 -0.3192927  0.7855066
    #> 18643  -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
    #> 218985  6.1359268  3.1727994 -0.3192927 -0.3192927 -0.3192927  8.2069561
    #> 238997 -0.3192927  0.5037921 -0.3192927 -0.3192927 -0.3192927 -0.3192927
    #> 245893 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
    #> 131765 -0.3192927 -0.3192927 -0.3192927 -0.3192927  1.5903105  0.7855066
    #> 536982 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
    #> 351629  5.0182785  4.5937241  5.4609218  4.0571176  2.8729479  5.3692361
    #> 321069 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
    #> 512157 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
    #> ...
    #> attr(,"lib.size")
    #>       CC1       CL3     Even1     Even2     Even3       SV1 
    #> 84395.682 57904.079  1560.595  4272.039 15659.007 38299.983 
    #> attr(,"true.lib.size")
    #>   CC1   CL3 Even1 Even2 Even3   SV1 
    #> 31125 37942 10184  7169  6597 34353 
    #> attr(,"lib.size.method")
    #> [1] "TMM normalization"
    #> attr(,"cpm")
    #> [1] FALSE
    #> attr(,"method")
    #> [1] "anscombe.nb"
    #> attr(,"dispersion")
    #> [1] 0.5672283
    plot_stability(y, counts, design=design)
    #> Warning: Transformation introduced infinite values in continuous x-axis
    #> Warning: Transformation introduced infinite values in continuous x-axis
    

    plot_biplot(y)
    

    plot_heatmap(y, n=50, cluster_samples = TRUE)
    #> Registered S3 method overwritten by 'seriation':
    #>   method         from 
    #>   reorder.hclust vegan
    

    Created on 2021-10-08 by the reprex package (v2.0.1)