I have a data-set comprising of an OTU table, where rows = samples and columns = OTUs, like this:
Otus <- data.frame(OTU_1 = c(0, 0, 1), OTU_2 = c(12, 0, 5), OTU_3 = c(0, 5, 3), row.names = c("S_1", "S_2", "S_3"))
I moved it from phyloseq
over to DESeq2
using the phyloseq_to_deseq2
command without issue. Now that it's a DESeq2
object, I want to normalize the OTU table using the variance stabilizing transformation with the following command:
vsd <- varianceStabilizingTransformation(DESeq2_Object, blind = TRUE)
However, I keep getting this error:
Error in estimateSizeFactorsForMatrix(counts(object), locfunc = locfunc, : every gene contains at least one zero, cannot compute log geometric means
After doing extensive research to understand this error, I found that what it means is that if each of my columns (OTU_1, OTU_2, OTU_3, etc.) contain a zero, DESeq2 cannot compute its geometric means. This is indeed my case, as all of my columns do contain at least 1 zero, for each column. However, I have found a work around to the exact same error with a different command used to compute differential expression. The work-around in this case is to apply the sfType = poscounts
within the DESeq
command, like this:
Diffs <- DESeq(DESeq2_Object, test = "Wald", fitType = "parametric", sfType = "poscounts")
However, this command does not populate the variance stabilization matrix that I am after, it only computes differential expression from the raw OTU table.
I have looked into the vignette and read up on the varianceStabilizingTransformation
command (using R
) but it does not have the sfType
flag to overcome this issue.
Is there a way around this error, so I can obtain a variance stabilized matrix?
What is your intended purpose for the variance-stabilized matrix? Are you then going to use the matrix to calculate differential abundance? In this tutorial, the authors (Susan Holmes and Joey McMurdie) refer to this tutorial for conducting differential abundance testing using the altered geometric mean formula, e.g.
#BiocManager::install("phyloseq")
#BiocManager::install("DESeq2")
library(phyloseq)
library(DESeq2)
#> Loading required package: S4Vectors
#> Loading required package: stats4
#> Loading required package: BiocGenerics
#> Loading required package: parallel
#> Loading required package: IRanges
#> Loading required package: GenomicRanges
#> Loading required package: GenomeInfoDb
#> Loading required package: SummarizedExperiment
#> Loading required package: MatrixGenerics
#> Loading required package: matrixStats
# Load example data
otufile = system.file("extdata", "GP_otu_table_rand_short.txt.gz", package = "phyloseq")
mapfile = system.file("extdata", "master_map.txt", package = "phyloseq")
trefile = system.file("extdata", "GP_tree_rand_short.newick.gz", package = "phyloseq")
qiimex = import_qiime(otufile, mapfile, trefile, showProgress = FALSE)
#> Processing map file...
#> Processing otu/tax file...
#> Reading file into memory prior to parsing...
#> Detecting first header line...
#> Header is on line 2
#> Converting input file to a table...
#> Defining OTU table...
#> Parsing taxonomy table...
#> Processing phylogenetic tree...
#> /Library/Frameworks/R.framework/Versions/4.1/Resources/library/phyloseq/extdata/GP_tree_rand_short.newick.gz ...
#> Warning in import_qiime(otufile, mapfile, trefile, showProgress = FALSE):
#> treefilename failed import. It will not be included.
qiimex
#> phyloseq-class experiment-level object
#> otu_table() OTU Table: [ 500 taxa and 26 samples ]
#> sample_data() Sample Data: [ 26 samples by 7 sample variables ]
#> tax_table() Taxonomy Table: [ 500 taxa by 7 taxonomic ranks ]
# Select samples
qiimex@sam_data$SampleType
#> [1] "Freshwater (creek)" "Freshwater (creek)" "Freshwater (creek)"
#> [4] "Soil" "Soil" "Mock"
#> [7] "Mock" "Mock" "Skin"
#> [10] "Freshwater" "Feces" "Skin"
#> [13] "Tongue" "Feces" "Skin"
#> [16] "Tongue" "Ocean" "Ocean"
#> [19] "Ocean" "Freshwater" "Soil"
#> [22] "Sediment (estuary)" "Sediment (estuary)" "Sediment (estuary)"
#> [25] "Feces" "Feces"
qiimex_subset <- subset_samples(qiimex, SampleType == "Soil" | SampleType == "Mock")
# Convert to DESeq2 object
DESeq2_Object <- phyloseq_to_deseq2(qiimex_subset, ~ SampleType)
#> converting counts to integer mode
#> Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
#> design formula are characters, converting to factors
# calculate geometric means prior to estimate size factors
gm_mean = function(x, na.rm=TRUE){
exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}
geoMeans = apply(counts(DESeq2_Object), 1, gm_mean)
DESeq2_dds = estimateSizeFactors(DESeq2_Object, geoMeans = geoMeans)
DESeq2_dds = DESeq(DESeq2_dds, fitType="local")
#> using pre-existing size factors
#> estimating dispersions
#> gene-wise dispersion estimates
#> mean-dispersion relationship
#> final dispersion estimates
#> fitting model and testing
# Explore results
res = results(DESeq2_dds)
res = res[order(res$padj, na.last=NA), ]
alpha = 0.01
sigtab = res[(res$padj < alpha), ]
sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(qiimex_subset)[rownames(sigtab), ], "matrix"))
head(sigtab)
#> baseMean log2FoldChange lfcSE stat pvalue padj
#> 184403 3316.1856 -11.604876 1.446460 -8.022948 1.032372e-15 1.827298e-13
#> 193343 820.0846 -10.147175 1.317754 -7.700354 1.356898e-14 1.200855e-12
#> 10113 316.3938 -9.216756 1.313276 -7.018143 2.248365e-12 1.326535e-10
#> 234421 238.0599 -11.167192 1.657220 -6.738510 1.600195e-11 7.080861e-10
#> 526081 259.6722 -9.346145 1.471139 -6.352999 2.111570e-10 7.474959e-09
#> 313166 1322.7565 -9.911298 1.593579 -6.219519 4.986800e-10 1.471106e-08
#> Kingdom Phylum Class Order
#> 184403 Bacteria Firmicutes Clostridia Clostridiales
#> 193343 Bacteria Firmicutes Clostridia Clostridiales
#> 10113 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales
#> 234421 Bacteria Firmicutes Clostridia Clostridiales
#> 526081 Bacteria Firmicutes Clostridia Clostridiales
#> 313166 Bacteria Bacteroidetes Bacteroidia Bacteroidales
#> Family Genus Species
#> 184403 Lachnospiraceae <NA> <NA>
#> 193343 Lachnospiraceae Roseburia Eubacteriumrectale
#> 10113 Enterobacteriaceae <NA> <NA>
#> 234421 Lachnospiraceae Ruminococcus <NA>
#> 526081 Lachnospiraceae Oribacterium <NA>
#> 313166 Bacteroidaceae Bacteroides <NA>
posigtab = sigtab[sigtab[, "log2FoldChange"] > 0, ]
posigtab = posigtab[, c("baseMean", "log2FoldChange", "lfcSE", "padj", "Phylum", "Class", "Family", "Genus")]
posigtab
#> baseMean log2FoldChange lfcSE padj Phylum
#> 303295 201.335216 6.506674 1.204792 1.305775e-06 Proteobacteria
#> 5552 196.834531 6.309543 1.178371 1.518862e-06 Proteobacteria
#> 236550 69.552022 8.770931 1.815173 1.840288e-05 Proteobacteria
#> 113626 844.163976 7.207704 1.503844 2.078799e-05 Acidobacteria
#> 97627 171.891263 6.169004 1.307597 2.812963e-05 Proteobacteria
#> 89763 52.875334 8.375284 1.795633 3.426524e-05 Actinobacteria
#> 306684 149.608143 5.964643 1.346820 9.322811e-05 Bacteroidetes
#> 218985 50.742401 8.315832 1.882803 9.334941e-05 Chloroflexi
#> 102382 335.108981 6.747414 1.555112 1.267530e-04 Proteobacteria
#> 206632 332.777997 7.486585 1.747926 1.553274e-04 Verrucomicrobia
#> 573607 178.082806 6.729167 1.614098 2.360541e-04 Acidobacteria
#> 593006 146.954357 7.392336 1.781983 2.469451e-04 Proteobacteria
#> 279590 151.624630 6.576054 1.637287 3.873396e-04 Acidobacteria
#> 89337 56.919233 6.692225 1.663102 3.873396e-04 Actinobacteria
#> 218035 43.876643 7.133246 1.793490 4.373432e-04 Actinobacteria
#> 368027 18.296878 6.844054 1.791678 7.384186e-04 Actinobacteria
#> 572242 62.103316 6.140545 1.616489 7.802034e-04 Planctomycetes
#> 588604 211.108277 6.589139 1.738856 7.862979e-04 Proteobacteria
#> 212596 183.109699 6.931159 1.846832 8.501324e-04 Acidobacteria
#> 237963 113.390704 7.549096 2.013483 8.501324e-04 Proteobacteria
#> 548077 23.508540 7.205060 1.921985 8.501324e-04 Actinobacteria
#> 222914 348.580255 7.393221 2.040394 1.319435e-03 Verrucomicrobia
#> 265094 519.413992 5.885380 1.636179 1.389556e-03 Actinobacteria
#> 341901 26.562097 7.381503 2.068411 1.512002e-03 Actinobacteria
#> 104310 11.372452 6.157420 1.746257 1.715256e-03 Actinobacteria
#> 561842 51.066519 5.459834 1.549686 1.715256e-03 Proteobacteria
#> 248299 16.577945 6.700539 1.906851 1.736686e-03 Actinobacteria
#> 166835 868.093822 6.189342 1.766318 1.762950e-03 Proteobacteria
#> 369734 32.758200 6.711075 1.919352 1.775080e-03 Proteobacteria
#> 295422 15.101303 6.565460 1.913842 2.221501e-03 Proteobacteria
#> 137099 25.876135 6.367122 1.867180 2.254420e-03 Verrucomicrobia
#> 254706 13.039735 6.358471 1.862435 2.254420e-03 Acidobacteria
#> 204462 137.941401 6.613294 1.963268 2.572456e-03 Acidobacteria
#> 160337 18.708823 5.893400 1.783191 3.172144e-03 Actinobacteria
#> 144065 23.284353 7.191156 2.184408 3.260211e-03 Proteobacteria
#> 113959 894.081156 6.136411 1.870146 3.325954e-03 Acidobacteria
#> 223948 10.185674 6.003017 1.833057 3.341251e-03 Acidobacteria
#> 160135 8.669323 5.764444 1.764057 3.366660e-03 Actinobacteria
#> 244840 8.505259 5.736708 1.759948 3.404914e-03 Planctomycetes
#> 240591 179.115203 5.662464 1.749499 3.628632e-03 Proteobacteria
#> 547752 22.387044 6.156040 1.918045 3.921972e-03 Proteobacteria
#> 163176 17.574738 5.803532 1.823396 4.230077e-03 Proteobacteria
#> 321885 24.312098 6.270669 2.013227 5.172611e-03 Planctomycetes
#> 329327 16.600143 5.717837 1.839840 5.212980e-03 Fibrobacteres
#> 262115 25.392319 7.316622 2.362343 5.319945e-03 SC4
#> 512616 15.661404 5.632695 1.854032 6.385192e-03 Proteobacteria
#> 261663 25.664154 4.953787 1.643615 6.812189e-03 Chloroflexi
#> 257151 16.884984 4.909367 1.658737 8.015407e-03 Bacteroidetes
#> 238929 25.265076 4.509092 1.544313 8.856346e-03 Verrucomicrobia
#> 322045 6.504135 5.355636 1.834044 8.856346e-03 Proteobacteria
#> Class Family Genus
#> 303295 Deltaproteobacteria Haliangiaceae <NA>
#> 5552 Alphaproteobacteria Caulobacteraceae Phenylobacterium
#> 236550 Betaproteobacteria Oxalobacteraceae Massilia
#> 113626 Chloracidobacteria <NA> <NA>
#> 97627 Betaproteobacteria Comamonadaceae Hylemonella
#> 89763 Actinobacteria <NA> <NA>
#> 306684 Sphingobacteria <NA> <NA>
#> 218985 SOGA31 <NA> <NA>
#> 102382 Betaproteobacteria Oxalobacteraceae Massilia
#> 206632 Spartobacteria Spartobacteriaceae MC18
#> 573607 Chloracidobacteria <NA> <NA>
#> 593006 Gammaproteobacteria Sinobacteraceae <NA>
#> 279590 Acidobacteria <NA> <NA>
#> 89337 Actinobacteria Pseudonocardiaceae Actinoalloteichus
#> 218035 Actinobacteria Microbacteriaceae <NA>
#> 368027 Actinobacteria <NA> <NA>
#> 572242 Planctomycea Gemmataceae <NA>
#> 588604 Alphaproteobacteria Rhodospirillaceae <NA>
#> 212596 Solibacteres Solibacteraceae CandidatusSolibacter
#> 237963 Deltaproteobacteria <NA> <NA>
#> 548077 Actinobacteria <NA> <NA>
#> 222914 Verrucomicrobiae Verrucomicrobiasubdivision3 <NA>
#> 265094 Actinobacteria Micromonosporaceae Micromonospora
#> 341901 Actinobacteria Nocardioidaceae <NA>
#> 104310 Actinobacteria Frankiaceae <NA>
#> 561842 Alphaproteobacteria <NA> <NA>
#> 248299 Actinobacteria <NA> <NA>
#> 166835 Betaproteobacteria Burkholderiaceae Burkholderia
#> 369734 Alphaproteobacteria Rhodospirillaceae <NA>
#> 295422 Betaproteobacteria Alcaligenaceae Sutterella
#> 137099 Verrucomicrobiae <NA> <NA>
#> 254706 Chloracidobacteria <NA> <NA>
#> 204462 Solibacteres Solibacteraceae CandidatusSolibacter
#> 160337 Actinobacteria Frankiaceae <NA>
#> 144065 Alphaproteobacteria Acetobacteraceae <NA>
#> 113959 <NA> <NA> <NA>
#> 223948 Chloracidobacteria <NA> <NA>
#> 160135 Actinobacteria Streptomycetaceae Streptomyces
#> 244840 Planctomycea Pirellulaceae Pirellula
#> 240591 Alphaproteobacteria Phyllobacteriaceae <NA>
#> 547752 Betaproteobacteria Burkholderiaceae Burkholderia
#> 163176 Betaproteobacteria Rhodocyclaceae Methyloversatilis
#> 321885 Planctomycea Gemmataceae <NA>
#> 329327 Fibrobacteres Fibrobacteraceae Fibrobacter
#> 262115 <NA> <NA> <NA>
#> 512616 Gammaproteobacteria Sinobacteraceae <NA>
#> 261663 Bljii12 Dolo_23 <NA>
#> 257151 Sphingobacteria <NA> <NA>
#> 238929 Verrucomicrobiae <NA> <NA>
#> 322045 Betaproteobacteria Oxalobacteraceae Massilia
# Plot the results
library(ggplot2)
theme_set(theme_bw())
sigtabgen = subset(sigtab, !is.na(Genus))
# Phylum order
x = tapply(sigtabgen$log2FoldChange, sigtabgen$Phylum, function(x) max(x))
x = sort(x, TRUE)
sigtabgen$Phylum = factor(as.character(sigtabgen$Phylum), levels=names(x))
# Genus order
x = tapply(sigtabgen$log2FoldChange, sigtabgen$Genus, function(x) max(x))
x = sort(x, TRUE)
sigtabgen$Genus = factor(as.character(sigtabgen$Genus), levels=names(x))
ggplot(sigtabgen, aes(y=Genus, x=log2FoldChange, color=Phylum)) +
geom_vline(xintercept = 0.0, color = "gray", size = 0.5) +
geom_point(size=6) +
theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))
Created on 2021-10-08 by the reprex package (v2.0.1)
The tutorial was published in May 2021, so it's ~up-to-date, and they basically skip the 'intermediate step' your question is related to, but if you need a variance-stabilized matrix for some other reason a potential solution is to apply Anscombe's variance stabilizing transformation using Varistran (which also provides some nice plotting features), e.g.
#devtools::install_github("MonashBioinformaticsPlatform/varistran")
#BiocManager::install("edgeR")
library(varistran)
#> Loading required package: grid
counts <- DESeq2_Object@assays@data$counts
design <- model.matrix(~ qiimex_subset@sam_data$SampleType)
y <- vst(counts, design=design)
#> Dispersion estimated as 0.5672283
# "y" contains the transformed counts
y
#> CC1 CL3 Even1 Even2 Even3 SV1
#> 338272 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 0.7855066
#> 289855 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 0.7855066
#> 287759 0.2967411 -0.3192927 -0.3192927 -0.3192927 1.5903105 4.0624033
#> 334839 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> 144065 3.2103181 6.8071310 -0.3192927 -0.3192927 -0.3192927 0.7855066
#> 18643 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> 218985 6.1359268 3.1727994 -0.3192927 -0.3192927 -0.3192927 8.2069561
#> 238997 -0.3192927 0.5037921 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> 245893 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> 131765 -0.3192927 -0.3192927 -0.3192927 -0.3192927 1.5903105 0.7855066
#> 536982 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> 351629 5.0182785 4.5937241 5.4609218 4.0571176 2.8729479 5.3692361
#> 321069 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> 512157 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> ...
#> attr(,"lib.size")
#> CC1 CL3 Even1 Even2 Even3 SV1
#> 84395.682 57904.079 1560.595 4272.039 15659.007 38299.983
#> attr(,"true.lib.size")
#> CC1 CL3 Even1 Even2 Even3 SV1
#> 31125 37942 10184 7169 6597 34353
#> attr(,"lib.size.method")
#> [1] "TMM normalization"
#> attr(,"cpm")
#> [1] FALSE
#> attr(,"method")
#> [1] "anscombe.nb"
#> attr(,"dispersion")
#> [1] 0.5672283
plot_stability(y, counts, design=design)
#> Warning: Transformation introduced infinite values in continuous x-axis
#> Warning: Transformation introduced infinite values in continuous x-axis
plot_biplot(y)
plot_heatmap(y, n=50, cluster_samples = TRUE)
#> Registered S3 method overwritten by 'seriation':
#> method from
#> reorder.hclust vegan
Created on 2021-10-08 by the reprex package (v2.0.1)