I have the following R code, borrowed here, that generates a reproducible tibble
table:
# Install/load packages only if needed
# ************************************
if (!require("pacman")) install.packages("pacman")
pacman::p_load(dplyr, expss, ggplot2, grid, purrr, rlang, tibble)
# Data Generation
# ***************
# Set the seed for reproducibility
set.seed(123)
# Generate random data
n <- 490
PTSD <- sample(c(1, 2, NA), n, replace = TRUE) #class(PTSD) = "numeric"
ANX <- sample(c(1, 2, NA), n, replace = TRUE) #class(ANX) = "numeric"
DEP <- sample(c(1, 2, NA), n, replace = TRUE) #class(DEP) = "numeric"
# Create the data frame
df <- data.frame(PTSD, ANX, DEP) #class(df) = "data.frame"
# Label the values: 1 = Low, 2 = High
expss::val_lab(df$PTSD) = expss::num_lab("1 Low\n2 High")
expss::val_lab(df$ANX) = expss::num_lab("1 Low\n2 High")
expss::val_lab(df$DEP) = expss::num_lab("1 Low\n2 High")
# Create a list of tables for each variable to count 1s, 2s, and NAs
count_results <- list(
PTSD = table(df$PTSD, useNA = "ifany"),
ANX = table(df$ANX, useNA = "ifany"),
DEP = table(df$DEP, useNA = "ifany")
)
# Frequency count and data summary
# ********************************
# Combine the count tables into a single table
count_table <- do.call(rbind, count_results)
# Initialize empty vectors to store results
variable_names <- character()
sample_sizes <- numeric()
# Loop through the test results and extract relevant information
for (variable_name in names(count_results)) {
sample_sizes <- c(sample_sizes, sum(count_results[[variable_name]]))
variable_names <- c(variable_names, variable_name)
}
# Create summary data frame
summary_df <- data.frame(
Variable = variable_names,
N = sample_sizes
)
# Combine the count table and chi-squared summary table by columns
final_result <- cbind(count_table, summary_df)
# Remove Variable column in the middle of the table
final_result <- subset(final_result, select = -c(Variable))
# Combination of CMDs (CMD ≥ 1)
# *****************************
cmd <- c("PTSD","ANX","DEP")
combs <- map(seq_along(cmd),\(n)combn(cmd,n,simplify = FALSE)) |> purrr::flatten()
filts <- rlang::parse_exprs(map_chr(combs,\(x)paste0(x ,'== 2',collapse=' & ')))
filtsnames <- rlang::parse_exprs(map_chr(combs,\(x)paste0(x ,collapse=' + ')))
names(filts) <- filtsnames
output <- purrr::map_int(filts,\(x){
df %>%
mutate(id = row_number())%>%
filter(!!(x))%>%
summarise(
n = n())
} |> pull(n)
)
tibble::enframe(output)
The output of the tibble
table is supposed to show how many out of N = 490
have the following common mental disorders (CMDs), that is PTSD only, ANX only, DEP only, both PTSD and ANX, both PTSD and DEP, both ANX and DEP, and all 3 CMDs:
# A tibble: 7 × 2
name value
<chr> <int>
1 PTSD 167
2 ANX 156
3 DEP 156
4 PTSD + ANX 56
5 PTSD + DEP 52
6 ANX + DEP 51
7 PTSD + ANX + DEP 23
I wanted to visualise the table graphically so I thought about generating a Venn diagram. What I expected to see in the diagram is the following.
Expectation list:
However, whilst none of the codes (examples below) generated any technical errors (i.e., R code errors) none of the packages I tried (VennDiagram
and ggVennDiagram
) showed the expected results (see Expectation list
).
Here below are the 4 codes used to generate 4 different Venn diagrams, none of which gave the results outlined in Expectation list
:
Using package VennDiagram
Version 1
pacman::p_load(VennDiagram)
# Move to new plotting page
grid::grid.newpage()
# Calculate percentages
total_samples <- nrow(df)
percentages <- output / total_samples * 100
venn.plot <- VennDiagram::draw.triple.venn(
area1 = output["PTSD"],
area2 = output["ANX"],
area3 = output["DEP"],
n12 = output["PTSD + ANX"],
n23 = output["ANX + DEP"],
n13 = output["PTSD + DEP"],
n123 = output["PTSD + ANX + DEP"],
category = c("PTSD", "ANX", "DEP"),
fill = c("red", "green", "blue"),
lty = "blank",
cex = rep(1.5,7),
cat.cex = rep(1.5,3),
cat.pos = c(-20,-40,-60),
cat.dist = c(0.05,0.05,0.05),
ind = TRUE,
euler.d =TRUE,
)
grid.draw(venn.plot)
Using package VennDiagram
2
pacman::p_load(VennDiagram)
# Move to new plotting page
grid::grid.newpage()
# Use pre-calculated values from 'output'
VennDiagram::draw.triple.venn(
area1 = output["PTSD"],
area2 = output["ANX"],
area3 = output["DEP"],
n12 = output["PTSD + ANX"] + output["PTSD + ANX + DEP"], # Adjust for overlaps
n23 = output["ANX + DEP"] + output["PTSD + ANX + DEP"], # Adjust for overlaps
n13 = output["PTSD + DEP"] + output["PTSD + ANX + DEP"], # Adjust for overlaps
n123 = output["PTSD + ANX + DEP"],
category = c("PTSD", "ANX", "DEP"),
col = "Red", fill = c("Green", "Yellow", "Blue"),
cex = 1.5, cat.cex = 1.5, cat.pos = c(-20, 20, 180)
)
Using package VennDiagram
3
pacman::p_load(VennDiagram)
# Calculate exclusive counts for Venn diagram
ptsd_only <- output["PTSD"] - output["PTSD + ANX"] - output["PTSD + DEP"] + output["PTSD + ANX + DEP"]
anx_only <- output["ANX"] - output["PTSD + ANX"] - output["ANX + DEP"] + output["PTSD + ANX + DEP"]
dep_only <- output["DEP"] - output["PTSD + DEP"] - output["ANX + DEP"] + output["PTSD + ANX + DEP"]
ptsd_anx <- output["PTSD + ANX"] - output["PTSD + ANX + DEP"]
ptsd_dep <- output["PTSD + DEP"] - output["PTSD + ANX + DEP"]
anx_dep <- output["ANX + DEP"] - output["PTSD + ANX + DEP"]
ptsd_anx_dep <- output["PTSD + ANX + DEP"]
# Move to new plotting page
grid::grid.newpage()
# Create Venn diagram with 3 sets using adjusted values
VennDiagram::draw.triple.venn(
area1 = ptsd_only,
area2 = anx_only,
area3 = dep_only,
n12 = ptsd_anx,
n23 = anx_dep,
n13 = ptsd_dep,
n123 = ptsd_anx_dep,
category = c("PTSD", "ANX", "DEP"),
col = "Red", fill = c("Green", "Yellow", "Blue"),
cex = 1.5, cat.cex = 1.5, cat.pos = c(-20, 20, 180)
)
Using package ggVennDiagram
pacman::p_load(ggVennDiagram)
# Prepare data for Venn diagram
venn_data <- list(
PTSD = which(df$PTSD == 2),
ANX = which(df$ANX == 2),
DEP = which(df$DEP == 2)
)
# Create Venn diagram with ggVennDiagram
ggVennDiagram(venn_data) +
ggplot2::scale_fill_gradient(low = "white", high = "darkgrey") +
theme_void()
My question: Short of drawing by hand the diagram, is there a way to generate a Venn diagram with R that reflects the same results as those found in the tibble
table (see figure below)?
Conditions: The code that generates the tibble
table (tibble::enframe(output)
) should remain the same. The Venn diagram should reflect the results of the tibble
table.
Caveat: Perhaps I am missing the point about Venn diagram and what they represent...
For the draw.triple.venn
function, you need to add the individual totals to get the correct sizes of the sets. Try the following:
venn.plot <- VennDiagram::draw.triple.venn(
area1 = output["PTSD"]+output["PTSD + ANX"]+output["PTSD + DEP"]+output["PTSD + ANX + DEP"],
area2 = output["ANX"]+output["PTSD + ANX"]+output["ANX + DEP"]+output["PTSD + ANX + DEP"],
area3 = output["DEP"]+output["ANX + DEP"]+output["PTSD + DEP"]+output["PTSD + ANX + DEP"],
n12 = output["PTSD + ANX"]+output["PTSD + ANX + DEP"],
n23 = output["ANX + DEP"]+output["PTSD + ANX + DEP"],
n13 = output["PTSD + DEP"]+output["PTSD + ANX + DEP"],
n123 = output["PTSD + ANX + DEP"],
category = c("PTSD", "ANX", "DEP"),
fill = c("red", "green", "blue"),
lty = "blank",
#cex = rep(1.5,7),
cat.cex = rep(1.5,3),
#cat.pos = c(-20,-40,-60),
#cat.dist = c(0.05,0.05,0.05),
ind = TRUE,
euler.d =TRUE
)
If you want to start with the raw data, then you can skip the calculations.
# Create a list for each column, and determine `which` rows have values>=1.
df.lst <- lapply(as.list(df), \(x) which(x>=1))
venn.diagram(df.lst, filename = 'Venn_3set_simple.tiff')
This differs from the expected, probably due to the way you calculated the totals and intersections.