I have a dataset df
I want to analyze and visualize with a scatterplot for correlation with histograms. Moreover, color data points in blue for one and yellow for the other, but pairs of non-zeros for both in green. And for those green, I want to make a correlation. Actually, the zeros are NAs
but I like them in the plot. Below is a script to generate an example:
n <- 300; percent_zeros <- 0.2; n_zeros <- percent_zeros * n
set.seed(123)
x <- runif(n, min = 0, max = 0.6)
noise <- runif(n, min = -0.05, max = 0.05)
y <- 0.8 * x + noise
y <- pmax(0, pmin(0.6, y))
zero_indices_x <- sample(1:n, n_zeros)
zero_indices_y <- sample(setdiff(1:n, zero_indices_x), n_zeros)
x[zero_indices_x] <- 0
y[zero_indices_y] <- 0
non_zero_indices <- which(x != 0 & y != 0)
df <- data.frame(x,y,color = NA)
df$color <- ifelse(x != 0, "yellow", df$color)
df$color <- ifelse(y != 0, "blue", df$color)
df$color <- ifelse(x != 0 & y != 0, "green", df$color)
plot(df$x, df$y, main = NULL,
xlab = NULL, ylab = NULL, pch = 19, col = df$color)
abline(lm(df$y ~ df$x), col = "red", lwd = 2)
abline(lm(y[x != 0 & y != 0] ~ x[x != 0 & y != 0]), col = "green", lwd = 2)
hist(x, breaks = 50, col = "yellow", border = "white", xlab = NULL, main = NULL)
hist(y, breaks = 50, col = "blue", border = "white", xlab = NULL, main = NULL)
but I would like it to look something like that: and the green line is a correlation for non-zeros pairs; it would be perfect if it could have the correlation coefficient and p-value there. Can anyone help me with the code for a nice ggplot for that?
Here is one option to achieve your desired result which created the three plots using ggplot2
and combines them using patchwork
. Additionally I added the correlation coefficient and p-value using ggpubr::stat_cor
:
library(ggplot2)
library(patchwork)
library(ggpubr)
p1 <- ggplot(df, aes(x, y, color = I(color))) +
geom_point() +
geom_smooth(se = FALSE, method = "lm", color = "red") +
geom_smooth(
data = ~ subset(., x > 0 & y > 0), se = FALSE,
method = "lm", color = "green"
) +
ggpubr::stat_cor(
data = ~ subset(., x > 0 & y > 0), se = FALSE,
color = "green",
label.x.npc = .25,
geom = "label"
)
#> Warning in ggpubr::stat_cor(data = ~subset(., x > 0 & y > 0), se = FALSE, :
#> Ignoring unknown parameters: `se`
p2 <- ggplot(df, aes(y = y)) +
geom_histogram(fill = "yellow") +
scale_x_reverse()
p3 <- ggplot(df, aes(x)) +
geom_histogram(fill = "blue") +
scale_y_reverse()
design <-
"
BA
#C
"
list(p1, p2, p3) |>
wrap_plots(design = design)
#> `geom_smooth()` using formula = 'y ~ x'
#> `geom_smooth()` using formula = 'y ~ x'
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.