I would like to create a map showing the bi-variate spatial correlation between two variables. This could be done either by doing a LISA map of bivariate Moran's I spatial correlation or using the L index proposed by Lee (2001).
The bi-variate Moran's I is not implemented in the spdep
library, but the L index is, so here is what I've tried without success using the L index. A answer showing a solution based on Moran's I would also be very welcomed !
As you can see from the reproducible example below, I've manged so far to calculate the local L indexes. What I would like to do is to estimate the pseudo p-values and create a map of the results like those maps we use in LISA spatial clusters with high-high, high-low, ..., low-low.
In this example, the goal is to create a map with bi-variate Lisa association between black and white population. The map should be created in ggplot2
, showing the clusters:
library(UScensus2000tract)
library(ggplot2)
library(spdep)
library(sf)
# load data
data("oregon.tract")
# plot Census Tract map
plot(oregon.tract)
# Variables to use in the correlation: white and black population in each census track
x <- scale(oregon.tract$white)
y <- scale(oregon.tract$black)
# create Queen contiguity matrix and Spatial weights matrix
nb <- poly2nb(oregon.tract)
lw <- nb2listw(nb)
# Lee index
Lxy <-lee(x, y, lw, length(x), zero.policy=TRUE)
# Lee’s L statistic (Global)
Lxy[1]
#> -0.1865688811
# 10k permutations to estimate pseudo p-values
LMCxy <- lee.mc(x, y, nsim=10000, lw, zero.policy=TRUE, alternative="less")
# quik plot of local L
Lxy[[2]] %>% density() %>% plot() # Lee’s local L statistic (Local)
LMCxy[[7]] %>% density() %>% lines(col="red") # plot values simulated 10k times
# get confidence interval of 95% ( mean +- 2 standard deviations)
two_sd_above <- mean(LMCxy[[7]]) + 2 * sd(LMCxy[[7]])
two_sd_below <- mean(LMCxy[[7]]) - 2 * sd(LMCxy[[7]])
# convert spatial object to sf class for easier/faster use
oregon_sf <- st_as_sf(oregon.tract)
# add L index values to map object
oregon_sf$Lindex <- Lxy[[2]]
# identify significant local results
oregon_sf$sig <- if_else( oregon_sf$Lindex < 2*two_sd_below, 1, if_else( oregon_sf$Lindex > 2*two_sd_above, 1, 0))
# Map of Local L index but only the significant results
ggplot() + geom_sf(data=oregon_sf, aes(fill=ifelse( sig==T, Lindex, NA)), color=NA)
What about this?
I'm using the regular Moran's I instead of that Lee Index you suggest. But I think the underlying reasoning is pretty much the same.
As you may see bellow -- the results produced this way look very much alike those comming from GeoDA
library(dplyr)
library(ggplot2)
library(sf)
library(spdep)
library(rgdal)
library(stringr)
library(UScensus2000tract)
#======================================================
# load data
data("oregon.tract")
# Variables to use in the correlation: white and black population in each census track
x <- oregon.tract$white
y <- oregon.tract$black
#======================================================
# Programming some functions
# Bivariate Moran's I
moran_I <- function(x, y = NULL, W){
if(is.null(y)) y = x
xp <- (x - mean(x, na.rm=T))/sd(x, na.rm=T)
yp <- (y - mean(y, na.rm=T))/sd(y, na.rm=T)
W[which(is.na(W))] <- 0
n <- nrow(W)
global <- (xp%*%W%*%yp)/(n - 1)
local <- (xp*W%*%yp)
list(global = global, local = as.numeric(local))
}
# Permutations for the Bivariate Moran's I
simula_moran <- function(x, y = NULL, W, nsims = 1000){
if(is.null(y)) y = x
n = nrow(W)
IDs = 1:n
xp <- (x - mean(x, na.rm=T))/sd(x, na.rm=T)
W[which(is.na(W))] <- 0
global_sims = NULL
local_sims = matrix(NA, nrow = n, ncol=nsims)
ID_sample = sample(IDs, size = n*nsims, replace = T)
y_s = y[ID_sample]
y_s = matrix(y_s, nrow = n, ncol = nsims)
y_s <- (y_s - apply(y_s, 1, mean))/apply(y_s, 1, sd)
global_sims <- as.numeric( (xp%*%W%*%y_s)/(n - 1) )
local_sims <- (xp*W%*%y_s)
list(global_sims = global_sims,
local_sims = local_sims)
}
#======================================================
# Adjacency Matrix (Queen)
nb <- poly2nb(oregon.tract)
lw <- nb2listw(nb, style = "B", zero.policy = T)
W <- as(lw, "symmetricMatrix")
W <- as.matrix(W/rowSums(W))
W[which(is.na(W))] <- 0
#======================================================
# Calculating the index and its simulated distribution
# for global and local values
m <- moran_I(x, y, W)
m[[1]] # global value
m_i <- m[[2]] # local values
local_sims <- simula_moran(x, y, W)$local_sims
# Identifying the significant values
alpha <- .05 # for a 95% confidence interval
probs <- c(alpha/2, 1-alpha/2)
intervals <- t( apply(local_sims, 1, function(x) quantile(x, probs=probs)))
sig <- ( m_i < intervals[,1] ) | ( m_i > intervals[,2] )
#======================================================
# Preparing for plotting
oregon.tract <- st_as_sf(oregon.tract)
oregon.tract$sig <- sig
# Identifying the LISA patterns
xp <- (x-mean(x))/sd(x)
yp <- (y-mean(y))/sd(y)
patterns <- as.character( interaction(xp > 0, W%*%yp > 0) )
patterns <- patterns %>%
str_replace_all("TRUE","High") %>%
str_replace_all("FALSE","Low")
patterns[oregon.tract$sig==0] <- "Not significant"
oregon.tract$patterns <- patterns
# Plotting
ggplot() + geom_sf(data=oregon.tract, aes(fill=patterns), color="NA") +
scale_fill_manual(values = c("red", "pink", "light blue", "dark blue", "grey95")) +
theme_minimal()
You can get results closer (but not identical) to that of GeoDa by making changes in the confidence interval (e.g. using 90% instead of 95%).
I suppose the remaining discrepancies come from a slightly different method of calculating the Moran's I. My version gives the same values of that function moran
available in the package spdep
. But GeoDa probably uses another approach.