I'm using MatchIt to achieve greater balance between study groups. Most methods appear to work great, reducing differences in covariates across groups (I'm including method = "optimal"
here as an example of a method that works well). However, genetic matching dramatically increases imbalance. It's also worth noting that the results from genetic matching appear to be exactly the same as for nearest neighbor matching, which to me suggests that something is going wrong. Why/how could this be happening?
library(MatchIt)
data <- data.frame(
treatment_group = rep(
rep(c(TRUE, FALSE), 88),
c(104L, 3L, 6L, 2L, 1L, 2L, 1L, 1L, 1L, 6L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 8L, 1L, 1L, 1L, 2L, 6L, 2L, 3L, 1L, 1L, 1L, 1L, 1L,
3L, 1L, 1L, 1L, 5L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 5L, 3L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 1L, 5L, 1L, 2L, 2L, 23L, 1L, 1L, 9L,
1L, 2L, 2L, 2L, 2L, 6L, 1L, 1L, 1L, 5L, 1L, 5L, 1L, 2L, 2L, 4L,
1L, 2L, 1L, 6L, 5L, 1L, 1L, 1L, 2L, 3L, 3L, 2L, 2L, 3L, 2L, 2L,
6L, 2L, 6L, 3L, 1L, 2L, 1L, 1L, 1L, 1L, 4L, 1L, 4L, 1L, 2L, 1L,
1L, 3L, 5L, 2L, 1L, 1L, 1L, 1L, 9L, 2L, 1L, 2L, 1L, 4L, 4L, 2L,
1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 6L, 1L, 5L, 1L, 2L, 5L,
2L, 3L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 5L, 7L, 1L, 1L,
2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 4L, 3L, 1L, 1L, 5L, 1L, 1L, 2L,
1L, 1L)
),
x1 = c(
5.5, 4.5, 4, 5.5, 2, 3, 5, 5, 3, 3.5, 4.5, 3, 3.5, 4, 4, 4.5, 5, 4.5, 4.5,
2.5, 3, 5.5, 5.5, 5.5, 5.5, 4.5, 4.5, 5, 6, 3, 6, 2, 5, 4, 3.5, 3, 4, 4, 4,
4.5, 6, 5, 3, 4, 5, 2, 1.5, 3.5, 2, 2.5, 4.5, 3, 2.5, 4, 5.5, 3, 4.5, 4, 2,
3.5, 5.5, 5, 1, 5, 6, 3.5, 4, 4, 4.5, 2.5, 4.5, 3.5, 5, 3, 5.5, 4.5, 6, 4,
3.5, 4.5, 5, 5, 6, 2.5, 4.5, 6, 6, 3.5, 2, 5, 3.5, 4, 4, 4, 6, 6, 5, 5, 4, 6,
4, 5, 2.5, 3, 3, 4.5, 2.5, 3, 6, 3, 5, 5.5, 3.5, 5, 3.5, 4.5, 5, 5, 3.5, 5.5,
4.5, 5, 5, 3.5, 5, 5, 4.5, 3, 3, 6, 5, 6, 3.5, 4.5, 4, 2.5, 4.5, 1, 5.5, 6,
4.5, 5, 6, 4, 1.5, 4.5, 3.5, 2.5, 3, 4.5, 5, 5, 3, 4, 5, 4.5, 4.5, 4.5, 6, 5,
4.5, 5, 4.5, 5, 6, 4, 5, 6, 5, 5.5, 6, 4.5, 4, 3, 6, 2.5, 5, 2, 4.5, 5.5, 4.5,
3.5, 5, 4.5, 5.5, 4.5, 5.5, 4, 4, 5, 4.5, 2, 5, 5, 6, 2.5, 2.5, 3.5, 5, 2,
3.5, 5.5, 5, 2.5, 6, 6, 6, 1.5, 6, 5, 6, 4, 3, 4.5, 1.5, 2, 3, 4, 3, 3, 4, 5,
6, 6, 3.5, 3.5, 5, 5, 6, 4, 3, 6, 5, 6, 3, 4.5, 4, 4.5, 5, 5, 3, 1.5, 3.5, 6,
5, 4, 6, 6, 4.5, 4, 5, 5.5, 3, 4, 3, 6, 2.5, 4.5, 3, 3, 3.5, 4, 4, 4, 5.5,
4.5, 4.5, 4.5, 1.5, 6, 3.5, 3, 5, 3.5, 4.5, 3.5, 6, 3, 2, 5.5, 5, 5.5, 5, 5,
5, 5, 3.5, 3, 3, 4, 4, 5, 5, 6, 5, 5, 3.5, 5, 5, 3.5, 1.5, 2.5, 4.5, 5, 3, 2,
4, 4.5, 3.5, 4, 4.5, 3.5, 4, 5, 4, 6, 4, 5.5, 4.5, 5, 4.5, 4, 4.5, 4, 5, 5,
2.5, 5, 3, 2, 4.5, 3, 4.5, 5, 4.5, 5, 5, 5, 3, 5.5, 5, 6, 2.5, 4.5, 3.5, 4, 5,
1, 2.5, 2, 4, 1.5, 5, 5.5, 5, 5, 6, 5, 6, 2.5, 5.5, 2.5, 5, 5, 2.5, 6, 4, 5,
3, 4.5, 5, 2.5, 5, 2, 4, 2.5, 5, 4, 5, 3.5, 4, 4.5, 3, 5, 2, 5, 5, 4.5, 6, 5,
5, 4, 5, 3, 6, 6, 5.5, 5, 3, 5, 5.5, 4.5, 5, 3, 4, 3, 3.5, 3, 3.5, 4, 2.5, 4,
3.5, 6, 2, 5.5, 3, 4.5, 5, 4.5, 3, 2.5, 5, 2, 5.5, 5, 5, 2, 6, 6, 3, 4, 1, 3,
5, 3.5, 4.5, 4.5, 5.5, 3.5, 3, 3.5, 4.5, 3.5, 6, 4, 4, 4.5, 6, 4.5, 2, 2, 6,
4, 4.5, 4, 5, 5, 3, 4, 4.5, 3.5, 2, 3.5, 5, 2, 5, 5, 2.5, 2.5, 5, 5, 4, 4, 3,
5, 5, 5.5, 5.5, 4.5, 6, 5.5, 2.5, 2, 4.5, 3.5, 5, 6, 5, 5, 4.5, 4, 6, 3.5,
1.5, 2.5, 5, 4.5, 2, 5
),
x2 = c(
3, 4, 2, 4, 2.5, 4, 4, 4, 2.5, 2.5, 2.5, 3, 3, 3, 2.5, 3.5, 4, 3.5, 2.5, 3, 2,
4, 3.5, 1.5, 3, 4, 3.5, 3, 3.5, 3.5, 3.5, 1.5, 3.5, 3.5, 1.5, 2.5, 3.5, 3.5,
3.5, 3.5, 4, 3.5, 3, 3, 2, 2.5, 4, 3.5, 3, 1.5, 2, 2, 2, 1.5, 4, 3, 4, 2, 3,
1.5, 3, 3, 1, 2.5, 2, 2.5, 3.5, 3, 3, 3, 3, 3, 2, 4, 3.5, 3, 3, 2, 2, 2.5,
2.5, 4, 2.5, 3, 3.5, 4, 3, 3.5, 2, 2.5, 3.5, 4, 2.5, 3, 4, 3, 3.5, 2, 1.5,
3.5, 3, 3.5, 3.5, 4, 1.5, 3, 2, 2, 4, 3, 3.5, 4, 1.5, 2.5, 3.5, 3, 4, 2, 3,
2.5, 3, 3, 4, 3, 4, 2, 3, 2.5, 2, 3.5, 2, 2.5, 3, 3, 3, 4, 3, 3.5, 2.5, 3.5,
3.5, 3.5, 3, 3.5, 3, 2.5, 3.5, 2, 3, 2, 3, 2.5, 3.5, 3, 3, 3, 3, 3, 2.5, 3, 4,
3, 3.5, 2, 3.5, 3, 3, 4, 2, 3.5, 3, 2, 3, 3, 2, 4, 3.5, 1.5, 3, 2, 3, 4, 3, 2,
3.5, 3, 1.5, 2.5, 3.5, 2, 3, 3, 3, 3, 3, 2, 2, 2, 2, 3, 4, 4, 3, 3.5, 2.5, 4,
4, 4, 3, 2, 4, 3.5, 4, 3, 2, 4, 2, 3, 2.5, 2.5, 3.5, 2, 3, 4, 3, 4, 4, 2.5, 2,
3, 3, 4, 3, 4, 2.5, 3.5, 3.5, 3.5, 2, 3.5, 2.5, 2.5, 3, 2, 3, 4, 3, 3, 3, 2,
4, 3.5, 3, 3, 2.5, 3, 3.5, 3, 3, 4, 2.5, 2, 4, 3, 3, 2, 2, 3, 3, 4, 1, 2.5, 4,
1, 3, 3, 2, 4, 2.5, 3.5, 3, 2, 3, 2.5, 3, 3, 2, 2, 4, 1.5, 2.5, 2.5, 2, 3, 3,
4, 3, 3, 3, 2.5, 3, 3, 3, 3.5, 2, 2.5, 3, 3.5, 3, 2, 3, 3.5, 3, 1.5, 3, 4, 3,
3.5, 3.5, 3, 2.5, 3, 3, 3, 3.5, 3.5, 1.5, 3, 2.5, 3.5, 2, 2, 3.5, 2, 3.5, 3,
3, 2, 3, 1.5, 3, 3, 4, 2, 2.5, 4, 2, 4, 3.5, 4, 2, 3, 3, 1.5, 3, 3, 3, 2, 3,
3.5, 1.5, 2.5, 3.5, 3, 3.5, 3.5, 3, 3, 2, 2.5, 2, 1.5, 3.5, 1.5, 3, 2.5, 2.5,
3, 2, 2, 2, 3.5, 4, 3.5, 2.5, 3.5, 3, 3, 4, 2.5, 2, 3, 3, 2, 4, 4, 2, 2.5, 1,
3.5, 4, 3, 3, 4, 2.5, 2, 3, 1.5, 2.5, 4, 2, 3.5, 3, 4, 3, 3, 3, 3, 3, 2, 2, 2,
3, 3, 4, 2, 3.5, 3, 3.5, 2, 2, 2, 3, 3, 3, 3, 3.5, 2, 3, 2, 4, 2, 3, 3, 2.5,
2, 2.5, 4, 3, 2.5, 2, 3, 2.5, 2, 4, 3.5, 3, 2.5, 3, 2.5, 2.5, 3.5, 2.5, 2.5,
2.5, 3.5, 4, 3, 2, 3, 3, 4, 2.5, 2.5, 4, 3, 1.5, 2, 2.5, 1.5, 3.5, 4, 3, 1.5,
1.5, 3, 4, 3, 2.5, 4, 2, 2.5, 3, 2.5, 3, 3.5, 2, 3, 3, 2.5
),
x3 = c(5L, 5L, 5L, 6L, 4L, 6L, 3L, 6L, 2L, 5L, 5L, 5L, 5L, 1L, 5L,
6L, 6L, 3L, 4L, 6L, 3L, 6L, 3L, 1L, 5L, 6L, 6L, 3L, 6L, 3L, 6L,
3L, 5L, 4L, 6L, 5L, 5L, 5L, 4L, 4L, 3L, 5L, 3L, 5L, 5L, 5L, 5L,
5L, 6L, 3L, 5L, 3L, 3L, 2L, 5L, 3L, 5L, 3L, 5L, 4L, 5L, 5L, 1L,
3L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 5L, 3L, 5L, 5L, 5L, 5L, 5L, 4L,
4L, 5L, 4L, 6L, 6L, 5L, 6L, 6L, 5L, 1L, 6L, 5L, 4L, 5L, 5L, 5L,
5L, 5L, 6L, 2L, 6L, 4L, 5L, 2L, 5L, 4L, 4L, 3L, 2L, 6L, 4L, 4L,
6L, 2L, 4L, 3L, 6L, 4L, 5L, 5L, 5L, 3L, 6L, 5L, 5L, 6L, 5L, 5L,
6L, 3L, 6L, 2L, 2L, 3L, 3L, 5L, 6L, 5L, 4L, 1L, 6L, 2L, 2L, 5L,
2L, 3L, 5L, 1L, 3L, 5L, 4L, 5L, 2L, 5L, 4L, 5L, 3L, 5L, 4L, 6L,
1L, 5L, 6L, 5L, 5L, 6L, 3L, 4L, 2L, 2L, 5L, 5L, 5L, 6L, 5L, 4L,
1L, 3L, 3L, 4L, 6L, 5L, 4L, 4L, 3L, 5L, 4L, 3L, 3L, 1L, 3L, 6L,
1L, 5L, 5L, 5L, 6L, 3L, 5L, 3L, 2L, 4L, 5L, 5L, 3L, 4L, 2L, 6L,
4L, 6L, 4L, 6L, 5L, 4L, 5L, 5L, 2L, 4L, 4L, 2L, 3L, 5L, 4L, 2L,
5L, 5L, 5L, 6L, 3L, 4L, 6L, 5L, 5L, 2L, 6L, 5L, 5L, 5L, 5L, 5L,
3L, 5L, 3L, 5L, 5L, 5L, 6L, 5L, 3L, 3L, 5L, 5L, 6L, 3L, 5L, 4L,
6L, 2L, 5L, 5L, 6L, 4L, 2L, 5L, 4L, 4L, 3L, 2L, 5L, 3L, 6L, 1L,
4L, 5L, 3L, 3L, 5L, 6L, 5L, 3L, 6L, 5L, 4L, 3L, 4L, 4L, 5L, 3L,
1L, 5L, 1L, 2L, 6L, 4L, 6L, 6L, 6L, 6L, 5L, 5L, 2L, 5L, 3L, 4L,
6L, 3L, 5L, 5L, 4L, 5L, 5L, 3L, 4L, 4L, 5L, 5L, 6L, 2L, 3L, 6L,
4L, 5L, 5L, 3L, 6L, 5L, 6L, 3L, 5L, 4L, 3L, 6L, 5L, 5L, 5L, 4L,
5L, 6L, 4L, 4L, 4L, 5L, 5L, 3L, 3L, 5L, 4L, 5L, 6L, 2L, 2L, 3L,
2L, 5L, 1L, 5L, 5L, 3L, 3L, 5L, 5L, 6L, 6L, 6L, 5L, 6L, 5L, 5L,
5L, 2L, 5L, 5L, 3L, 5L, 5L, 2L, 5L, 6L, 3L, 3L, 4L, 4L, 2L, 3L,
5L, 5L, 5L, 5L, 6L, 1L, 3L, 4L, 3L, 5L, 4L, 2L, 5L, 5L, 5L, 3L,
6L, 5L, 6L, 6L, 6L, 3L, 5L, 2L, 5L, 4L, 4L, 3L, 2L, 5L, 6L, 2L,
4L, 5L, 4L, 5L, 6L, 4L, 5L, 3L, 2L, 5L, 6L, 5L, 3L, 3L, 4L, 6L,
3L, 1L, 1L, 5L, 5L, 1L, 5L, 5L, 3L, 4L, 6L, 5L, 3L, 6L, 6L, 5L,
4L, 5L, 6L, 6L, 5L, 5L, 4L, 6L, 6L, 4L, 4L, 4L, 4L, 5L, 6L, 3L,
5L, 6L, 5L, 3L, 5L, 5L, 5L, 5L, 6L, 5L, 4L, 1L, 5L, 5L, 5L, 4L,
3L, 6L, 5L, 3L, 3L, 4L, 4L, 5L, 6L, 5L, 4L, 3L, 3L, 6L, 5L, 3L,
1L, 4L, 3L, 2L, 3L)
)
out_optimal <- matchit(
data = data,
formula = treatment_group ~ x1 + x2 + x3,
method = "optimal"
)
out_nearest <- matchit(
data = data,
formula = treatment_group ~ x1 + x2 + x3,
method = "nearest"
)
out_genetic <- matchit(
data = data,
formula = treatment_group ~ x1 + x2 + x3,
method = "genetic"
)
summary(out_optimal) # Std. Mean Diff.s go from -.05, -.05, +.01 (all data) to -.03, +.00, -.02 (matched data)
summary(out_nearest) # Std. Mean Diff.s go from -.05, -.05, +.01 (all data) to -.51, -.50, +.05 (matched data)
summary(out_genetic) # Std. Mean Diff.s go from -.05, -.05, +.01 (all data) to -.51, -.50, +.05 (matched data)
These matching methods don't work well when there are more treated than control units. If you change the estimand to the ATC (which effectively switches the treated and control groups), all methods do well. This is because nearest neighbor matching (and genetic matching, which is just nearest neighbor matching) by default matches from those with the largest propensity score to the smallest. That means the hardest treated units to match get matched first, and the easiest get matched last, and when there are more treated units than control units, the easiest treated units to match don't get matched at all.
Optimal matching and nearest neighbor matching with m.order = "closest"
do not suffer from this problem because the units that are easiest to match are match first. When you set estimand = "ATC"
, all control units receive a match, including both the easiest and hardest to match.
Alternatively, when you have more treated than control units, you can do matching with replacement (either nearest neighbor or genetic matching). Setting replace = TRUE
makes this happen and yields excellent balance.