rt-testcombn

combn to run a t.test on only one row in a data.table, based on the grouping of another row


For this question, I'm using example data from another question: (Using combn() in R to find all possible t-test relationships, how to access the variables compared?)

I am trying to run t.tests on the data in one column (vdem_media_bias) based on the grouping and averaging according to the like rows in column group. To do this, I made a matrix (col_vec) where all the possible combinations of rows in column group are given in columns. Then I looped through the combination columns and tried to use the values in col_vec to index trust_news and run t.tests on the values in vdem_media_bias.

library(tidyr)
library(dplyr)
library(purrr)
library(broom)
library(data.table)

trust_news <- data.table(group  =   c(  1,  2,  3,  5,  1,  2,  3,  5   ),
                         mean   =   c(  2.68,   2.8,    3.22,   2.96,   2.16739457271907,   2.52420459002153,   2.41568263469884,   3.203042892 ),
                         polity2    =   c(  11.0065624592239,   11.5173963819866,   10.9690935682988,   7.4505271137667,    12.5039982916238,   10.2183026540883,   11.5528854439886,   8.951439457 ),
                         web    =   c(  87.2661,    94.8967,    89.7391,    74.3872,    99.429653354408,    86.651554228033,    88.7724076776387,   82.71520214 ),
                         rsf    =   c(  16.4114055821914,   16.7094853023036,   29.5703741065912,   23.8887382488302,   15.6296493773739,   17.6154322886768,   28.7548757022008,   19.67113906 ),
                         civil_liberties    =   c(  0.0825935016192092, 0.775101009726579,  0.0505530810447113, 0.302537382248755,  0.4563913292031,    0.759215853597412,  0.172427084833363,  0.772689046 ),
                         freedom_of_expression  =   c(  0.828613177733605,  0.312013322438901,  0.788670591087342,  0.192519523553583,  0.599717513191781,  0.682591150447179,  0.0926750357710926, 0.777797913 ),
                         vdem_gov_censorship_effort =   c(  0.138795469801477,  0.609063884646593,  0.726359178306941,  0.889397969637812,  0.935589374225135,  0.5665276966565,    0.939565934607974,  0.383976181 ),
                         vdem_self_censorship_effort    =   c(  0.396389956859171,  0.368909768892093,  0.795891881421097,  0.635097442795347,  0.818482469632694,  0.236225542529557,  0.615626792133531,  0.035815441 ),
                         vdem_freedom_of_expression =   c(  0.270293421459824,  0.0390741902396886, 0.0430150919275922, 0.451150387704898,  0.477508000166212,  0.841653725881213,  0.885557800737999,  0.205165414 ),
                         ciri_freedom_of_speech_and_press   =   c(  0.316629196080121,  0.473252307510063,  0.782615827564335,  0.600531400120434,  0.710143072640082,  0.0105031020419695, 0.348075169050439,  0.282137779 ),
                         media_integrity    =   c(  0.763527559879246,  0.32401086386208,   0.72357993656577,   0.00400993472498878,    0.804790656378296,  0.752078255508056,  0.12450323855886,   0.784842369 ),
                         vdem_critical_press    =   c(  0.381934476222942,  0.34180259848916,   0.446445782446012,  0.101622162644166,  0.69273703605482,   0.674447773690032,  0.593981628888233,  0.942639773 ),
                         vdem_media_perspective =   c(  0.0121195504624632, 0.55722585253704,   0.759939717080726,  0.422261093911245,  0.425138202329788,  0.0440513028866172, 0.128858393736017,  0.242947985 ),
                         vdem_media_bias    =   c(  0.0386091435387627, 0.149227444223962,  0.109819706451143,  0.79759885890908,   0.421084356961021,  0.269048253813396,  0.661346417250035,  0.051339441 ),
                         vdem_media_corruption  =   c(  0.188072265767419,  0.70446056263869,   0.988731124980027,  0.659578974943737,  0.924096790161402,  0.465803086735264,  0.0389306657957701, 0.865528541 ),
                         vdem_media_freedom =   c(  0.710150348587615,  0.253187878925348,  0.436362652570953,  0.471900738683268,  0.302672853555399,  0.88559075326591,   0.786474651280836,  0.234080239 )
                         )
              
#Get the unique names of the rows in a column
condition_vec = unique(trust_news$group)

#get combinations of rows in the 'group' column
col_vec <- names(combn(condition_vec, 2, FUN = paste))

tab_ttest_list = list()

for(c in col_vec) {
tmp_ttest <- t.test(trust_news[group == combn(condition_vec, 2)[1,c], vdem_media_bias],
                      trust_news[group == combn(condition_vec, 2)[2,c], vdem_media_bias])
res_tab_1 = data.table(condition1=combn(condition_vec, 2)[1,c],
                       condition2=combn(condition_vec, 2)[2,c],
                       t_statistic=tmp_ttest$statistic,
                       df=tmp_ttest$parameter,
                       p_value=tmp_ttest$p.value,
                       mean_cond1=tmp_ttest$estimate[1],
                       mean_cond2=tmp_ttest$estimate[2],
                       conf_low=tmp_ttest$conf.low,
                       conf_high=tmp_ttest$conf.high,
                       method=tmp_ttest$method)
res_tab = rbind(res_tab_1)
tab_ttest_list[[c]] = res_tab
print(c)
}

ttest_compList = rbindlist(tab_ttest_list)

Here's the thing: When I run this and let it sit and think for a while, I get no error. It seems like it has a problem processing it. In R-studio, if I run col_vec or ttest_compList, it comes up completely empty.

Which programming god did I upset to make this happen and how do I appease it? Thank you kindly.

**UPDATE: ** When I run one t.test using the code below:

#Get the unique names of the rows in a column
condition_vec = unique(trust_news$group)

#get list of combinations of rows in the 'rows' column
col_vec <- combn(condition_vec, 2, FUN = paste)

  tmp_ttest <- t.test(trust_news[group == combn(condition_vec, 2)[1,1], vdem_media_bias],
                      trust_news[group == combn(condition_vec, 2)[2,1], vdem_media_bias])
  res_tab_1 = data.table(condition1=combn(condition_vec, 2)[1,1],
                         condition2=combn(condition_vec, 2)[2,1],
                         t_statistic=tmp_ttest$statistic,
                         df=tmp_ttest$parameter,
                         p_value=tmp_ttest$p.value,
                         mean_cond1=tmp_ttest$estimate[1],
                         mean_cond2=tmp_ttest$estimate[2],
                         conf_low=tmp_ttest$conf.low,
                         conf_high=tmp_ttest$conf.high,
                         method=tmp_ttest$method)

I get results that look like this when I run res_tab_1:

enter image description here

UPDATE Adding as.numeric() to col_vec seems to fix the no 'dimnames' attribute for array issue, so it is no longer character, but there is another issue. I only get t.tests for some of my comparisons, as shown below.

col_vec <- as.numeric(combn(condition_vec, 2, FUN = paste))
   condition1 condition2 t_statistic       df   p_value mean_cond1 mean_cond2
1:          1          2   0.1033366 1.194413 0.9322750  0.2298468  0.2091378
2:          1          3  -0.4640742 1.781169 0.6930649  0.2298468  0.3855831
3:          1          5  -0.4641799 1.491450 0.7012347  0.2298468  0.4244691
4:          2          5  -0.5697971 1.051526 0.6664883  0.2091378  0.4244691
                    method
1: Welch Two Sample t-test
2: Welch Two Sample t-test
3: Welch Two Sample t-test
4: Welch Two Sample t-test

col_vec looks like this, so there should be 6 comparisons:

> col_vec
     [,1] [,2] [,3] [,4] [,5] [,6]
[1,] "1"  "1"  "1"  "2"  "2"  "3" 
[2,] "2"  "3"  "5"  "3"  "5"  "5"

UPDATE I think I found something that works! But please let me know if there is something better.

#Get the unique names of the rows in a column
condition_vec = as.numeric(unique(trust_news$group))

#get list of combinations of rows in the 'rows' column
col_vec <- combn(condition_vec, 2, FUN = paste)

tab_ttest_list = list()

for(c in 1:ncol(col_vec)) {
tmp_ttest <- t.test(trust_news[group == combn(condition_vec, 2)[1,c], vdem_media_bias],
                      trust_news[group == combn(condition_vec, 2)[2,c], vdem_media_bias])
res_tab_1 = data.table(condition1=combn(condition_vec, 2)[1,c],
                       condition2=combn(condition_vec, 2)[2,c],
                       t_statistic=tmp_ttest$statistic,
                       df=tmp_ttest$parameter,
                       p_value=tmp_ttest$p.value,
                       mean_cond1=tmp_ttest$estimate[1],
                       mean_cond2=tmp_ttest$estimate[2],
                       conf_low=tmp_ttest$conf.low,
                       conf_high=tmp_ttest$conf.high,
                       method=tmp_ttest$method)
res_tab = rbind(res_tab_1)
tab_ttest_list[[c]] = res_tab
print(c)
}

ttest_compList = rbindlist(tab_ttest_list)
print(ttest_compList)

This gives me:

   condition1 condition2 t_statistic       df   p_value mean_cond1 mean_cond2
1:          1          2  0.10333665 1.194413 0.9322750  0.2298468  0.2091378
2:          1          3 -0.46407420 1.781169 0.6930649  0.2298468  0.3855831
3:          1          5 -0.46417992 1.491450 0.7012347  0.2298468  0.4244691
4:          2          3 -0.62525727 1.094188 0.6368321  0.2091378  0.3855831
5:          2          5 -0.56979705 1.051526 0.6664883  0.2091378  0.4244691
6:          3          5 -0.08381105 1.841388 0.9414162  0.3855831  0.4244691
                    method
1: Welch Two Sample t-test
2: Welch Two Sample t-test
3: Welch Two Sample t-test
4: Welch Two Sample t-test
5: Welch Two Sample t-test
6: Welch Two Sample t-test

Solution

  • Here is a base R that you can generalize and play with:

    multiple_t_test <- function(form, data){
      fn <- function(x, g){
        f <- function(cmbs){
          t_object <- do.call(t.test, unname(split(x, g %in% cmbs)))
          nms <- paste0(cmbs, collapse = ":")
          cbind(grp = nms, broom::tidy(t_object))}
        levs <- levels(as.factor(g))
        do.call(rbind, combn(levs, 2, f, simplify = FALSE))
      }
      
      model <- model.frame(form[c(1,3, 2)], data)
      grp_fct <- model.response(model)
      grp_fct_name <- deparse1(form[[3]])
      Xnames <- attr(terms(model), "term.labels")
      res <- Map(fn, data[Xnames], MoreArgs = list(g = grp_fct))
      res <- do.call(rbind, res)
      data.frame(group =   sub("[.0-9]+$", "", rownames(res)), 
                        res, row.names = NULL)
      
    }
    
    multiple_t_test(mean~group, trust_news)
    
    
    # A tibble: 6 × 12
      group grp   estim…¹ estim…² estim…³ stati…⁴ p.value param…⁵ conf.…⁶ conf.…⁷ method
      <chr> <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <chr> 
    1 mean  1:2    0.407     2.95    2.54  1.75     0.135    5.50  -0.175   0.989 Welch…
    2 mean  1:3    0.251     2.87    2.62  0.941    0.389    5.07  -0.432   0.934 Welch…
    3 mean  1:5   -0.0126    2.74    2.75 -0.0442   0.966    5.74  -0.719   0.694 Welch…
    4 mean  2:3    0.0126    2.75    2.74  0.0442   0.966    5.74  -0.694   0.719 Welch…
    5 mean  2:5   -0.251     2.62    2.87 -0.941    0.389    5.07  -0.934   0.432 Welch…
    6 mean  3:5   -0.407     2.54    2.95 -1.75     0.135    5.50  -0.989   0.175 Welch
    

    You can then use it as:

     multiple_t_test(mean+news~group, trust_news)# for only mean and news
    
     multiple_t_test(.~group, trust_news) # for everything else
    

    Note that the grouping factor is just 1 variable. The notion of having 2 does not work. but it can be optimized to use two