rfor-loopvcf-variant-call-format

Calculation the minor allele frequency from variant dosage matrix in R


I have a variant dosage matrix and want to calculate the Minor Allele Frequency (MAF) for each variant (row) of the data frame dose_df.

I'd like to ask you wheather it is correct to say that the Allele Frequency (AF) of a variant will be calculated by considering the sum of each values in a row divided by two times of the total number of individuals. Then if the AF value was less than 0.5 it is going to be considered as MAF, otherwise 1-AF_value will be the MAF value.And if yes, whether the below for-loop does do the job.

Following the assumption above, here is a chunk of the dosage matrix called dose_df:

dose_df <- structure(list(CHR = c("chr22", "chr22", "chr22", "chr22", "chr22", 
"chr22", "chr22", "chr22", "chr22", "chr22", "chr22", "chr22", 
"chr22", "chr22", "chr22", "chr22", "chr22", "chr22", "chr22", 
"chr22"), POS = c(10519389L, 10526179L, 10526296L, 10527090L, 
10530609L, 10557732L, 10557819L, 10557824L, 10557840L, 10558138L, 
10559721L, 10559769L, 10560849L, 10560850L, 10560850L, 10560915L, 
10561980L, 10562747L, 10562991L, 10563056L), ID = c("chr22_10519389_T_C_b38", 
"chr22_10526179_G_A_b38", "chr22_10526296_G_A_b38", "chr22_10527090_C_T_b38", 
"chr22_10530609_C_T_b38", "chr22_10557732_G_A_b38", "chr22_10557819_C_G_b38", 
"chr22_10557824_AC_A_b38", "chr22_10557840_G_A_b38", "chr22_10558138_CT_C_b38", 
"chr22_10559721_AG_A_b38", "chr22_10559769_G_T_b38", "chr22_10560849_C_T_b38", 
"chr22_10560850_G_A_b38", "chr22_10560850_G_C_b38", "chr22_10560915_C_T_b38", 
"chr22_10561980_A_T_b38", "chr22_10562747_AGTTTT_A_b38", "chr22_10562991_G_T_b38", 
"chr22_10563056_C_T_b38"), `GTEX-1122O` = c(0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `GTEX-11EM3` = c(0, 
0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1), `GTEX-11EMC` = c(1, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `GTEX-11EQ9` = c(1, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `GTEX-11I78` = c(1, 
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), `GTEX-11VI4` = c(0, 
0, 2, 0, 0, 2, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0), `GTEX-11ZTT` = c(2, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), `GTEX-1211K` = c(2, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), `GTEX-1212Z` = c(2, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `GTEX-12696` = c(2, 
0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 2, 1, 0, 2), `GTEX-1269C` = c(2, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `GTEX-12C56` = c(2, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0), `GTEX-12WSJ` = c(2, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), `GTEX-12WSL` = c(2, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), `GTEX-12WSN` = c(0, 
2, 0, 1, 0, 1, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0), `GTEX-13111` = c(2, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), `GTEX-1399R` = c(2, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0)), row.names = c(NA, 
-20L), class = c("data.table", 
"data.frame"))

I tried to do the job through a loop:

for (i in 1:nrow(dose_df)){
            dose_df$maf[i] <- sum(dose_df[i, -c(1,2,3)])/(2 * length(dose_df[, -c(1,2,3)]))
                  if (dose_df$maf[i] < 0.5){
                      dose_df$maf[i] <- dose_df$maf[i]
                  } else if (dose_df$maf[i] > 0.5)  {
                             dose_df$maf[i] <- 1 - dose_df$maf[i]
                        }
}

Here is the dose_df$maf:

> dose_df$maf
 [1] 0.32352941 0.07434641 0.07434641 0.04656863 0.10212418 0.10212418 0.07434641 0.04656863 0.12990196 0.07434641 0.01879085
[12] 0.12990196 0.10212418 0.12990196 0.04656863 0.10212418 0.10212418 0.26879085 0.07434641 0.10212418

Solution

  • This may be vectorized with rowSums and ifelse instead of looping over each row

    tmp <- rowSums(dose_df[,-(1:3)])/(2 * (ncol(dose_df) - 3))
    ifelse(tmp > 0.5, 1 - tmp, tmp)
    

    -output

    [1] 0.32352941 0.05882353 0.05882353 0.02941176 0.08823529 0.08823529 0.05882353 0.02941176 0.11764706 0.05882353 0.00000000 0.11764706 0.08823529 0.11764706 0.02941176 0.08823529
    [17] 0.08823529 0.26470588 0.05882353 0.08823529
    

    The for loop output is not correct as a column is added maf in the first iteration. So, when we take the length, it will be incremented by 1 (after the first iteration). One way to prevent this is by taking the length initially before we loop

    tmp1 <- dose_df[, -c(1,2,3)]
    l1 <- ncol(tmp1)
    dose_df$maf <- NA_real_
    for (i in 1:nrow(dose_df)){
                dose_df$maf[i] <- sum(tmp1[i,])/(2 * l1)
                      if (dose_df$maf[i] < 0.5){
                          dose_df$maf[i] <- dose_df$maf[i]
                      } else if (dose_df$maf[i] > 0.5)  {
                                 dose_df$maf[i] <- 1 - dose_df$maf[i]
                            }
    }
    

    -output

    > dose_df$maf
     [1] 0.32352941 0.05882353 0.05882353 0.02941176 0.08823529 0.08823529 0.05882353 0.02941176 0.11764706 0.05882353 0.00000000 0.11764706 0.08823529 0.11764706 0.02941176 0.08823529
    [17] 0.08823529 0.26470588 0.05882353 0.08823529