I have a variant dosage matrix and want to calculate the Minor Allele Frequency (MAF) for each variant (row) of the data frame dose_df
.
I'd like to ask you wheather it is correct to say that the Allele Frequency (AF) of a variant will be calculated by considering the sum
of each values in a row divided by two times of the total number of individuals. Then if the AF value was less than 0.5 it is going to be considered as MAF, otherwise 1-AF_value
will be the MAF value.And if yes, whether the below for-loop does do the job.
Following the assumption above, here is a chunk of the dosage matrix called dose_df
:
dose_df <- structure(list(CHR = c("chr22", "chr22", "chr22", "chr22", "chr22",
"chr22", "chr22", "chr22", "chr22", "chr22", "chr22", "chr22",
"chr22", "chr22", "chr22", "chr22", "chr22", "chr22", "chr22",
"chr22"), POS = c(10519389L, 10526179L, 10526296L, 10527090L,
10530609L, 10557732L, 10557819L, 10557824L, 10557840L, 10558138L,
10559721L, 10559769L, 10560849L, 10560850L, 10560850L, 10560915L,
10561980L, 10562747L, 10562991L, 10563056L), ID = c("chr22_10519389_T_C_b38",
"chr22_10526179_G_A_b38", "chr22_10526296_G_A_b38", "chr22_10527090_C_T_b38",
"chr22_10530609_C_T_b38", "chr22_10557732_G_A_b38", "chr22_10557819_C_G_b38",
"chr22_10557824_AC_A_b38", "chr22_10557840_G_A_b38", "chr22_10558138_CT_C_b38",
"chr22_10559721_AG_A_b38", "chr22_10559769_G_T_b38", "chr22_10560849_C_T_b38",
"chr22_10560850_G_A_b38", "chr22_10560850_G_C_b38", "chr22_10560915_C_T_b38",
"chr22_10561980_A_T_b38", "chr22_10562747_AGTTTT_A_b38", "chr22_10562991_G_T_b38",
"chr22_10563056_C_T_b38"), `GTEX-1122O` = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `GTEX-11EM3` = c(0,
0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1), `GTEX-11EMC` = c(1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `GTEX-11EQ9` = c(1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `GTEX-11I78` = c(1,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), `GTEX-11VI4` = c(0,
0, 2, 0, 0, 2, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0), `GTEX-11ZTT` = c(2,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), `GTEX-1211K` = c(2,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), `GTEX-1212Z` = c(2,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `GTEX-12696` = c(2,
0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 2, 1, 0, 2), `GTEX-1269C` = c(2,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `GTEX-12C56` = c(2,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0), `GTEX-12WSJ` = c(2,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), `GTEX-12WSL` = c(2,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), `GTEX-12WSN` = c(0,
2, 0, 1, 0, 1, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0), `GTEX-13111` = c(2,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), `GTEX-1399R` = c(2,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0)), row.names = c(NA,
-20L), class = c("data.table",
"data.frame"))
I tried to do the job through a loop:
for (i in 1:nrow(dose_df)){
dose_df$maf[i] <- sum(dose_df[i, -c(1,2,3)])/(2 * length(dose_df[, -c(1,2,3)]))
if (dose_df$maf[i] < 0.5){
dose_df$maf[i] <- dose_df$maf[i]
} else if (dose_df$maf[i] > 0.5) {
dose_df$maf[i] <- 1 - dose_df$maf[i]
}
}
Here is the dose_df$maf
:
> dose_df$maf
[1] 0.32352941 0.07434641 0.07434641 0.04656863 0.10212418 0.10212418 0.07434641 0.04656863 0.12990196 0.07434641 0.01879085
[12] 0.12990196 0.10212418 0.12990196 0.04656863 0.10212418 0.10212418 0.26879085 0.07434641 0.10212418
This may be vectorized with rowSums
and ifelse
instead of looping over each row
tmp <- rowSums(dose_df[,-(1:3)])/(2 * (ncol(dose_df) - 3))
ifelse(tmp > 0.5, 1 - tmp, tmp)
-output
[1] 0.32352941 0.05882353 0.05882353 0.02941176 0.08823529 0.08823529 0.05882353 0.02941176 0.11764706 0.05882353 0.00000000 0.11764706 0.08823529 0.11764706 0.02941176 0.08823529
[17] 0.08823529 0.26470588 0.05882353 0.08823529
The for
loop output is not correct as a column is added maf
in the first iteration. So, when we take the length
, it will be incremented by 1 (after the first iteration). One way to prevent this is by taking the length
initially before we loop
tmp1 <- dose_df[, -c(1,2,3)]
l1 <- ncol(tmp1)
dose_df$maf <- NA_real_
for (i in 1:nrow(dose_df)){
dose_df$maf[i] <- sum(tmp1[i,])/(2 * l1)
if (dose_df$maf[i] < 0.5){
dose_df$maf[i] <- dose_df$maf[i]
} else if (dose_df$maf[i] > 0.5) {
dose_df$maf[i] <- 1 - dose_df$maf[i]
}
}
-output
> dose_df$maf
[1] 0.32352941 0.05882353 0.05882353 0.02941176 0.08823529 0.08823529 0.05882353 0.02941176 0.11764706 0.05882353 0.00000000 0.11764706 0.08823529 0.11764706 0.02941176 0.08823529
[17] 0.08823529 0.26470588 0.05882353 0.08823529