rdataframestatisticscorrelationpearson

replacement length error which doing correlation in R using cor function


I am trying to use cor function in R to do correlation for genes across many samples. I have two input files: actual expression and predicted expression. Both these files have 5 rows which correspond to genes and columns as sample. When i use cor function its giving an error: Basically for each gene i want to get a cor value across all the samples between actual and predicted.

    library(readr)
    actual_expr = read.table(file = "actual_expr.txt", header = TRUE, sep = "\t" )
    pred_expr = read.table(file = "predict_Expr.txt", header = TRUE, sep = "\t" )
    pred_expr=pred_expr[,-1]
    pred_expr <- as.data.frame(t(pred_expr))
    colnames(pred_expr) <- pred_expr[1,]
    pred_expr <- pred_expr[-c(1),]
    actual_expr <- as.data.frame(t(actual_expr))
    
    #filter columns of actual based on pred:
    #pred_expr <- pred_expr[,colnames(actual_expr)%in%colnames(pred_expr)]
    y <- colnames(actual_expr)
    z <- which(colnames(pred_expr) %in% y)
    pred_expr <- pred_expr[,z]
##convert chr to numeric for cor function
    for(i in 1:205){
    pred_expr[,i] <- as.numeric((as.character(pred_expr[,i])))
    #actual_expr[,i] <- as.numeric((as.character(actual_expr[,i])))
    }
    cor_vec <- NULL
    for(i in 1:5){ #for first-five gene
       if(rownames(pred_expr)[i] %in% rownames(actual_expr)=="TRUE"){
           z <- which(rownames(actual_expr) %in% rownames(pred_expr)[i])
           index3=which(rownames(pred_expr)[i] %in%  rownames(actual_expr)[z])
           index4=which(rownames(actual_expr) %in% rownames(pred_expr)[i])
           data1_1<<- pred_expr[index3,]
           data2_1<<- actual_expr[index4,]
           cor_vec[i] = cor(data1_1[i,],data2_1[i,])
           }
       }

The error is:

Warning messages:
1: In cor_vec[i] <- cor(data1_1[i, ], data2_1[i, ]) :
  number of items to replace is not a multiple of replacement length
2: In cor_vec[i] <- cor(data1_1[i, ], data2_1[i, ]) :
  number of items to replace is not a multiple of replacement length

And cor_Vec has NA NA NA NA NA values. The files are:

dput(data1_1[1, c(1, 5)])
structure(list(`GTEX-1117F` = -1.1059008, `GTEX-1192X` = -1.9839559), row.names = "ENSG00000278558.4", class = "data.frame")
> dput(data2_1[1, c(1, 5)])
structure(list(`GTEX-1117F` = 0.0294928231667479, `GTEX-1192X` = -0.294937186047513), row.names = "ENSG00000278558.4", class = "data.frame")
dput(pred_expr[1:2,c(1:5)])
structure(list(`GTEX-1117F` = c(-1.1059008, -0.7702356), `GTEX-111FC` = c(-1.9839559, 
-0.3895127), `GTEX-1128S` = c(-1.9839559, -0.3895127), `GTEX-117XS` = c(-0.9919779, 
-0.1947563), `GTEX-1192X` = c(-1.9839559, -0.3895127)), row.names = c("ENSG00000278558.4", 
"ENSG00000275793.1"), class = "data.frame")
dput(actual_expr[1:2,c(1:5)])
structure(list(`GTEX-1117F` = c(0.0205417435855037, -0.257328654412256
), `GTEX-111FC` = c(-0.196491626008295, -0.564355420678903), 
    `GTEX-1128S` = c(-0.259273634395018, -0.53496548376192), 
    `GTEX-117XS` = c(-0.1764185747562, -0.765050266632035), `GTEX-1192X` = c(-0.185980126179191, 
    -0.479964998120876)), row.names = c("ENSG00000227232.5", 
"ENSG00000268903.1"), class = "data.frame")

Does anyone know why I am getting this error. Thank you.


Solution

  • Inside the loop, use unlist to return a vector from the data.frame as subsetting with a single row, still returns the data.frame

    > str(data1_1)
    'data.frame':   1 obs. of  2 variables:
     $ GTEX-1117F: num -1.11
     $ GTEX-1192X: num -1.98
    > str(data1_1[1,])
    'data.frame':   1 obs. of  2 variables:
     $ GTEX-1117F: num -1.11
     $ GTEX-1192X: num -1.98
    > str(unlist(data1_1[1,]))
     Named num [1:2] -1.11 -1.98
     - attr(*, "names")= chr [1:2] "GTEX-1117F" "GTEX-1192X"
    

    So we use

    cor(unlist(data1_1[1,]), unlist(data2_1[1,]))
    

    within the loop