rmachine-learningsvmsupervised-learningone-class-classification

One Class Classification in R language. What am I doing wrong when generating the confusion matrix?


I am trying to understand and implement classifiers A class in R is based on several UCIs and one of them (http://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease).

When trying to print a confusion matrix you are giving the error “all arguments must have the same length”.

What am I doing wrong?

library(caret)
library(dplyr)
library(e1071)
library(NLP)
library(tm)

ds = read.csv('kidney_disease.csv', 
              header = TRUE)

#Remover colunas inutiliz?veis              
ds <- subset(ds, select = -c(age), classification =='ckd' )

x <- subset(ds, select = -classification) #make x variables
y <- ds$classification #make y variable(dependent)

# test on the whole set
#pred <- predict(model, subset(ds, select=-classification))


trainPositive<-x
testnegative<-y

inTrain<-createDataPartition(1:nrow(trainPositive),p=0.6,list=FALSE)

trainpredictors<-trainPositive[inTrain,1:4]
trainLabels<-trainPositive[inTrain,6]

testPositive<-trainPositive[-inTrain,]
testPosNeg<-rbind(testPositive,testnegative)

testpredictors<-testPosNeg[,1:4]
testLabels<-testPosNeg[,6]

svm.model<-svm(trainpredictors,y=NULL,
               type='one-classification',
               nu=0.10,
               scale=TRUE,
               kernel="radial")

svm.predtrain<-predict(svm.model,trainpredictors)
svm.predtest<-predict(svm.model,testpredictors)

# confusionMatrixTable<-table(Predicted=svm.pred,Reference=testLabels)
# confusionMatrix(confusionMatrixTable,positive='TRUE')

confTrain <- table(Predicted=svm.predtrain,Reference=trainLabels)
confTest <- table(Predicted=svm.predtest,Reference=testLabels)

confusionMatrix(confTest,positive='TRUE')


print(confTrain)
print(confTest)

#grid

Here are some of the first lines of the dataset I'm using:

 id bp    sg al su    rbc       pc        pcc         ba bgr bu  sc sod pot hemo pcv   wc
1  0 80 1.020  1  0          normal notpresent notpresent 121 36 1.2  NA  NA 15.4  44 7800
2  1 50 1.020  4  0          normal notpresent notpresent  NA 18 0.8  NA  NA 11.3  38 6000
3  2 80 1.010  2  3 normal   normal notpresent notpresent 423 53 1.8  NA  NA  9.6  31 7500
4  3 70 1.005  4  0 normal abnormal    present notpresent 117 56 3.8 111 2.5 11.2  32 6700
5  4 80 1.010  2  0 normal   normal notpresent notpresent 106 26 1.4  NA  NA 11.6  35 7300
6  5 90 1.015  3  0                 notpresent notpresent  74 25 1.1 142 3.2 12.2  39 7800
   rc htn  dm cad appet  pe ane classification
1 5.2 yes yes  no  good  no  no            ckd
2      no  no  no  good  no  no            ckd
3      no yes  no  poor  no yes            ckd
4 3.9 yes  no  no  poor yes yes            ckd
5 4.6  no  no  no  good  no  no            ckd
6 4.4 yes yes  no  good yes  no            ckd

The error log:

> confTrain <- table (Predicted = svm.predtrain, Reference = trainLabels)
Table error (Predicted = svm.predtrain, Reference = trainLabels):
all arguments must be the same length
> confTest <- table (Predicted = svm.predtest, Reference = testLabels)
Table error (expected = svm.predtest, reference = testLabels):
all arguments must be the same length
>
> confusionMatrix (confTest, positive = 'TRUE')
ConfusionMatrix error (confTest, positive = "TRUE"):
'confTest' object not found
>
>
> print (confTrain)
Printing error (confTrain): object 'confTrain' not found
> print (confTest)
Printing error (confTest): object 'confTest' not found



Solution

  • I see a number of issues. First it seems that a lot of your data is of class character rather than numeric, which is required by the classifier. Let's pick some columns and convert to numeric. I will use data.table because fread is very convenient.

    library(caret)
    library(e1071)
    library(data.table)
    setDT(ds)
    #Choose columns
    mycols <- c("id","bp","sg","al","su")
    #Convert to numeric
    ds[,(mycols) := lapply(.SD, as.numeric),.SDcols = mycols]
    
    #Convert classification to logical
    data <- ds[,.(bp,sg,al,su,classification = ds$classification == "ckd")]
    data
         bp    sg al su classification
      1: 80 1.020  1  0           TRUE
      2: 50 1.020  4  0           TRUE
      3: 80 1.010  2  3           TRUE
      4: 70 1.005  4  0           TRUE
      5: 80 1.010  2  0           TRUE
     ---                              
    396: 80 1.020  0  0          FALSE
    397: 70 1.025  0  0          FALSE
    398: 80 1.020  0  0          FALSE
    399: 60 1.025  0  0          FALSE
    400: 80 1.025  0  0          FALSE
    

    Once the data is cleaned up, you can sample a training and test set with createDataPartition as in your original code.

    #Sample data for training and test set
    inTrain<-createDataPartition(1:nrow(data),p=0.6,list=FALSE)
    train<- data[inTrain,]
    test <- data[-inTrain,]
    

    Then we can create the model and make the predictions.

    svm.model<-svm(classification ~ bp + sg + al + su, data = train,
                   type='one-classification',
                   nu=0.10,
                   scale=TRUE,
                   kernel="radial")
    
    #Perform predictions 
    svm.predtrain<-predict(svm.model,train)
    svm.predtest<-predict(svm.model,test)
    

    Your main issue with the cross table was that the model can only predict for cases that don't have any NAs, so you have to subset the classification levels to those with predictions. Then you can evaluate confusionMatrix:

    confTrain <- table(Predicted=svm.predtrain,
                       Reference=train$classification[as.integer(names(svm.predtrain))])
    confTest <- table(Predicted=svm.predtest,
                      Reference=test$classification[as.integer(names(svm.predtest))])
    
    confusionMatrix(confTest,positive='TRUE')
    
    Confusion Matrix and Statistics
    
             Reference
    Predicted FALSE TRUE
        FALSE     0   17
        TRUE     55   64
    
                   Accuracy : 0.4706         
                     95% CI : (0.3845, 0.558)
        No Information Rate : 0.5956         
        P-Value [Acc > NIR] : 0.9988         
    
                      Kappa : -0.2361        
    
     Mcnemar's Test P-Value : 1.298e-05      
    
                Sensitivity : 0.7901         
                Specificity : 0.0000         
             Pos Pred Value : 0.5378         
             Neg Pred Value : 0.0000         
                 Prevalence : 0.5956         
             Detection Rate : 0.4706         
       Detection Prevalence : 0.8750         
          Balanced Accuracy : 0.3951         
    
           'Positive' Class : TRUE           
    

    Data

    library(archive)
    library(data.table)
    tf1 <- tempfile(fileext = ".rar")
    #Download data file
    download.file("http://archive.ics.uci.edu/ml/machine-learning-databases/00336/Chronic_Kidney_Disease.rar", tf1)
    tf2 <- tempfile()
    #Un-rar file
    archive_extract(tf1, tf2)
    #Read in data
    ds <- fread(paste0(tf2,"/Chronic_Kidney_Disease/chronic_kidney_disease.arff"), fill = TRUE, skip = "48")
    #Remove erroneous last column
    ds[,V26:= NULL]
    #Set column names (from header)
    setnames(ds,c("id","bp","sg","al","su","rbc","pc","pcc","ba","bgr","bu","sc","sod","pot","hemo","pcv","wc","rc","htn","dm","cad","appet","pe","ane","classification"))
    #Replace "?" with NA
    ds[ds == "?"] <- NA