I am trying to split my data into testing and training sets using package caret. I have 77 rows with complete data in each column. The function 'createDataPartition' results in 4 rows for training data and 73 rows for testing data, which doesn't seem right. Any help would be appreciated. Here is my code:
> library(caret)
> # Split data into train and test
> set.seed(123)
> data.full <- data.full %>% select(fasting_status, a1c, glu, uc_ratio)
> training.samples <- data.full %>%
+ createDataPartition(p = 0.8, list = FALSE)
Warning messages:
1: In createDataPartition(., p = 0.8, list = FALSE) :
Some classes have no records ( ) and these will be ignored
2: In createDataPartition(., p = 0.8, list = FALSE) :
Some classes have a single record ( ) and these will be selected for the sample
> train.data <- data.full[training.samples, ]
> test.data <- data.full[-training.samples, ]
Here are my reproducible data:
> dput(data.full)
structure(list(fasting_status = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L), levels = c("1", "2"), class = "factor"),
a1c = c(4.3, 4.5, 4.4, 2.9, 4.3, 4.4, 4.2, 4.5, 4.2, 4.2,
4.5, 4.5, 4.8, 4.5, 5.2, 4.9, 4.6, 4.2, 4.4, 4.9, 4.6, 4.5,
4.4, 4.8, 4.5, 4.1, 3.8, 3.1, 4.3, 4.6, 4.7, 4.9, 4.6, 4.4,
3.1, 4.6, 4.4, 4.2, 4.4, 5.2, 4.4, 5.1, 4.6, 4.7, 5.2, 4.7,
4.7, 4.6, 4.4, 4.4, 4.2, 4.5, 4.6, 4.4, 3.2, 4.8, 5.2, 5.2,
4.6, 4.9, 5.6, 4.6, 4.9, 4.5, 5.1, 4.6, 4.9, 4.6, 4.3, 4.6,
4.6, 4.3, 4.6, 4.3, 4.6, 6.5, 4.8), glu = c(88.5, 98, 117.5,
53, 108.5, 106, 105, 101, 91, 99.5, 128.5, 113, 114, 121.5,
121, 131.5, 160.5, 96, 110, 140, 119.5, 115.3, 112, 143.5,
116.5, 116.5, 111, 139.5, 123.5, 131, 113, 137, 114, 98.5,
124.5, 123.5, 111.5, 111, 127, 123, 137.5, 119, 107, 130.5,
142.5, 115, 133.5, 119, 148.3, 125.5, 138.5, 106.5, 153.5,
126.5, 179, 145, 143, 124.5, 134, 146.5, 127.5, 124.5, 123,
129, 145.3, 125.5, 146.5, 153.5, 115.5, 128, 110.5, 131,
139.5, 124, 154, 94, 76.3), uc_ratio = c(30.65603924, 15.32801962,
60.59075991, 7.39973361, 57.84661317, 27.46781116, 16.0944206,
6.131207848, 94.61568474, 19.50838861, 7.803355443, 19.41549152,
7.464079119, 19.67095851, 29.50643777, 62.94706724, 80.472103,
25.75107296, 73.57449418, 39.01677721, 41.13018598, 10.62933697,
7.803355443, 30.04291845, 32.75355771, 49.52129416, 5.969860273,
22.72153497, 7.153075823, 75.61823012, 23.50296342, 53.64806867,
11.19611891, 38.25340549, 88.36152487, 51.50214592, 9.196811772,
41.98544505, 6.35828962, 9.196811772, 94.87237407, 12.87553648,
6.035407725, 7.39973361, 10.72961373, 11.70503316, 9.035464197,
16.34988759, 11.68917269, 35.11509949, 61.85306741, 11.36076748,
12.2624157, 7.153075823, 14.30615165, 10.40447392, 3.901677721,
52.11526671, 21.45922747, 30.49469166, 81.06819266, 1.950838861,
34.33476395, 8.0472103, 24.94635193, 9.754194304, 64.3776824,
9.196811772, 11.92179304, 34.87124464, 74.39198856, 124.4635193,
13.79521766, 5.722460658, 66.76204101, 69.9757432, 19.50838861
)), row.names = c(NA, -77L), class = "data.frame")
You should be splitting a vector, typically an index vector to use as the split, not a data frame.
splits <- createDataPartition(1:nrow(data.full), p = 0.8, list = FALSE)
training.sample <- data.full[splits[,1],]
test.sample <- data.full[setdiff(1:nrow(data.full), splits[,1]),]
Or
test.sample <- dplyr::anti_join(data.full, training.sample)
dim(training.sample)
#[1] 64 4
dim(test.sample)
#[1] 13 4