I have two data.tables in this format (the actual tables have about a million rows in each):
library(data.table)
dt1 <- data.table(
code=c("A001", "A002","A003","A004","A005"),
x=c(65,92,25,450,12),
y=c(98,506,72,76,15),
no1=c(010101, 010156, 028756, 372576,367383),
no2=c(876362,"",682973,78269,"")
)
dt2 <- data.table(
code=c("A003", "A004","A005","A006","A007","A008","A009"),
x=c(25,126,12,55,34,134,55),
y=c(72,76,890,568,129,675,989),
no1=c(028756, 372576,367383,234876, 287156, 123348, 198337),
no2=c(682973,78269,65378,"","","",789165)
)
I would like to combine the two together and keep only unique rows based on all column entries being unique. This is what I have but I assume there is a better way of doing it:
dt3 <- rbindlist(list(dt1, dt2))
dt3 <- unique(dt3, by = c("code", "x", "y", "no1", "no2"))
Once I have this single dataset I would like to give any duplicate 'code' records some attribute information (version number and a comment about what's different in that version to the previous one). The output I am looking for would be this:
dt4 <- data.table(
code=c("A001", "A002","A003","A004","A005", "A004","A005","A006","A007","A008","A009"),
x=c(65,92,25,450,12,126,12,55,34,134,55),
y=c(98,506,72,76,15,76,890,568,129,675,989),
no1=c(010101, 010156, 028756, 372576,367383, 372576,367383,234876, 287156, 123348, 198337),
no2=c(876362,"",682973,78269,"",78269,65378,"","","",789165),
version = c("V1","V1","V1","V1","V1","V2","V2","V1","V1","V1","V1"),
unique_version=c("A001_V1", "A002_V1","A003_V1","A004_V1","A005_V1", "A004_V2","A005_V2","A006_V1","A007_V1","A008_V1","A009_V1"),
comment = c("First_entry","First_entry","First_entry","First_entry","First_entry","New_x", "New_y_and_no2","First_entry","First_entry","First_entry","First_entry")
)
I'm not sure how to achieve dt4 (and in an efficient way considering the size of the real dataset will be over a million rows).
Edit
Having applied @Chase's solution to my real data I noticed my dt3 example varies slightly from the type of result I am getting. This looks more like my real data:
dt6 <- data.table(
code=c("A111", "A111","A111","A111","A111", "A111","A111","A234", "A234","A234","A234","A234", "A234","A234"),
x=c("",126,126,"",836,843,843,126,126,"",127,836,843,843),
y=c("",76,76,"",456,465,465,76,76,"",77,456,465,465),
no1=c(028756, 028756,028756,057756, 057756, 057756, 057756,028756, 028756,057756,057756, 057756, 057756, 057756),
no2=c("","",034756,"","","",789165,"",034756,"","","","",789165)
)
comp_cols <- c("x", "y", "no1", "no2")
#grabs the names of the mismatching values and formats them how you did
f <- function(x,y) {
n_x <- names(x)
diff <- x != y
paste0("New_", paste0(n_x[diff], collapse = "_and_"))
}
dt6[, version := paste0("V", 1:.N), by = code]
dt6[, unique_version := paste(code, version, sep = "_")]
dt6[, comment := ifelse(version == "V1", "First_entry", f(.SD[1], .SD[2])), by = code, .SDcols = comp_cols]
As you can see the suggested solution to create the comment column seems to be returning only the first change between the first and second versions (and not the changes better V2 and V3 etc.)
Here's one solution - the first two are trivial, the comment takes a little more thought:
dt5 <- copy(dt3)
comp_cols <- c("x", "y", "no1", "no2")
#grabs the names of the mismatching values and formats them how you did
f <- function(x,y) {
n_x <- names(x)
diff <- x != y
paste0("New_", paste0(n_x[diff], collapse = "_and"))
}
dt5[, version := paste0("V", 1:.N), by = code]
dt5[, unique_version := paste(code, version, sep = "_")]
dt5[, comment := ifelse(version == "V1", "First_entry", f(.SD[1], .SD[2])), by = code, .SDcols = comp_cols]
End up yielding this:
> dt5
code x y no1 no2 version unique_version comment
1: A001 65 98 10101 876362 V1 A001_V1 First_entry
2: A002 92 506 10156 V1 A002_V1 First_entry
3: A003 25 72 28756 682973 V1 A003_V1 First_entry
4: A004 450 76 372576 78269 V1 A004_V1 First_entry
5: A005 12 15 367383 V1 A005_V1 First_entry
6: A004 126 76 372576 78269 V2 A004_V2 New_x
7: A005 12 890 367383 65378 V2 A005_V2 New_y_andno2
8: A006 55 568 234876 V1 A006_V1 First_entry
9: A007 34 129 287156 V1 A007_V1 First_entry
10: A008 134 675 123348 V1 A008_V1 First_entry
11: A009 55 989 198337 789165 V1 A009_V1 First_entry