I am generating a dataframe with dummy variables using the fastDummies
for later modeling using two character variables.
The original looking something like this:
PID Flag1 Flag2 Var_1 Var_2
0001 0 0 AB AB
0001 0 0 CD CD
0001 1 0 EF EF
0001 0 0 GH GH
0001 0 0 IJ IJ
0001 0 1 KL KL
0001 0 0 MN MN
0001 1 0 OP OP
0001 0 0 QR QR
0001 0 1 ST ST
0001 0 0 UV UV
0001 1 0 WX WX
0001 0 0 YZ YZ
And fastDummies
will generate an output of Var_1
and Var_2
like the following:
PID Flag1 Flag2 Var_1 Var_2 Var_1_AB Var_1_CD Var_1_EF Var_1_GH Var_1_IJ Var_1_KL Var_1_MN Var_1_OP Var_1_QR Var_1_ST Var_1_UV Var_1_WX Var_1_YZ Var_2_AB Var_2_CD Var_2_EF Var_2_GH Var_2_IJ Var_2_KL Var_2_MN Var_2_OP Var_2_QR Var_2_ST Var_2_UV Var_2_WX Var_2_YZ
0001 0 0 AB AB 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
0001 0 0 CD CD 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
0001 1 0 EF EF 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
0001 0 0 GH GH 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
0001 0 0 IJ IJ 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
0001 0 1 KL KL 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
0001 0 0 MN MN 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
0001 1 0 OP OP 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
0001 0 0 QR QR 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
0001 0 1 ST ST 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
0001 0 0 UV UV 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
0001 1 0 WX WX 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0
0001 0 0 YZ YZ 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1
However, the data contains flags for encoding whether or not a variable needs to be inverted (in this case, flipped from 0 to 1 or vise-verse), indicated with Flag_1
and Flag_2
.
Which would need to look something like this:
PID Flag1 Flag2 Var_1 Var_2 Var_1_AB Var_1_CD Var_1_EF Var_1_GH Var_1_IJ Var_1_KL Var_1_MN Var_1_OP Var_1_QR Var_1_ST Var_1_UV Var_1_WX Var_1_YZ Var_2_AB Var_2_CD Var_2_EF Var_2_GH Var_2_IJ Var_2_KL Var_2_MN Var_2_OP Var_2_QR Var_2_ST Var_2_UV Var_2_WX Var_2_YZ
1 0 0 AB AB 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
1 0 0 CD CD 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
1 1 0 EF EF 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
1 0 0 GH GH 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
1 0 0 IJ IJ 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
1 0 1 KL KL 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 0 0 MN MN 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
1 1 0 OP OP 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
1 0 0 QR QR 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
1 0 1 ST ST 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 0 0 UV UV 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
1 1 0 WX WX 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
1 0 0 YZ YZ 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1
Where a 1 on Flag_1
indicates a flip from 0 to 1 on the dummy variable created using Var_1
and a 1 on Flag_2
is indicative of a 0 to 1 flip on the dummy variable created using Var_2
.
I was originally going to create a function to do this that reads in the dataframe and parses values based on strings and column names and deploy it via lapply
, but I feel like there's a way to do this using dplyr/tidyverse
via mutate_at
or mutate_all
that can be manipulated with the standard variable names that I'm just missing.
Any suggestions at all on how to accomplish this in a simpler manner would be appreciated.
Some code to generate the aforementioned dataframe:
list.of.packages <-
c("data.table", "tidyverse", "stringr","janitor","fastDummies")
new.packages <-
list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])]
if (length(new.packages))
install.packages(new.packages)
lapply(list.of.packages, require, character.only = TRUE)
df = data.frame(PID = c(0001,0001,0001,0001,0001,0001,0001,0001,0001,0001,0001,0001,0001), Flag1 = c(0,0,1,0,0,0,0,1,0,0,0,1,0), Flag2 = c(0,0,0,0,0,1,0,0,0,1,0,0,0), Var_1 = c("AB","CD","EF","GH","IJ","KL","MN","OP","QR","ST","UV","WX","YZ"), Var_2 = c("AB","CD","EF","GH","IJ","KL","MN","OP","QR","ST","UV","WX","YZ"))
df <- dummy_cols(df, select_columns = "Var_1", remove_selected_columns=FALSE, omit_colname_prefix=FALSE)
df <- dummy_cols(df, select_columns = "Var_2", remove_selected_columns=FALSE, omit_colname_prefix=FALSE)
Try a two-across
method:
library(dplyr)
dat1new <- dat1 %>%
mutate(
across(starts_with("Var_1_"),
~ if_else(Var_1 == sub(".*_", "", cur_column()) & Flag1 > 0, +!., .)),
across(starts_with("Var_2_"),
~ if_else(Var_2 == sub(".*_", "", cur_column()) & Flag2 > 0, +!., .))
)
dat1new
# PID Flag1 Flag2 Var_1 Var_2 Var_1_AB Var_1_CD Var_1_EF Var_1_GH Var_1_IJ Var_1_KL Var_1_MN Var_1_OP Var_1_QR Var_1_ST Var_1_UV Var_1_WX Var_1_YZ Var_2_AB Var_2_CD Var_2_EF Var_2_GH Var_2_IJ Var_2_KL Var_2_MN Var_2_OP Var_2_QR Var_2_ST Var_2_UV Var_2_WX Var_2_YZ
# 1 1 0 0 AB AB 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
# 2 1 0 0 CD CD 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
# 3 1 1 0 EF EF 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
# 4 1 0 0 GH GH 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
# 5 1 0 0 IJ IJ 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
# 6 1 0 1 KL KL 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 7 1 0 0 MN MN 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
# 8 1 1 0 OP OP 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
# 9 1 0 0 QR QR 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
# 10 1 0 1 ST ST 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 11 1 0 0 UV UV 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
# 12 1 1 0 WX WX 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
# 13 1 0 0 YZ YZ 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1
all.equal(dat1new, dat2)
# [1] TRUE
Walkthrough:
across
is the preferred method over mutate_at
, it should look similar-enough._
in starts_with("Var_1_")
should match only the columns we want and skip the literal Var_1
column.sub(...)
converts each column name to just the two-letter code found in Var_1
Var_1
) match and Flag1
is set, then use +!.
which is a shortcut way to convert an integer to logical, invert it, then back to integer._2_
variablesData: dat1
is your original (second data code block), dat2
is your desired output (third data code block).
dat1 <- structure(list(PID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Flag1 = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L), Flag2 = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), Var_1 = c("AB", "CD", "EF", "GH", "IJ", "KL", "MN", "OP", "QR", "ST", "UV", "WX", "YZ"), Var_2 = c("AB", "CD", "EF", "GH", "IJ", "KL", "MN", "OP", "QR", "ST", "UV", "WX", "YZ"), Var_1_AB = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_CD = c(0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_EF = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_GH = c(0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_IJ = c(0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_KL = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_MN = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_OP = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), Var_1_QR = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), Var_1_ST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), Var_1_UV = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), Var_1_WX = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L), Var_1_YZ = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), Var_2_AB = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_CD = c(0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_EF = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_GH = c(0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_IJ = c(0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_KL = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_MN = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_OP = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), Var_2_QR = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), Var_2_ST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), Var_2_UV = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), Var_2_WX = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L), Var_2_YZ = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA, -13L))
dat2 <- structure(list(PID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Flag1 = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L), Flag2 = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), Var_1 = c("AB", "CD", "EF", "GH", "IJ", "KL", "MN", "OP", "QR", "ST", "UV", "WX", "YZ"), Var_2 = c("AB", "CD", "EF", "GH", "IJ", "KL", "MN", "OP", "QR", "ST", "UV", "WX", "YZ"), Var_1_AB = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_CD = c(0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_EF = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_GH = c(0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_IJ = c(0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_KL = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_MN = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_OP = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_QR = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), Var_1_ST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), Var_1_UV = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), Var_1_WX = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_1_YZ = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), Var_2_AB = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_CD = c(0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_EF = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_GH = c(0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_IJ = c(0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_KL = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_MN = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_OP = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), Var_2_QR = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), Var_2_ST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Var_2_UV = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), Var_2_WX = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L), Var_2_YZ = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA, -13L))