I have a data.frame
with about 66k rows and 150 columns. For the purposes of this question, this is a sample. All columns are date objects.
structure(list(fixed_date = structure(c(19267, 19239, 19120,
18035, 19253, 19211, 19120, 19778, 19120, 19211, 18224, 18224,
19624, 19211, 19211, 19211, 18409, 18409, 18558, 19876, 19267
), class = "Date"), BEGIN1 = structure(c(19240, 19212, 19086,
18008, 19240, 19177, 19086, 19751, 19086, 19177, 18197, 18197,
19597, 19177, 19177, 18197, 18197, 18197, 18477, 18477, 19240
), class = "Date"), END1 = structure(c(19267, 19239, 19120, 18035,
19253, 19211, 19120, 19778, 19120, 19211, 18224, 18224, 19624,
19211, 19211, 18224, 18224, 18224, 18488, 18488, 19267), class = "Date"),
BEGIN2 = structure(c(NA, NA, 18043, NA, NA, NA, NA, NA, 18232,
NA, NA, NA, NA, NA, 18232, 18232, 18232, 18232, 18512, 19849,
18484), class = "Date"), END2 = structure(c(NA, NA, 18070,
NA, NA, NA, NA, NA, 18280, NA, NA, NA, NA, NA, 18280, 18280,
18280, 18280, 18523, 19876, 18495), class = "Date"), BEGIN3 = structure(c(NA,
NA, 18897, NA, NA, NA, NA, NA, 18323, NA, NA, NA, NA, NA,
18354, 18354, 18396, 18396, 18547, 19793, 18519), class = "Date"),
END3 = structure(c(NA, NA, 18924, NA, NA, NA, NA, NA, 18350,
NA, NA, NA, NA, NA, 18381, 18381, 18409, 18409, 18558, 19827,
18530), class = "Date"), BEGIN4 = structure(c(NA, NA, 18113,
NA, NA, NA, NA, NA, 18687, NA, NA, NA, NA, NA, 18722, 18372,
NA, NA, NA, NA, 18554), class = "Date"), END4 = structure(c(NA,
NA, 18140, NA, NA, NA, NA, NA, 18700, NA, NA, NA, NA, NA,
18749, 18399, NA, NA, NA, NA, 18565), class = "Date"), BEGIN5 = structure(c(NA,
NA, 18148, NA, NA, NA, NA, NA, 18407, NA, NA, NA, NA, NA,
18407, 18407, NA, NA, NA, NA, NA), class = "Date"), END5 = structure(c(NA,
NA, 18175, NA, NA, NA, NA, NA, 18434, NA, NA, NA, NA, NA,
18434, 18434, NA, NA, NA, NA, NA), class = "Date")), row.names = c(NA,
-21L), class = c("tbl_df", "tbl", "data.frame"))
What I want is a column that gives me the name of the column across columns 2 and 11 which is equal to the value of column 1. This runs with no errors but no column is created:
sample$match_date_<- apply(sample[,c(2:11)], 1,
function(x) names(which(x == sample[,1])))
I don't know why this isn't working as expected and also I want to achieve a faster solution since I guess using apply will be slow. Any advice will be much appreciated.
As a tidyverse
/ dplyr
package user, my suggestion is to use the column names as "variable" for easier manipulations, with pivot_longer()
. If you anticipate that there will be a single match on every row, you can use filter()
. I don't think that it will slow, some seconds on bigger dataset maybe.
library(tidyverse)
df = structure(list(fixed_date = structure(c(19267, 19239, 19120,
18035, 19253, 19211, 19120, 19778, 19120, 19211, 18224, 18224,
19624, 19211, 19211, 19211, 18409, 18409, 18558, 19876, 19267
), class = "Date"), BEGIN1 = structure(c(19240, 19212, 19086,
18008, 19240, 19177, 19086, 19751, 19086, 19177, 18197, 18197,
19597, 19177, 19177, 18197, 18197, 18197, 18477, 18477, 19240
), class = "Date"), END1 = structure(c(19267, 19239, 19120, 18035,
19253, 19211, 19120, 19778, 19120, 19211, 18224, 18224, 19624,
19211, 19211, 18224, 18224, 18224, 18488, 18488, 19267), class = "Date"),
BEGIN2 = structure(c(NA, NA, 18043, NA, NA, NA, NA, NA, 18232,
NA, NA, NA, NA, NA, 18232, 18232, 18232, 18232, 18512, 19849,
18484), class = "Date"), END2 = structure(c(NA, NA, 18070,
NA, NA, NA, NA, NA, 18280, NA, NA, NA, NA, NA, 18280, 18280,
18280, 18280, 18523, 19876, 18495), class = "Date"), BEGIN3 = structure(c(NA,
NA, 18897, NA, NA, NA, NA, NA, 18323, NA, NA, NA, NA, NA,
18354, 18354, 18396, 18396, 18547, 19793, 18519), class = "Date"),
END3 = structure(c(NA, NA, 18924, NA, NA, NA, NA, NA, 18350,
NA, NA, NA, NA, NA, 18381, 18381, 18409, 18409, 18558, 19827,
18530), class = "Date"), BEGIN4 = structure(c(NA, NA, 18113,
NA, NA, NA, NA, NA, 18687, NA, NA, NA, NA, NA, 18722, 18372,
NA, NA, NA, NA, 18554), class = "Date"), END4 = structure(c(NA,
NA, 18140, NA, NA, NA, NA, NA, 18700, NA, NA, NA, NA, NA,
18749, 18399, NA, NA, NA, NA, 18565), class = "Date"), BEGIN5 = structure(c(NA,
NA, 18148, NA, NA, NA, NA, NA, 18407, NA, NA, NA, NA, NA,
18407, 18407, NA, NA, NA, NA, NA), class = "Date"), END5 = structure(c(NA,
NA, 18175, NA, NA, NA, NA, NA, 18434, NA, NA, NA, NA, NA,
18434, 18434, NA, NA, NA, NA, NA), class = "Date")), row.names = c(NA,
-21L), class = c("tbl_df", "tbl", "data.frame"))
df %>% mutate(id = 1:n() ) %>%
pivot_longer(-c(fixed_date, id) ) %>%
group_by(id) %>%
filter(fixed_date ==value)
#> # A tibble: 20 × 4
#> # Groups: id [20]
#> fixed_date id name value
#> <date> <int> <chr> <date>
#> 1 2022-10-02 1 END1 2022-10-02
#> 2 2022-09-04 2 END1 2022-09-04
#> 3 2022-05-08 3 END1 2022-05-08
#> 4 2019-05-19 4 END1 2019-05-19
#> 5 2022-09-18 5 END1 2022-09-18
#> 6 2022-08-07 6 END1 2022-08-07
#> 7 2022-05-08 7 END1 2022-05-08
#> 8 2024-02-25 8 END1 2024-02-25
#> 9 2022-05-08 9 END1 2022-05-08
#> 10 2022-08-07 10 END1 2022-08-07
#> 11 2019-11-24 11 END1 2019-11-24
#> 12 2019-11-24 12 END1 2019-11-24
#> 13 2023-09-24 13 END1 2023-09-24
#> 14 2022-08-07 14 END1 2022-08-07
#> 15 2022-08-07 15 END1 2022-08-07
#> 16 2020-05-27 17 END3 2020-05-27
#> 17 2020-05-27 18 END3 2020-05-27
#> 18 2020-10-23 19 END3 2020-10-23
#> 19 2024-06-02 20 END2 2024-06-02
#> 20 2022-10-02 21 END1 2022-10-02
Created on 2024-06-13 with reprex v2.1.0