rdplyrdataset

How to get a value from one dataset to fill in the missing values in another dataset


I having a dataset with some incomplete data


start_station_name start_station_id start_lat end_lng
NA NA 41.9 -87.6
NA NA 41.9 -87.6
NA NA 41.9 -87.7
NA NA 41.9 -87.7
NA NA 41.9 -87.6
NA NA 41.9 -87.6
NA NA 41.9 -87.7
NA NA 41.9 -87.6
NA NA 41.9 -87.6
NA NA 42.0 -87.6

as you can see I have the start_lat and the end_lng but name and id is NA.

I created a data set from some of the complete rows that have the start_station_name, start_station_id, start_lat, and start_lng


start_station_name start_station_id start_lat end_lng
Yates Blvd & 93rd St 20237 41.7 -87.6
Yates Blvd & 75th St KA1503000024 41.8 -87.6
Woodlawn Ave & Lake Park Ave KA1503000065 41.8 -87.6
Woodlawn Ave & 75th St 569 41.8 -87.6
Woodlawn Ave & 58th St 22002 41.8 -87.6
Woodlawn Ave & 55th St TA1307000164 41.8 -87.6
Woodlawn & 103rd 20133 41.7 -87.6
Wood St & Webster Ave TA1307000138 41.9 -87.7
Wood St & Taylor St (Temp) 13285 41.9 -87.7
Wood St & Milwaukee Ave 13221 41.9 -87.6

The issue that I am having is trying to fill in the missing data with the reference dataset. Here is what I have right now


#get all bike station data
bike_share_data <- read_csv(cyclistic_file)

# get data regarding each distinct start_station
station_data <- bike_share_data %>% 
  distinct(start_station_name,.keep_all = TRUE) %>%
  select(start_station_name,start_station_id,start_lat,end_lng)

# Fill in missing values using merge (dplyr) by matching the lat and lng

bike_share_data2 <- bike_share_data %>%
  mutate(start_station_name=ifelse(is.na(start_station_name),(
                                   filter(data=station_data,station_data$start_lat==bike_share_data$start_lat && station_data$start_lng==bike_share_data$start_lng )%>%
                                      select(station_data$start_station_name))
                                   ,
                                   start_station_name)
)

What I was expecting was to get one row from the filtering then getting the start_station_name by using select.

what I got was

Caused by error in station_data$start_lat == bike_share_data$start_lat && station_data$start_lng == bike_share_data$start_lng: ! 'length = 5734381' in coercion to 'logical(1)'


Solution

  • Using rows_patch, with appropriate arguments and column conversions:

    library(dplyr)
    
    rows_patch(
      x=mutate(df, 
               start_station_name=as.character(start_station_name),
               start_station_id=as.character(start_station_id)),
      y=distinct(df_complete, start_lat, end_lng, .keep_all = TRUE), 
      by=c("start_lat", "end_lng"), unmatched = "ignore")
    

       start_station_name      start_station_id start_lat end_lng
       <chr>                   <chr>                <dbl>   <dbl>
     1 Wood St & Milwaukee Ave 13221                 41.9   -87.6
     2 Wood St & Milwaukee Ave 13221                 41.9   -87.6
     3 Wood St & Webster Ave   TA1307000138          41.9   -87.7
     4 Wood St & Webster Ave   TA1307000138          41.9   -87.7
     5 Wood St & Milwaukee Ave 13221                 41.9   -87.6
     6 Wood St & Milwaukee Ave 13221                 41.9   -87.6
     7 Wood St & Webster Ave   TA1307000138          41.9   -87.7
     8 Wood St & Milwaukee Ave 13221                 41.9   -87.6
     9 Wood St & Milwaukee Ave 13221                 41.9   -87.6
    10 NA                      NA                    42     -87.6
    

    The last row remains unpatched since there is no corresponding row in df_complete.


    Data:

    df <- structure(list(start_station_name = c(NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA), start_station_id = c(NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA), start_lat = c(41.9, 41.9, 41.9, 41.9, 41.9, 
    41.9, 41.9, 41.9, 41.9, 42), end_lng = c(-87.6, -87.6, -87.7, 
    -87.7, -87.6, -87.6, -87.7, -87.6, -87.6, -87.6)), class = c("tbl_df", 
    "tbl", "data.frame"), row.names = c(NA, -10L))
    
    df_complete <- structure(list(start_station_name = c("Yates Blvd & 93rd St", 
    "Yates Blvd & 75th St", "Woodlawn Ave & Lake Park Ave", "Woodlawn Ave & 75th St", 
    "Woodlawn Ave & 58th St", "Woodlawn Ave & 55th St", "Woodlawn & 103rd", 
    "Wood St & Webster Ave", "Wood St & Taylor St (Temp)", "Wood St & Milwaukee Ave"
    ), start_station_id = c("20237", "KA1503000024", "KA1503000065", 
    "569", "22002", "TA1307000164", "20133", "TA1307000138", "13285", 
    "13221"), start_lat = c(41.7, 41.8, 41.8, 41.8, 41.8, 41.8, 41.7, 
    41.9, 41.9, 41.9), end_lng = c(-87.6, -87.6, -87.6, -87.6, -87.6, 
    -87.6, -87.6, -87.7, -87.7, -87.6)), class = c("tbl_df", "tbl", 
    "data.frame"), row.names = c(NA, -10L))