I having a dataset with some incomplete data
start_station_name | start_station_id | start_lat | end_lng |
---|---|---|---|
NA | NA | 41.9 | -87.6 |
NA | NA | 41.9 | -87.6 |
NA | NA | 41.9 | -87.7 |
NA | NA | 41.9 | -87.7 |
NA | NA | 41.9 | -87.6 |
NA | NA | 41.9 | -87.6 |
NA | NA | 41.9 | -87.7 |
NA | NA | 41.9 | -87.6 |
NA | NA | 41.9 | -87.6 |
NA | NA | 42.0 | -87.6 |
as you can see I have the start_lat and the end_lng but name and id is NA.
I created a data set from some of the complete rows that have the start_station_name, start_station_id, start_lat, and start_lng
start_station_name | start_station_id | start_lat | end_lng |
---|---|---|---|
Yates Blvd & 93rd St | 20237 | 41.7 | -87.6 |
Yates Blvd & 75th St | KA1503000024 | 41.8 | -87.6 |
Woodlawn Ave & Lake Park Ave | KA1503000065 | 41.8 | -87.6 |
Woodlawn Ave & 75th St | 569 | 41.8 | -87.6 |
Woodlawn Ave & 58th St | 22002 | 41.8 | -87.6 |
Woodlawn Ave & 55th St | TA1307000164 | 41.8 | -87.6 |
Woodlawn & 103rd | 20133 | 41.7 | -87.6 |
Wood St & Webster Ave | TA1307000138 | 41.9 | -87.7 |
Wood St & Taylor St (Temp) | 13285 | 41.9 | -87.7 |
Wood St & Milwaukee Ave | 13221 | 41.9 | -87.6 |
The issue that I am having is trying to fill in the missing data with the reference dataset. Here is what I have right now
#get all bike station data
bike_share_data <- read_csv(cyclistic_file)
# get data regarding each distinct start_station
station_data <- bike_share_data %>%
distinct(start_station_name,.keep_all = TRUE) %>%
select(start_station_name,start_station_id,start_lat,end_lng)
# Fill in missing values using merge (dplyr) by matching the lat and lng
bike_share_data2 <- bike_share_data %>%
mutate(start_station_name=ifelse(is.na(start_station_name),(
filter(data=station_data,station_data$start_lat==bike_share_data$start_lat && station_data$start_lng==bike_share_data$start_lng )%>%
select(station_data$start_station_name))
,
start_station_name)
)
What I was expecting was to get one row from the filtering then getting the start_station_name by using select.
what I got was
Caused by error in station_data$start_lat == bike_share_data$start_lat && station_data$start_lng == bike_share_data$start_lng
:
! 'length = 5734381' in coercion to 'logical(1)'
Using rows_patch
, with appropriate arguments and column conversions:
library(dplyr)
rows_patch(
x=mutate(df,
start_station_name=as.character(start_station_name),
start_station_id=as.character(start_station_id)),
y=distinct(df_complete, start_lat, end_lng, .keep_all = TRUE),
by=c("start_lat", "end_lng"), unmatched = "ignore")
start_station_name start_station_id start_lat end_lng
<chr> <chr> <dbl> <dbl>
1 Wood St & Milwaukee Ave 13221 41.9 -87.6
2 Wood St & Milwaukee Ave 13221 41.9 -87.6
3 Wood St & Webster Ave TA1307000138 41.9 -87.7
4 Wood St & Webster Ave TA1307000138 41.9 -87.7
5 Wood St & Milwaukee Ave 13221 41.9 -87.6
6 Wood St & Milwaukee Ave 13221 41.9 -87.6
7 Wood St & Webster Ave TA1307000138 41.9 -87.7
8 Wood St & Milwaukee Ave 13221 41.9 -87.6
9 Wood St & Milwaukee Ave 13221 41.9 -87.6
10 NA NA 42 -87.6
The last row remains unpatched since there is no corresponding row in df_complete
.
Data:
df <- structure(list(start_station_name = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), start_station_id = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), start_lat = c(41.9, 41.9, 41.9, 41.9, 41.9,
41.9, 41.9, 41.9, 41.9, 42), end_lng = c(-87.6, -87.6, -87.7,
-87.7, -87.6, -87.6, -87.7, -87.6, -87.6, -87.6)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))
df_complete <- structure(list(start_station_name = c("Yates Blvd & 93rd St",
"Yates Blvd & 75th St", "Woodlawn Ave & Lake Park Ave", "Woodlawn Ave & 75th St",
"Woodlawn Ave & 58th St", "Woodlawn Ave & 55th St", "Woodlawn & 103rd",
"Wood St & Webster Ave", "Wood St & Taylor St (Temp)", "Wood St & Milwaukee Ave"
), start_station_id = c("20237", "KA1503000024", "KA1503000065",
"569", "22002", "TA1307000164", "20133", "TA1307000138", "13285",
"13221"), start_lat = c(41.7, 41.8, 41.8, 41.8, 41.8, 41.8, 41.7,
41.9, 41.9, 41.9), end_lng = c(-87.6, -87.6, -87.6, -87.6, -87.6,
-87.6, -87.6, -87.7, -87.7, -87.6)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -10L))