I have a dataframe column consisting of a string of attributes in the format "attribute1=>value1,attribute2=>value2,..."
. Entries have different attributes in different orders. For example, row 1 may have attributes A, B, and C while row 2 may have attributes C, B, D, and E.
I want to filter the attribute list by string match and then expand this column into multiple columns where each column name is the attribute name. Missing values should be populated by NA.
I have a solution using lots of lapply
statements but I expect there is a more efficient one, either through simplifying the current solution, or using rrapply
or another function/package (such as separate_wider_delim
). The dataframe will have ~500,000 rows.
my_ids = c(1066,1990,578,2299,725,219,1229,2089,383,544,1243,1164,723)
my_attributes = c("addr:city=>Shanklin,addr:country=>GB,addr:housenumber=>36,38,addr:postcode=>PO37 6JY,addr:street=>High Street,amenity=>restaurant,level=>0",
"addr:city=>Ryde,addr:housenumber=>78,addr:street=>West Street,amenity=>doctors,healthcare=>doctor",
"addr:city=>Ryde,addr:country=>GB,addr:housenumber=>134,addr:postcode=>PO33 2RJ,addr:street=>High Street,amenity=>community_centre,check_date:opening_hours=>2023-09-16,fhrs:id=>1647467,level=>0,opening_hours:signed=>no",
"addr:city=>Ventnor,addr:country=>GB,addr:housename=>Froghill Farm,addr:postcode=>PO38 3AN,addr:street=>Shanklin Road,addr:village=>Godshill,amenity=>fuel,brand=>Gulf,brand:wikidata=>Q5617505,car_wash=>no,fhrs:id=>540417",
"addr:city=>Freshwater,addr:country=>GB,addr:housename=>The Apple Farm,addr:postcode=>PO40 9XR,addr:street=>Newport Road,amenity=>cafe,fhrs:id=>540510,source:addr=>FHRS Open Data",
"access=>yes,addr:city=>Ventnor,addr:postcode=>PO38 1UF,addr:street=>Steephill Road,amenity=>parking,capacity=>10,charge=>£3 per day,contact:email=>steephillcarpark@gmail.com,fee=>yes,park_ride=>no,parking=>surface,payment:coins=>yes,surface=>unpaved",
"addr:city=>Ventnor,addr:country=>GB,addr:postcode=>PO38 1UF,addr:street=>Steephill Road,amenity=>pub,fhrs:id=>541580,source:addr=>FHRS Open Data,wheelchair=>yes",
"addr:city=>Ventnor,addr:country=>GB,addr:housename=>Ventnor Park,addr:postcode=>PO38 1JZ,addr:street=>Park Avenue,amenity=>cafe,drinking_water:refill=>yes,drinking_water:refill:network=>Refill,fhrs:id=>540763",
"addr:city=>Ventnor,addr:country=>GB,addr:postcode=>PO38 3LY,addr:street=>Newport Road,addr:village=>Godshill,fhrs:id=>541337,shop=>farm",
"addr:city=>Newport,addr:country=>GB,addr:postcode=>PO30 2PD,addr:street=>Briddlesford Road,amenity=>recycling,opening_hours=>Nov-Mar: 10:00-18:00; Apr-Oct: 7:00-20:00; Dec 25 off,owner=>Isle of Wight Council,ownership=>municipal,recycling:batteries=>yes,recycling:bulky_waste=>yes,recycling:cans=>yes,recycling:car_batteries=>yes,recycling:cardboard=>yes,recycling:clothes=>yes,recycling:cooking_oil=>yes,recycling:electrical_appliances=>yes,recycling:electrical_items=>yes,recycling:engine_oil=>yes,recycling:fluorescent_tubes=>yes,recycling:glass=>yes,recycling:green_waste=>yes,recycling:paper=>yes,recycling:plastic=>yes,recycling:plastic_bottles=>yes,recycling:residual_waste=>yes,recycling:scrap_metal=>yes,recycling:small_appliances=>yes,recycling:white_goods=>yes,recycling:wood=>yes,recycling_type=>centre,ref:GB:uprn=>10024247154,website=>https://www.iow.gov.uk/Residents/Environment-Planning-and-Waste/Waste-and-Recycling/Recycling-Centres/Lynnbottom-HWRC,wikidata=>Q106093911",
"addr:city=>Carisbrooke,addr:country=>GB,addr:housename=>Carisbrooke Castle,addr:postcode=>PO30 1XY,addr:street=>Castle Hill,amenity=>cafe,bar=>no,check_date=>2025-05-04,cuisine=>coffee_shop,diet:vegetarian=>yes,fhrs:id=>540472,outdoor_seating=>yes,payment:cash=>yes,payment:credit_cards=>yes,payment:debit_cards=>yes,takeaway=>yes",
"addr:city=>Gatcombe,addr:country=>GB,addr:housename=>Little Gatcombe Farm,addr:postcode=>PO30 3EQ,addr:street=>Newbarn Lane,amenity=>cafe,fhrs:id=>540818,operator=>Little Gatcombe Farm",
"addr:city=>Freshwater,addr:country=>GB,addr:county=>Isle of Wight,addr:postcode=>PO40 9DT,addr:street=>Queens Road,amenity=>doctors,healthcare=>doctor,healthcare:speciality=>general,opening_hours=>Mo-Fr 08:30-18:00,operator=>West Wight Medical Partnerships,website=>https://brooksidehealthcentre.nhs.uk/"
)
datadf = data.frame(id = my_ids,
attribute_tags = my_attributes)
id | addr:city | addr:country | addr:housenumber | addr:postcode | addr:street | addr:housename | addr:village | addr:county |
---|---|---|---|---|---|---|---|---|
1066 | Shanklin | GB | 36 | PO37 6JY | High Street | NA | NA | NA |
1990 | Ryde | NA | 78 | NA | West Street | NA | NA | NA |
578 | Ryde | GB | 134 | PO33 2RJ | High Street | NA | NA | NA |
2299 | Ventnor | GB | NA | PO38 3AN | Shanklin Road | Froghill Farm | Godshill | NA |
725 | Freshwater | GB | NA | PO40 9XR | Newport Road | The Apple Farm | NA | NA |
219 | Ventnor | NA | NA | PO38 1UF | Steephill Road | NA | NA | NA |
1229 | Ventnor | GB | NA | PO38 1UF | Steephill Road | NA | NA | NA |
2089 | Ventnor | GB | NA | PO38 1JZ | Park Avenue | Ventnor Park | NA | NA |
383 | Ventnor | GB | NA | PO38 3LY | Newport Road | NA | Godshill | NA |
544 | Newport | GB | NA | PO30 2PD | Briddlesford Road | NA | NA | NA |
1243 | Carisbrooke | GB | NA | PO30 1XY | Castle Hill | Carisbrooke Castle | NA | NA |
1164 | Gatcombe | GB | NA | PO30 3EQ | Newbarn Lane | Little Gatcombe Farm | NA | NA |
723 | Freshwater | GB | NA | PO40 9DT | Queens Road | NA | NA | Isle of Wight |
datadf = data.frame(
id = sample(3000, 13),
attributes = a1[57:69])
tags_sep = function(myname) { # turn into named list
mystr = strsplit(myname, '=>')
mylist = mystr[[1]][2]
names(mylist) = mystr[[1]][1]
return(mylist)
}
tmp1 = rapply(as.list(datadf$attribute_tags), function(x) strsplit(x, ','), how = 'list')
tmp2 = rapply(tmp1, function(x) subset(x, grepl('addr:', x)), how = 'list') # get attributes with 'addr' in the name
tmp3 = unlist(tmp2, recursive = F)
tmp4 = lapply(tmp3, function(x) (lapply(x, function(y) tags_sep(y)))) # create named list
tmp5 = lapply(tmp4, function(x) unlist(x, recursive = F))
tmp6 = bind_rows(tmp5)
outputdf = cbind(datadf['id'], tmp6)
rrapply
library(rrapply)
tmp1 = lapply(as.list(datadf$attribute_tags), function(x) as.list(strsplit(x, ',')[[1]])) # get into list of lists
tmp2 = rrapply(tmp1,
condition = function(x) grepl('addr:', x),
how = "prune")
tmp3 = rrapply(tmp2,
f = function(x) tags_sep(x),
how = 'melt')
Unfortunately, the attribute names are not present in the dataframe. A similar problem arises when using how = 'bind'
. The resulting dataframe does not have attribute names as column names and does not properly handle missing attributes or attributes in a different order.
I have seen questions and vignettes and how to handle these issues but these are always starting with a named list. I am trying to create a named list and then a dataframe.
In base R you could use some regex to accomplish this. The goal of regex is to make the string to a compatible dcf file
a <- gsub(",?([^ ,]+=>)", "\n\\1", datadf$attribute_tags)
b <- gsub("(.*?):(?=.*=>)", "\\1_", a, perl = TRUE)
d <- read.dcf(textConnection(gsub("=>", ":", b)), all = TRUE)
cbind(datadf[1], subset(d, select = startsWith(names(d), "addr")))
id addr_city addr_country addr_housenumber addr_postcode addr_street addr_housename addr_village addr_county
1 1066 Shanklin GB 36,38 PO37 6JY High Street <NA> <NA> <NA>
2 1990 Ryde <NA> 78 <NA> West Street <NA> <NA> <NA>
3 578 Ryde GB 134 PO33 2RJ High Street <NA> <NA> <NA>
4 2299 Ventnor GB <NA> PO38 3AN Shanklin Road Froghill Farm Godshill <NA>
5 725 Freshwater GB <NA> PO40 9XR Newport Road The Apple Farm <NA> <NA>
6 219 Ventnor <NA> <NA> PO38 1UF Steephill Road <NA> <NA> <NA>
7 1229 Ventnor GB <NA> PO38 1UF Steephill Road <NA> <NA> <NA>
8 2089 Ventnor GB <NA> PO38 1JZ Park Avenue Ventnor Park <NA> <NA>
9 383 Ventnor GB <NA> PO38 3LY Newport Road <NA> Godshill <NA>
10 544 Newport GB <NA> PO30 2PD Briddlesford Road <NA> <NA> <NA>
11 1243 Carisbrooke GB <NA> PO30 1XY Castle Hill Carisbrooke Castle <NA> <NA>
12 1164 Gatcombe GB <NA> PO30 3EQ Newbarn Lane Little Gatcombe Farm <NA> <NA>
13 723 Freshwater GB <NA> PO40 9DT Queens Road <NA> <NA> Isle of Wight
If speed is necessary, then only read the required fields:
field <- c("city", "country", "housenumber", "postcode", "street", "housename",
"village", "county")
fields <- paste0("addr_", field)
a <- gsub(",?([^ ,]+=>)", "\n\\1", datadf$attribute_tags)
b <- gsub("(.*?):(?=.*=>)", "\\1_", a, perl = TRUE)
d <- read.dcf(textConnection(gsub("=>", ":", b)), fields)
cbind(datadf[1], d)