I need to do fuzzy matching in a following pattern: table A contains strings with addresses (already preformatted by me like removing whitespaces etc.), which I have to verify their correctness. I have table B which contains all the possible addresses (formatted in same shape as table A), so I don't want to just match row 1 from table A to row 1 from table B and so on, but compare each row from table A to the whole table B and find the closest match for each.
From what I have checked, adist
and agrep
work on row-to-row basis by default, by trying to use them I also immediately get out of memory message. Is it even possible to do in R while having 8 GB RAM only?
I have found an example code for a similar problem and based my solution on it, but performance is still issue. It works fine on a sample of 600 rows in table A and 2000 rows in table B, but the complete dataset is 600000 and 900000 rows, respectively.
adresy_odl <- adist(TableA$Adres, TableB$Adres, partial=FALSE, ignore.case = TRUE)
min_odl<-apply(adresy_odl, 1, min)
match.s1.s2<-NULL
for(i in 1:nrow(adresy_odl))
{
s2.i<-match(min_odl[i],adresy_odl[i,])
s1.i<-i
match.s1.s2<-rbind(data.frame(s2.i=s2.i,s1.i=s1.i,s2name=TableB[s2.i,]$Adres, s1name=TableA[s1.i,]$Adres, adist=min_odl[i]),match.s1.s2)
}
The memory error happens already at the first row (adist function):
Error: cannot allocate vector of size 1897.0 Gb
Below is example (CSV) of data I use, tableA and tableB look exactly the same, the only difference is that tableB has all the possible combinations of Zipcode, Street and City, while in tableA there are mostly either wrong zipcodes or some errors in spelling of streets.
TableA:
"","Zipcode","Street","City","Adres"
"33854","80-221","Traugutta","Gdańsk","80-221TrauguttaGdańsk"
"157093","80-276","KsBernardaSychty","Gdańsk","80-276KsBernardaSychtyGdańsk"
"200115","80-339","Grunwaldzka","Gdańsk","80-339GrunwaldzkaGdańsk"
"344514","80-318","Wąsowicza","Gdańsk","80-318WąsowiczaGdańsk"
"355415","80-625","Stryjewskiego","Gdańsk","80-625StryjewskiegoGdańsk"
"356414","80-452","Kilińskiego","Gdańsk","80-452KilińskiegoGdańsk"
TableB:
"","Zipcode","Street","City","Adres"
"47204","80-180","11Listopada","Gdańsk","80-18011ListopadaGdańsk"
"47205","80-041","3BrygadySzczerbca","Gdańsk","80-0413BrygadySzczerbcaGdańsk"
"47206","80-802","3Maja","Gdańsk","80-8023MajaGdańsk"
"47207","80-299","Achillesa","Gdańsk","80-299AchillesaGdańsk"
"47208","80-316","AdamaAsnyka","Gdańsk","80-316AdamaAsnykaGdańsk"
"47209","80-405","AdamaMickiewicza","Gdańsk","80-405AdamaMickiewiczaGdańsk"
"47210","80-425","AdamaMickiewicza","Gdańsk","80-425AdamaMickiewiczaGdańsk"
"47211","80-456","AdolfaDygasińskiego","Gdańsk","80-456AdolfaDygasińskiegoGdańsk"
First few rows of my code results:
"","s2.i","s1.i","s2name","s1name","adist"
"1",1333,614,"80-152PowstańcówWarszawskichGdańsk","80-158PowstańcówWarszawskichGdańsk",1
"2",257,613,"80-180CzerskaGdańsk","80-180ZEUSAGdańsk",3
"3",1916,612,"80-119WojskiegoGdańsk","80-355BeniowskiegoGdańsk",8
"4",1916,611,"80-119WojskiegoGdańsk","80-180PorębskiegoGdańsk",6
"5",181,610,"80-204BraciŚniadeckichGdańsk","80-210ŚniadeckichGdańsk",7
"6",181,609,"80-204BraciŚniadeckichGdańsk","80-210ŚniadeckichGdańsk",7
"7",21,608,"80-401alGenJózefaHalleraGdańsk","80-401GenJózefaHalleraGdańsk",2
"8",1431,607,"80-264RomanaDmowskiegoGdańsk","80-264DmowskiegoGdańsk",6
"9",1610,606,"80-239StefanaCzarnieckiegoGdańsk","80-239StefanaCzarnieckiegoGdańsk",0
I would try awesome fuzzyjoin
package by @drob of StackOverflow
library(dplyr)
dict_df <- tibble::tribble(
~ID,~Zipcode,~Street,~City,~Adres,
"33854","80-221","Traugutta","Gdańsk","80-221TrauguttaGdańsk",
"157093","80-276","KsBernardaSychty","Gdańsk","80-276KsBernardaSychtyGdańsk",
"200115","80-339","Grunwaldzka","Gdańsk","80-339GrunwaldzkaGdańsk",
"344514","80-318","Wąsowicza","Gdańsk","80-318WąsowiczaGdańsk",
"355415","80-625","Stryjewskiego","Gdańsk","80-625StryjewskiegoGdańsk",
"356414","80-452","Kilińskiego","Gdańsk","80-452KilińskiegoGdańsk") %>%
select(ID, Adres)
noise_df <- tibble::tribble(
~Zipcode,~Street,~City,~Adres,
"80-221","Trauguta","Gdansk","80-221TraugutaGdansk",
"80-211","Traugguta","Gdansk","80-211TrauggutaGdansk",
"80-276","KsBernardaSychty","Gdańsk","80-276KsBernardaSychtyGdańsk",
"80-267","KsBernardaSyschty","Gdańsk","80-276KsBernardaSyschtyGdańsk",
"80-339","Grunwaldzka","Gdańsk","80-339GrunwaldzkaGdańsk",
"80-399","Grunwaldzka","dansk","80-399Grunwaldzkadańsk",
"80-318","Wasowicza","Gdańsk","80-318WasowiczaGdańsk",
"80-625","Stryjewskiego","Gdańsk","80-625StryjewskiegoGdańsk",
"80-625","Stryewskogo","Gdansk","80-625StryewskogoGdansk",
"80-452","Kilinskiego","Gdańsk","80-452KilinskiegoGdańsk")
library(fuzzyjoin)
noise_df %>%
# using jaccard with max_dist=0.5. Try other distance methods with different max_dist to save memory use
stringdist_left_join(dict_df, by="Adres", distance_col="dist", method="jaccard", max_dist=0.5) %>%
select(Adres.x, ID, Adres.y, dist) %>%
group_by(Adres.x) %>%
# select best fit record
top_n(-1, dist)
Results table consists of original address (Adres.x
) and best match from the dictionary (ID
and Adres.y
) together with string distance.
# A tibble: 10 x 4
# Groups: Adres.x [10]
Adres.x ID Adres.y dist
<chr> <chr> <chr> <dbl>
1 80-221TraugutaGdansk 33854 80-221TrauguttaGdańsk 0.11764706
2 80-211TrauggutaGdansk 33854 80-221TrauguttaGdańsk 0.11764706
3 80-276KsBernardaSychtyGdańsk 157093 80-276KsBernardaSychtyGdańsk 0.00000000
4 80-276KsBernardaSyschtyGdańsk 157093 80-276KsBernardaSychtyGdańsk 0.00000000
5 80-339GrunwaldzkaGdańsk 200115 80-339GrunwaldzkaGdańsk 0.00000000
6 80-399Grunwaldzkadańsk 200115 80-339GrunwaldzkaGdańsk 0.00000000
7 80-318WasowiczaGdańsk 344514 80-318WąsowiczaGdańsk 0.05555556
8 80-625StryjewskiegoGdańsk 355415 80-625StryjewskiegoGdańsk 0.00000000
9 80-625StryewskogoGdansk 355415 80-625StryjewskiegoGdańsk 0.17391304
10 80-452KilinskiegoGdańsk 356414 80-452KilińskiegoGdańsk 0.05263158
I found fuzzy match work best when you convert everything to lower case ASCII (iconv()
and tolower()
)
UPDATE: This might have smaller memory footprint:
library(purrr)
library(dplyr)
noise_df %>% split(.$Adres) %>%
# using jaccard with max_dist=0.5. Try other distance methods with different max_dist to save memory use
map_df(~stringdist_left_join(.x, dict_df, by="Adres", distance_col="dist", method="jaccard", max_dist=0.5, ignore_case = TRUE) %>%
select(Adres.x, ID, Adres.y, dist) %>%
group_by(Adres.x) %>%
# select best fit record
top_n(-1, dist))
UPDATE2: when using "lv" distance algorithm you get too many missing values and NAs. In certain cases, when no match is found, string_dist_join
drops the distance
column you created. That's why the rest of the pipe fails, first at select
and later at top_n
. In order to see whats going on, take small sample of your data, change map_df
to map
and explore resulting list.