rdata-cleaningspatialstring-matching

How to match German province names between 2 data sets in R?


I'm working with two datasets for German NUTS-3 level regions:

A shapefile from Eurostat via the giscoR package:

> library(giscoR)
> nuts3_germany <- gisco_get_nuts(country = "Germany", year = "2021", resolution = "03", nuts_level = "3")
> glimpse(nuts3_germany)
Rows: 401
Columns: 10
$ NUTS_ID    <chr> "DEG0P", "DE21M", "DE21N", "DE248", "DE249", "DE24A", "DE24B", "DE24C", "DE24D", "DE251", "DE252", "DE253", "…
$ LEVL_CODE  <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3…
$ URBN_TYPE  <dbl> 3, 3, 2, 2, 2, 3, 3, 3, 3, 3, 1, 1, 1, 1, 3, 1, 1, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 3, 3, 3, 2, 3, 3…
$ CNTR_CODE  <chr> "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "…
$ NAME_LATN  <chr> "Wartburgkreis", "Traunstein", "Weilheim-Schongau", "Forchheim", "Hof, Landkreis", "Kronach", "Kulmbach", "Li…
$ NUTS_NAME  <chr> "Wartburgkreis", "Traunstein", "Weilheim-Schongau", "Forchheim", "Hof, Landkreis", "Kronach", "Kulmbach", "Li…
$ MOUNT_TYPE <dbl> 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4…
$ COAST_TYPE <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 2, 3…
$ geo        <chr> "DEG0P", "DE21M", "DE21N", "DE248", "DE249", "DE24A", "DE24B", "DE24C", "DE24D", "DE251", "DE252", "DE253", "…
$ geometry   <MULTIPOLYGON [°]> MULTIPOLYGON (((10.09469 50..., MULTIPOLYGON (((12.84167 48..., MULTIPOLYGON (((10.91209 47..., …

And a PM2.5 dataset for the year 2021:

> library(dplyr)
> library(stringr)
> library(readr)
> df_2021 <- read_delim(
+   "APExpose_DE__2003-2022__nogeo.csv",
+   delim = ";",
+   locale = locale(encoding = "UTF-8")
+ ) %>%
+   mutate(
+     kreis = str_replace_all(kreis, c("ü" = "ü", "ö" = "ö", "ä" = "ä", "Ã\u009f" = "ß"))
+   ) %>%
+   filter(year == 2021, scenario == "average") %>%
+   select(kreis, PM2.5_annualMean)
Rows: 24120 Columns: 15                                                                                                           
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ";"
chr  (2): kreis, scenario
dbl (13): year, NO2_annualMean, NO2_hrOver200, NO_annualMean, O3_annualMean, O3_daysOver120, O3_dailyMaxAnnualMean, O3_dailyHo...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
> glimpse(df_2021)
Rows: 402
Columns: 2
$ kreis            <chr> "Freiburg im Breisgau", "Dillingen an der Donau", "Nürnberg", "Neumarkt in der Oberpfalz", "Rosenheim",…
$ PM2.5_annualMean <dbl> 7.252013, 9.232584, 12.186710, 9.281277, 10.353246, 8.548126, 9.311748, 14.496520, 9.368583, 9.300551, …

Problem:

The kreis names in df_2021 do not exactly match the NUTS_NAME entries in nuts3_germany:

> setdiff(nuts3_germany$NUTS_NAME, df_2021$kreis)
  [1] "Hof, Landkreis"                               "Wunsiedel i. Fichtelgebirge"                 
  [3] "Ansbach, Kreisfreie Stadt"                    "Erlangen, Kreisfreie Stadt"                  
  [5] "Fürth, Kreisfreie Stadt"                      "Nürnberg, Kreisfreie Stadt"                  
  [7] "Schwabach, Kreisfreie Stadt"                  "Ansbach, Landkreis"                          
  [9] "Fürth, Landkreis"                             "Eisenach, Kreisfreie Stadt"                  
 [11] "Brandenburg an der Havel, Kreisfreie Stadt"   "Cottbus, Kreisfreie Stadt"                   
 [13] "Frankfurt (Oder), Kreisfreie Stadt"           "Potsdam, Kreisfreie Stadt"                   
 [15] "Kassel, Landkreis"                            "Rostock, Kreisfreie Stadt"                   
 [17] "Schwerin, Kreisfreie Stadt"                   "Landkreis Rostock"                           
 [19] "Lüneburg, Landkreis"                          "Braunschweig, Kreisfreie Stadt"              
 [21] "Salzgitter, Kreisfreie Stadt"                 "Wolfsburg, Kreisfreie Stadt"                 
 [23] "Dortmund, Kreisfreie Stadt"                   "Hagen, Kreisfreie Stadt"                     
 [25] "Hamm, Kreisfreie Stadt"                       "Herne, Kreisfreie Stadt"                     
 [27] "Delmenhorst, Kreisfreie Stadt"                "Pforzheim, Stadtkreis"                       
 [29] "Erfurt, Kreisfreie Stadt"                     "Gera, Kreisfreie Stadt"                      
 [31] "Koblenz, Kreisfreie Stadt"                    "Trier, Kreisfreie Stadt"                     
 [33] "Bottrop, Kreisfreie Stadt"                    "Gelsenkirchen, Kreisfreie Stadt"             
 [35] "Münster, Kreisfreie Stadt"                    "Bielefeld, Kreisfreie Stadt"                 
 [37] "Frankenthal (Pfalz), Kreisfreie Stadt"        "Kaiserslautern, Kreisfreie Stadt"            
 [39] "Landau in der Pfalz, Kreisfreie Stadt"        "Ludwigshafen am Rhein, Kreisfreie Stadt"     
 [41] "Mainz, Kreisfreie Stadt"                      "Neustadt an der Weinstraße, Kreisfreie Stadt"
 [43] "Pirmasens, Kreisfreie Stadt"                  "Speyer, Kreisfreie Stadt"                    
 [45] "Worms, Kreisfreie Stadt"                      "Zweibrücken, Kreisfreie Stadt"               
 [47] "Kaiserslautern, Landkreis"                    "Heidelberg, Stadtkreis"                      
 [49] "Mannheim, Stadtkreis"                         "Flensburg, Kreisfreie Stadt"                 
 [51] "Kiel, Kreisfreie Stadt"                       "Lübeck, Kreisfreie Stadt"                    
 [53] "Neumünster, Kreisfreie Stadt"                 "Freiburg im Breisgau, Stadtkreis"            
 [55] "Rosenheim, Landkreis"                         "Mühldorf a. Inn"                             
 [57] "München, Landkreis"                           "Pfaffenhofen a. d. Ilm"                      
 [59] "Offenbach, Landkreis"                         "Gießen, Landkreis"                           
 [61] "Kassel, Kreisfreie Stadt"                     "Stuttgart, Stadtkreis"                       
 [63] "Augsburg, Kreisfreie Stadt"                   "Kaufbeuren, Kreisfreie Stadt"                
 [65] "Kempten (Allgäu), Kreisfreie Stadt"           "Memmingen, Kreisfreie Stadt"                 
 [67] "Augsburg, Landkreis"                          "Dillingen a.d. Donau"                        
 [69] "Wilhelmshaven, Kreisfreie Stadt"              "Dresden, Kreisfreie Stadt"                   
 [71] "Friesland (DE)"                               "Oldenburg, Landkreis"                        
 [73] "Osnabrück, Landkreis"                         "Jena, Kreisfreie Stadt"                      
 [75] "Suhl, Kreisfreie Stadt"                       "Weimar, Kreisfreie Stadt"                    
 [77] "Bochum, Kreisfreie Stadt"                     "Oberhausen, Kreisfreie Stadt"                
 [79] "Remscheid, Kreisfreie Stadt"                  "Solingen, Kreisfreie Stadt"                  
 [81] "Wuppertal, Kreisfreie Stadt"                  "Dessau-Roßlau, Kreisfreie Stadt"             
 [83] "Halle (Saale), Kreisfreie Stadt"              "Magdeburg, Kreisfreie Stadt"                 
 [85] "Emden, Kreisfreie Stadt"                      "Oldenburg (Oldenburg), Kreisfreie Stadt"     
 [87] "Osnabrück, Kreisfreie Stadt"                  "Bonn, Kreisfreie Stadt"                      
 [89] "Köln, Kreisfreie Stadt"                       "Leverkusen, Kreisfreie Stadt"                
 [91] "Landshut, Kreisfreie Stadt"                   "Passau, Kreisfreie Stadt"                    
 [93] "Straubing, Kreisfreie Stadt"                  "Landshut, Landkreis"                         
 [95] "Passau, Landkreis"                            "Chemnitz, Kreisfreie Stadt"                  
 [97] "Neustadt a. d. Aisch-Bad Windsheim"           "Leipzig, Kreisfreie Stadt"                   
 [99] "Coburg, Landkreis"                            "Aschaffenburg, Kreisfreie Stadt"             
[101] "Schweinfurt, Kreisfreie Stadt"                "Würzburg, Kreisfreie Stadt"                  
[103] "Aschaffenburg, Landkreis"                     "Schweinfurt, Landkreis"                      
[105] "Würzburg, Landkreis"                          "Tübingen, Landkreis"                         
[107] "Ulm, Stadtkreis"                              "Ingolstadt, Kreisfreie Stadt"                
[109] "München, Kreisfreie Stadt"                    "Rosenheim, Kreisfreie Stadt"                 
[111] "Heilbronn, Stadtkreis"                        "Heilbronn, Landkreis"                        
[113] "Bremen, Kreisfreie Stadt"                     "Bremerhaven, Kreisfreie Stadt"               
[115] "Darmstadt, Kreisfreie Stadt"                  "Frankfurt am Main, Kreisfreie Stadt"         
[117] "Offenbach am Main, Kreisfreie Stadt"          "Wiesbaden, Kreisfreie Stadt"                 
[119] "Baden-Baden, Stadtkreis"                      "Karlsruhe, Stadtkreis"                       
[121] "Neumarkt i. d. OPf."                          "Karlsruhe, Landkreis"                        
[123] "Amberg, Kreisfreie Stadt"                     "Regensburg, Kreisfreie Stadt"                
[125] "Weiden i. d. Opf, Kreisfreie Stadt"           "Neustadt a. d. Waldnaab"                     
[127] "Regensburg, Landkreis"                        "Bamberg, Kreisfreie Stadt"                   
[129] "Bayreuth, Kreisfreie Stadt"                   "Coburg, Kreisfreie Stadt"                    
[131] "Hof, Kreisfreie Stadt"                        "Bamberg, Landkreis"                          
[133] "Bayreuth, Landkreis"                          "Düsseldorf, Kreisfreie Stadt"                
[135] "Duisburg, Kreisfreie Stadt"                   "Essen, Kreisfreie Stadt"                     
[137] "Krefeld, Kreisfreie Stadt"                    "Mönchengladbach, Kreisfreie Stadt"           
[139] "Mülheim an der Ruhr, Kreisfreie Stadt"       
> setdiff(df_2021$kreis, nuts3_germany$NUTS_NAME)
  [1] "Freiburg im Breisgau"                "Dillingen an der Donau"              "Nürnberg"                           
  [4] "Neumarkt in der Oberpfalz"           "Rosenheim"                           "Wilhelmshaven"                      
  [7] "Amberg"                              "Fürth"                               "Regensburg"                         
 [10] "Rostock"                             "Baden-Baden"                         "Heilbronn"                          
 [13] "Weiden in der Oberpfalz"             "Koblenz"                             "Bamberg"                            
 [16] "Gera"                                "Coburg"                              "Bremen"                             
 [19] "Bremerhaven"                         "Essen"                               "Kaiserslautern"                     
 [22] "Neustadt an der Weinstraße"          "Pirmasens"                           "Heidelberg"                         
 [25] "Karlsruhe"                           "Schwabach"                           "Gießen"                             
 [28] "Oldenburg"                           "Lübeck"                              "Hof"                                
 [31] "Landshut"                            "Passau"                              "Salzgitter"                         
 [34] "Pforzheim"                           "Aschaffenburg"                       "Erlangen"                           
 [37] "Schweinfurt"                         "Wunsiedel im Fichtelgebirge"         "Bielefeld"                          
 [40] "Leverkusen"                          "Ludwigshafen am Rhein"               "Wolfsburg"                          
 [43] "Schwerin"                            "Braunschweig"                        "Düsseldorf"                         
 [46] "Köln"                                "Speyer"                              "Offenbach am Main"                  
 [49] "Oberhausen"                          "Osnabrück"                           "Chemnitz"                           
 [52] "Kiel"                                "Memmingen"                           "Flensburg"                          
 [55] "Ansbach"                             "Herne"                               "Krefeld"                            
 [58] "Mönchengladbach"                     "Suhl"                                "Würzburg"                           
 [61] "Bochum"                              "Bottrop"                             "Frankenthal (Pfalz)"                
 [64] "Halle (Saale)"                       "Magdeburg"                           "Emden"                              
 [67] "Wuppertal"                           "Worms"                               "Neumünster"                         
 [70] "Mannheim"                            "Augsburg"                            "Bayreuth"                           
 [73] "Potsdam"                             "Tübingen"                            "Ulm"                                
 [76] "Neustadt an der Aisch-Bad Windsheim" "Frankfurt am Main"                   "Gelsenkirchen"                      
 [79] "Ingolstadt"                          "Kempten (Allgäu)"                    "Straubing"                          
 [82] "Brandenburg an der Havel"            "Frankfurt (Oder)"                    "Zweibrücken"                        
 [85] "München"                             "Darmstadt"                           "Landau in der Pfalz"                
 [88] "Pfaffenhofen an der Ilm"             "Offenbach"                           "Weimar"                             
 [91] "Friesland"                           "Dortmund"                            "Hamm"                               
 [94] "Münster"                             "Eisenach"                            "Stuttgart"                          
 [97] "Kassel"                              "Lüneburg"                            "Osterode am Harz"                   
[100] "Duisburg"                            "Mainz"                               "Dresden"                            
[103] "Kaufbeuren"                          "Mühldorf am Inn"                     "Cottbus"                            
[106] "Hagen"                               "Mülheim an der Ruhr"                 "Erfurt"                             
[109] "Delmenhorst"                         "Dessau-Roßlau"                       "Neustadt an der Waldnaab"           
[112] "Wiesbaden"                           "Bonn"                                "Jena"                               
[115] "Remscheid"                           "Solingen"                            "Trier"                              

This causes left_join() to fail, and even string normalization with fuzzy matching is unreliable due to many systematic but nontrivial differences (abbreviations, inclusion of terms like "Kreisfreie Stadt", use of alternative suffixes, etc.).

Question:

What is the best practice or reliable method to match German NUTS-3 names across datasets, given theses differences in spelling, abbreviation, and naming conventions?


Solution

  • After days of trial and failure, I solved the problem. Below is how I did it:

    The complete code is shown below:

    # -----------------------------
    # 0. Load Required Packages
    # -----------------------------
    library(stringr)
    library(stringi)
    library(giscoR)
    library(scales)
    library(readxl)
    library(dplyr)
    library(readr)
    library(sf)
    
    # -----------------------------
    # 1. Read and Clean PM2.5 Data
    # -----------------------------
    
    # 2019 data (used for coordinates and kreis_code mapping)
    df_2019 <- read_delim("APExpose_DE__2010-2019.csv", delim = ";", locale = locale(encoding = "UTF-8")) %>%
      mutate(kreis = str_replace_all(kreis, c("ü" = "ü", "ö" = "ö", "ä" = "ä", "Ã\u009f" = "ß"))) %>%
      filter(year == 2019, scenario == "average") %>%
      select(kreis, kreis_code, ISO_code, Kreis_Scluessel, PM2.5_annualMean, Lon, Lat)
    
    # 2021 PM2.5 values only
    df_2021 <- read_delim("APExpose_DE__2003-2022__nogeo.csv", delim = ";", locale = locale(encoding = "UTF-8")) %>%
      mutate(kreis = str_replace_all(kreis, c("ü" = "ü", "ö" = "ö", "ä" = "ä", "Ã\u009f" = "ß"))) %>%
      filter(year == 2021, scenario == "average") %>%
      select(kreis_code, kreis, PM2.5_annualMean) %>%
      rename(PM2.5_2021 = PM2.5_annualMean)
    
    # Merge 2021 PM2.5 into 2019 data using kreis_code (shared ID across both)
    df_merged <- df_2019 %>%
      left_join(df_2021, by = "kreis_code")
    
    # -----------------------------
    # 2. Add NUTS-3 Codes
    # -----------------------------
    
    # Read Schluesselnummer-to-NUTS3 mapping
    kreise_clean <- read_excel("04-kreise.xlsx", sheet = "Kreisfreie Städte u. Landkreise", skip = 3) %>%
      select(Schluesselnummer = `Schlüssel-nummer`, NUTS3) %>%
      filter(str_detect(Schluesselnummer, "^\\d{5}$"))  # only valid codes
    
    # Join NUTS3 code to merged data
    df_final <- df_merged %>%
      left_join(kreise_clean, by = c("Kreis_Scluessel" = "Schluesselnummer"))
    
    # Report unmatched cases
    missing_nuts <- df_final %>% filter(is.na(NUTS3))
    cat("Unmatched Kreise (due to legacy codes):\n")
    print(missing_nuts %>% select(kreis.x, Kreis_Scluessel))
    
    # -----------------------------
    # 3. Get Spatial Data & Merge
    # -----------------------------
    
    nuts3_germany <- gisco_get_nuts(country = "DE", year = "2021", resolution = "01", nuts_level = "3")
    
    map_data <- nuts3_germany %>%
      left_join(df_final, by = c("NUTS_ID" = "NUTS3"))