I'm working with two datasets for German NUTS-3 level regions:
A shapefile from Eurostat via the giscoR
package:
> library(giscoR)
> nuts3_germany <- gisco_get_nuts(country = "Germany", year = "2021", resolution = "03", nuts_level = "3")
> glimpse(nuts3_germany)
Rows: 401
Columns: 10
$ NUTS_ID <chr> "DEG0P", "DE21M", "DE21N", "DE248", "DE249", "DE24A", "DE24B", "DE24C", "DE24D", "DE251", "DE252", "DE253", "…
$ LEVL_CODE <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3…
$ URBN_TYPE <dbl> 3, 3, 2, 2, 2, 3, 3, 3, 3, 3, 1, 1, 1, 1, 3, 1, 1, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 3, 3, 3, 2, 3, 3…
$ CNTR_CODE <chr> "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "…
$ NAME_LATN <chr> "Wartburgkreis", "Traunstein", "Weilheim-Schongau", "Forchheim", "Hof, Landkreis", "Kronach", "Kulmbach", "Li…
$ NUTS_NAME <chr> "Wartburgkreis", "Traunstein", "Weilheim-Schongau", "Forchheim", "Hof, Landkreis", "Kronach", "Kulmbach", "Li…
$ MOUNT_TYPE <dbl> 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4…
$ COAST_TYPE <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 2, 3…
$ geo <chr> "DEG0P", "DE21M", "DE21N", "DE248", "DE249", "DE24A", "DE24B", "DE24C", "DE24D", "DE251", "DE252", "DE253", "…
$ geometry <MULTIPOLYGON [°]> MULTIPOLYGON (((10.09469 50..., MULTIPOLYGON (((12.84167 48..., MULTIPOLYGON (((10.91209 47..., …
And a PM2.5 dataset for the year 2021:
> library(dplyr)
> library(stringr)
> library(readr)
> df_2021 <- read_delim(
+ "APExpose_DE__2003-2022__nogeo.csv",
+ delim = ";",
+ locale = locale(encoding = "UTF-8")
+ ) %>%
+ mutate(
+ kreis = str_replace_all(kreis, c("ü" = "ü", "ö" = "ö", "ä" = "ä", "Ã\u009f" = "ß"))
+ ) %>%
+ filter(year == 2021, scenario == "average") %>%
+ select(kreis, PM2.5_annualMean)
Rows: 24120 Columns: 15
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ";"
chr (2): kreis, scenario
dbl (13): year, NO2_annualMean, NO2_hrOver200, NO_annualMean, O3_annualMean, O3_daysOver120, O3_dailyMaxAnnualMean, O3_dailyHo...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
> glimpse(df_2021)
Rows: 402
Columns: 2
$ kreis <chr> "Freiburg im Breisgau", "Dillingen an der Donau", "Nürnberg", "Neumarkt in der Oberpfalz", "Rosenheim",…
$ PM2.5_annualMean <dbl> 7.252013, 9.232584, 12.186710, 9.281277, 10.353246, 8.548126, 9.311748, 14.496520, 9.368583, 9.300551, …
Problem:
The kreis
names in df_2021
do not exactly match the NUTS_NAME
entries in nuts3_germany
:
> setdiff(nuts3_germany$NUTS_NAME, df_2021$kreis)
[1] "Hof, Landkreis" "Wunsiedel i. Fichtelgebirge"
[3] "Ansbach, Kreisfreie Stadt" "Erlangen, Kreisfreie Stadt"
[5] "Fürth, Kreisfreie Stadt" "Nürnberg, Kreisfreie Stadt"
[7] "Schwabach, Kreisfreie Stadt" "Ansbach, Landkreis"
[9] "Fürth, Landkreis" "Eisenach, Kreisfreie Stadt"
[11] "Brandenburg an der Havel, Kreisfreie Stadt" "Cottbus, Kreisfreie Stadt"
[13] "Frankfurt (Oder), Kreisfreie Stadt" "Potsdam, Kreisfreie Stadt"
[15] "Kassel, Landkreis" "Rostock, Kreisfreie Stadt"
[17] "Schwerin, Kreisfreie Stadt" "Landkreis Rostock"
[19] "Lüneburg, Landkreis" "Braunschweig, Kreisfreie Stadt"
[21] "Salzgitter, Kreisfreie Stadt" "Wolfsburg, Kreisfreie Stadt"
[23] "Dortmund, Kreisfreie Stadt" "Hagen, Kreisfreie Stadt"
[25] "Hamm, Kreisfreie Stadt" "Herne, Kreisfreie Stadt"
[27] "Delmenhorst, Kreisfreie Stadt" "Pforzheim, Stadtkreis"
[29] "Erfurt, Kreisfreie Stadt" "Gera, Kreisfreie Stadt"
[31] "Koblenz, Kreisfreie Stadt" "Trier, Kreisfreie Stadt"
[33] "Bottrop, Kreisfreie Stadt" "Gelsenkirchen, Kreisfreie Stadt"
[35] "Münster, Kreisfreie Stadt" "Bielefeld, Kreisfreie Stadt"
[37] "Frankenthal (Pfalz), Kreisfreie Stadt" "Kaiserslautern, Kreisfreie Stadt"
[39] "Landau in der Pfalz, Kreisfreie Stadt" "Ludwigshafen am Rhein, Kreisfreie Stadt"
[41] "Mainz, Kreisfreie Stadt" "Neustadt an der Weinstraße, Kreisfreie Stadt"
[43] "Pirmasens, Kreisfreie Stadt" "Speyer, Kreisfreie Stadt"
[45] "Worms, Kreisfreie Stadt" "Zweibrücken, Kreisfreie Stadt"
[47] "Kaiserslautern, Landkreis" "Heidelberg, Stadtkreis"
[49] "Mannheim, Stadtkreis" "Flensburg, Kreisfreie Stadt"
[51] "Kiel, Kreisfreie Stadt" "Lübeck, Kreisfreie Stadt"
[53] "Neumünster, Kreisfreie Stadt" "Freiburg im Breisgau, Stadtkreis"
[55] "Rosenheim, Landkreis" "Mühldorf a. Inn"
[57] "München, Landkreis" "Pfaffenhofen a. d. Ilm"
[59] "Offenbach, Landkreis" "Gießen, Landkreis"
[61] "Kassel, Kreisfreie Stadt" "Stuttgart, Stadtkreis"
[63] "Augsburg, Kreisfreie Stadt" "Kaufbeuren, Kreisfreie Stadt"
[65] "Kempten (Allgäu), Kreisfreie Stadt" "Memmingen, Kreisfreie Stadt"
[67] "Augsburg, Landkreis" "Dillingen a.d. Donau"
[69] "Wilhelmshaven, Kreisfreie Stadt" "Dresden, Kreisfreie Stadt"
[71] "Friesland (DE)" "Oldenburg, Landkreis"
[73] "Osnabrück, Landkreis" "Jena, Kreisfreie Stadt"
[75] "Suhl, Kreisfreie Stadt" "Weimar, Kreisfreie Stadt"
[77] "Bochum, Kreisfreie Stadt" "Oberhausen, Kreisfreie Stadt"
[79] "Remscheid, Kreisfreie Stadt" "Solingen, Kreisfreie Stadt"
[81] "Wuppertal, Kreisfreie Stadt" "Dessau-Roßlau, Kreisfreie Stadt"
[83] "Halle (Saale), Kreisfreie Stadt" "Magdeburg, Kreisfreie Stadt"
[85] "Emden, Kreisfreie Stadt" "Oldenburg (Oldenburg), Kreisfreie Stadt"
[87] "Osnabrück, Kreisfreie Stadt" "Bonn, Kreisfreie Stadt"
[89] "Köln, Kreisfreie Stadt" "Leverkusen, Kreisfreie Stadt"
[91] "Landshut, Kreisfreie Stadt" "Passau, Kreisfreie Stadt"
[93] "Straubing, Kreisfreie Stadt" "Landshut, Landkreis"
[95] "Passau, Landkreis" "Chemnitz, Kreisfreie Stadt"
[97] "Neustadt a. d. Aisch-Bad Windsheim" "Leipzig, Kreisfreie Stadt"
[99] "Coburg, Landkreis" "Aschaffenburg, Kreisfreie Stadt"
[101] "Schweinfurt, Kreisfreie Stadt" "Würzburg, Kreisfreie Stadt"
[103] "Aschaffenburg, Landkreis" "Schweinfurt, Landkreis"
[105] "Würzburg, Landkreis" "Tübingen, Landkreis"
[107] "Ulm, Stadtkreis" "Ingolstadt, Kreisfreie Stadt"
[109] "München, Kreisfreie Stadt" "Rosenheim, Kreisfreie Stadt"
[111] "Heilbronn, Stadtkreis" "Heilbronn, Landkreis"
[113] "Bremen, Kreisfreie Stadt" "Bremerhaven, Kreisfreie Stadt"
[115] "Darmstadt, Kreisfreie Stadt" "Frankfurt am Main, Kreisfreie Stadt"
[117] "Offenbach am Main, Kreisfreie Stadt" "Wiesbaden, Kreisfreie Stadt"
[119] "Baden-Baden, Stadtkreis" "Karlsruhe, Stadtkreis"
[121] "Neumarkt i. d. OPf." "Karlsruhe, Landkreis"
[123] "Amberg, Kreisfreie Stadt" "Regensburg, Kreisfreie Stadt"
[125] "Weiden i. d. Opf, Kreisfreie Stadt" "Neustadt a. d. Waldnaab"
[127] "Regensburg, Landkreis" "Bamberg, Kreisfreie Stadt"
[129] "Bayreuth, Kreisfreie Stadt" "Coburg, Kreisfreie Stadt"
[131] "Hof, Kreisfreie Stadt" "Bamberg, Landkreis"
[133] "Bayreuth, Landkreis" "Düsseldorf, Kreisfreie Stadt"
[135] "Duisburg, Kreisfreie Stadt" "Essen, Kreisfreie Stadt"
[137] "Krefeld, Kreisfreie Stadt" "Mönchengladbach, Kreisfreie Stadt"
[139] "Mülheim an der Ruhr, Kreisfreie Stadt"
> setdiff(df_2021$kreis, nuts3_germany$NUTS_NAME)
[1] "Freiburg im Breisgau" "Dillingen an der Donau" "Nürnberg"
[4] "Neumarkt in der Oberpfalz" "Rosenheim" "Wilhelmshaven"
[7] "Amberg" "Fürth" "Regensburg"
[10] "Rostock" "Baden-Baden" "Heilbronn"
[13] "Weiden in der Oberpfalz" "Koblenz" "Bamberg"
[16] "Gera" "Coburg" "Bremen"
[19] "Bremerhaven" "Essen" "Kaiserslautern"
[22] "Neustadt an der Weinstraße" "Pirmasens" "Heidelberg"
[25] "Karlsruhe" "Schwabach" "Gießen"
[28] "Oldenburg" "Lübeck" "Hof"
[31] "Landshut" "Passau" "Salzgitter"
[34] "Pforzheim" "Aschaffenburg" "Erlangen"
[37] "Schweinfurt" "Wunsiedel im Fichtelgebirge" "Bielefeld"
[40] "Leverkusen" "Ludwigshafen am Rhein" "Wolfsburg"
[43] "Schwerin" "Braunschweig" "Düsseldorf"
[46] "Köln" "Speyer" "Offenbach am Main"
[49] "Oberhausen" "Osnabrück" "Chemnitz"
[52] "Kiel" "Memmingen" "Flensburg"
[55] "Ansbach" "Herne" "Krefeld"
[58] "Mönchengladbach" "Suhl" "Würzburg"
[61] "Bochum" "Bottrop" "Frankenthal (Pfalz)"
[64] "Halle (Saale)" "Magdeburg" "Emden"
[67] "Wuppertal" "Worms" "Neumünster"
[70] "Mannheim" "Augsburg" "Bayreuth"
[73] "Potsdam" "Tübingen" "Ulm"
[76] "Neustadt an der Aisch-Bad Windsheim" "Frankfurt am Main" "Gelsenkirchen"
[79] "Ingolstadt" "Kempten (Allgäu)" "Straubing"
[82] "Brandenburg an der Havel" "Frankfurt (Oder)" "Zweibrücken"
[85] "München" "Darmstadt" "Landau in der Pfalz"
[88] "Pfaffenhofen an der Ilm" "Offenbach" "Weimar"
[91] "Friesland" "Dortmund" "Hamm"
[94] "Münster" "Eisenach" "Stuttgart"
[97] "Kassel" "Lüneburg" "Osterode am Harz"
[100] "Duisburg" "Mainz" "Dresden"
[103] "Kaufbeuren" "Mühldorf am Inn" "Cottbus"
[106] "Hagen" "Mülheim an der Ruhr" "Erfurt"
[109] "Delmenhorst" "Dessau-Roßlau" "Neustadt an der Waldnaab"
[112] "Wiesbaden" "Bonn" "Jena"
[115] "Remscheid" "Solingen" "Trier"
This causes left_join()
to fail, and even string normalization with fuzzy matching is unreliable due to many systematic but nontrivial differences (abbreviations, inclusion of terms like "Kreisfreie Stadt", use of alternative suffixes, etc.).
Question:
What is the best practice or reliable method to match German NUTS-3 names across datasets, given theses differences in spelling, abbreviation, and naming conventions?
After days of trial and failure, I solved the problem. Below is how I did it:
I found the 2019 dataset wherein the Kreis Schluessel
are already included, along with kreis_code
.
I used kreis_code
as stable key to match/add 2021 PM2.5 data to the 2019 dataset.
Then I mapped to NUTS3
via the variable Kreis_Schluessel
.
The complete code is shown below:
# -----------------------------
# 0. Load Required Packages
# -----------------------------
library(stringr)
library(stringi)
library(giscoR)
library(scales)
library(readxl)
library(dplyr)
library(readr)
library(sf)
# -----------------------------
# 1. Read and Clean PM2.5 Data
# -----------------------------
# 2019 data (used for coordinates and kreis_code mapping)
df_2019 <- read_delim("APExpose_DE__2010-2019.csv", delim = ";", locale = locale(encoding = "UTF-8")) %>%
mutate(kreis = str_replace_all(kreis, c("ü" = "ü", "ö" = "ö", "ä" = "ä", "Ã\u009f" = "ß"))) %>%
filter(year == 2019, scenario == "average") %>%
select(kreis, kreis_code, ISO_code, Kreis_Scluessel, PM2.5_annualMean, Lon, Lat)
# 2021 PM2.5 values only
df_2021 <- read_delim("APExpose_DE__2003-2022__nogeo.csv", delim = ";", locale = locale(encoding = "UTF-8")) %>%
mutate(kreis = str_replace_all(kreis, c("ü" = "ü", "ö" = "ö", "ä" = "ä", "Ã\u009f" = "ß"))) %>%
filter(year == 2021, scenario == "average") %>%
select(kreis_code, kreis, PM2.5_annualMean) %>%
rename(PM2.5_2021 = PM2.5_annualMean)
# Merge 2021 PM2.5 into 2019 data using kreis_code (shared ID across both)
df_merged <- df_2019 %>%
left_join(df_2021, by = "kreis_code")
# -----------------------------
# 2. Add NUTS-3 Codes
# -----------------------------
# Read Schluesselnummer-to-NUTS3 mapping
kreise_clean <- read_excel("04-kreise.xlsx", sheet = "Kreisfreie Städte u. Landkreise", skip = 3) %>%
select(Schluesselnummer = `Schlüssel-nummer`, NUTS3) %>%
filter(str_detect(Schluesselnummer, "^\\d{5}$")) # only valid codes
# Join NUTS3 code to merged data
df_final <- df_merged %>%
left_join(kreise_clean, by = c("Kreis_Scluessel" = "Schluesselnummer"))
# Report unmatched cases
missing_nuts <- df_final %>% filter(is.na(NUTS3))
cat("Unmatched Kreise (due to legacy codes):\n")
print(missing_nuts %>% select(kreis.x, Kreis_Scluessel))
# -----------------------------
# 3. Get Spatial Data & Merge
# -----------------------------
nuts3_germany <- gisco_get_nuts(country = "DE", year = "2021", resolution = "01", nuts_level = "3")
map_data <- nuts3_germany %>%
left_join(df_final, by = c("NUTS_ID" = "NUTS3"))