Spatial observational download loop error

I have this script to download observational data of a species from GBIF.org in R. Afterwards, the data is plotted on a map of Europe so make a map of the species distribution.

# PACKAGES

library(sf)
library(rgbif)
library(dismo)
library(tidyverse)
library(eurostat)
library(leaflet)
library(sf)
library(scales)
library(cowplot)
library(ggthemes)
library(ggplot2)

# GBIF EXAMPLE
  # Get data from GBIF
  data <- gbif("Psilocybe", "semilanceata", geo = TRUE)

  # Filter out rows with missing coordinates
  data_filtered <- data[complete.cases(data$lon, data$lat), ]

  # Create a spatial dataframe
  data_sf <- st_as_sf(data_filtered, coords = c("lon", "lat"), crs = 4326)


# MAP
  # Get map data
  get_eurostat_geospatial(resolution = 10, 
                        nuts_level = 0, 
                        year = 2016)

  SHP_0 <- get_eurostat_geospatial(resolution = 10, 
                                 nuts_level = 0, 
                                 year = 2016)
# PLOT OBSERVATIONS + MAP
# Simple  
  species_plot <-ggplot() +
    geom_sf() +
    geom_sf(data = SHP_0, color = 'black', fill = 'grey90',linewidth = 0.20) +
    scale_x_continuous(limits = c(-10, 30)) +
    scale_y_continuous(limits = c(35, 70)) + 
    geom_sf(data = data_sf, color = "cornflowerblue", size = 1.2, alpha = 0.25) +
    theme_void()
  # Save the ggplot as a .jpg file with the species name
  species_name <- "Psilocybe_semilanceata"
  ggsave(paste0(species_name, ".jpg"), plot = species_plot, width = 10, height = 8, units = "in", dpi = 300)

I have a complete dataset with a few hundred species which I want to make a distribution map for. So, I loaded the dataset with the Latin species name:

> str(bird_names)
 chr [1:280] "Prunella modularis" "Myiopsitta monachus" ...
> head(bird_names)
[1] "Prunella modularis"  "Myiopsitta monachus" "Pyrrhura perlata"   
[4] "Tyto alba"           "Panurus biarmicus"   "Merops apiaster"

Therefore I rewrote the first script with a for-loop to download all the species data from GBIF.org and plot it:

library(sf)
library(rgbif)
library(dismo)
library(tidyverse)
library(eurostat)
library(leaflet)
library(sf)
library(scales)
library(cowplot)
library(ggthemes)
library(ggplot2)


# Create a directory for storing bird maps
dir.create("BIRD_MAPS", showWarnings = FALSE)

# Your existing packages and GBIF example...

# Loop through each bird species
for (species in bird_names) {
  # Split the species name into genus and species
  species_parts <- strsplit(species, " ")[[1]]
  genus <- species_parts[1]
  species_name <- species_parts[2]
  
  # Get data from GBIF for the current species
  data <- gbif(genus, species_name, geo = TRUE)
  
  # Filter out rows with missing coordinates
  data_filtered <- data[complete.cases(data$lon, data$lat), ]
  
  # Create a spatial dataframe
  data_sf <- st_as_sf(data_filtered, coords = c("lon", "lat"), crs = 4326)
  
  # Get map data
  SHP_0 <- get_eurostat_geospatial(resolution = 10, nuts_level = 0, year = 2016)
  
  # Plot observations + map
  species_plot <- ggplot() +
    geom_sf() +
    geom_sf(data = SHP_0, color = 'black', fill = 'grey90', linewidth = 0.20) +
    scale_x_continuous(limits = c(-10, 30)) +
    scale_y_continuous(limits = c(35, 70)) + 
    geom_sf(data = data_sf, color = "cornflowerblue", size = 1.2, alpha = 0.25) +
    theme_void()
  
  # Save the ggplot as a .jpg file with the species name in BIRD_MAPS directory
  ggsave(paste0("BIRD_MAPS/", gsub(" ", "_", species), ".jpg"), plot = species_plot, width = 10, height = 8, units = "in", dpi = 300)
}

However, I got this error:

2790578 records found
Error in gbif(genus, species_name, geo = TRUE) : 
  The number of records is larger than the maximum for download via this service (200,000)

Is it possible to overcome this? Do I get the error because the script wants to download all observations at once instead of all the observations of one species? I only want the observations of Europe so that could be a way to filter out some excess observations.

Solution

You can use a rgbif package. Below my standard workflow example to get the GBIF occurencies for species (including synonyms). Feel free to use it as guide:

Prepare the auxiliary data file: check for synonyms and get occ_count for them:

spc <- "Cynodon dactylon (L.) Pers."
output_file <- "cynodon_dactylon.csv"

if(!file.exists(output_file)) {
  a <- rgbif::name_backbone(name = spc) |>
    subset(select = c("usageKey", "species", "scientificName", "status")) |>
    dplyr::mutate(acceptedKey = usageKey)
  
  b <- rgbif::name_usage(key = a$usageKey, data = "synonyms")$data |>
    subset(select = c("key", "species", "scientificName", "taxonomicStatus", "acceptedKey")) |>
    dplyr::rename(c(usageKey = key, status = taxonomicStatus))
  
  a <- a |>
    rbind(b)
  
  for (i in 1:nrow(a)) {
    key <- as.integer(a[i,"usageKey"])
    occCount <- rgbif::occ_count(taxonKey = key)
    a[a$usageKey == key, "occCount"] <- occCount
    message(i, " ", a[i, "scientificName"], ", count = ", occCount)
  }
  readr::write_csv(a, file = output_file)
} else {
  a <- readr::read_csv(file = output_file, show_col_types = FALSE)
}

Now, having all synonyms and number of their occurrences we can download GBIF sets of data. First of all we are filtering out only those species/synonyms which have at least 1 occurence in GBIF. Please note, rgbif uses GBIF API, therefore you have to create an account on GBIF.

if(file.exists(output_file)) {
  a <- readr::read_csv(file = output_file, show_col_types = FALSE)
  b <- a |>
    subset(occCount >= 1L)
  
  for (i in 1:nrow(b)) {
    if(!("occDownloadKey" %in% names(b)) || is.na(b[i, "occDownloadKey"])) {
      message(i, " ", b[i, "scientificName"])
      calKey <- as.integer(b[i, "usageKey"])
      res <- rgbif::occ_download(
        rgbif::pred('taxonKey', calKey),
        rgbif::pred('hasCoordinate', TRUE),
        user = "your_user_name",
        pwd = "your_password",
        email = "your_email"
      )
      repeat {
        ala <- rgbif::occ_download_meta(res)
        message(paste(calKey, Sys.time(), ala$status, sep = " - "))
        if (ala$status == "SUCCEEDED") {
          Sys.sleep(3)
          data <- rgbif::occ_download_get(key = ala$key, path = "data/gbif", overwrite = FALSE)
          a[a$usageKey == calKey, "occDownloadKey"] <- ala$key
          break
        }
        Sys.sleep(60)
      }
    } else {
      message(i, " ", b[i, "scientificName"], " ", b[i, "occDownloadKey"])
    }
  }
  readr::write_csv(a, file = output_file)
}

Having all data downloaded (it takes a while) we can proceed further, I usually call CoordinateCleaner package to clean up the data, like:

a <- readr::read_csv(file = output_file, show_col_types = FALSE)
b <- a |>
  subset(!is.na(occDownloadKey))

d <- data.frame()
for (i in 1:nrow(b)) {
  message(i, " of ", nrow(b), " - ", b[i, "scientificName"])
  dd <- rgbif::occ_download_import(rgbif::as.download(paste0("data/gbif/",b[i,"occDownloadKey"],".zip"))) |>
    subset(select = c(gbifID, species, decimalLongitude, decimalLatitude, countryCode, individualCount,
           family, taxonRank, coordinateUncertaintyInMeters, year,
           basisOfRecord, institutionCode, datasetName)) |>
    subset(!is.na(decimalLongitude) & !is.na(decimalLatitude))
  
  dd$gbifID <- as.character(dd$gbifID)
  d <- dd |>
    rbind(d)
}

d$countryCode <- countrycode::countrycode(d$countryCode, origin =  'iso2c', destination = 'iso3c')
d <- data.frame(d)

rf <- CoordinateCleaner::clean_coordinates(d,
                                           lon = "decimalLongitude",
                                           lat = "decimalLatitude",
                                           countries = "countryCode",
                                           species = "species",
                                           tests = c("capitals", "centroids", 
                                                     "equal", "gbif", 
                                                     "institutions", 
                                                     "outliers", "seas",
                                                     "zeros", "countries")
)

rf <- rf |>
  sf::st_as_sf(coords = c("decimalLongitude", "decimalLatitude"), crs = "EPSG:4326") |>
  subset(.summary == TRUE)