rloopsdismo

Spatial observational download loop error


I have this script to download observational data of a species from GBIF.org in R. Afterwards, the data is plotted on a map of Europe so make a map of the species distribution.

# PACKAGES

library(sf)
library(rgbif)
library(dismo)
library(tidyverse)
library(eurostat)
library(leaflet)
library(sf)
library(scales)
library(cowplot)
library(ggthemes)
library(ggplot2)

# GBIF EXAMPLE
  # Get data from GBIF
  data <- gbif("Psilocybe", "semilanceata", geo = TRUE)

  # Filter out rows with missing coordinates
  data_filtered <- data[complete.cases(data$lon, data$lat), ]

  # Create a spatial dataframe
  data_sf <- st_as_sf(data_filtered, coords = c("lon", "lat"), crs = 4326)


# MAP
  # Get map data
  get_eurostat_geospatial(resolution = 10, 
                        nuts_level = 0, 
                        year = 2016)

  SHP_0 <- get_eurostat_geospatial(resolution = 10, 
                                 nuts_level = 0, 
                                 year = 2016)
# PLOT OBSERVATIONS + MAP
# Simple  
  species_plot <-ggplot() +
    geom_sf() +
    geom_sf(data = SHP_0, color = 'black', fill = 'grey90',linewidth = 0.20) +
    scale_x_continuous(limits = c(-10, 30)) +
    scale_y_continuous(limits = c(35, 70)) + 
    geom_sf(data = data_sf, color = "cornflowerblue", size = 1.2, alpha = 0.25) +
    theme_void()
  # Save the ggplot as a .jpg file with the species name
  species_name <- "Psilocybe_semilanceata"
  ggsave(paste0(species_name, ".jpg"), plot = species_plot, width = 10, height = 8, units = "in", dpi = 300)
  

I have a complete dataset with a few hundred species which I want to make a distribution map for. So, I loaded the dataset with the Latin species name:

> str(bird_names)
 chr [1:280] "Prunella modularis" "Myiopsitta monachus" ...
> head(bird_names)
[1] "Prunella modularis"  "Myiopsitta monachus" "Pyrrhura perlata"   
[4] "Tyto alba"           "Panurus biarmicus"   "Merops apiaster"    

Therefore I rewrote the first script with a for-loop to download all the species data from GBIF.org and plot it:

library(sf)
library(rgbif)
library(dismo)
library(tidyverse)
library(eurostat)
library(leaflet)
library(sf)
library(scales)
library(cowplot)
library(ggthemes)
library(ggplot2)


# Create a directory for storing bird maps
dir.create("BIRD_MAPS", showWarnings = FALSE)

# Your existing packages and GBIF example...

# Loop through each bird species
for (species in bird_names) {
  # Split the species name into genus and species
  species_parts <- strsplit(species, " ")[[1]]
  genus <- species_parts[1]
  species_name <- species_parts[2]
  
  # Get data from GBIF for the current species
  data <- gbif(genus, species_name, geo = TRUE)
  
  # Filter out rows with missing coordinates
  data_filtered <- data[complete.cases(data$lon, data$lat), ]
  
  # Create a spatial dataframe
  data_sf <- st_as_sf(data_filtered, coords = c("lon", "lat"), crs = 4326)
  
  # Get map data
  SHP_0 <- get_eurostat_geospatial(resolution = 10, nuts_level = 0, year = 2016)
  
  # Plot observations + map
  species_plot <- ggplot() +
    geom_sf() +
    geom_sf(data = SHP_0, color = 'black', fill = 'grey90', linewidth = 0.20) +
    scale_x_continuous(limits = c(-10, 30)) +
    scale_y_continuous(limits = c(35, 70)) + 
    geom_sf(data = data_sf, color = "cornflowerblue", size = 1.2, alpha = 0.25) +
    theme_void()
  
  # Save the ggplot as a .jpg file with the species name in BIRD_MAPS directory
  ggsave(paste0("BIRD_MAPS/", gsub(" ", "_", species), ".jpg"), plot = species_plot, width = 10, height = 8, units = "in", dpi = 300)
}

However, I got this error:

2790578 records found
Error in gbif(genus, species_name, geo = TRUE) : 
  The number of records is larger than the maximum for download via this service (200,000)

Is it possible to overcome this? Do I get the error because the script wants to download all observations at once instead of all the observations of one species? I only want the observations of Europe so that could be a way to filter out some excess observations.


Solution

  • As L Tyrone mentioned, you can use a rgbif package. Below my standard workflow example to get the GBIF occurencies for species (including synonyms). Feel free to use it as guide:

    1. Prepare the auxiliary data file: check for synonyms and get occ_count for them:
    spc <- "Cynodon dactylon (L.) Pers."
    output_file <- "cynodon_dactylon.csv"
    
    if(!file.exists(output_file)) {
      a <- rgbif::name_backbone(name = spc) |>
        subset(select = c("usageKey", "species", "scientificName", "status")) |>
        dplyr::mutate(acceptedKey = usageKey)
      
      b <- rgbif::name_usage(key = a$usageKey, data = "synonyms")$data |>
        subset(select = c("key", "species", "scientificName", "taxonomicStatus", "acceptedKey")) |>
        dplyr::rename(c(usageKey = key, status = taxonomicStatus))
      
      a <- a |>
        rbind(b)
      
      for (i in 1:nrow(a)) {
        key <- as.integer(a[i,"usageKey"])
        occCount <- rgbif::occ_count(taxonKey = key)
        a[a$usageKey == key, "occCount"] <- occCount
        message(i, " ", a[i, "scientificName"], ", count = ", occCount)
      }
      readr::write_csv(a, file = output_file)
    } else {
      a <- readr::read_csv(file = output_file, show_col_types = FALSE)
    }
    
    1. Now, having all synonyms and number of their occurrences we can download GBIF sets of data. First of all we are filtering out only those species/synonyms which have at least 1 occurence in GBIF. Please note, rgbif uses GBIF API, therefore you have to create an account on GBIF.
    if(file.exists(output_file)) {
      a <- readr::read_csv(file = output_file, show_col_types = FALSE)
      b <- a |>
        subset(occCount >= 1L)
      
      for (i in 1:nrow(b)) {
        if(!("occDownloadKey" %in% names(b)) || is.na(b[i, "occDownloadKey"])) {
          message(i, " ", b[i, "scientificName"])
          calKey <- as.integer(b[i, "usageKey"])
          res <- rgbif::occ_download(
            rgbif::pred('taxonKey', calKey),
            rgbif::pred('hasCoordinate', TRUE),
            user = "your_user_name",
            pwd = "your_password",
            email = "your_email"
          )
          repeat {
            ala <- rgbif::occ_download_meta(res)
            message(paste(calKey, Sys.time(), ala$status, sep = " - "))
            if (ala$status == "SUCCEEDED") {
              Sys.sleep(3)
              data <- rgbif::occ_download_get(key = ala$key, path = "data/gbif", overwrite = FALSE)
              a[a$usageKey == calKey, "occDownloadKey"] <- ala$key
              break
            }
            Sys.sleep(60)
          }
        } else {
          message(i, " ", b[i, "scientificName"], " ", b[i, "occDownloadKey"])
        }
      }
      readr::write_csv(a, file = output_file)
    }
    
    1. Having all data downloaded (it takes a while) we can proceed further, I usually call CoordinateCleaner package to clean up the data, like:
    a <- readr::read_csv(file = output_file, show_col_types = FALSE)
    b <- a |>
      subset(!is.na(occDownloadKey))
    
    d <- data.frame()
    for (i in 1:nrow(b)) {
      message(i, " of ", nrow(b), " - ", b[i, "scientificName"])
      dd <- rgbif::occ_download_import(rgbif::as.download(paste0("data/gbif/",b[i,"occDownloadKey"],".zip"))) |>
        subset(select = c(gbifID, species, decimalLongitude, decimalLatitude, countryCode, individualCount,
               family, taxonRank, coordinateUncertaintyInMeters, year,
               basisOfRecord, institutionCode, datasetName)) |>
        subset(!is.na(decimalLongitude) & !is.na(decimalLatitude))
      
      dd$gbifID <- as.character(dd$gbifID)
      d <- dd |>
        rbind(d)
    }
    
    d$countryCode <- countrycode::countrycode(d$countryCode, origin =  'iso2c', destination = 'iso3c')
    d <- data.frame(d)
    
    rf <- CoordinateCleaner::clean_coordinates(d,
                                               lon = "decimalLongitude",
                                               lat = "decimalLatitude",
                                               countries = "countryCode",
                                               species = "species",
                                               tests = c("capitals", "centroids", 
                                                         "equal", "gbif", 
                                                         "institutions", 
                                                         "outliers", "seas",
                                                         "zeros", "countries")
    )
    
    rf <- rf |>
      sf::st_as_sf(coords = c("decimalLongitude", "decimalLatitude"), crs = "EPSG:4326") |>
      subset(.summary == TRUE)