sqlrsqlitedbplyr

With dbplyr, how to add a row to a table unless the row already exists? rows_insert() is too strict and rows_append() too lax


In R, is there a way to use dbplyr functions, without added SQL, to add a row to a database table unless the row already exists, thus avoiding duplication? In the example below, I add red apples to a table but then cannot add green apples with rows_insert() because apples already exist. rows_append() would let me add green apples, but multiple times. Some SQL for how to add a row unless it already exists is shown, but can it be done with regular dbplyr functions?

suppressPackageStartupMessages({
  library(dplyr)
  library(dbplyr)
  library(DBI)
})

con <- DBI::dbConnect(RSQLite::SQLite())

# Create database with one table and three columns: id, fruit, colour
create_db <- "CREATE TABLE fruits(id INTEGER PRIMARY KEY AUTOINCREMENT, fruit TEXT, colour TEXT)" |> as.sql(con)
DBI::dbExecute(con, create_db)
#> [1] 0

fruits <- tbl(con, "fruits")

# Apples can be red
red_apple <- tibble(fruit = "apple", colour = "red")

# Insert into the database; primary key autoincrements and id will be assigned
rows_insert(fruits, red_apple, copy = TRUE, conflict = "ignore", in_place = TRUE)
#> Matching, by = "fruit"
fruits
#> # Source:   table<fruits> [1 x 3]
#> # Database: sqlite 3.41.2 []
#>      id fruit colour
#>   <int> <chr> <chr>
#> 1     1 apple red

# Inserting the same colour fruit doesn't work.
rows_insert(fruits, red_apple, copy = TRUE, conflict = "ignore", in_place = TRUE)
#> Matching, by = "fruit"
fruits
#> # Source:   table<fruits> [1 x 3]
#> # Database: sqlite 3.41.2 []
#>      id fruit colour
#>   <int> <chr> <chr>
#> 1     1 apple red

# Apples can also be green
green_apple <- tibble(fruit = "apple", colour = "green")

# I would like this new row to be added to the database, but this doesn't do it.
rows_insert(fruits, green_apple, copy = TRUE, conflict = "ignore", in_place = TRUE)
#> Matching, by = "fruit"
fruits
#> # Source:   table<fruits> [1 x 3]
#> # Database: sqlite 3.41.2 []
#>      id fruit colour
#>   <int> <chr> <chr>
#> 1     1 apple red

# Cherries can be red, and this works, even though the colour is duplicated.
red_cherry <- tibble(fruit = "cherry", colour = "red")
rows_insert(fruits, red_cherry, copy = TRUE, conflict = "ignore", in_place = TRUE)
#> Matching, by = "fruit"
fruits
#> # Source:   table<fruits> [2 x 3]
#> # Database: sqlite 3.41.2 []
#>      id fruit  colour
#>   <int> <chr>  <chr>
#> 1     1 apple  red
#> 2     2 cherry red

# To be added with rows_insert() the fruit must be different from anything there
yellow_lemon <- tibble(fruit = "lemon", colour = "yellow")
rows_insert(fruits, yellow_lemon, copy = TRUE, conflict = "ignore", in_place = TRUE)
#> Matching, by = "fruit"
fruits
#> # Source:   table<fruits> [3 x 3]
#> # Database: sqlite 3.41.2 []
#>      id fruit colour
#>   <int> <chr> <chr>
#> 1     1 apple red
#> 2     2 cherry red
#> 3     3 lemon yellow

# rows_append() will add the green apple ...
rows_append(fruits, green_apple, copy = TRUE, in_place = TRUE)
fruits
#> # Source:   table<fruits> [4 x 3]
#> # Database: sqlite 3.41.2 []
#>      id fruit  colour
#>   <int> <chr>  <chr>
#> 1     1 apple  red
#> 2     2 cherry red
#> 3     3 lemon  yellow
#> 4     4 apple  green

# ... but it will allow adding it multiple times, creating duplicates.
rows_append(fruits, green_apple, copy = TRUE, in_place = TRUE)
rows_append(fruits, green_apple, copy = TRUE, in_place = TRUE)
fruits
#> # Source:   table<fruits> [6 x 3]
#> # Database: sqlite 3.41.2 []
#>      id fruit  colour
#>   <int> <chr>  <chr>
#> 1     1 apple  red
#> 2     2 cherry red
#> 3     3 lemon  yellow
#> 4     4 apple  green
#> 5     5 apple  green
#> 6     6 apple  green

# This SQL statement will add a yellow apple once ...
add_yellow_apple <- "INSERT INTO fruits (fruit, colour) SELECT 'apple', 'yellow' WHERE NOT EXISTS (SELECT 1 FROM fruits WHERE fruit = 'apple' AND colour = 'yellow');" |> as.sql(con)
DBI::dbExecute(con, add_yellow_apple)
#> [1] 1

fruits
#> # Source:   table<fruits> [7 x 3]
#> # Database: sqlite 3.41.2 []
#>      id fruit  colour
#>   <int> <chr>  <chr>
#> 1     1 apple  red
#> 2     2 cherry red
#> 3     3 lemon  yellow
#> 4     4 apple  green
#> 5     5 apple  green
#> 6     6 apple  green
#> 7     7 apple  yellow

# ... and only once, as desired, never creating a duplicate.
DBI::dbExecute(con, add_yellow_apple)
#> [1] 0

fruits
#> # Source:   table<fruits> [7 x 3]
#> # Database: sqlite 3.41.2 []
#>      id fruit  colour
#>   <int> <chr>  <chr>
#> 1     1 apple  red
#> 2     2 cherry red
#> 3     3 lemon  yellow
#> 4     4 apple  green
#> 5     5 apple  green
#> 6     6 apple  green
#> 7     7 apple  yellow

Solution

  • I think you need specify the by to get the correct key in a way that dplyr can see it.

    suppressPackageStartupMessages({
      library(dplyr)
      library(dbplyr)
      library(DBI)
    })
    
    con <- DBI::dbConnect(RSQLite::SQLite())
    
    # Create database with one table and three columns: id, fruit, colour
    create_db <- "CREATE TABLE fruits(id INTEGER PRIMARY KEY AUTOINCREMENT, fruit TEXT, colour TEXT)" |> as.sql(con)
    DBI::dbExecute(con, create_db)
    #> [1] 0
    
    fruits <- tbl(con, "fruits")
    
    red_apple <- tibble(fruit = "apple", colour = "red")
    green_apple <- tibble(fruit = "apple", colour = "green")
    orange <- tibble(fruit = "orange", colour = "orange")
    red_cherry <- tibble(fruit = "cherry", colour = "red")
    
    # Insert into the database; primary key autoincrements and id will be assigned
    in_place <- TRUE
    
    rows_insert(fruits, red_apple, , by = c("fruit", "colour"),
                copy = TRUE, conflict = "ignore", in_place = in_place)
    rows_insert(fruits, orange, by = c("fruit", "colour"),
                copy = TRUE, conflict = "ignore", in_place = in_place)
    rows_insert(fruits, green_apple, by = c("fruit", "colour"),
                copy = TRUE, conflict = "ignore", in_place = in_place)
    rows_insert(fruits, green_apple, by = c("fruit", "colour"),
                copy = TRUE, conflict = "ignore", in_place = in_place)
    rows_insert(fruits, red_cherry, by = c("fruit", "colour"),
                copy = TRUE, conflict = "ignore", in_place = in_place)
    
    #> Matching, by = "fruit"
    fruits
    #> # Source:   table<fruits> [4 x 3]
    #> # Database: sqlite 3.41.2 []
    #>      id fruit  colour
    #>   <int> <chr>  <chr> 
    #> 1     1 apple  red   
    #> 2     2 orange orange
    #> 3     3 apple  green 
    #> 4     4 cherry red
    

    Created on 2023-06-07 with reprex v2.0.2

    Note that if you switch in_place to FALSE and use show_query(), you can see the SQL that dbplyr is generating to implement this. While this doesn't work to create the table for some reason, you can see why setting the correct key is necessary.

    > rows_insert(fruits, red_cherry, by = c("fruit", "colour"),
    +             copy = TRUE, conflict = "ignore", in_place = in_place) %>% show_query()
    <SQL>
    SELECT *
    FROM `fruits`
    UNION ALL
    SELECT NULL AS `id`, *
    FROM (
      SELECT *
      FROM `dbplyr_080`
      WHERE NOT EXISTS (
        SELECT 1 FROM `fruits`
        WHERE (
          `dbplyr_080`.`fruit` = `fruits`.`fruit` AND
          `dbplyr_080`.`colour` = `fruits`.`colour`
        )
      )
    )