In R, is there a way to use dbplyr
functions, without added SQL, to add a row to a database table unless the row already exists, thus avoiding duplication? In the example below, I add red apples to a table but then cannot add green apples with rows_insert()
because apples already exist. rows_append()
would let me add green apples, but multiple times. Some SQL for how to add a row unless it already exists is shown, but can it be done with regular dbplyr
functions?
suppressPackageStartupMessages({
library(dplyr)
library(dbplyr)
library(DBI)
})
con <- DBI::dbConnect(RSQLite::SQLite())
# Create database with one table and three columns: id, fruit, colour
create_db <- "CREATE TABLE fruits(id INTEGER PRIMARY KEY AUTOINCREMENT, fruit TEXT, colour TEXT)" |> as.sql(con)
DBI::dbExecute(con, create_db)
#> [1] 0
fruits <- tbl(con, "fruits")
# Apples can be red
red_apple <- tibble(fruit = "apple", colour = "red")
# Insert into the database; primary key autoincrements and id will be assigned
rows_insert(fruits, red_apple, copy = TRUE, conflict = "ignore", in_place = TRUE)
#> Matching, by = "fruit"
fruits
#> # Source: table<fruits> [1 x 3]
#> # Database: sqlite 3.41.2 []
#> id fruit colour
#> <int> <chr> <chr>
#> 1 1 apple red
# Inserting the same colour fruit doesn't work.
rows_insert(fruits, red_apple, copy = TRUE, conflict = "ignore", in_place = TRUE)
#> Matching, by = "fruit"
fruits
#> # Source: table<fruits> [1 x 3]
#> # Database: sqlite 3.41.2 []
#> id fruit colour
#> <int> <chr> <chr>
#> 1 1 apple red
# Apples can also be green
green_apple <- tibble(fruit = "apple", colour = "green")
# I would like this new row to be added to the database, but this doesn't do it.
rows_insert(fruits, green_apple, copy = TRUE, conflict = "ignore", in_place = TRUE)
#> Matching, by = "fruit"
fruits
#> # Source: table<fruits> [1 x 3]
#> # Database: sqlite 3.41.2 []
#> id fruit colour
#> <int> <chr> <chr>
#> 1 1 apple red
# Cherries can be red, and this works, even though the colour is duplicated.
red_cherry <- tibble(fruit = "cherry", colour = "red")
rows_insert(fruits, red_cherry, copy = TRUE, conflict = "ignore", in_place = TRUE)
#> Matching, by = "fruit"
fruits
#> # Source: table<fruits> [2 x 3]
#> # Database: sqlite 3.41.2 []
#> id fruit colour
#> <int> <chr> <chr>
#> 1 1 apple red
#> 2 2 cherry red
# To be added with rows_insert() the fruit must be different from anything there
yellow_lemon <- tibble(fruit = "lemon", colour = "yellow")
rows_insert(fruits, yellow_lemon, copy = TRUE, conflict = "ignore", in_place = TRUE)
#> Matching, by = "fruit"
fruits
#> # Source: table<fruits> [3 x 3]
#> # Database: sqlite 3.41.2 []
#> id fruit colour
#> <int> <chr> <chr>
#> 1 1 apple red
#> 2 2 cherry red
#> 3 3 lemon yellow
# rows_append() will add the green apple ...
rows_append(fruits, green_apple, copy = TRUE, in_place = TRUE)
fruits
#> # Source: table<fruits> [4 x 3]
#> # Database: sqlite 3.41.2 []
#> id fruit colour
#> <int> <chr> <chr>
#> 1 1 apple red
#> 2 2 cherry red
#> 3 3 lemon yellow
#> 4 4 apple green
# ... but it will allow adding it multiple times, creating duplicates.
rows_append(fruits, green_apple, copy = TRUE, in_place = TRUE)
rows_append(fruits, green_apple, copy = TRUE, in_place = TRUE)
fruits
#> # Source: table<fruits> [6 x 3]
#> # Database: sqlite 3.41.2 []
#> id fruit colour
#> <int> <chr> <chr>
#> 1 1 apple red
#> 2 2 cherry red
#> 3 3 lemon yellow
#> 4 4 apple green
#> 5 5 apple green
#> 6 6 apple green
# This SQL statement will add a yellow apple once ...
add_yellow_apple <- "INSERT INTO fruits (fruit, colour) SELECT 'apple', 'yellow' WHERE NOT EXISTS (SELECT 1 FROM fruits WHERE fruit = 'apple' AND colour = 'yellow');" |> as.sql(con)
DBI::dbExecute(con, add_yellow_apple)
#> [1] 1
fruits
#> # Source: table<fruits> [7 x 3]
#> # Database: sqlite 3.41.2 []
#> id fruit colour
#> <int> <chr> <chr>
#> 1 1 apple red
#> 2 2 cherry red
#> 3 3 lemon yellow
#> 4 4 apple green
#> 5 5 apple green
#> 6 6 apple green
#> 7 7 apple yellow
# ... and only once, as desired, never creating a duplicate.
DBI::dbExecute(con, add_yellow_apple)
#> [1] 0
fruits
#> # Source: table<fruits> [7 x 3]
#> # Database: sqlite 3.41.2 []
#> id fruit colour
#> <int> <chr> <chr>
#> 1 1 apple red
#> 2 2 cherry red
#> 3 3 lemon yellow
#> 4 4 apple green
#> 5 5 apple green
#> 6 6 apple green
#> 7 7 apple yellow
I think you need specify the by
to get the correct key in a way that dplyr
can see it.
suppressPackageStartupMessages({
library(dplyr)
library(dbplyr)
library(DBI)
})
con <- DBI::dbConnect(RSQLite::SQLite())
# Create database with one table and three columns: id, fruit, colour
create_db <- "CREATE TABLE fruits(id INTEGER PRIMARY KEY AUTOINCREMENT, fruit TEXT, colour TEXT)" |> as.sql(con)
DBI::dbExecute(con, create_db)
#> [1] 0
fruits <- tbl(con, "fruits")
red_apple <- tibble(fruit = "apple", colour = "red")
green_apple <- tibble(fruit = "apple", colour = "green")
orange <- tibble(fruit = "orange", colour = "orange")
red_cherry <- tibble(fruit = "cherry", colour = "red")
# Insert into the database; primary key autoincrements and id will be assigned
in_place <- TRUE
rows_insert(fruits, red_apple, , by = c("fruit", "colour"),
copy = TRUE, conflict = "ignore", in_place = in_place)
rows_insert(fruits, orange, by = c("fruit", "colour"),
copy = TRUE, conflict = "ignore", in_place = in_place)
rows_insert(fruits, green_apple, by = c("fruit", "colour"),
copy = TRUE, conflict = "ignore", in_place = in_place)
rows_insert(fruits, green_apple, by = c("fruit", "colour"),
copy = TRUE, conflict = "ignore", in_place = in_place)
rows_insert(fruits, red_cherry, by = c("fruit", "colour"),
copy = TRUE, conflict = "ignore", in_place = in_place)
#> Matching, by = "fruit"
fruits
#> # Source: table<fruits> [4 x 3]
#> # Database: sqlite 3.41.2 []
#> id fruit colour
#> <int> <chr> <chr>
#> 1 1 apple red
#> 2 2 orange orange
#> 3 3 apple green
#> 4 4 cherry red
Created on 2023-06-07 with reprex v2.0.2
Note that if you switch in_place
to FALSE
and use show_query()
, you can see the SQL that dbplyr
is generating to implement this. While this doesn't work to create the table for some reason, you can see why setting the correct key is necessary.
> rows_insert(fruits, red_cherry, by = c("fruit", "colour"),
+ copy = TRUE, conflict = "ignore", in_place = in_place) %>% show_query()
<SQL>
SELECT *
FROM `fruits`
UNION ALL
SELECT NULL AS `id`, *
FROM (
SELECT *
FROM `dbplyr_080`
WHERE NOT EXISTS (
SELECT 1 FROM `fruits`
WHERE (
`dbplyr_080`.`fruit` = `fruits`.`fruit` AND
`dbplyr_080`.`colour` = `fruits`.`colour`
)
)
)