pythonduckdbexecutemany

Inserting to a (temp) table from an insert statement with returning clause in duckdb


I am working with inserting and manipulating data in a DuckDB database using Python. The incoming data is staged in several temporary tables, before it is moved to permanent tables (largely for duplication checking). From an INSERT statement with a RETURNING clause I need to insert the returned values (primary key produced by the insert statement plus some business keys) into a temporary "mapping" table used for mapping the new primary key from the permanent table, to some of the data in the other temporary tables. Because I am dealing with large amounts of data at once (10,000s of rows) I'd prefer to not return, fetch, and insert separately.

Below is the code I attempted to use:

import duckdb

conn = duckdb.connect("duckdb.db")
conn.execute(
    """ 
   CREATE SEQUENCE seq_id START 1;
   CREATE TABLE Data(
      id INTEGER DEFAULT nextval('seq_id') PRIMARY KEY,
      col1 VARCHAR,
      col2 INTEGER,
      col3 VARCHAR,
      col4 VARCHAR);"""
)
conn.execute(
    """
   CREATE TEMP TABLE Data_temp(
      col1 VARCHAR,
      col2 INTEGER,
      col3 VARCHAR,
      col4 VARCHAR);"""
)

conn.execute(
    """
   CREATE TABLE Other_data(
      id INTEGER REFERENCES DATA(id),
      cola FLOAT,
      colb FLOAT);"""
)
conn.execute(
    """
   CREATE TEMP TABLE Other_data_temp(
      cola FLOAT,
      colb FLOAT,
      col1 VARCHAR,
      col2 INTEGER);"""
)

conn.execute(
    """
   CREATE TEMP TABLE temp_mapping(
      id   INTEGER,
      col1 VARCHAR,
      col2 INTEGER);"""
)

input_data = [
    ["green", 3, "round", "sweet"],
    ["red", 3, "square", "sweet"],
    ["blue", 2, "square", "bitter"],
]

input_other_data = [
    [1.43, 4.23, "green", 3],
    [6.45, 9.0, "red", 3],
    [4.8, 0.2, "blue", 2],
]

conn.executemany(
    """
   INSERT INTO Data_temp(col1, col2, col3, col4)
   VALUES (?,?,?,?);""",
    input_data,
)

conn.executemany(
    """
   INSERT INTO Other_data_temp
   VALUES (?,?,?,?);""",
    input_other_data,
)

mapping = conn.execute(
    """
   INSERT INTO Data(
      col1, col2, col3, col4)
   SELECT 
      T.col1, T.col2, T.col3, T.col4
   FROM Data_temp AS T
   RETURNING id, col1, col2;"""
)

conn.executemany(
    """
   INSERT INTO temp_mapping(
      id, col1, col2) 
   VALUES (?, ?, ?);""",
    mapping,
)

### Exception: duckdb.duckdb.InvalidInputException: Invalid Input Error: executemany requires a list of parameter sets to be provided


conn.execute(
    """
   INSERT INTO Other_data(id, cola, colb)
   SELECT TM.id, ODT.cola, ODT.colb
   FROM Other_data_temp AS ODT
   JOIN temp_mapping AS TM
      ON TM.col1 = ODT.col1
      AND TM.col2 = ODT.col2;
"""
)
conn.commit()
count = conn.execute("SELECT COUNT(*) FROM Other_data;")

print(count.fetchall()[0][0])

conn.close()

I have an SQLite version of the code (near identical, except how the autogenerated primary key is handled), where I can pass the RETURNING cursor object directly as input for executemany, which works perfectly:

import sqlite3

conn = sqlite3.connect("sqlite.db")
conn.execute(
    """ 
   CREATE TABLE Data(
      id INTEGER PRIMARY KEY AUTOINCREMENT,
      col1 VARCHAR,
      col2 INTEGER,
      col3 VARCHAR,
      col4 VARCHAR);"""
)
conn.execute(
    """
   CREATE TEMP TABLE Data_temp(
      col1 VARCHAR,
      col2 INTEGER,
      col3 VARCHAR,
      col4 VARCHAR);"""
)

conn.execute(
    """
   CREATE TABLE Other_data(
      id INTEGER REFERENCES DATA(id),
      cola FLOAT,
      colb FLOAT);"""
)
conn.execute(
    """
   CREATE TEMP TABLE Other_data_temp(
      cola FLOAT,
      colb FLOAT,
      col1 VARCHAR,
      col2 INTEGER);"""
)

conn.execute(
    """
   CREATE TEMP TABLE temp_mapping(
      id   INTEGER,
      col1 VARCHAR,
      col2 INTEGER);"""
)

input_data = [
    ["green", 3, "round", "sweet"],
    ["red", 3, "square", "sweet"],
    ["blue", 2, "square", "bitter"],
]

input_other_data = [
    [1.43, 4.23, "green", 3],
    [6.45, 9.0, "red", 3],
    [4.8, 0.2, "blue", 2],
]

conn.executemany(
    """
   INSERT INTO Data_temp(col1, col2, col3, col4)
   VALUES (?,?,?,?);""",
    input_data,
)

conn.executemany(
    """
   INSERT INTO Other_data_temp
   VALUES (?,?,?,?);""",
    input_other_data,
)

mapping = conn.execute(
    """
   INSERT INTO Data(
      col1, col2, col3, col4)
   SELECT 
      T.col1, T.col2, T.col3, T.col4
   FROM Data_temp AS T
   RETURNING id, col1, col2;"""
)

conn.executemany(
    """
   INSERT INTO temp_mapping(
      id, col1, col2) 
   VALUES (?, ?, ?);""",
    mapping,
)
conn.execute(
    """
   INSERT INTO Other_data(id, cola, colb)
   SELECT TM.id, ODT.cola, ODT.colb
   FROM Other_data_temp AS ODT
   JOIN temp_mapping AS TM
      ON TM.col1 = ODT.col1
      AND TM.col2 = ODT.col2;
"""
)
conn.commit()
count = conn.execute("SELECT COUNT(*) FROM Other_data;")

print(count.fetchall()[0][0])
# 3
conn.close()

Are there any alternatives to this in DuckDB, one where I can avoid fetching the mapping data into memory first?


Solution

  • Add a .fetchall() to your conn.execute("""INSERT INTO Data … RETURNING …;"""):

    mapping = conn.execute(
        """
       INSERT INTO Data
       …
       RETURNING id, col1, col2;"""
    ).fetchall()
    

    (tested with Python 3.13.7 / DuckDB 1.3.2)

    Why

    The failure of:

    mapping = conn.execute(
        """
       INSERT INTO Data
       …
       RETURNING id, col1, col2;"""
    )
    

    to return results seems related to the doc mentioning a "smart" feature that differentiates between SELECTs and others ("If it is a SELECT statement, create a relation object from the given SQL query, otherwise run the query as-is."),
    with code in DuckDB's Python bridge that has this "SELECT-only" implementation.

    This failure to take the polymorphic nature of INSERT … RETURNING (acts both as an INSERT and a SELECT) into account could be worth a new issue on DuckDB,
    but note that it's deeply rooted in DuckDB, because even pure SQL queries that I think would work on PostgreSQL and SQLite:

    SELECT * FROM (
       INSERT INTO Data
       …
       RETURNING id, col1, col2
    ) AS x;
    

    or:

    WITH i AS (
       INSERT INTO Data
       …
       RETURNING id, col1, col2
    )
    SELECT * FROM i;
    

    fail with respectively:

    So, as even DuckDB's internal SQL parser fails at recognizing INSERT … RETURNING's ability to return, you're lucky that the Python bridge at least offers the possibility to explicitely trigger the results output (instead of relying on execute and the like "automagical" spotting of SELECTs).