pythonsql-serverpandasdataframeseries

Loading data into SQL Server using Pandas dataframe


I'm trying to load data from parquet file to a SQL Server table using Pandas.

This is the error I'm getting inside cursor.execute().

Traceback (most recent call last):
  File "C:\\Users\\path\\redshift.py", line 138, in <module>
    cursor.execute(insert_query, row)
  File "src\\pymssql\\_pymssql.pyx", line 447, in pymssql._pymssql.Cursor.execute
  File "src\\pymssql\\_mssql.pyx", line 1125, in pymssql._mssql.MSSQLConnection.execute_query
  File "src\\pymssql\\_mssql.pyx", line 1156, in pymssql._mssql.MSSQLConnection.execute_query
  File "src\\pymssql\\_mssql.pyx", line 1273, in pymssql._mssql.MSSQLConnection.format_and_run_query
  File "src\\pymssql\\_mssql.pyx", line 1295, in pymssql._mssql.MSSQLConnection.format_sql_command
  File "src\\pymssql\\_mssql.pyx", line 2084, in pymssql._mssql._substitute_params
  File "src\\pymssql\\_mssql.pyx", line 2069, in pymssql._mssql._quote_data
  File "src\\pymssql\\_mssql.pyx", line 1986, in pymssql._mssql._quote_simple_value
  File "C:\Users\pwcsreddy\AppData\Roaming\Python\Python312\site-packages\pandas\core\generic.py", line 1577, in __nonzero__

raise ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

The error looks like it's related to Pandas but it's throwing the error while inserting into the table and not while executing Pandas code.

This is the code I have.

import pymssql
import pandas as pd
import duckdb
csv_file_path = "C:\\path\\pp2.snappy.parquet"
df = pd.read_parquet(csv_file_path)
columns = df.columns.tolist()
df = df[columns]
df = df.fillna(0)
column_names = ', '.join(columns)
placeholders = ', '.join(['?'] * len(columns))
conn = pymssql.connect()
cursor = conn.cursor()

datetime_columns = []
df2 = duckdb.sql("describe select * from read_parquet('pp2.snappy.parquet')").df()
for i, j in zip(df2["column_name"], df2["column_type"]):
    if "TIMESTAMP" in j:
        datetime_columns.append(i)

for index, row in df[columns].iterrows():
    insert_query = f"""
    INSERT INTO dbo.table_name ({column_names})
    VALUES ( {placeholders} );
    """
    for dt_col in datetime_columns:
        value = getattr(row, dt_col)
        if not pd.isna(value):
            setattr(row, dt_col, pd.to_datetime(value))
        else:
            setattr(row, dt_col, 0)

    print(insert_query, dict(row))
    cursor.execute(insert_query, row)
    conn.commit()

This is the output I'm getting for print statement.

INSERT INTO dbo.tablename (id, contact, createuserid, publicid, active, beanversion, archivepartition, retired, createtime, lastupdatetime, account, updateuserid, updatetime, subtype, temporarylastupdatetime, frozensetid)
    VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? );
{'id': 501, 'contact': 503, 'createuserid': 3, 'publicid': 'pc:SCPdF-JQE54GgYqF5JYxm', 'active': True, 'beanversion': 0, 'archivepartition': 0.0, 'retired': 0, 'createtime': Timestamp('2024-08-05 08:28:34.417000+0000', tz='UTC'), 'lastupdatetime': Timestamp('2024-08-05 08:28:34.417000+0000', tz='UTC'), 'account': 301, 'updateuserid': 3, 'updatetime': Timestamp('2024-08-05 08:28:34.417000+0000', tz='UTC'), 'subtype': 1, 'temporarylastupdatetime': Timestamp('1970-01-01 00:00:00'), 'frozensetid': 0.0}

If I try to provide dict(row) to cursor.execute(), I'm getting different error.

Traceback (most recent call last):
  File "src\\pymssql\\_pymssql.pyx", line 447, in pymssql._pymssql.Cursor.execute
  File "src\\pymssql\\_mssql.pyx", line 1125, in pymssql._mssql.MSSQLConnection.execute_query
  File "src\\pymssql\\_mssql.pyx", line 1156, in pymssql._mssql.MSSQLConnection.execute_query
  File "src\\pymssql\\_mssql.pyx", line 1289, in pymssql._mssql.MSSQLConnection.format_and_run_query
  File "src\\pymssql\\_mssql.pyx", line 1855, in pymssql._mssql.check_cancel_and_raise
  File "src\\pymssql\\_mssql.pyx", line 1901, in pymssql._mssql.raise_MSSQLDatabaseException
pymssql._mssql.MSSQLDatabaseException: (102, b"Incorrect syntax near '?'.DB-Lib error message 20018, severity 15:\nGeneral SQL Server error: Check messages from the SQL Server\n")

Solution

  • Pass dictionary to .execute function:

    cursor.execute(insert_query, dict(row))
    

    (Same as in your print)

    Most likely, pymssql checks whether your row is empty somewhat simplistically with if row. Which doesn't work for pandas series (hence the error).