I'm trying to load data from parquet file to a SQL Server table using Pandas.
This is the error I'm getting inside cursor.execute()
.
Traceback (most recent call last): File "C:\\Users\\path\\redshift.py", line 138, in <module> cursor.execute(insert_query, row) File "src\\pymssql\\_pymssql.pyx", line 447, in pymssql._pymssql.Cursor.execute File "src\\pymssql\\_mssql.pyx", line 1125, in pymssql._mssql.MSSQLConnection.execute_query File "src\\pymssql\\_mssql.pyx", line 1156, in pymssql._mssql.MSSQLConnection.execute_query File "src\\pymssql\\_mssql.pyx", line 1273, in pymssql._mssql.MSSQLConnection.format_and_run_query File "src\\pymssql\\_mssql.pyx", line 1295, in pymssql._mssql.MSSQLConnection.format_sql_command File "src\\pymssql\\_mssql.pyx", line 2084, in pymssql._mssql._substitute_params File "src\\pymssql\\_mssql.pyx", line 2069, in pymssql._mssql._quote_data File "src\\pymssql\\_mssql.pyx", line 1986, in pymssql._mssql._quote_simple_value File "C:\Users\pwcsreddy\AppData\Roaming\Python\Python312\site-packages\pandas\core\generic.py", line 1577, in __nonzero__ raise ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
The error looks like it's related to Pandas but it's throwing the error while inserting into the table and not while executing Pandas code.
This is the code I have.
import pymssql
import pandas as pd
import duckdb
csv_file_path = "C:\\path\\pp2.snappy.parquet"
df = pd.read_parquet(csv_file_path)
columns = df.columns.tolist()
df = df[columns]
df = df.fillna(0)
column_names = ', '.join(columns)
placeholders = ', '.join(['?'] * len(columns))
conn = pymssql.connect()
cursor = conn.cursor()
datetime_columns = []
df2 = duckdb.sql("describe select * from read_parquet('pp2.snappy.parquet')").df()
for i, j in zip(df2["column_name"], df2["column_type"]):
if "TIMESTAMP" in j:
datetime_columns.append(i)
for index, row in df[columns].iterrows():
insert_query = f"""
INSERT INTO dbo.table_name ({column_names})
VALUES ( {placeholders} );
"""
for dt_col in datetime_columns:
value = getattr(row, dt_col)
if not pd.isna(value):
setattr(row, dt_col, pd.to_datetime(value))
else:
setattr(row, dt_col, 0)
print(insert_query, dict(row))
cursor.execute(insert_query, row)
conn.commit()
This is the output I'm getting for print statement.
INSERT INTO dbo.tablename (id, contact, createuserid, publicid, active, beanversion, archivepartition, retired, createtime, lastupdatetime, account, updateuserid, updatetime, subtype, temporarylastupdatetime, frozensetid)
VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? );
{'id': 501, 'contact': 503, 'createuserid': 3, 'publicid': 'pc:SCPdF-JQE54GgYqF5JYxm', 'active': True, 'beanversion': 0, 'archivepartition': 0.0, 'retired': 0, 'createtime': Timestamp('2024-08-05 08:28:34.417000+0000', tz='UTC'), 'lastupdatetime': Timestamp('2024-08-05 08:28:34.417000+0000', tz='UTC'), 'account': 301, 'updateuserid': 3, 'updatetime': Timestamp('2024-08-05 08:28:34.417000+0000', tz='UTC'), 'subtype': 1, 'temporarylastupdatetime': Timestamp('1970-01-01 00:00:00'), 'frozensetid': 0.0}
If I try to provide dict(row)
to cursor.execute()
, I'm getting different error.
Traceback (most recent call last):
File "src\\pymssql\\_pymssql.pyx", line 447, in pymssql._pymssql.Cursor.execute
File "src\\pymssql\\_mssql.pyx", line 1125, in pymssql._mssql.MSSQLConnection.execute_query
File "src\\pymssql\\_mssql.pyx", line 1156, in pymssql._mssql.MSSQLConnection.execute_query
File "src\\pymssql\\_mssql.pyx", line 1289, in pymssql._mssql.MSSQLConnection.format_and_run_query
File "src\\pymssql\\_mssql.pyx", line 1855, in pymssql._mssql.check_cancel_and_raise
File "src\\pymssql\\_mssql.pyx", line 1901, in pymssql._mssql.raise_MSSQLDatabaseException
pymssql._mssql.MSSQLDatabaseException: (102, b"Incorrect syntax near '?'.DB-Lib error message 20018, severity 15:\nGeneral SQL Server error: Check messages from the SQL Server\n")
Pass dictionary to .execute
function:
cursor.execute(insert_query, dict(row))
(Same as in your print
)
Most likely, pymssql
checks whether your row
is empty somewhat simplistically with if row
. Which doesn't work for pandas series (hence the error).