pythoncsvmysql-connector

I can't recognise the error in my python script for posting a csv into a MySQL DataBase


I'm attempting to send a CSV file with a large amount of columns to a MySQL database. Upon running my code I am met with this error message:

Traceback (most recent call last):
   File "/home/dan-brown/apollo/src/cicflowmeter/datbase_handling.py", line 24, in <module>
     db_post()
   File "/home/dan-brown/apollo/src/cicflowmeter/datbase_handling.py", line 22, in db_post
     mycursor.execute(query, (row,))
   File "/home/dan-brown/PythonVenv/my-venv/lib/python3.12/site-packages/mysql/connector/cursor_cext.py", line 350, in execute
     stmt = RE_PY_PARAM.sub(psub, stmt)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File "/home/dan-brown/PythonVenv/my-venv/lib/python3.12/site-packages/mysql/connector/cursor_cext.py", line 115, in __call__
     raise ProgrammingError( mysql.connector.errors.ProgrammingError: Not enough parameters for the SQL statement

It is to my understanding that this means that the number of variables I have requested in the query versus the amount read on the CSV file do not match but I have double checked this numerous times only to find the same 84 features. Here is the code:

import mysql.connector
    
#connect to database
db = mysql.connector.connect(
    host="localhost",
    user="root",
    passwd="root",
    database="Apollo"
)
#active user
mycursor = db.cursor()

#setup table. Should be called on install
def db_setup():
  mycursor.execute("CREATE TABLE NetworkLogs (id INT AUTO_INCREMENT PRIMARY KEY, src_ip VARCHAR(15), dst_ip VARCHAR(15), src_port INT, dst_port INT, src_mac VARCHAR(17), dst_mac VARCHAR(17), protocol INT, timestamp DATETIME, flow_duration FLOAT, flow_byts_s FLOAT, flow_pkts_s FLOAT, fwd_pkts_s FLOAT, bwd_pkts_s FLOAT, tot_fwd_pkts INT, tot_bwd_pkts INT, totlen_fwd_pkts INT, totlen_bwd_pkts INT, fwd_pkt_len_max FLOAT, fwd_pkt_len_min FLOAT, fwd_pkt_len_mean FLOAT, fwd_pkt_len_std FLOAT, bwd_pkt_len_max FLOAT, bwd_pkt_len_min FLOAT, bwd_pkt_len_mean FLOAT, bwd_pkt_len_std FLOAT, pkt_len_max FLOAT, pkt_len_min FLOAT, pkt_len_mean FLOAT, pkt_len_std FLOAT, pkt_len_var FLOAT, fwd_header_len INT, bwd_header_len INT, fwd_seg_size_min INT, fwd_act_data_pkts INT, flow_iat_mean FLOAT, flow_iat_max FLOAT, flow_iat_min FLOAT, flow_iat_std FLOAT, fwd_iat_tot FLOAT, fwd_iat_max FLOAT, fwd_iat_min FLOAT, fwd_iat_mean FLOAT, fwd_iat_std FLOAT, bwd_iat_tot FLOAT, bwd_iat_max FLOAT, bwd_iat_min FLOAT, bwd_iat_mean FLOAT, bwd_iat_std FLOAT, fwd_psh_flags INT, bwd_psh_flags INT, fwd_urg_flags INT, bwd_urg_flags INT, fin_flag_cnt INT, syn_flag_cnt INT, rst_flag_cnt INT, psh_flag_cnt INT, ack_flag_cnt INT, urg_flag_cnt INT, ece_flag_cnt INT, down_up_ratio FLOAT, pkt_size_avg FLOAT, init_fwd_win_byts INT, init_bwd_win_byts INT, active_max FLOAT, active_min FLOAT, active_mean FLOAT, active_std FLOAT, idle_max FLOAT, idle_min FLOAT, idle_mean FLOAT, idle_std FLOAT, fwd_byts_b_avg FLOAT, fwd_pkts_b_avg FLOAT, bwd_byts_b_avg FLOAT, bwd_pkts_b_avg FLOAT, fwd_blk_rate_avg FLOAT, bwd_blk_rate_avg FLOAT, fwd_seg_size_avg FLOAT, bwd_seg_size_avg FLOAT, cwe_flag_count INT, subflow_fwd_pkts INT, subflow_bwd_pkts INT, subflow_fwd_byts INT, subflow_bwd_byts INT)")

#inserts recorded flows to db
def db_post():
  with open('src/cicflowmeter/test1.csv', 'r') as f:
      for row in f:
        query = "INSERT INTO NetworkLogs (src_ip, dst_ip, src_port, dst_port, src_mac, dst_mac, protocol, timestamp, flow_duration, flow_byts_s, flow_pkts_s, fwd_pkts_s, bwd_pkts_s, tot_fwd_pkts, tot_bwd_pkts, totlen_fwd_pkts, totlen_bwd_pkts, fwd_pkt_len_max, fwd_pkt_len_min, fwd_pkt_len_mean, fwd_pkt_len_std, bwd_pkt_len_max, bwd_pkt_len_min, bwd_pkt_len_mean, bwd_pkt_len_std, pkt_len_max, pkt_len_min, pkt_len_mean, pkt_len_std, pkt_len_var, fwd_header_len, bwd_header_len, fwd_seg_size_min, fwd_act_data_pkts, flow_iat_mean, flow_iat_max, flow_iat_min, flow_iat_std, fwd_iat_tot, fwd_iat_max, fwd_iat_min, fwd_iat_mean, fwd_iat_std, bwd_iat_tot, bwd_iat_max, bwd_iat_min, bwd_iat_mean, bwd_iat_std, fwd_psh_flags, bwd_psh_flags, fwd_urg_flags, bwd_urg_flags, fin_flag_cnt, syn_flag_cnt, rst_flag_cnt, psh_flag_cnt, ack_flag_cnt, urg_flag_cnt, ece_flag_cnt, down_up_ratio, pkt_size_avg, init_fwd_win_byts, init_bwd_win_byts, active_max, active_min, active_mean, active_std, idle_max, idle_min, idle_mean, idle_std, fwd_byts_b_avg, fwd_pkts_b_avg, bwd_byts_b_avg, bwd_pkts_b_avg, fwd_blk_rate_avg, bwd_blk_rate_avg, fwd_seg_size_avg, bwd_seg_size_avg, cwe_flag_count, subflow_fwd_pkts, subflow_bwd_pkts, subflow_fwd_byts, subflow_bwd_byts) VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')"
        mycursor.execute(query, (row,))
  db.commit()
db_post()

And this is the CSV file that is being read:

192.168.1.16,192.168.1.255,59222,15600,64:e7:d8:33:f6:76,ff:ff:ff:ff:ff:ff,17,2024-10-05 18:07:59,0.0,0,0,0,0,1,0,77,0,77.0,77.0,77.0,0.0,0.0,0.0,0.0,0.0,77,77,77.0,0.0,0.0,8,0,8,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0.0,77.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77.0,0.0,0,1,0,77,0

I have tried multiple forum posts but I am yet to find an answer.

I apologize if this is laid out incorrectly or is a simple mistake. I am not an experienced programmer, any help would be greatly appreciated.
Thank you.


Solution

  • There are multiple problems:

    1. You're not splitting the CSV into separate values for each field. Use csv.reader() for this.
    2. You're wrapping row in a tuple, so it's just a single parameter value. Just use row by itself.
    3. The %s placeholders in the SQL should not be quoted. That makes them literal, and they won't be replaced with the parameter values.
    #inserts recorded flows to db
    def db_post():
        query = "INSERT INTO NetworkLogs (src_ip, dst_ip, src_port, dst_port, src_mac, dst_mac, protocol, timestamp, flow_duration, flow_byts_s, flow_pkts_s, fwd_pkts_s, bwd_pkts_s, tot_fwd_pkts, tot_bwd_pkts, totlen_fwd_pkts, totlen_bwd_pkts, fwd_pkt_len_max, fwd_pkt_len_min, fwd_pkt_len_mean, fwd_pkt_len_std, bwd_pkt_len_max, bwd_pkt_len_min, bwd_pkt_len_mean, bwd_pkt_len_std, pkt_len_max, pkt_len_min, pkt_len_mean, pkt_len_std, pkt_len_var, fwd_header_len, bwd_header_len, fwd_seg_size_min, fwd_act_data_pkts, flow_iat_mean, flow_iat_max, flow_iat_min, flow_iat_std, fwd_iat_tot, fwd_iat_max, fwd_iat_min, fwd_iat_mean, fwd_iat_std, bwd_iat_tot, bwd_iat_max, bwd_iat_min, bwd_iat_mean, bwd_iat_std, fwd_psh_flags, bwd_psh_flags, fwd_urg_flags, bwd_urg_flags, fin_flag_cnt, syn_flag_cnt, rst_flag_cnt, psh_flag_cnt, ack_flag_cnt, urg_flag_cnt, ece_flag_cnt, down_up_ratio, pkt_size_avg, init_fwd_win_byts, init_bwd_win_byts, active_max, active_min, active_mean, active_std, idle_max, idle_min, idle_mean, idle_std, fwd_byts_b_avg, fwd_pkts_b_avg, bwd_byts_b_avg, bwd_pkts_b_avg, fwd_blk_rate_avg, bwd_blk_rate_avg, fwd_seg_size_avg, bwd_seg_size_avg, cwe_flag_count, subflow_fwd_pkts, subflow_bwd_pkts, subflow_fwd_byts, subflow_bwd_byts) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"    with open('src/cicflowmeter/test1.csv', 'r') as f:
            csv_f = csv.reader(f)
            for row in csv_f:
                mycursor.execute(query, row)
        db.commit()
    

    There's also no need to re-assign query each time through the loop, it never changes.

    You can also do this more efficiently using mycursor.executemany(). It takes a 2-dimensional iterable and does the looping itself. It will batch up multiple rows to reduce the number of queries sent to the DB.

    #inserts recorded flows to db
    def db_post():
        query = "INSERT INTO NetworkLogs (src_ip, dst_ip, src_port, dst_port, src_mac, dst_mac, protocol, timestamp, flow_duration, flow_byts_s, flow_pkts_s, fwd_pkts_s, bwd_pkts_s, tot_fwd_pkts, tot_bwd_pkts, totlen_fwd_pkts, totlen_bwd_pkts, fwd_pkt_len_max, fwd_pkt_len_min, fwd_pkt_len_mean, fwd_pkt_len_std, bwd_pkt_len_max, bwd_pkt_len_min, bwd_pkt_len_mean, bwd_pkt_len_std, pkt_len_max, pkt_len_min, pkt_len_mean, pkt_len_std, pkt_len_var, fwd_header_len, bwd_header_len, fwd_seg_size_min, fwd_act_data_pkts, flow_iat_mean, flow_iat_max, flow_iat_min, flow_iat_std, fwd_iat_tot, fwd_iat_max, fwd_iat_min, fwd_iat_mean, fwd_iat_std, bwd_iat_tot, bwd_iat_max, bwd_iat_min, bwd_iat_mean, bwd_iat_std, fwd_psh_flags, bwd_psh_flags, fwd_urg_flags, bwd_urg_flags, fin_flag_cnt, syn_flag_cnt, rst_flag_cnt, psh_flag_cnt, ack_flag_cnt, urg_flag_cnt, ece_flag_cnt, down_up_ratio, pkt_size_avg, init_fwd_win_byts, init_bwd_win_byts, active_max, active_min, active_mean, active_std, idle_max, idle_min, idle_mean, idle_std, fwd_byts_b_avg, fwd_pkts_b_avg, bwd_byts_b_avg, bwd_pkts_b_avg, fwd_blk_rate_avg, bwd_blk_rate_avg, fwd_seg_size_avg, bwd_seg_size_avg, cwe_flag_count, subflow_fwd_pkts, subflow_bwd_pkts, subflow_fwd_byts, subflow_bwd_byts) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
        with open('src/cicflowmeter/test1.csv', 'r') as f:
            csv_f = csv.reader(f)
            mycursor.executemany(query, csv_f)
        db.commit()