tbls = ['tbl1' , 'tbl2', 'tbl3']
for tbl in tbls:
apidata = []
offset = 0
limit = 3000
while True:
print("----")
baseurl = f'someurl'
url = baseurl + tbl + '?sysparm_offset=' + '{}'.format(offset) + '&sysparm_limit=' + '{}'.format(limit)
print("Requesting", url)
response = requests.get(url, auth=(apiuser, apipwd), headers=headers )
data = response.json()
json_string = json.dumps(data)
df = pd.DataFrame(data)
try:
with cx_Oracle.connect(dbuser, dbpass, dsn, encoding='UTF-8') as connection:
cursor = connection.cursor()
sql = 'insert into ' + tbl + '(req_data) values (:req_data)'
cursor.execute(sql, [json_string]) #works fine with this sql but super slow
#cursor.executemany(sql, ((i,) for i in data['result'])) #TypeError: parameters should be a list of sequences/dictionaries or an integer specifying the number of times to execute the statement
#cursor.executemany(sql, df.values.tolist()) #Python value of type dict not supported.
connection.commit()
except cx_Oracle.Error as error:
print('CX_Oracle Error occurred:')
print(error)
if len(data['result']) == 0:
break
else:
apidata.extend(data['result'])
offset = offset + 3000
continue
I am trying to pull api data, that is in json format and then trying to insert into a json table in Oracle. My code goes through each tbl from the tbls list and pull paginated api data 3000 rows at a time and loops the api table through until it finishes pulling. Once it finishes pulling, it does the same for remaining 'tbl' in tbls. This works fine when used cursor.execute but extremely slow(the api tables have >150k rows).
When I try to use executemany, it just doesn't seem to work for me. I know for executemany, I have to provide a list, which I tried with ((i,) for i in data['result']) and also pandas data frame, df.values.tolist(). I am still a Python noob and would really appreciate if you could point out what I am doing wrong here. I went through a lot of articles/examples on executemnay but I am still having difficulty with this.
Sample of apidata {'result': [{'owner': {'link': 'https://someurl.com/api/now/table/sys_user/39aec4146fedda00f3ab4ecbbb3ee4ec', 'value': '39aec4146fedda00f3ab4ecbbb3ee4ec'}, 'sys_id': '00c67bbbdbbcb2c01e05fb541d96196b', 'sys_updated_by': 'xa0380', 'sys_created_on': '2017-02-14 22:25:04', 'document': {'link': 'https://someurl.com/api/now/table/vtb_task/b1b6bb7bdbbcb2c01e05fb541d961923', 'value': 'b1b6bb7bdbbcb2c01e05fb541d961923'}, 'name': '', 'sys_mod_count': '0', 'sys_updated_on': '2017-02-14 22:25:04', 'sys_tags': '', 'sys_created_by': 'xa0380', 'table': 'vtb_task'}]}
The length of apidata varies for different tables but since I am inserting it as json in oracle, it doesn't matter.
Oracle json table ddl: create table tbl1 ( req_data blob check (req_data is json) );
Oracle version - 19c CX_Oracle version - 8.3
Thank you in advance. Please let me know if you need any additional information.
Try binding with executemany()
like this:
import cx_Oracle as oracledb
import os
import sys
if sys.platform.startswith('darwin'):
oracledb.init_oracle_client(lib_dir=os.environ.get('HOME')+'/Downloads/instantclient_19_8')
un = os.environ.get('PYTHON_USERNAME')
pw = os.environ.get('PYTHON_PASSWORD')
cs = os.environ.get('PYTHON_CONNECTSTRING')
connection = oracledb.connect(user=un, password=pw, dsn=cs)
# For the SELECT statement
def output_type_handler(cursor, name, default_type, size, precision, scale):
if default_type == oracledb.CLOB:
return cursor.var(oracledb.LONG_STRING, arraysize=cursor.arraysize)
if default_type == oracledb.BLOB:
return cursor.var(oracledb.LONG_BINARY, arraysize=cursor.arraysize)
connection.outputtypehandler = output_type_handler
d1 = """{'result': [{'owner': {'link': 'https://someurl.com/api/now/table/sys_user/39aec4146fedda00f3ab4ecbbb3ee4ec', 'value': '39aec4146fedda00f3ab4ecbbb3ee4ec'}, 'sys_id': '00c67bbbdbbcb2c01e05fb541d96196b', 'sys_updated_by': 'xa0380', 'sys_created_on': '2017-02-14 22:25:04', 'document': {'link': 'https://someurl.com/api/now/table/vtb_task/b1b6bb7bdbbcb2c01e05fb541d961923', 'value': 'b1b6bb7bdbbcb2c01e05fb541d961923'}, 'name': '', 'sys_mod_count': '0', 'sys_updated_on': '2017-02-14 22:25:04', 'sys_tags': '', 'sys_created_by': 'xa0380', 'table': 'vtb_task'}]}"""
d2 = """{'result': [{'owner': {'link': 'https://someurl2.com/api/now/table/sys_user/xyz', 'value': 'xyz'}, 'sys_id': 'xyz', 'sys_updated_by': 'xa0380', 'sys_created_on': '2017-02-14 22:25:04', 'document': {'link': 'https://someurl.com/api/now/table/vtb_task/b1b6bb7bdbbcb2c01e05fb541d961923', 'value': 'xyz'}, 'name': '', 'sys_mod_count': '0', 'sys_updated_on': '2017-02-14 22:25:04', 'sys_tags': '', 'sys_created_by': 'xa0380', 'table': 'vtb_task'}]}"""
data = [
{"req_data": d1},
{"req_data": d2}
]
sql = 'insert into tbl1 (req_data) values (:req_data)'
with connection.cursor() as cursor:
cursor.executemany(sql, data)
with connection.cursor() as cursor:
cursor.execute("select * from tbl1")
r = cursor.fetchall()
print(r)
Or you could change the statement to use a named bind variable, and then construct the data like:
data = [
(d1,),
(d2,)
]
sql = 'insert into tbl1 (req_data) values (:1)'
Other things to speed up your app:
while
loop.