I have the following code which is written to parse VCF (Variant call format) file:
Python code:
import vcf
import psycopg2
datalist = []
def create_snp_tables() :
drop_table_query = 'DROP TABLE IF EXISTS sampletable;'
cursor.execute(drop_table_query)
create_value_table_query = '''CREATE TABLE IF NOT EXISTS sampletable (as_ID INT, as_NM TEXT, as_DT_ID INT, as_DT_NM TEXT, VCF_ID TEXT, SAMPLE_ID TEXT, VARIANT_ID TEXT, as_DT_LINE_SEQ INT, DATE_START DATE, DATE_END DATE, as_DT_VAL_SEQ INT, as_DT_VA_NM TEXT, as_DT_VALUE TEXT); '''
cursor.execute(create_value_table_query)
conn.commit()
def createtupplelist(as_id, vcf_id,as_dt_nm, sample_id, as_dt_va_nm, as_dt_value, as_va_seq, as_dt_va_line_seq):
variant_id= 'variant_id'
as_nm = 'as_name'
datalist.append("({0},'{1}','{2}','{3}','{4}','{5}','{6}','{7}','{8}','{9}','{10}')".format(as_id,str(as_nm),'1',as_dt_nm,vcf_id,sample_id,variant_id,as_dt_va_line_seq,as_va_seq,as_dt_va_nm,variable_value))
if len(datalist)==20:
insertdata()
def insertdata():
global datalist
iter_datalist=iter(datalist)
args_str = ','.join(iter_datalist)
cursor.execute("INSERT INTO sampletable(as_ID,as_NM,as_DT_ID,as_DT_NM,VCF_ID,SAMPLE_ID,Variant_ID,as_DT_LINE_SEQ,as_DT_VAL_SEQ,as_DT_VA_NM,as_DT_VALUE) VALUES "+args_str)
print("inserted")
conn.commit()
datalist=[]
#read vcf file using pyvcf library
file_name = 'sample.vcf'
vcf_reader = vcf.Reader(open(file_name, 'r'))
conn = psycopg2.connect(host="localhost",database="mydb", user="postgres", password="pgAdmin")
cursor = conn.cursor()
create_snp_tables()
line_index = 0
as_dt_variant = 'Variant'
index = 0
for record in vcf_reader :
index=index+1
line_index += 1
sample_name = ''
variable_value = record.CHROM
variable_name = "CHROM"
createtupplelist('1', file_name, as_dt_variant, sample_name, variable_name, variable_value, str(index), str(line_index))
This is a sample file which I pass through the script:
Sample VCF File:
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT BA12878.40x.S7508
chr1 10069 . A AC 136.17 RF AC=1;AN=2;CRF=0.817;DP=7;GC=0.505;MQ=70.3709;MQ0=770;NS=7;QD=1.945;STR_LENGTH=90;STR_PERIOD=6 GT:GQ:DP:MQ:PS:PQ:AD:ADP:AF:ARF:BQ:FRF:MC:MF:SB:RFQUAL:FT 1|0:136:70:36:10069:99:309:944:0.327:0.046:.:0.97:164:0.165:0.000:2.45:RF
chr1 10069 . A AC 136.17 RF AC=1;AN=2;CRF=0.817;DP=7;GC=0.505;MQ=70.3709;MQ0=770;NS=7;QD=1.945;STR_LENGTH=90;STR_PERIOD=6 GT:GQ:DP:MQ:PS:PQ:AD:ADP:AF:ARF:BQ:FRF:MC:MF:SB:RFQUAL:FT 1|0:136:70:36:10069:99:309:944:0.327:0.046:.:0.97:164:0.165:0.000:2.45:RF
chr1 10069 . A AC 136.17 RF AC=1;AN=2;CRF=0.817;DP=7;GC=0.505;MQ=70.3709;MQ0=770;NS=7;QD=1.945;STR_LENGTH=90;STR_PERIOD=6 GT:GQ:DP:MQ:PS:PQ:AD:ADP:AF:ARF:BQ:FRF:MC:MF:SB:RFQUAL:FT 1|0:136:70:36:10069:99:309:944:0.327:0.046:.:0.97:164:0.165:0.000:2.45:RF
chr1 10069 . A AC 136.17 RF AC=1;AN=2;CRF=0.817;DP=7;GC=0.505;MQ=70.3709;MQ0=770;NS=7;QD=1.945;STR_LENGTH=90;STR_PERIOD=6 GT:GQ:DP:MQ:PS:PQ:AD:ADP:AF:ARF:BQ:FRF:MC:MF:SB:RFQUAL:FT 1|0:136:70:36:10069:99:309:944:0.327:0.046:.:0.97:164:0.165:0.000:2.45:RF
chr1 10069 . A AC 136.17 RF AC=1;AN=2;CRF=0.817;DP=7;GC=0.505;MQ=70.3709;MQ0=770;NS=7;QD=1.945;STR_LENGTH=90;STR_PERIOD=6 GT:GQ:DP:MQ:PS:PQ:AD:ADP:AF:ARF:BQ:FRF:MC:MF:SB:RFQUAL:FT 1|0:136:70:36:10069:99:309:944:0.327:0.046:.:0.97:164:0.165:0.000:2.45:RF
OUTPUT in my Postgres table - sampletable
as_id as_nm as_dt_id as_dt_nm vcf_id sample_id variant_id as_dt_line_seq date_start date_end as_dt_val_seq as_dt_va_nm as_dt_value
1 as_name 2 Variant sample.vcf "" variant_id 1 null null 1 CHROM chr1
1 as_name 2 Variant sample.vcf "" variant_id 1 null null 1 POS 10069
1 as_name 2 Variant sample.vcf "" variant_id 1 null null 1 ID None
1 as_name 2 Variant sample.vcf "" variant_id 1 null null 1 REF A
1 as_name 2 Variant sample.vcf "" variant_id 1 null null 1 ALT AC
1 as_name 2 Variant sample.vcf "" variant_id 1 null null 1 QUAL 136.17
1 as_name 2 Variant sample.vcf "" variant_id 1 null null 1 FILTER RF
My Python code is working very slow. It is inserting about 1000 records in 5 minutes. I have more than 5 million records.
I am looking for some help to optimize the Python code to insert it faster. Please suggest.
INSERT INTO XXX (column list) VALUES (values list) is not the optimal I replace this by syntax "INSERT INTO XXX VALUES", formatted list
It improve drastically performances 10 minutes ==> under 15 sec
(I did it in python 3.7.5)
# list have to be a string
datalist.append("({0},'{1}',{2},'{3}','{4}','{5}','{6}',...)".format(...))
def insertdata():
global datalist
iter_datalist=iter(datalist)
args_str = ','.join(iter_datalist)
cursor.execute("INSERT INTO sampletable VALUES "+args_str)
conn.commit()
datalist=[]