I've an input VCF file that looks like this:
# ...
6 29911899 . A ACC . . ACMG=US;acmg_atav=NA;af_absent=0;af_rare=1;benign_cv=0;cases_af=0.1667;ccds_transcript=true;clingen=0;clinvar_clinrevstars=NA;clinvar_clinrevstat=NA;clinvar_clinsig=NA;clinvar_disease=NA;clinvar_disease_name=NA;clinvar_patho_SNV_missense_count=NA;clinvar_patho_SNV_nonsense_count=NA;clinvar_patho_SNV_splice_count=NA;clinvar_patho_cnv_count=NA;clinvar_patho_indel_9bp=0;clinvar_patho_indel_count=NA;clinvar_pmid=NA;clinvar_rs=NA;controls_af=NA;dom=patho5;effect=frameshift_variant;evs_gts=NA;evs_maf=0;exac_afr_af=0;exac_afr_gts=NA;exac_amr_af=0;exac_amr_gts=NA;exac_eas_af=0;exac_eas_gts=NA;exac_fin_af=0;exac_fin_gts=NA;exac_global_af=0;exac_global_gts=NA;exac_nfe_af=0;exac_nfe_gts=NA;exac_oth_af=0;exac_oth_gts=NA;exac_sas_af=0;exac_sas_gts=NA;father_dp_bin=50;father_gt=hom ref;gcs_score=0.619719801;gene_name_correct=HLA-A;gerp_score=NA;gnomad_ex_afr_af=NA;gnomad_ex_amr_af=NA;gnomad_ex_asj_af=NA;gnomad_ex_controls_af=NA;gnomad_ex_controls_afr_af=NA;gnomad_ex_controls_amr_af=NA;gnomad_ex_controls_an=NA;gnomad_ex_controls_asj_af=NA;gnomad_ex_controls_eas_af=NA;gnomad_ex_controls_fin_af=NA;gnomad_ex_controls_nfe_af=NA;gnomad_ex_controls_nhemi=NA;gnomad_ex_controls_nhet=NA;gnomad_ex_controls_nhomalt=NA;gnomad_ex_controls_sas_af=NA;gnomad_ex_eas_af=NA;gnomad_ex_filter=NA;gnomad_ex_fin_af=NA;gnomad_ex_global_af=NA;gnomad_ex_global_an=NA;gnomad_ex_global_nhemi=NA;gnomad_ex_global_nhet=NA;gnomad_ex_global_nhomalt=NA;gnomad_ex_nfe_af=NA;gnomad_ex_sas_af=NA;gnomad_gen_afr_af=0.0071;gnomad_gen_amr_af=0.0084;gnomad_gen_asj_af=0;gnomad_gen_controls_af=0.0027;gnomad_gen_controls_afr_af=0.0078;gnomad_gen_controls_amr_af=0.0048;gnomad_gen_controls_an=9916;gnomad_gen_controls_asj_af=0;gnomad_gen_controls_eas_af=0.0025;gnomad_gen_controls_fin_af=0;gnomad_gen_controls_nfe_af=0.0015;gnomad_gen_controls_nhemi=NA;gnomad_gen_controls_nhet=27;gnomad_gen_controls_nhomalt=0;gnomad_gen_eas_af=0.0029;gnomad_gen_filter=RF;gnomad_gen_fin_af=0;gnomad_gen_global_af=0.0026;gnomad_gen_global_an=28524;gnomad_gen_global_nhemi=NA;gnomad_gen_global_nhet=74;gnomad_gen_global_nhomalt=0;gnomad_gen_nfe_af=8.162E-004;het_cases=1;hgmd_9bp=3;hgmd_class=DM;hgmd_disease=HLA-A_null_allele;hgmd_flank_2bp=NA;hgmd_flank_9bp=NA;hgmd_m1=NA;hgmd_m2=NA;hgmd_p1=NA;hgmd_p2=NA;hgmd_pmid=30156062;hgmd_site=NA;hgvs_c=c.626_627dupCC;hgvs_p=p.Lys210fs;hom_cases=0;humdiv_cat=NA;humdiv_score=NA;humvar_cat=NA;humvar_score=NA;mother_dp_bin=50;mother_gt=hom ref;mtr=NA;mtr_centile=NA;mtr_domain_centile=53.7664;mtr_exon_centile=47.8182;mtr_fdr=NA;nc_gerp_centile=88.06733103;oe_evs=95.0373404;oe_exac=97.72123894;omim={Hypersensitivity syndrome carbamazepine-induced susceptibility to} 608579 (3);patho=1;pc_gerp_centile=88.90086207;pli=0.132779665;pli_rec=0.862664913;pred=NA;pz_comp_het=NA;pz_hom=NA;pz_single_het=CPMPWES1100065;rs_name=NA;subrvis_domain_centile=95.105;subrvis_domain_name=HLA-A:143322:143322_0;subrvis_exon_centile=95.7367;subrvis_exon_name=HLA-A:E4:E4;synonymous=NA;transcript_name=ENST00000376802;trap=NA;trap_score=NA;var_name=6-29911899-A-ACC;var_type=indel;
6 43034839 . T TC . . ACMG=US;acmg_atav=NA;af_absent=1;af_rare=NA;benign_cv=0;cases_af=0.1667;ccds_transcript=true;clingen=0;clinvar_clinrevstars=NA;clinvar_clinrevstat=NA;clinvar_clinsig=NA;clinvar_disease=NA;clinvar_disease_name=NA;clinvar_patho_SNV_missense_count=NA;clinvar_patho_SNV_nonsense_count=NA;clinvar_patho_SNV_splice_count=NA;clinvar_patho_cnv_count=NA;clinvar_patho_indel_9bp=0;clinvar_patho_indel_count=NA;clinvar_pmid=NA;clinvar_rs=NA;controls_af=NA;dom=lof0;effect=frameshift_variant;evs_gts=NA;evs_maf=0;exac_afr_af=0;exac_afr_gts=NA;exac_amr_af=0;exac_amr_gts=NA;exac_eas_af=0;exac_eas_gts=NA;exac_fin_af=0;exac_fin_gts=NA;exac_global_af=0;exac_global_gts=NA;exac_nfe_af=0;exac_nfe_gts=NA;exac_oth_af=0;exac_oth_gts=NA;exac_sas_af=0;exac_sas_gts=NA;father_dp_bin=50;father_gt=hom ref;gcs_score=1.113375375;gene_name_correct=KLC4;gerp_score=NA;gnomad_ex_afr_af=NA;gnomad_ex_amr_af=NA;gnomad_ex_asj_af=NA;gnomad_ex_controls_af=NA;gnomad_ex_controls_afr_af=NA;gnomad_ex_controls_amr_af=NA;gnomad_ex_controls_an=NA;gnomad_ex_controls_asj_af=NA;gnomad_ex_controls_eas_af=NA;gnomad_ex_controls_fin_af=NA;gnomad_ex_controls_nfe_af=NA;gnomad_ex_controls_nhemi=NA;gnomad_ex_controls_nhet=NA;gnomad_ex_controls_nhomalt=NA;gnomad_ex_controls_sas_af=NA;gnomad_ex_eas_af=NA;gnomad_ex_filter=NA;gnomad_ex_fin_af=NA;gnomad_ex_global_af=NA;gnomad_ex_global_an=NA;gnomad_ex_global_nhemi=NA;gnomad_ex_global_nhet=NA;gnomad_ex_global_nhomalt=NA;gnomad_ex_nfe_af=NA;gnomad_ex_sas_af=NA;gnomad_gen_afr_af=NA;gnomad_gen_amr_af=NA;gnomad_gen_asj_af=NA;gnomad_gen_controls_af=NA;gnomad_gen_controls_afr_af=NA;gnomad_gen_controls_amr_af=NA;gnomad_gen_controls_an=NA;gnomad_gen_controls_asj_af=NA;gnomad_gen_controls_eas_af=NA;gnomad_gen_controls_fin_af=NA;gnomad_gen_controls_nfe_af=NA;gnomad_gen_controls_nhemi=NA;gnomad_gen_controls_nhet=NA;gnomad_gen_controls_nhomalt=NA;gnomad_gen_eas_af=NA;gnomad_gen_filter=NA;gnomad_gen_fin_af=NA;gnomad_gen_global_af=NA;gnomad_gen_global_an=NA;gnomad_gen_global_nhemi=NA;gnomad_gen_global_nhet=NA;gnomad_gen_global_nhomalt=NA;gnomad_gen_nfe_af=NA;het_cases=1;hgmd_9bp=0;hgmd_class=NA;hgmd_disease=NA;hgmd_flank_2bp=NA;hgmd_flank_9bp=NA;hgmd_m1=NA;hgmd_m2=NA;hgmd_p1=NA;hgmd_p2=NA;hgmd_pmid=NA;hgmd_site=NA;hgvs_c=c.901dupC;hgvs_p=p.His301fs;hom_cases=0;humdiv_cat=NA;humdiv_score=NA;humvar_cat=NA;humvar_score=NA;mother_dp_bin=50;mother_gt=hom ref;mtr=NA;mtr_centile=NA;mtr_domain_centile=NA;mtr_exon_centile=NA;mtr_fdr=NA;nc_gerp_centile=13.25450509;oe_evs=80.84196579;oe_exac=51.53761062;omim=NA;patho=0;pc_gerp_centile=22.37607759;pli=0.236002385;pli_rec=0.763936873;pred=NA;pz_comp_het=NA;pz_hom=NA;pz_single_het=CPMPWES1100502;rs_name=NA;subrvis_domain_centile=NA;subrvis_domain_name=NA;subrvis_exon_centile=NA;subrvis_exon_name=NA;synonymous=NA;transcript_name=ENST00000458460;trap=NA;trap_score=NA;var_name=6-43034839-T-TC;var_type=indel;
6 16327915 . A ATGCTGCTGC . . ACMG=US;acmg_atav=NA;af_absent=1;af_rare=NA;benign_cv=0;cases_af=0.3333;ccds_transcript=true;clingen=0;clinvar_clinrevstars=NA;clinvar_clinrevstat=NA;clinvar_clinsig=NA;clinvar_disease=NA;clinvar_disease_name=NA;clinvar_patho_SNV_missense_count=0;clinvar_patho_SNV_nonsense_count=0;clinvar_patho_SNV_splice_count=0;clinvar_patho_cnv_count=1;clinvar_patho_indel_9bp=0;clinvar_patho_indel_count=0;clinvar_pmid=NA;clinvar_rs=NA;controls_af=NA;dom=pred0;effect=disruptive_inframe_insertion;evs_gts=NA;evs_maf=0;exac_afr_af=0;exac_afr_gts=NA;exac_amr_af=0;exac_amr_gts=NA;exac_eas_af=0;exac_eas_gts=NA;exac_fin_af=0;exac_fin_gts=NA;exac_global_af=0;exac_global_gts=NA;exac_nfe_af=0;exac_nfe_gts=NA;exac_oth_af=0;exac_oth_gts=NA;exac_sas_af=0;exac_sas_gts=NA;father_dp_bin=200;father_gt=hom ref;gcs_score=2.1738605;gene_name_correct=ATXN1;gerp_score=NA;gnomad_ex_afr_af=NA;gnomad_ex_amr_af=NA;gnomad_ex_asj_af=NA;gnomad_ex_controls_af=NA;gnomad_ex_controls_afr_af=NA;gnomad_ex_controls_amr_af=NA;gnomad_ex_controls_an=NA;gnomad_ex_controls_asj_af=NA;gnomad_ex_controls_eas_af=NA;gnomad_ex_controls_fin_af=NA;gnomad_ex_controls_nfe_af=NA;gnomad_ex_controls_nhemi=NA;gnomad_ex_controls_nhet=NA;gnomad_ex_controls_nhomalt=NA;gnomad_ex_controls_sas_af=NA;gnomad_ex_eas_af=NA;gnomad_ex_filter=NA;gnomad_ex_fin_af=NA;gnomad_ex_global_af=NA;gnomad_ex_global_an=NA;gnomad_ex_global_nhemi=NA;gnomad_ex_global_nhet=NA;gnomad_ex_global_nhomalt=NA;gnomad_ex_nfe_af=NA;gnomad_ex_sas_af=NA;gnomad_gen_afr_af=NA;gnomad_gen_amr_af=NA;gnomad_gen_asj_af=NA;gnomad_gen_controls_af=NA;gnomad_gen_controls_afr_af=NA;gnomad_gen_controls_amr_af=NA;gnomad_gen_controls_an=NA;gnomad_gen_controls_asj_af=NA;gnomad_gen_controls_eas_af=NA;gnomad_gen_controls_fin_af=NA;gnomad_gen_controls_nfe_af=NA;gnomad_gen_controls_nhemi=NA;gnomad_gen_controls_nhet=NA;gnomad_gen_controls_nhomalt=NA;gnomad_gen_eas_af=NA;gnomad_gen_filter=NA;gnomad_gen_fin_af=NA;gnomad_gen_global_af=NA;gnomad_gen_global_an=NA;gnomad_gen_global_nhemi=NA;gnomad_gen_global_nhet=NA;gnomad_gen_global_nhomalt=NA;gnomad_gen_nfe_af=NA;het_cases=2;hgmd_9bp=0;hgmd_class=NA;hgmd_disease=NA;hgmd_flank_2bp=0;hgmd_flank_9bp=0;hgmd_m1=NA;hgmd_m2=NA;hgmd_p1=NA;hgmd_p2=NA;hgmd_pmid=NA;hgmd_site=NA;hgvs_c=c.618_626dupGCAGCAGCA;hgvs_p=p.Gln206_Gln208dup;hom_cases=0;humdiv_cat=NA;humdiv_score=NA;humvar_cat=NA;humvar_score=NA;mother_dp_bin=200;mother_gt=hom ref;mtr=NA;mtr_centile=NA;mtr_domain_centile=36.7757;mtr_exon_centile=31.5215;mtr_fdr=NA;nc_gerp_centile=5.28218103;oe_evs=25.62033245;oe_exac=23.18030973;omim=SPINOCEREBELLAR ATAXIA 1; SCA1 [AD];patho=0;pc_gerp_centile=42.23060345;pli=0.398661149;pli_rec=0.599425706;pred=1;pz_comp_het=NA;pz_hom=NA;pz_single_het=CPMPWES1100065;rs_name=NA;subrvis_domain_centile=76.9664;subrvis_domain_name=ATXN1:-:-_2;subrvis_exon_centile=0.4705;subrvis_exon_name=ATXN1:E1:E1;synonymous=0;transcript_name=ENST00000244769;trap=0;trap_score=NA;var_name=6-16327915-A-ATGCTGCTGC;var_type=indel;
# ...
what I want to get is the value of a particular sub-header in column 8 for each line. e.g. af_absent
I tried to use an awk
script (called through awk -f <myscript.awk> <input-file>
) but I didn't manage to understand how can I "nest" awk commands or column accession. E.g. with:
BEGIN {FS="\t"; OFS="\t"}
{
info[$8]
for (i in info)
{
print i
FS=";"; cats=info[$3]
print cats
next
}
}
END{}
### OUTPUT I got:
ACMG=US;acmg_atav=NA;af_absent=0;af_rare=1;benign_cv=0;cases_af=0.3333;ccds_transcript=true;clingen=0;clinvar_clinrevstars=NA;clinvar_clinrevstat=NA;clinvar_clinsig=NA;clinvar_disease=NA;clinvar_disease_name=NA;clinvar_patho_SNV_missense_count=2;clinvar_patho_SNV_nonsense_count=0;clinvar_patho_SNV_splice_count=0;clinvar_patho_cnv_count=0;clinvar_patho_indel_9bp=0;clinvar_patho_indel_count=1;clinvar_pmid=NA;clinvar_rs=NA;controls_af=NA;dom=lof5;effect=frameshift_variant;evs_gts=NA;evs_maf=0;exac_afr_af=3.015E-004;exac_afr_gts='4972/3/0';exac_amr_af=0;exac_amr_gts='5566/0/0';exac_eas_af=0;exac_eas_gts='4320/0/0';exac_fin_af=0;exac_fin_gts='2908/0/0';exac_global_af=3.694E-005;exac_global_gts='54133/4/0';exac_nfe_af=1.768E-005;exac_nfe_gts='28275/1/0';exac_oth_af=0;exac_oth_gts='417/0/0';exac_sas_af=0;exac_sas_gts='7675/0/0';father_dp_bin=50;father_gt=het;gcs_score=0.27914964;gene_name_correct=RHD;gerp_score=NA;gnomad_ex_afr_af=5.799E-004;gnomad_ex_amr_af=0;gnomad_ex_asj_af=4.363E-004;gnomad_ex_controls_af=3.044E-005;gnomad_ex_controls_afr_af=2.927E-004;gnomad_ex_controls_amr_af=0;gnomad_ex_controls_an=98554;gnomad_ex_controls_asj_af=0;gnomad_ex_controls_eas_af=0;gnomad_ex_controls_fin_af=0;gnomad_ex_controls_nfe_af=2.777E-005;gnomad_ex_controls_nhemi=NA;gnomad_ex_controls_nhet=3;gnomad_ex_controls_nhomalt=0;gnomad_ex_controls_sas_af=0;gnomad_ex_eas_af=0;gnomad_ex_filter=PASS;gnomad_ex_fin_af=0;gnomad_ex_global_af=6.219E-005;gnomad_ex_global_an=225102;gnomad_ex_global_nhemi=NA;gnomad_ex_global_nhet=10;gnomad_ex_global_nhomalt=2;gnomad_ex_nfe_af=1.044E-005;gnomad_ex_sas_af=0;gnomad_gen_afr_af=0.0013;gnomad_gen_amr_af=0;gnomad_gen_asj_af=0;gnomad_gen_controls_af=4.374E-004;gnomad_gen_controls_afr_af=0.0017;gnomad_gen_controls_amr_af=0;gnomad_gen_controls_an=9144;gnomad_gen_controls_asj_af=0;gnomad_gen_controls_eas_af=0;gnomad_gen_controls_fin_af=0;gnomad_gen_controls_nfe_af=0;gnomad_gen_controls_nhemi=NA;gnomad_gen_controls_nhet=4;gnomad_gen_controls_nhomalt=0;gnomad_gen_eas_af=0;gnomad_gen_filter=PASS;gnomad_gen_fin_af=0;gnomad_gen_global_af=3.752E-004;gnomad_gen_global_an=26652;gnomad_gen_global_nhemi=NA;gnomad_gen_global_nhet=10;gnomad_gen_global_nhomalt=0;gnomad_gen_nfe_af=0;het_cases=0;hgmd_9bp=1;hgmd_class=NA;hgmd_disease=NA;hgmd_flank_2bp=NA;hgmd_flank_9bp=NA;hgmd_m1=NA;hgmd_m2=NA;hgmd_p1=NA;hgmd_p2=NA;hgmd_pmid=NA;hgmd_site=NA;hgvs_c=c.1056_1057insT;hgvs_p=p.Gly353fs;hom_cases=1;humdiv_cat=NA;humdiv_score=NA;humvar_cat=NA;humvar_score=NA;mother_dp_bin=30;mother_gt=hom ref;mtr=NA;mtr_centile=NA;mtr_domain_centile=70.953;mtr_exon_centile=42.8145;mtr_fdr=NA;nc_gerp_centile=50.65833635;oe_evs=80.5227656;oe_exac=83.16371681;omim=[Rh-negative blood type] (3);patho=0;pc_gerp_centile=91.29310345;pli=0.0060601;pli_rec=0.97290792;pred=NA;pz_comp_het=NA;pz_hom=NA;pz_single_het=CPMPWES1100065;rs_name=NA;subrvis_domain_centile=99.923;subrvis_domain_name=RHD:250217:250217_0;subrvis_exon_centile=96.4592;subrvis_exon_name=RHD:E7:E7;synonymous=NA;transcript_name=ENST00000328664;trap=NA;trap_score=NA;var_name=1-25633203-C-CT;var_type=indel;
clingen=0
af_absent=0
af_absent=0
af_absent=0
# ... rest of file with "af_absent=0"
I know that is wrong but I counldn't find a solution in awk blogs/questions in the web.
PS: the column is not necessarily always at same position, I need to check it before by the "sub-header" value (e.g. af_absent). Moreover, I'd need a loop in order to acces multiple values in order to sub-header name.
Other performant approaches (than awk
) would be appreciated!
Thanks a lot in advance for any help!
$ awk -F'[\t;=]' '{for (i=8;i<NF;i+=2) if ($i=="af_absent") print $(i+1)}' file
0
1
1
but more usefully in the long term:
$ cat tst.awk
BEGIN { FS=OFS="\t" }
{
n = split($8,tmp,/[;=]/)
for (i=1; i<n; i+=2) {
f[tmp[i]] = tmp[i+1]
}
print f["af_absent"], f["ACMG"], f["dom"]
}
$ awk -f tst.awk file
0 US patho5
1 US lof0
1 US pred0