bashawkbioinformaticsvcf-variant-call-formatbed

Merging multiple files with two common columns, and replace the blank to 0


I extremely appreciate if anyone could me help to merge multiple files (up to 8) with two common columns ($1$2). I want to get all values of $3 and replace the blank with 0. Here are the samples from 4 files

File1:

chr1 111001 234
chr2 22099  108

File2:

chr1 111001 42
chr1 430229 267

File3:

chr1 111001 92
chr5 663800 311

File4:

chr1 111001 129
chr2 22099  442

Desired output

chr1 111001 234 42 92 129
chr1 430229  0  267 0  0
chr2 22099  108 0  0  442
chr5 663800  0  0 311  0

I tried

awk '{ a[$1 OFS $2 FS] = a[$1 OFS $2 FS] ( a[$1 OFS $2 FS] == "" ? "" : OFS) $3 }END{ for (i in a){print i,"0",a[i]} }' OFS="\t"  file1.txt file2.txt file3.txt file4.txt | sort -k1

output

chr1    111001  0   234 42  92  129
chr1    430229  0   267
chr2    22099   0   108 442
chr5    663800  0   311

Thank very much in advance


Solution

  • One more variant, could you please try following, written and teste with shown samples.

    awk '
    {
      if(!a[FILENAME]++){
         file[++count]=FILENAME
      }
      b[$1 OFS $2 OFS FILENAME]=$NF
      c[$1 OFS $2]++
      if(!d[$1 OFS $2]++){
        e[++count1]=$1 OFS $2
      }
    }
    END{
      for(i=1;i<=length(c);i++){
        printf("%s ",e[i])
        for(j=1;j<=count;j++){
          printf("%s %s",(b[e[i] OFS file[j]]!=""?b[e[i] OFS file[j]]:0),j==count?ORS:OFS)
        }
      }
    }
    ' file{1..4} | sort -k1
    

    Output will be as follows.

    chr1 111001 234  42  92  129
    chr1 430229 0  267  0  0
    chr2 22099 108  0  0  442
    chr5 663800 0  0  311  0
    

    Explanation: Adding detailed explanation for above.

    awk '                                        ##Starting awk program from here.
    {
      if(!a[FILENAME]++){                        ##Checking condition if FILENAME is present in a then do following.
         file[++count]=FILENAME                  ##Creating file with index of count and value is current file name.
      }
      b[$1 OFS $2 OFS FILENAME]=$NF              ##Creating array b with index of 1st 2nd and filename and which has value as last field.
      c[$1 OFS $2]++                             ##Creating array c with index of 1st and 2nd field and keep increasing its value with 1.
      if(!d[$1 OFS $2]++){                       ##Checking condition if 1st and 2nd field are NOT present in d then do following.
        e[++count1]=$1 OFS $2                    ##Creating e with index of count1 with increasing value of 1 and which has first and second fields here.
      }
    }
    END{                                         ##Starting END block of this awk program from here.
      for(i=1;i<=length(c);i++){                 ##Starting for loop which runs from i=1 to till length of c here.
        printf("%s ",e[i])                       ##Printing value of array e with index i here.
        for(j=1;j<=count;j++){                   ##Starting for loop till value of count here.
          printf("%s %s",(b[e[i] OFS file[j]]!=""?b[e[i] OFS file[j]]:0),j==count?ORS:OFS)   ##Printing value of b with index of e[i] OFS file[j] if it present then print else print 0, print new line if j==count or print space.
        }
      }
    }
    ' file{1..4} | sort -k1                      ##Mentioning Input_files 1 to 4 here and sorting output with 1st field here.
    


    EDIT: As per GREAT regex GURU @anubhava sir's comments adding solution with ARGC and ARGV with GNU awk.

    awk '
    {
      b[$1 OFS $2 OFS FILENAME]=$NF
      c[$1 OFS $2]++
      if(!d[$1 OFS $2]++){
        e[++count1]=$1 OFS $2
      }
    }
    END{
      count=(ARGC-1)
      for(i=1;i<=length(c);i++){
        printf("%s ",e[i])
        for(j=1;j<=(ARGC-1);j++){
          printf("%s %s",(b[e[i] OFS ARGV[j]]!=""?b[e[i] OFS ARGV[j]]:0),j==count?ORS:OFS)
        }
      }
    }
    ' file{1..4} | sort -k1