I extremely appreciate if anyone could me help to merge multiple files (up to 8) with two common columns ($1$2). I want to get all values of $3 and replace the blank with 0. Here are the samples from 4 files
File1:
chr1 111001 234
chr2 22099 108
File2:
chr1 111001 42
chr1 430229 267
File3:
chr1 111001 92
chr5 663800 311
File4:
chr1 111001 129
chr2 22099 442
Desired output
chr1 111001 234 42 92 129
chr1 430229 0 267 0 0
chr2 22099 108 0 0 442
chr5 663800 0 0 311 0
I tried
awk '{ a[$1 OFS $2 FS] = a[$1 OFS $2 FS] ( a[$1 OFS $2 FS] == "" ? "" : OFS) $3 }END{ for (i in a){print i,"0",a[i]} }' OFS="\t" file1.txt file2.txt file3.txt file4.txt | sort -k1
output
chr1 111001 0 234 42 92 129
chr1 430229 0 267
chr2 22099 0 108 442
chr5 663800 0 311
Thank very much in advance
One more variant, could you please try following, written and teste with shown samples.
awk '
{
if(!a[FILENAME]++){
file[++count]=FILENAME
}
b[$1 OFS $2 OFS FILENAME]=$NF
c[$1 OFS $2]++
if(!d[$1 OFS $2]++){
e[++count1]=$1 OFS $2
}
}
END{
for(i=1;i<=length(c);i++){
printf("%s ",e[i])
for(j=1;j<=count;j++){
printf("%s %s",(b[e[i] OFS file[j]]!=""?b[e[i] OFS file[j]]:0),j==count?ORS:OFS)
}
}
}
' file{1..4} | sort -k1
Output will be as follows.
chr1 111001 234 42 92 129
chr1 430229 0 267 0 0
chr2 22099 108 0 0 442
chr5 663800 0 0 311 0
Explanation: Adding detailed explanation for above.
awk ' ##Starting awk program from here.
{
if(!a[FILENAME]++){ ##Checking condition if FILENAME is present in a then do following.
file[++count]=FILENAME ##Creating file with index of count and value is current file name.
}
b[$1 OFS $2 OFS FILENAME]=$NF ##Creating array b with index of 1st 2nd and filename and which has value as last field.
c[$1 OFS $2]++ ##Creating array c with index of 1st and 2nd field and keep increasing its value with 1.
if(!d[$1 OFS $2]++){ ##Checking condition if 1st and 2nd field are NOT present in d then do following.
e[++count1]=$1 OFS $2 ##Creating e with index of count1 with increasing value of 1 and which has first and second fields here.
}
}
END{ ##Starting END block of this awk program from here.
for(i=1;i<=length(c);i++){ ##Starting for loop which runs from i=1 to till length of c here.
printf("%s ",e[i]) ##Printing value of array e with index i here.
for(j=1;j<=count;j++){ ##Starting for loop till value of count here.
printf("%s %s",(b[e[i] OFS file[j]]!=""?b[e[i] OFS file[j]]:0),j==count?ORS:OFS) ##Printing value of b with index of e[i] OFS file[j] if it present then print else print 0, print new line if j==count or print space.
}
}
}
' file{1..4} | sort -k1 ##Mentioning Input_files 1 to 4 here and sorting output with 1st field here.
EDIT: As per GREAT regex GURU @anubhava sir's comments adding solution with ARGC
and ARGV
with GNU awk
.
awk '
{
b[$1 OFS $2 OFS FILENAME]=$NF
c[$1 OFS $2]++
if(!d[$1 OFS $2]++){
e[++count1]=$1 OFS $2
}
}
END{
count=(ARGC-1)
for(i=1;i<=length(c);i++){
printf("%s ",e[i])
for(j=1;j<=(ARGC-1);j++){
printf("%s %s",(b[e[i] OFS ARGV[j]]!=""?b[e[i] OFS ARGV[j]]:0),j==count?ORS:OFS)
}
}
}
' file{1..4} | sort -k1