cbashawktac

Annotate last use of variables in (c) source code with (g)awk


I have a file with c Sourcecode such as

func2(&x5, &x6, x4, (arg1[3]));
func2(&x7, &x8, x4, (arg1[2]));  
func(&x13, &x14, 0x0, x12, x9);     
func(&x17, &x18, x16, x8, x5);
uint64_t x19 = (x18 + x6); 
func2(&x20, &x21, x11, 0xff);
func2(&x24, &x25, x11, 0xff11));
func(&x26, &x27, 0x0, x25, x22);
uint64_t x28 = (x27 + x23);
func(&x29, &x30, 0x0, x11, x24);
func(&x31, &x32, x30, x13, x26);
func(&x33, &x34, x32, x15, x28);

And I would like to annotate the last usage of a variable. Such as:

func2(&x5, &x6, x4, (arg1[3]));
func2(&x7, &x8, x4, (arg1[2]));   // 4,7
func(&x17, &x18, x16, x8, x5);    // 5,8,16,17
uint64_t x19 = (x18 + x6);        // 6,19
func2(&x20, &x21, x11, 0xff);     // 21,20
func2(&x24, &x25, x11, 0xff11));   
func(&x26, &x27, 0x0, x25, x22);  // 25,22
uint64_t x28 = (x27 + x23);       // 23,27,28
func(&x29, &x30, 0x0, x11, x24);  // 24,11,29
func(&x31, &x32, x30, x13, x26);  // 26,13,30,31

The comments list all the variables, which are not used below. (Semantic behind that: those could be reused/freed afterwards.)

The variables follow the regex /x([0-9]){1,3}/. I tried to feed that file via tac into gawk with the following cmd: tac file.c | gawk ' match($0,/x([0-9]){1,3}/,a) && ! seen[a[0]] {printf "%s// %s\n",$0,a[0];seen[a[0]]=1;}{print}' |tac which produces

func2(&x5, &x6, x4, (arg1[3]));
func2(&x5, &x6, x4, (arg1[3]));// x5
func2(&x7, &x8, x4, (arg1[2]));  
func2(&x7, &x8, x4, (arg1[2]));  // x7
func(&x13, &x14, 0x0, x12, x9);     
func(&x13, &x14, 0x0, x12, x9);     // x13
func(&x17, &x18, x16, x8, x5);
func(&x17, &x18, x16, x8, x5);// x17
uint64_t x19 = (x18 + x6); 
uint64_t x19 = (x18 + x6); // x19
func2(&x20, &x21, x11, 0xff);
func2(&x20, &x21, x11, 0xff);// x20
func2(&x24, &x25, x11, 0xff11));
func2(&x24, &x25, x11, 0xff11));// x24
func(&x26, &x27, 0x0, x25, x22);
func(&x26, &x27, 0x0, x25, x22);// x26
uint64_t x28 = (x27 + x23);
uint64_t x28 = (x27 + x23);// x28
func(&x29, &x30, 0x0, x11, x24);
func(&x29, &x30, 0x0, x11, x24);// x29
func(&x31, &x32, x30, x13, x26);
func(&x31, &x32, x30, x13, x26);// x31
func(&x33, &x34, x32, x15, x28);
func(&x33, &x34, x32, x15, x28);// x33

I am quite close already, but obviously this is not what I want.


Solution

  • The big issue is that match() only finds the first match of a regular expression. You have to loop through each line repeatedly to find all the variables in it.

    If you read the file once to find uses of variables, and then read it again to print out the last-used entries based on the first passes data collection, it's possible to do it in just gawk. This shell script wraps it for convenience so you don't have to manually specify the source file twice:

    #!/bin/sh
    gawk '
    NR == FNR {
        s = $0
        while (match(s, /\<x([0-9]+)\>/, a)) {
            seen[a[1]] = FNR
            s = substr(s, RSTART + RLENGTH)
        }
        next
    }
    {
        s = $0
        lasts = ""
        while (match(s, /\<x([0-9]+)\>/, a)) {
            if (seen[a[1]] == FNR) {
                if (lasts == "")
                    lasts = a[1]
                else
                    lasts = lasts "," a[1]
            }
            s = substr(s, RSTART + RLENGTH)
        }
        if (lasts == "")
            print $0
        else
            printf "%s\t// %s\n", $0, lasts;
    }
    ' "$1" "$1"
    

    Example usage:

    $ ./lastvars foo.c
    func2(&x5, &x6, x4, (arg1[3]));
    func2(&x7, &x8, x4, (arg1[2]));     // 7,4
    func(&x13, &x14, 0x0, x12, x9);         // 14,12,9
    func(&x17, &x18, x16, x8, x5);  // 17,16,8,5
    uint64_t x19 = (x18 + x6);  // 19,18,6
    func2(&x20, &x21, x11, 0xff);   // 20,21
    func2(&x24, &x25, x11, 0xff11));
    func(&x26, &x27, 0x0, x25, x22);    // 25,22
    uint64_t x28 = (x27 + x23); // 27,23
    func(&x29, &x30, 0x0, x11, x24);    // 29,11,24
    func(&x31, &x32, x30, x13, x26);    // 31,30,13,26
    func(&x33, &x34, x32, x15, x28);    // 33,34,32,15,28