pythonsshdiffdifflibldd

Using python difflib to compare more than two files


I would like to get an overview over e.g. the ldd dependency list of multiple (3+) computers by comparing them with each other and highlighting the differences. For example, if I have a dict that looks as following:

my_ldd_outputs = {
  01:"<ldd_output>",
  02:"<ldd_output>", 
  ...
  09:"<ldd_output>",
  10:"<ldd_output>"
}

I would like the output to look something like

<identical line 1>
<identical line 2>
<identical line 3>
<differing line 4> (computer 01 02)
<differing line 4> (computer 04 05 06 07)
<differing line 4> (computer 08 09 10)
<identical line 5>
<identical line 6>
...

My first approach involved python difflib, where my idea was to first get to a datastructure where all the ldd_output lists (just the result split with \n) from the abovementioned my_ldd_outputs dictionary are the same length, and any missing line that exists in another ldd_output string is added with a string. So if two files looked like this:

ldd_1 = """
<identical line 1>
<identical line 2>
<differing line 3>
<identical line 4>
<extra line 5>
<identical line 6>
"""

ldd_2 = """
<identical line 1>
<identical line 2>
<differing line 3>
<identical line 4>
<identical line 6>
"""

My goal was to store those files as

ldd_1 = """
<identical line 1>
<identical line 2>
<differing line 3>
<identical line 4>
<extra line 5>
<identical line 6>
"""

ldd_2 = """
<identical line 1>
<identical line 2>
<differing line 3>
<identical line 4>
<None>
<identical line 6>
"""

And ultimately just iterate over every line of the converted files (which now all have the same length) and compare each line in terms of their differences and ignore any <None> entries so the diff can be printed consecutively.

I created a function that uses python difflib to fill the missing lines from other files with a <None> string. However, I am not sure how to expand this function to incorporate an arbitrary amount of diffs

def generate_diff(file_1, file_2):
    #differing hashvalues from ldd can be ignored, we only care about version and path
    def remove_hashvalues(input):
        return re.sub("([a-zA-Z0-9_.-]{32}\/|\([a-zA-Z0-9_.-]*\))", "<>", input)
    diff = [line.strip() for line in difflib.ndiff(remove_hashvalues(base).splitlines(keepends=True),remove_hashvalues(file_2).splitlines(keepends=True))]
    list_1 = []
    list_2 = []
    i = 0
    while i<len(diff):
        if diff[i].strip():
            if diff[i][0:2]=="- ":
                lost = []
                gained = []
                while diff[i][0:2]=="- " or diff[i][0:2]=="? ":
                    if diff[i][0:2]=="- ": lost.append(diff[i][1:].strip())
                    i+=1
                while diff[i][0:2]=="+ " or diff[i][0:2]=="? ":
                    if diff[i][0:2]=="+ ": gained.append(diff[i][1:].strip())
                    i+=1
                while len(lost) != len(gained):
                    lost.append("<None>") if len(lost)<len(gained) else gained.insert(0,"<None>")
                list_1+=lost; list_2+=gained
            elif diff[i][0:2]=="+ ":
                list_1.append("<None>"); list_2.append(diff[i][1:].strip())
            if not diff[i][0:2]=="? ":
                list_1.append(diff[i].strip()); list_2.append(diff[i].strip())
        i+=1
    return list_1, list_2

I also found this tool that allows the comparison of multiple files, but unfortunately its not designed to compare code.

EDIT: I adjusted the solution suggestion of @AyoubKaanich to create a more simplified version that does what I want:

from collections import defaultdict
import re
def transform(input):
    input = re.sub("([a-zA-Z0-9_.-]{32}\/|\([a-zA-Z0-9_.-]*\))", "<>", input) # differing hashvalues can be ignored, we only care about version and path
    return sorted(input.splitlines())
def generate_diff(outputs: dict):
    mapping = defaultdict(set)
    for target, output in outputs.items():
        for line in transform(output):
            mapping[line.strip()].add(target)
    result = []
    current_line = None
    color_index = 0
    for line in sorted(mapping.keys()):
        if len(outputs) == len(mapping[line]):
            if current_line: current_line = None
            result.append((line))
        else:
            if current_line != line.split(" ")[0]:
                current_line = line.split(" ")[0]
                color_index+=1
            result.append((f"\033[3{color_index%6+1}m{line}\033[0m",mapping[line]))
    return result

The only downside is that this does not apply to diffs where the string varies in an arbitrary section as opposed to just the beginning, which is what difflib is good at detecting. However, for the case of ldd, since the dependency is always listed at first, sorting alphabetically and taking the first section of the string works.


Solution

  • Pure Python solution, no libraries or extra dependencies.

    Note: this solutions works due some assumptions:

    
    from collections import defaultdict
    import re
    
    def transform(input):
        # differing hashvalues from ldd can be ignored, we only care about version and path
        input = re.sub("([a-zA-Z0-9_.-]{32}\/|\([a-zA-Z0-9_.-]*\))", "<>", input)
        return sorted(input.splitlines())
    
    def generate_diff(outputs: dict, common_threshold = 0):
        """
            common_threshold: how many outputs need to contain line to consider it common
                and mark outputs that do not have it as missing
        """
        assert(common_threshold <= len(outputs))
    
        mapping = defaultdict(set)
        for target, output in outputs.items():
            for line in transform(output):
                mapping[line].add(target)
        
        for line in sorted(mapping.keys()):
            found = mapping[line]
            if len(outputs) == len(found):
                print('  ' + line)
            elif len(found) >= common_threshold:
                missed_str = ",".join(map(str, set(outputs.keys()) - found))
                print(f'- {line}  ({missed_str})')
            else:
                added_str = ",".join(map(str, found))
                print(f'+ {line}  ({added_str})')
    
    

    Sample execution

    
    my_ldd_outputs = {
    'A': """
    linux-vdso.so.1 (0x00007ffde4f09000)
    libtinfo.so.6 => /lib/x86_64-linux-gnu/libtinfo.so.6 (0x00007fe0594f3000)
    libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007fe0592cb000)
    /lib64/ld-linux-x86-64.so.2 (0x00007fe059690000)
    """,
    'B': """
    linux-vdso.so.1 (0x00007fff697b6000)
    libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f1c54045000)
    /lib64/ld-linux-x86-64.so.2 (0x00007f1c54299000)
    """,
    'C': """
    linux-vdso.so.1 (0x00007fffd61f9000)
    libcrypto.so.3 => /lib/x86_64-linux-gnu/libcrypto.so.3 (0x00007f08a51a3000)
    libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f08a4f7b000)
    /lib64/ld-linux-x86-64.so.2 (0x00007f08a5612000)
    """,
    'D': """
    linux-vdso.so.1 (0x00007ffcf9ddd000)
    libcrypto.so.3 => /lib/x86_64-linux-gnu/libcrypto.so.3 (0x00007fa2e381b000)
    libselinux.so.1 => /lib/x86_64-linux-gnu/libselinux.so.1 (0x00007fa2e37ef000)
    libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007fa2e35c7000)
    libpcre2-8.so.0 => /lib/x86_64-linux-gnu/libpcre2-8.so.0 (0x00007fa2e3530000)
    /lib64/ld-linux-x86-64.so.2 (0x00007fa2e3cd7000)
    """,
    'E': """
    linux-vdso.so.1 (0x00007ffc2deab000)
    libcrypto.so.3 => /lib/x86_64-linux-gnu/libcrypto.so.3 (0x00007f31fed91000)
    libz.so.1 => /lib/x86_64-linux-gnu/libz.so.1 (0x00007f31fed75000)
    libselinux.so.1 => /lib/x86_64-linux-gnu/libselinux.so.1 (0x00007f31fed49000)
    libgssapi_krb5.so.2 => /lib/x86_64-linux-gnu/libgssapi_krb5.so.2 (0x00007f31fecf5000)
    libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f31feacd000)
    libpcre2-8.so.0 => /lib/x86_64-linux-gnu/libpcre2-8.so.0 (0x00007f31fea34000)
    /lib64/ld-linux-x86-64.so.2 (0x00007f31ff2af000)
    libkrb5.so.3 => /lib/x86_64-linux-gnu/libkrb5.so.3 (0x00007f31fe969000)
    libk5crypto.so.3 => /lib/x86_64-linux-gnu/libk5crypto.so.3 (0x00007f31fe93a000)
    libcom_err.so.2 => /lib/x86_64-linux-gnu/libcom_err.so.2 (0x00007f31fe934000)
    libkrb5support.so.0 => /lib/x86_64-linux-gnu/libkrb5support.so.0 (0x00007f31fe926000)
    libkeyutils.so.1 => /lib/x86_64-linux-gnu/libkeyutils.so.1 (0x00007f31fe91f000)
    libresolv.so.2 => /lib/x86_64-linux-gnu/libresolv.so.2 (0x00007f31fe909000)
    """
    }
    generate_diff(my_ldd_outputs, 2)
    
    

    Outputs

      /lib64/ld-linux-x86-64.so.2 <>
      libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 <>
    + libcom_err.so.2 => /lib/x86_64-linux-gnu/libcom_err.so.2 <>  (E)
    - libcrypto.so.3 => /lib/x86_64-linux-gnu/libcrypto.so.3 <>  (B,A)
    + libgssapi_krb5.so.2 => /lib/x86_64-linux-gnu/libgssapi_krb5.so.2 <>  (E)
    + libk5crypto.so.3 => /lib/x86_64-linux-gnu/libk5crypto.so.3 <>  (E)
    + libkeyutils.so.1 => /lib/x86_64-linux-gnu/libkeyutils.so.1 <>  (E)
    + libkrb5.so.3 => /lib/x86_64-linux-gnu/libkrb5.so.3 <>  (E)
    + libkrb5support.so.0 => /lib/x86_64-linux-gnu/libkrb5support.so.0 <>  (E)
    - libpcre2-8.so.0 => /lib/x86_64-linux-gnu/libpcre2-8.so.0 <>  (C,B,A)
    + libresolv.so.2 => /lib/x86_64-linux-gnu/libresolv.so.2 <>  (E)
    - libselinux.so.1 => /lib/x86_64-linux-gnu/libselinux.so.1 <>  (C,B,A)
    + libtinfo.so.6 => /lib/x86_64-linux-gnu/libtinfo.so.6 <>  (A)
    + libz.so.1 => /lib/x86_64-linux-gnu/libz.so.1 <>  (E)
      linux-vdso.so.1 <>