Comparing Two .Txt Files Using Difflib in Python

Comparing two files using difflib in python

Check to see if the first character in each element has a + or - at the start (marking the line having changed):

with open("compare.txt") as f, open("test.txt") as g:
flines = f.readlines()
glines = g.readlines()
d = difflib.Differ()
diffs = [x for x in d.compare(flines, glines) if x[0] in ('+', '-')]
if diffs:
# all rows with changes
else:
print('No changes')

comparing two .txt, difflib module tells me that a line is unique ('-') when in fact it is present in both .txt

diff doesn't check if line is unique in all file but if line is in the same place in other file - so you should first sort lines.

But If you want to check if lines exist in both files or if they unique in one file then better convert to set() and compare sets.


Minimal working code

a = ['A', 'B', 'C']
b = ['A', 'C', 'D']

print('a:', a)
print('b:', b)

set_a = set(a)
set_b = set(b)

print('--- duplicated ---')

duplicated = set_a & set_b

for item in sorted(duplicated):
print(item)

print('--- unique a ---')

unique_a = set_a - set_b

for item in sorted(unique_a):
print(item)

print('--- unique b ---')

unique_b = set_b - set_a

for item in sorted(unique_b):
print(item)

Result

a: ['A', 'B', 'C']
b: ['A', 'C', 'D']
--- duplicated ---
A
C
--- unique a ---
B
--- unique b ---
D

Comparing two .txt files using difflib in Python

For starters, you need to pass strings to difflib.SequenceMatcher, not files:

# Like so
difflib.SequenceMatcher(None, str1, str2)

# Or just read the files in
difflib.SequenceMatcher(None, file1.read(), file2.read())

That'll fix your error.

To get the first non-matching string, see the difflib documentation.

Using python difflib to compare more than two files

Pure Python solution, no libraries or extra dependencies.

Note: this solutions works due some assumptions:

  • Order of lines do not matter
  • A line either exists, or is missing (no logic to check similarity between lines)

from collections import defaultdict
import re

def transform(input):
# differing hashvalues from ldd can be ignored, we only care about version and path
input = re.sub("([a-zA-Z0-9_.-]{32}\/|\([a-zA-Z0-9_.-]*\))", "<>", input)
return sorted(input.splitlines())

def generate_diff(outputs: dict, common_threshold = 0):
"""
common_threshold: how many outputs need to contain line to consider it common
and mark outputs that do not have it as missing
"""
assert(common_threshold <= len(outputs))

mapping = defaultdict(set)
for target, output in outputs.items():
for line in transform(output):
mapping[line].add(target)

for line in sorted(mapping.keys()):
found = mapping[line]
if len(outputs) == len(found):
print(' ' + line)
elif len(found) >= common_threshold:
missed_str = ",".join(map(str, set(outputs.keys()) - found))
print(f'- {line} ({missed_str})')
else:
added_str = ",".join(map(str, found))
print(f'+ {line} ({added_str})')

Sample execution


my_ldd_outputs = {
'A': """
linux-vdso.so.1 (0x00007ffde4f09000)
libtinfo.so.6 => /lib/x86_64-linux-gnu/libtinfo.so.6 (0x00007fe0594f3000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007fe0592cb000)
/lib64/ld-linux-x86-64.so.2 (0x00007fe059690000)
""",
'B': """
linux-vdso.so.1 (0x00007fff697b6000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f1c54045000)
/lib64/ld-linux-x86-64.so.2 (0x00007f1c54299000)
""",
'C': """
linux-vdso.so.1 (0x00007fffd61f9000)
libcrypto.so.3 => /lib/x86_64-linux-gnu/libcrypto.so.3 (0x00007f08a51a3000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f08a4f7b000)
/lib64/ld-linux-x86-64.so.2 (0x00007f08a5612000)
""",
'D': """
linux-vdso.so.1 (0x00007ffcf9ddd000)
libcrypto.so.3 => /lib/x86_64-linux-gnu/libcrypto.so.3 (0x00007fa2e381b000)
libselinux.so.1 => /lib/x86_64-linux-gnu/libselinux.so.1 (0x00007fa2e37ef000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007fa2e35c7000)
libpcre2-8.so.0 => /lib/x86_64-linux-gnu/libpcre2-8.so.0 (0x00007fa2e3530000)
/lib64/ld-linux-x86-64.so.2 (0x00007fa2e3cd7000)
""",
'E': """
linux-vdso.so.1 (0x00007ffc2deab000)
libcrypto.so.3 => /lib/x86_64-linux-gnu/libcrypto.so.3 (0x00007f31fed91000)
libz.so.1 => /lib/x86_64-linux-gnu/libz.so.1 (0x00007f31fed75000)
libselinux.so.1 => /lib/x86_64-linux-gnu/libselinux.so.1 (0x00007f31fed49000)
libgssapi_krb5.so.2 => /lib/x86_64-linux-gnu/libgssapi_krb5.so.2 (0x00007f31fecf5000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f31feacd000)
libpcre2-8.so.0 => /lib/x86_64-linux-gnu/libpcre2-8.so.0 (0x00007f31fea34000)
/lib64/ld-linux-x86-64.so.2 (0x00007f31ff2af000)
libkrb5.so.3 => /lib/x86_64-linux-gnu/libkrb5.so.3 (0x00007f31fe969000)
libk5crypto.so.3 => /lib/x86_64-linux-gnu/libk5crypto.so.3 (0x00007f31fe93a000)
libcom_err.so.2 => /lib/x86_64-linux-gnu/libcom_err.so.2 (0x00007f31fe934000)
libkrb5support.so.0 => /lib/x86_64-linux-gnu/libkrb5support.so.0 (0x00007f31fe926000)
libkeyutils.so.1 => /lib/x86_64-linux-gnu/libkeyutils.so.1 (0x00007f31fe91f000)
libresolv.so.2 => /lib/x86_64-linux-gnu/libresolv.so.2 (0x00007f31fe909000)
"""
}
generate_diff(my_ldd_outputs, 2)

Outputs

  /lib64/ld-linux-x86-64.so.2 <>
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 <>
+ libcom_err.so.2 => /lib/x86_64-linux-gnu/libcom_err.so.2 <> (E)
- libcrypto.so.3 => /lib/x86_64-linux-gnu/libcrypto.so.3 <> (B,A)
+ libgssapi_krb5.so.2 => /lib/x86_64-linux-gnu/libgssapi_krb5.so.2 <> (E)
+ libk5crypto.so.3 => /lib/x86_64-linux-gnu/libk5crypto.so.3 <> (E)
+ libkeyutils.so.1 => /lib/x86_64-linux-gnu/libkeyutils.so.1 <> (E)
+ libkrb5.so.3 => /lib/x86_64-linux-gnu/libkrb5.so.3 <> (E)
+ libkrb5support.so.0 => /lib/x86_64-linux-gnu/libkrb5support.so.0 <> (E)
- libpcre2-8.so.0 => /lib/x86_64-linux-gnu/libpcre2-8.so.0 <> (C,B,A)
+ libresolv.so.2 => /lib/x86_64-linux-gnu/libresolv.so.2 <> (E)
- libselinux.so.1 => /lib/x86_64-linux-gnu/libselinux.so.1 <> (C,B,A)
+ libtinfo.so.6 => /lib/x86_64-linux-gnu/libtinfo.so.6 <> (A)
+ libz.so.1 => /lib/x86_64-linux-gnu/libz.so.1 <> (E)
linux-vdso.so.1 <>

compare two files in python ignore comparing commented lines

I would eliminate the #-marked lines using a list comprehension:

...
with open('testfile1') as text1, open('testfile2') as text2:
diff = difflib.ndiff(
[line for line in text1 if not line.startswith('#')],
[line for line in text2 if not line.startswith('#')]
)
...


Related Topics



Leave a reply



Submit