Compare two files report difference in python
import difflib
lines1 = '''
dog
cat
bird
buffalo
gophers
hound
horse
'''.strip().splitlines()
lines2 = '''
cat
dog
bird
buffalo
gopher
horse
mouse
'''.strip().splitlines()
# Changes:
# swapped positions of cat and dog
# changed gophers to gopher
# removed hound
# added mouse
for line in difflib.unified_diff(lines1, lines2, fromfile='file1', tofile='file2', lineterm=''):
print line
Outputs the following:
--- file1
+++ file2
@@ -1,7 +1,7 @@
+cat
dog
-cat
bird
buffalo
-gophers
-hound
+gopher
horse
+mouse
This diff gives you context -- surrounding lines to help make it clear how the file is different. You can see "cat" here twice, because it was removed from below "dog" and added above it.
You can use n=0 to remove the context.
for line in difflib.unified_diff(lines1, lines2, fromfile='file1', tofile='file2', lineterm='', n=0):
print line
Outputting this:
--- file1
+++ file2
@@ -0,0 +1 @@
+cat
@@ -2 +2,0 @@
-cat
@@ -5,2 +5 @@
-gophers
-hound
+gopher
@@ -7,0 +7 @@
+mouse
But now it's full of the "@@" lines telling you the position in the file that has changed. Let's remove the extra lines to make it more readable.
for line in difflib.unified_diff(lines1, lines2, fromfile='file1', tofile='file2', lineterm='', n=0):
for prefix in ('---', '+++', '@@'):
if line.startswith(prefix):
break
else:
print line
Giving us this output:
+cat
-cat
-gophers
-hound
+gopher
+mouse
Now what do you want it to do?
If you ignore all removed lines, then you won't see that "hound" was removed.
If you're happy just showing the additions to the file, then you could do this:
diff = difflib.unified_diff(lines1, lines2, fromfile='file1', tofile='file2', lineterm='', n=0)
lines = list(diff)[2:]
added = [line[1:] for line in lines if line[0] == '+']
removed = [line[1:] for line in lines if line[0] == '-']
print 'additions:'
for line in added:
print line
print
print 'additions, ignoring position'
for line in added:
if line not in removed:
print line
Outputting:
additions:
cat
gopher
mouse
additions, ignoring position:
gopher
mouse
You can probably tell by now that there are various ways to "print the differences" of two files, so you will need to be very specific if you want more help.
Python - Compare 2 files and output differences
This is working for me:
def compare(File1,File2):
with open(File1,'r') as f:
d=set(f.readlines())
with open(File2,'r') as f:
e=set(f.readlines())
open('file3.txt','w').close() #Create the file
with open('file3.txt','a') as f:
for line in list(d-e):
f.write(line)
You need to compare the readlines set and find out lines that are not present in file2. You can then append these lines to the new file.
Compare two different files line by line in python
This solution reads both files in one pass, excludes blank lines, and prints common lines regardless of their position in the file:
with open('some_file_1.txt', 'r') as file1:
with open('some_file_2.txt', 'r') as file2:
same = set(file1).intersection(file2)
same.discard('\n')
with open('some_output_file.txt', 'w') as file_out:
for line in same:
file_out.write(line)
Python : Compare two csv files and print out differences
The problem is that you are comparing each line in fileone
to the same line in filetwo
. As soon as there is an extra line in one file you will find that the lines are never equal again. Try this:
with open('old.csv', 'r') as t1, open('new.csv', 'r') as t2:
fileone = t1.readlines()
filetwo = t2.readlines()
with open('update.csv', 'w') as outFile:
for line in filetwo:
if line not in fileone:
outFile.write(line)
comparing two text files - line by line comparison (involves masking) - python
This is the answer - finally cracked it myself -:)
import os
import sys
import re
import webbrowser
Comparison function - does it line by line:
def CompareFiles(str_file1,str_file2):
'''
This function compares two long string texts and returns their
differences as two sequences of unique lines, one list for each.
'''
#reading from text file and splitting str_file into lines - delimited by "\n"
file1_lines = str_file1.split("\n")
file2_lines = str_file2.split("\n")
#unique lines to each one, store it in their respective lists
unique_file1 = []
unique_file2 = []
#unique lines in str1
for line1 in file1_lines:
if line1 !='':
if line1 not in file2_lines:
unique_file1.append(line1)
#unique lines in str2
for line2 in file2_lines:
if line2 != '':
if line2 not in file1_lines:
unique_file2.append(line2)
return unique_file1, unique_file2
Use this function to mask:
def Masker(pattern_lines, file2mask):
'''
This function masks some fields (based on the pattern_lines) with
dummy text to simplify the comparison
'''
#mask the values of all matches from the pattern_lines by a dummy data - 'xxxxxxxxxx'
for pattern in pattern_lines:
temp = pattern.findall(file2mask)
if len(temp) != 0:
for value in temp:
if isinstance(value, str):
masked_file = file2mask.replace(str(value),'x'*10)
elif isinstance(value, tuple):
for tup in value:
masked_file = file2mask.replace(str(tup),'x'*10)
return masked_file
Open the files:
f1 = open("file1.txt","r")
data1 = f1.read()
f1.close()
f3 = open("file2.txt","r")
data3 = f3.read()
f3.close()
Create a folder to store the output file (optional):
save_path = os.path.join(os.path.dirname(__file__), 'outputs')
filename = os.path.normpath(os.path.join(save_path,"interim.txt"))
Pattern lines for masking:
pattern_lines = [
re.compile(r'\- This file is located in 3000.3422.(.*) description \"(.*)\"', re.M),
re.compile(r'\- City address of file is \"(.*)\"',re.M),
re.compile(r'\- Country of file is (.*)',re.M)
]
Mask the two files:
data1_masked = Masker(pattern_lines,data1)
data3_masked = Masker(pattern_lines,data3)
compare the two files and return the unique lines for both
unique1, unique2 = CompareFiles(data1_masked, data3_masked)
Reporting - you can write it into a function:
file = open(filename,'w')
file.write("-------------------------\n")
file.write("\nONLY in FILE ONE\n")
file.write("\n-------------------------\n")
file.write(str('\n'.join(unique1)))
file.write("\n-------------------------\n")
file.write("\nONLY in FILE TWO\n")
file.write("\n-------------------------\n")
file.write(str('\n'.join(unique2)))
file.close()
And finally open the comparison output file:
webbrowser.open(filename)
Comparing two files in python
You dont need to use range(len(fp1))
. You can use the fp1
directly. That should fix the error.
def compareString(line1,line2): #sub function to compare strings of files
i=0 #initial index
while line1[i]==line2[i]: #compare each line until they are equal
i=i+1
if line1[i]!=line2[i]: #if unequal
print('Mismatch at character ',i,line1[i]) #print error message
def compareMain(): #
file1=input('Enter the name of the first file: ') #input file1 name
file2=input('Enter the name of the second file: ') #input file2 name
fp1=open(file1,'r') #open file1, reading mode
fp2=open(file2,'r') #open file2, reading mode
for line1 in fp1: #Getting each line of file1
for line2 in fp2: #Getting each line of file2
compareString(line1,line2) #Call compare function
fp1.close() #Close file1
fp2.close() #Close file2
compareMain() #Execute
see if two files have the same content in python
Yes, I think hashing the file would be the best way if you have to compare several files and store hashes for later comparison. As hash can clash, a byte-by-byte comparison may be done depending on the use case.
Generally byte-by-byte comparison would be sufficient and efficient, which filecmp module already does + other things too.
See http://docs.python.org/library/filecmp.html
e.g.
>>> import filecmp
>>> filecmp.cmp('file1.txt', 'file1.txt')
True
>>> filecmp.cmp('file1.txt', 'file2.txt')
False
Speed consideration:
Usually if only two files have to be compared, hashing them and comparing them would be slower instead of simple byte-by-byte comparison if done efficiently. e.g. code below tries to time hash vs byte-by-byte
Disclaimer: this is not the best way of timing or comparing two algo. and there is need for improvements but it does give rough idea. If you think it should be improved do tell me I will change it.
import random
import string
import hashlib
import time
def getRandText(N):
return "".join([random.choice(string.printable) for i in xrange(N)])
N=1000000
randText1 = getRandText(N)
randText2 = getRandText(N)
def cmpHash(text1, text2):
hash1 = hashlib.md5()
hash1.update(text1)
hash1 = hash1.hexdigest()
hash2 = hashlib.md5()
hash2.update(text2)
hash2 = hash2.hexdigest()
return hash1 == hash2
def cmpByteByByte(text1, text2):
return text1 == text2
for cmpFunc in (cmpHash, cmpByteByByte):
st = time.time()
for i in range(10):
cmpFunc(randText1, randText2)
print cmpFunc.func_name,time.time()-st
and the output is
cmpHash 0.234999895096
cmpByteByByte 0.0
compare whether two python files result in same byte code (are code wise identical)
You might try using Python's internal compile
function, which can compile from string (read in from a file in your case). For example, compiling and comparing the resulting code objects from two equivalent programs and one almost equivalent program and then just for demo purposes (something you would not want to do) executing a couple of the code objects:
import hashlib
import marshal
def compute_hash(code):
code_bytes = marshal.dumps(code)
code_hash = hashlib.sha1(code_bytes).hexdigest()
return code_hash
source1 = """x = 3
y = 4
z = x * y
print(z)
"""
source2 = "x=3;y=4;z=x*y;print(z)"
source3 = "a=3;y=4;z=a*y;print(z)"
obj1 = compile(source=source1, filename='<string>', mode='exec', dont_inherit=1)
obj2 = compile(source=source2, filename='<string>', mode='exec', dont_inherit=1)
obj3 = compile(source=source3, filename='<string>', mode='exec', dont_inherit=1)
print(obj1 == obj2)
print(obj1 == obj3)
exec(obj1)
exec(obj3)
print(compute_hash(obj1))
Prints:
True
False
12
12
48632a1b64357e9d09d19e765d3dc6863ee67ab9
This will save you from having to copying py files, creating pyc files, comparing pyc files, etc.
Note:
The compute_hash
function is if you need a hash function that is repeatable, i.e. returns the same value repeatedly for the same code object when computed in successive program runs.
Related Topics
Multiprocessing - Pipe VS Queue
How to Convert Escaped Characters
Why Do Two Identical Lists Have a Different Memory Footprint
Attributeerror: 'Client' Object Has No Attribute 'Send_Message' (Discord Bot)
Error When Loading Cookies into a Python Request Session
How to Enable MySQL Client Auto Re-Connect with MySQLdb
How Would I Make a Method Which Is Run Every Time a Frame Is Shown in Tkinter
Gunicorn Autoreload on Source Change
Import Module Works in Terminal But Not in Idle
How to Sort a Pandas Dataframe by Index
Python Regular Expression Pattern * Is Not Working as Expected
How to Unimport a Python Module Which Is Already Imported
How to Clone a Python Generator Object
Opencv Python: Cv2.Findcontours - Valueerror: Too Many Values to Unpack
Beautiful Soup 4 Find_All Don't Find Links That Beautiful Soup 3 Finds
Tkinter: Mouse Drag a Window Without Borders, Eg. Overridedirect(1)
Using Monotonically_Increasing_Id() for Assigning Row Number to Pyspark Dataframe