Linux Awk Comparing Two CSV Files and Creating a New File with a Flag

linux awk comparing two csv files and creating a new file with a flag

This might work for you:

diff  -W999 --side-by-side OLD NEW |
sed '/^[^\t]*\t\s*|\t\(.*\)/{s//\1 U/;b};/^\([^\t]*\)\t*\s*<$/{s//\1 D/;b};/^.*>\t\(.*\)/{s//\1 N/;b};d'
DTL,11111111,1111111111111111,11111111111,Y,Y,xx,xx U
DTL,22222222,2222222222222222,22222222222,Y,N,cc,cc U
DTL,33333333,3333333333333333,33333333333,Y,Y,dd,dd D
DTL,77777777,7777777777777777,77777777777,N,N,ee,ee N

an awk solution along the same lines:

diff -W999 --side-by-side OLD NEW |
awk '/[|][\t]/{split($0,a,"[|][\t]");print a[2]" U"};/[\t] *<$/{split($0,a,"[\t]* *<$");print a[1]" D"};/>[\t]/{split($0,a,">[\t]");print a[2]" N"}'
DTL,11111111,1111111111111111,11111111111,Y,Y,xx,xx U
DTL,22222222,2222222222222222,22222222222,Y,N,cc,cc U
DTL,33333333,3333333333333333,33333333333,Y,Y,dd,dd D
DTL,77777777,7777777777777777,77777777777,N,N,ee,ee N

Comparing two csv files and updating a field according to the comparison using awk

You can use awk like this:

cat csv.awk

BEGIN{FS=OFS=","}
FNR==NR {
a[$1 SUBSEP $5]
next
}
{
$5 = (($1 SUBSEP $5) in a)?"Match":"No Match"
}
1

Then use it as:

awk -f csv.awk file2.csv file1.csv

ID, Name, Address, Phone,Match
1, Jim, 12 Main,123-456-7890,No Match
2, Kim,11 Bush,987-654-3210,Match
3, Tim,33 Main,111-111-1111,Match
4, Zim,66 Main,111-555-1111,No Match

and also:

awk -f csv.awk file1.csv file2.csv

ID, Name, Address, Phone,Match, Blood Type, Left/Right Handed
1, Jon, 122 Main,333-456-7890,No Match, A Pos, Right
2, Tom,111 Bush,999-654-3210,Match, A Neg, Right
3, Tam,333 Main,111-222-1111,Match, O Neg, Left
4, Zam,99 Main,555-555-1111,No Match, A Pos, Left

Update: If you want to do it a single awk command then here it is.

cat csv.awk 

BEGIN{FS=OFS=","}
{
key = $1 SUBSEP $5
}
FNR == NR {
a[key]
next
}
FILENAME == ARGV[2] {
if (key in a) {
$5 = "Match"
b[key]
}
else
$5 = "No Match"
print > "_" ARGV[2]
next
}
{
$5 = (key in b)?"Match":"No Match"
print > "_" ARGV[3]
}

Then execute it as:

awk -f csv.awk file2.csv file1.csv file2.csv &&
mv _file1.csv file1.csv &&
mv _filw2.csv file2.csv

Compare two text files and if the second file has a row which contains both the columns of first file delete that row

This assumes the pairs in file1 never have the same value in both fields:

$ cat tst.awk
NR==FNR {
pairs1[NR] = $1
pairs2[NR] = $2
next
}
{
orig = $0
gsub(/[[:space:],]+/," ")
delete vals
for (i=1; i<=NF; i++) {
vals[$i]
}
for (nr in pairs1) {
if ( (pairs1[nr] in vals) && (pairs2[nr] in vals) ) {
next
}
}
print orig
}


$ awk -f tst.awk file1 file2
2002, 5052, 7001, 1500, 2500
2003, 5051, 3500, 4500, 4952

Compare two columns from two files and append the output in any of the two files as a new column

Could you please try following, written and tested with shown samples in GNU awk.

awk '
BEGIN{
FS=OFS=","
}
FNR==NR{
arr[$2]=$NF
next
}
{
print $1,(($1 in arr)?($NF>arr[$1]?"Old_file":"New_File"):"No_Match")
}
' Input_file1 Input_file2

Explanation: Adding detailed explanation for above.

awk '                           ##Starting awk program from here.
BEGIN{ ##Starting BEGIN section of this program from here.
FS=OFS="," ##Setting field separator and outpout field separator as comma.
}
FNR==NR{ ##Checking condition if FNR==NR when Input_file1 is being read.
arr[$2]=$NF ##Creating arr with index of 2nd field and having value as last field.
next ##next will skip further statements from here.
}
{
print $1,($1 in arr)?($NF>arr[$1]?"Old_file":"New_File"):"No_Match"
##printing 1st field and checking condition if 1st field is present in arr then check
##if last field is greater than arr value then print Old_file else print
##New_file OR if 1st field is NOT in arr then print No_Match.
}
' Input_file1 Input_file2 ##Mentioning Input_file names here.

Shell script: Check if data in columns `X` from two CSV files are matched

[ "$(cut -d' ' -f1 a.txt)" = "$(cut -d' ' -f1 b.txt)" ]; echo $?

Explanation:

  1. [ "string1" = "string2" ] - The test command. If the string1 equals to the string2, it returns 0, else 1. See man test for another information.
  2. cut -d' ' -f1 a.txt - cut the first column from the file a.txt.

    • -d' ' - set the field delimiter to the space.
    • -f1 - select only the field number 1. You can use a variable, instead of the number 1 in this case, like the num=1; [ "$(cut -d' ' -f$num a.txt)" = "$(cut -d' ' -f$num b.txt)" ]; echo $?.
  3. echo $? - print the exit status of the last executed program.

Match common IDs between two huge csv files

Can do this with standard join utility

file1.txt

1 a
2 b
3 c

file2.txt

1 002
2 x90
5 d07

join example

join -1 1 -2 1 -o 1.1,1.2,2.2 file1.txt file2.txt

here join is joining from file1.field1 to file2.field2 and outputting the fields specified with the -o flag

output

1 a 002
2 b x90

Compare two files and output results according to conditionals

Using any awk in any shell on every Unix box and assuming that your quoted fields don't contain ;s and that you don't care about the output order:

$ cat tst.awk
BEGIN { FS=";" }
NR==FNR {
first[$1]
next
}
{
print
delete first[$1]
}
END {
for ( i in first ) {
print i
}
}


$ awk -f tst.awk file1.csv file2.csv
"router51";"DatacenterA - cab1";"Prod - Tenant12"
"switch33";"DatacenterB - cab14";"Prod - Tenant4"
"switch2";"DatacenterA - cab3";"Dev - Tenant5"
"router44";"DatacenterC - cab2";"Test - Tenant2"
"switch6"
"router12"
"switch10"

If you do care about the output order you could use the decorate/sort/undecorate idiom with any awk+sort+cut:

$ cat tst.awk
BEGIN { FS=OFS=";" }
NR==FNR {
first[$1]
next
}
{
prt($0)
delete first[$1]
}
END {
for ( i in first ) {
prt(i)
}
}

function prt(str, arr, alpha, numeric) {
split(str,arr)
alpha = numeric = arr[1]
sub(/[0-9].*/,"",alpha)
gsub(/[^0-9]/,"",numeric)
print alpha, numeric, str
}


$ awk -f tst.awk file1.csv file2.csv | sort -t';' -k1,1 -k2,2n | cut -d';' -f3-
"router12"
"router44";"DatacenterC - cab2";"Test - Tenant2"
"router51";"DatacenterA - cab1";"Prod - Tenant12"
"switch2";"DatacenterA - cab3";"Dev - Tenant5"
"switch6"
"switch10"
"switch33";"DatacenterB - cab14";"Prod - Tenant4"


Related Topics



Leave a reply



Submit