Comparing Two Unsorted Lists in Linux, Listing the Unique in the Second File

Comparing two unsorted lists in linux, listing the unique in the second file

grep -Fxv -f first-file.txt second-file.txt

Basically looks for all lines in second-file.txt which don't match any line in first-file.txt. Might be slow if the files are large.

Also, once you sort the files (Use sort -n if they are numeric), then comm should also have worked. What error does it give? Try this:

comm -23 second-file-sorted.txt first-file-sorted.txt

Difference between two lists using Bash

Use the comm(1) command to compare the two files. They both need to be sorted, which you can do beforehand if they are large, or you can do it inline with bash process substitution.

comm can take a combination of the flags -1, -2 and -3 indicating which file to suppress lines from (unique to file 1, unique to file 2 or common to both).

To get the lines only in the old file:

comm -23 <(sort /tmp/oldList) <(sort /tmp/newList)

To get the lines only in the new file:

comm -13 <(sort /tmp/oldList) <(sort /tmp/newList)

You can feed that into a while read loop to process each line:

while read old ; do
...do stuff with $old
done < <(comm -23 <(sort /tmp/oldList) <(sort /tmp/newList))

and similarly for the new lines.

Compare two unsorted files and print unique elements from each file

You want sort -n and uniq -u:

$ sort -n file1 file2 | uniq -u

5 6 7 8
13 14 15 16
21 22 23 24

# Redirect to file3
$ sort -n file1 file2 | uniq -u > file3

Edit:

$ awk '{u[$0]++}END{for(k in u)if(u[k]==1)print k}' file1 file2

5 6 7 8
21 22 23 24
13 14 15 16

Here u is the name of an associative array, you could name it anything (I choose u short for unique). The keys (k) in the array are the lines in the files so every time a duplicate line is seen the count is increased. After the array is built we loop through the array and only print the key if it was only seen once, this code should help clear it up:

$ awk '{uniq[$0]++}END{for (key in uniq)print uniq[key]": "key}' file1 file2
2: 9 10 11 12
1: 5 6 7 8
1: 21 22 23 24
1: 13 14 15 16
2: 17 18 19 20
2: 1 2 3 4

How to compare two text files with unsorted, and slightly different lines

Why not write a little program for the job, so it works on both platforms? It's easily accomplished in some platform-independent C code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef struct Line
{
char *line;
char *tokens;
size_t nwords;
const char **words;
} Line;

char *copyString(const char *s)
{
char *r = malloc(strlen(s) + 1);
if (!r) exit(EXIT_FAILURE);
strcpy(r, s);
return r;
}

int compareLines(const void *a, const void *b)
{
const Line *line1 = a;
const Line *line2 = b;

size_t mw = line1->nwords;
if (line2->nwords < mw) mw = line2->nwords;
for (size_t i = 0; i < mw; ++i)
{
int r = strcmp(line1->words[i], line2->words[i]);
if (r) return r;
}
if (line1->nwords > mw) return 1;
if (line2->nwords > mw) return -1;
return 0;
}

size_t readFile(Line **linesptr, FILE *f, size_t wordCount)
{
size_t cap = 256;
size_t n = 0;
char buf[1024];

Line *lines = malloc(cap * sizeof(Line));
if (!lines) exit(EXIT_FAILURE);

while (fgets(buf, 1024, f))
{
if (n == cap)
{
cap *= 2;
lines = realloc(lines, cap * sizeof(Line));
if (!lines) exit(EXIT_FAILURE);
}
lines[n].line = copyString(buf);
lines[n].tokens = copyString(buf);
lines[n].words = malloc(wordCount * sizeof(const char *));
if (!lines[n].words) exit(EXIT_FAILURE);
size_t c = 0;
char *word = strtok(lines[n].tokens, " \t");
while (word && c < wordCount)
{
lines[n].words[c++] = word;
if (c == wordCount) break;
word = strtok(0, " \t");
}
lines[n].nwords = c;
lines[n].words = realloc(lines[n].words, c * sizeof(const char *));
if (!lines[n].words) exit(EXIT_FAILURE);
++n;
}
lines = realloc(lines, n * sizeof(Line));
if (!lines) exit(EXIT_FAILURE);
qsort(lines, n, sizeof(Line), compareLines);
*linesptr = lines;
return n;
}

void freeLines(Line *lines, size_t n)
{
for (size_t i = 0; i < n; ++i)
{
free(lines[i].words);
free(lines[i].tokens);
free(lines[i].line);
}
free(lines);
}

int main(int argc, char **argv)
{
if (argc != 4)
{
fprintf(stderr, "Usage: %s [n] [file1] [file2]\n", argv[0]);
return EXIT_FAILURE;
}

int nwords = atoi(argv[1]);
if (!nwords) return EXIT_FAILURE;
FILE *f1 = fopen(argv[2], "r");
if (!f1) return EXIT_FAILURE;
FILE *f2 = fopen(argv[3], "r");
if (!f2) return EXIT_FAILURE;

Line *f1lines = 0;
size_t nf1lines = readFile(&f1lines, f1, nwords);
if (!f1lines) return EXIT_FAILURE;

Line *f2lines = 0;
size_t nf2lines = readFile(&f2lines, f2, nwords);
if (!f2lines) return EXIT_FAILURE;

fclose(f1);
fclose(f2);

size_t f1pos = 0;
size_t f2pos = 0;

while (f1pos < nf1lines && f2pos < nf2lines)
{
int cmp = compareLines(f1lines + f1pos, f2lines + f2pos);
if (cmp)
{
if (cmp < 0)
{
printf("%s: %s", argv[2], f1lines[f1pos++].line);
}
else
{
printf("%s: %s", argv[3], f2lines[f2pos++].line);
}
}
else
{
++f1pos;
++f2pos;
}
}

while (f1pos < nf1lines)
{
printf("%s: %s", argv[2], f1lines[f1pos++].line);
}

while (f2pos < nf2lines)
{
printf("%s: %s", argv[3], f2lines[f2pos++].line);
}

freeLines(f1lines, nf1lines);
freeLines(f2lines, nf2lines);

return EXIT_SUCCESS;
}

If you use gcc, compile for example with

gcc -s -g0 -O2 -std=c11 -Wall -Wextra -pedantic -ofinduniq finduniq.c

Demo:

$ ./finduniq 4 test1.txt test2.txt
test2.txt: an apple a day keeps the doctor away
test1.txt: lorem ipsum dolor

$ ./finduniq 6 test1.txt test2.txt
test2.txt: an apple a day keeps the doctor away
test2.txt: jumps over the very lazy *chicken*
test1.txt: jumps over the very lazy dog
test1.txt: lorem ipsum dolor
test2.txt: the quick brown fox *swims*
test1.txt: the quick brown fox jumps

How to get the difference (only additions) between two files in linux

diff and then grep for the edit type you want.

diff -u A1 A2 | grep -E "^\+"

Fast way of finding lines in one file that are not in another?

You can achieve this by controlling the formatting of the old/new/unchanged lines in GNU diff output:

diff --new-line-format="" --unchanged-line-format=""  file1 file2

The input files should be sorted for this to work. With bash (and zsh) you can sort in-place with process substitution <( ):

diff --new-line-format="" --unchanged-line-format="" <(sort file1) <(sort file2)

In the above new and unchanged lines are suppressed, so only changed (i.e. removed lines in your case) are output. You may also use a few diff options that other solutions don't offer, such as -i to ignore case, or various whitespace options (-E, -b, -v etc) for less strict matching.


Explanation

The options --new-line-format, --old-line-format and --unchanged-line-format let you control the way diff formats the differences, similar to printf format specifiers. These options format new (added), old (removed) and unchanged lines respectively. Setting one to empty "" prevents output of that kind of line.

If you are familiar with unified diff format, you can partly recreate it with:

diff --old-line-format="-%L" --unchanged-line-format=" %L" \
--new-line-format="+%L" file1 file2

The %L specifier is the line in question, and we prefix each with "+" "-" or " ", like diff -u
(note that it only outputs differences, it lacks the --- +++ and @@ lines at the top of each grouped change).
You can also use this to do other useful things like number each line with %dn.


The diff method (along with other suggestions comm and join) only produce the expected output with sorted input, though you can use <(sort ...) to sort in place. Here's a simple awk (nawk) script (inspired by the scripts linked-to in Konsolebox's answer) which accepts arbitrarily ordered input files, and outputs the missing lines in the order they occur in file1.

# output lines in file1 that are not in file2
BEGIN { FS="" } # preserve whitespace
(NR==FNR) { ll1[FNR]=$0; nl1=FNR; } # file1, index by lineno
(NR!=FNR) { ss2[$0]++; } # file2, index by string
END {
for (ll=1; ll<=nl1; ll++) if (!(ll1[ll] in ss2)) print ll1[ll]
}

This stores the entire contents of file1 line by line in a line-number indexed array ll1[], and the entire contents of file2 line by line in a line-content indexed associative array ss2[]. After both files are read, iterate over ll1 and use the in operator to determine if the line in file1 is present in file2. (This will have have different output to the diff method if there are duplicates.)

In the event that the files are sufficiently large that storing them both causes a memory problem, you can trade CPU for memory by storing only file1 and deleting matches along the way as file2 is read.

BEGIN { FS="" }
(NR==FNR) { # file1, index by lineno and string
ll1[FNR]=$0; ss1[$0]=FNR; nl1=FNR;
}
(NR!=FNR) { # file2
if ($0 in ss1) { delete ll1[ss1[$0]]; delete ss1[$0]; }
}
END {
for (ll=1; ll<=nl1; ll++) if (ll in ll1) print ll1[ll]
}

The above stores the entire contents of file1 in two arrays, one indexed by line number ll1[], one indexed by line content ss1[]. Then as file2 is read, each matching line is deleted from ll1[] and ss1[]. At the end the remaining lines from file1 are output, preserving the original order.

In this case, with the problem as stated, you can also divide and conquer using GNU split (filtering is a GNU extension), repeated runs with chunks of file1 and reading file2 completely each time:

split -l 20000 --filter='gawk -f linesnotin.awk - file2' < file1

Note the use and placement of - meaning stdin on the gawk command line. This is provided by split from file1 in chunks of 20000 line per-invocation.

For users on non-GNU systems, there is almost certainly a GNU coreutils package you can obtain, including on OSX as part of the Apple Xcode tools which provides GNU diff, awk, though only a POSIX/BSD split rather than a GNU version.

How to efficiently compare two unordered lists (not sets)?

O(n): The Counter() method is best (if your objects are hashable):

def compare(s, t):
return Counter(s) == Counter(t)

O(n log n): The sorted() method is next best (if your objects are orderable):

def compare(s, t):
return sorted(s) == sorted(t)

O(n * n): If the objects are neither hashable, nor orderable, you can use equality:

def compare(s, t):
t = list(t) # make a mutable copy
try:
for elem in s:
t.remove(elem)
except ValueError:
return False
return not t

Comparing two files in linux terminal

Here is my solution for this :

mkdir temp
mkdir results
cp /usr/share/dict/american-english ~/temp/american-english-dictionary
cp /usr/share/dict/british-english ~/temp/british-english-dictionary
cat ~/temp/american-english-dictionary | wc -l > ~/results/count-american-english-dictionary
cat ~/temp/british-english-dictionary | wc -l > ~/results/count-british-english-dictionary
grep -Fxf ~/temp/american-english-dictionary ~/temp/british-english-dictionary > ~/results/common-english
grep -Fxvf ~/results/common-english ~/temp/american-english-dictionary > ~/results/unique-american-english
grep -Fxvf ~/results/common-english ~/temp/british-english-dictionary > ~/results/unique-british-english

Comparing two unsorted text files to find intersection file

Here is a simple GNU awk that does what (I think) you are trying to do:

$ gawk 'BEGIN{FS=OFS=", "} NR==FNR{db[$1 $2 $3 $4 $5 $6]; next}
$1~/ARFCN: / && !($1 $2 $3 $4 $5 $6 in db)' db.txt rec.txt

Prints:

ARFCN:   56, Freq:  946.2M, CID:     0, LAC:     0, MCC:   0, MNC:   0, Pwr: -50
ARFCN: 100, Freq: 955.0M, CID: 667, LAC: 1007, MCC: 410, MNC: 3, Pwr: -27

This works by comparing the first 6 fields and ignoring Pwr since that might be variable depending on conditions.

If all fields are relevant, you can simplify to:

$ gawk 'BEGIN{FS=OFS=", "} NR==FNR{db[$0]; next}
$1~/ARFCN: / && !($0 in db)' db.txt rec.txt

Prints:

ARFCN: 1004, Freq:  931.0M, CID: 36231, LAC:  7713, MCC: 410, MNC:   4, Pwr: -34
ARFCN: 1008, Freq: 931.8M, CID: 47103, LAC: 7713, MCC: 410, MNC: 4, Pwr: -30
ARFCN: 10, Freq: 937.0M, CID: 30911, LAC: 10470, MCC: 410, MNC: 1, Pwr: -33
ARFCN: 49, Freq: 944.8M, CID: 15535, LAC: 52207, MCC: 410, MNC: 3, Pwr: -28
ARFCN: 56, Freq: 946.2M, CID: 0, LAC: 0, MCC: 0, MNC: 0, Pwr: -50
ARFCN: 79, Freq: 950.8M, CID: 10003, LAC: 470, MCC: 410, MNC: 6, Pwr: -43
ARFCN: 82, Freq: 951.4M, CID: 10002, LAC: 470, MCC: 410, MNC: 6, Pwr: -36
ARFCN: 100, Freq: 955.0M, CID: 667, LAC: 1007, MCC: 410, MNC: 3, Pwr: -27

Or if you only want to compare the ARFCN field:

$ gawk 'BEGIN{FS=OFS=", "} NR==FNR{db[$1]; next}
$1~/ARFCN: / && !($1 in db)' db.txt rec.txt
ARFCN: 100, Freq: 955.0M, CID: 667, LAC: 1007, MCC: 410, MNC: 3, Pwr: -27

You get the idea...



Related Topics



Leave a reply



Submit