Comparing two unsorted lists in linux, listing the unique in the second file
grep -Fxv -f first-file.txt second-file.txt
Basically looks for all lines in second-file.txt
which don't match any line in first-file.txt
. Might be slow if the files are large.
Also, once you sort the files (Use sort -n
if they are numeric), then comm
should also have worked. What error does it give? Try this:
comm -23 second-file-sorted.txt first-file-sorted.txt
Difference between two lists using Bash
Use the comm(1)
command to compare the two files. They both need to be sorted, which you can do beforehand if they are large, or you can do it inline with bash process substitution.
comm
can take a combination of the flags -1
, -2
and -3
indicating which file to suppress lines from (unique to file 1, unique to file 2 or common to both).
To get the lines only in the old file:
comm -23 <(sort /tmp/oldList) <(sort /tmp/newList)
To get the lines only in the new file:
comm -13 <(sort /tmp/oldList) <(sort /tmp/newList)
You can feed that into a while read
loop to process each line:
while read old ; do
...do stuff with $old
done < <(comm -23 <(sort /tmp/oldList) <(sort /tmp/newList))
and similarly for the new lines.
Compare two unsorted files and print unique elements from each file
You want sort -n
and uniq -u
:
$ sort -n file1 file2 | uniq -u
5 6 7 8
13 14 15 16
21 22 23 24
# Redirect to file3
$ sort -n file1 file2 | uniq -u > file3
Edit:
$ awk '{u[$0]++}END{for(k in u)if(u[k]==1)print k}' file1 file2
5 6 7 8
21 22 23 24
13 14 15 16
Here u
is the name of an associative array, you could name it anything (I choose u short for unique). The keys (k) in the array are the lines in the files so every time a duplicate line is seen the count is increased. After the array is built we loop through the array and only print the key if it was only seen once, this code should help clear it up:
$ awk '{uniq[$0]++}END{for (key in uniq)print uniq[key]": "key}' file1 file2
2: 9 10 11 12
1: 5 6 7 8
1: 21 22 23 24
1: 13 14 15 16
2: 17 18 19 20
2: 1 2 3 4
How to compare two text files with unsorted, and slightly different lines
Why not write a little program for the job, so it works on both platforms? It's easily accomplished in some platform-independent C code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct Line
{
char *line;
char *tokens;
size_t nwords;
const char **words;
} Line;
char *copyString(const char *s)
{
char *r = malloc(strlen(s) + 1);
if (!r) exit(EXIT_FAILURE);
strcpy(r, s);
return r;
}
int compareLines(const void *a, const void *b)
{
const Line *line1 = a;
const Line *line2 = b;
size_t mw = line1->nwords;
if (line2->nwords < mw) mw = line2->nwords;
for (size_t i = 0; i < mw; ++i)
{
int r = strcmp(line1->words[i], line2->words[i]);
if (r) return r;
}
if (line1->nwords > mw) return 1;
if (line2->nwords > mw) return -1;
return 0;
}
size_t readFile(Line **linesptr, FILE *f, size_t wordCount)
{
size_t cap = 256;
size_t n = 0;
char buf[1024];
Line *lines = malloc(cap * sizeof(Line));
if (!lines) exit(EXIT_FAILURE);
while (fgets(buf, 1024, f))
{
if (n == cap)
{
cap *= 2;
lines = realloc(lines, cap * sizeof(Line));
if (!lines) exit(EXIT_FAILURE);
}
lines[n].line = copyString(buf);
lines[n].tokens = copyString(buf);
lines[n].words = malloc(wordCount * sizeof(const char *));
if (!lines[n].words) exit(EXIT_FAILURE);
size_t c = 0;
char *word = strtok(lines[n].tokens, " \t");
while (word && c < wordCount)
{
lines[n].words[c++] = word;
if (c == wordCount) break;
word = strtok(0, " \t");
}
lines[n].nwords = c;
lines[n].words = realloc(lines[n].words, c * sizeof(const char *));
if (!lines[n].words) exit(EXIT_FAILURE);
++n;
}
lines = realloc(lines, n * sizeof(Line));
if (!lines) exit(EXIT_FAILURE);
qsort(lines, n, sizeof(Line), compareLines);
*linesptr = lines;
return n;
}
void freeLines(Line *lines, size_t n)
{
for (size_t i = 0; i < n; ++i)
{
free(lines[i].words);
free(lines[i].tokens);
free(lines[i].line);
}
free(lines);
}
int main(int argc, char **argv)
{
if (argc != 4)
{
fprintf(stderr, "Usage: %s [n] [file1] [file2]\n", argv[0]);
return EXIT_FAILURE;
}
int nwords = atoi(argv[1]);
if (!nwords) return EXIT_FAILURE;
FILE *f1 = fopen(argv[2], "r");
if (!f1) return EXIT_FAILURE;
FILE *f2 = fopen(argv[3], "r");
if (!f2) return EXIT_FAILURE;
Line *f1lines = 0;
size_t nf1lines = readFile(&f1lines, f1, nwords);
if (!f1lines) return EXIT_FAILURE;
Line *f2lines = 0;
size_t nf2lines = readFile(&f2lines, f2, nwords);
if (!f2lines) return EXIT_FAILURE;
fclose(f1);
fclose(f2);
size_t f1pos = 0;
size_t f2pos = 0;
while (f1pos < nf1lines && f2pos < nf2lines)
{
int cmp = compareLines(f1lines + f1pos, f2lines + f2pos);
if (cmp)
{
if (cmp < 0)
{
printf("%s: %s", argv[2], f1lines[f1pos++].line);
}
else
{
printf("%s: %s", argv[3], f2lines[f2pos++].line);
}
}
else
{
++f1pos;
++f2pos;
}
}
while (f1pos < nf1lines)
{
printf("%s: %s", argv[2], f1lines[f1pos++].line);
}
while (f2pos < nf2lines)
{
printf("%s: %s", argv[3], f2lines[f2pos++].line);
}
freeLines(f1lines, nf1lines);
freeLines(f2lines, nf2lines);
return EXIT_SUCCESS;
}
If you use gcc, compile for example with
gcc -s -g0 -O2 -std=c11 -Wall -Wextra -pedantic -ofinduniq finduniq.c
Demo:
$ ./finduniq 4 test1.txt test2.txt
test2.txt: an apple a day keeps the doctor away
test1.txt: lorem ipsum dolor
$ ./finduniq 6 test1.txt test2.txt
test2.txt: an apple a day keeps the doctor away
test2.txt: jumps over the very lazy *chicken*
test1.txt: jumps over the very lazy dog
test1.txt: lorem ipsum dolor
test2.txt: the quick brown fox *swims*
test1.txt: the quick brown fox jumps
How to get the difference (only additions) between two files in linux
diff
and then grep
for the edit type you want.
diff -u A1 A2 | grep -E "^\+"
Fast way of finding lines in one file that are not in another?
You can achieve this by controlling the formatting of the old/new/unchanged lines in GNU diff
output:
diff --new-line-format="" --unchanged-line-format="" file1 file2
The input files should be sorted for this to work. With bash
(and zsh
) you can sort in-place with process substitution <( )
:
diff --new-line-format="" --unchanged-line-format="" <(sort file1) <(sort file2)
In the above new and unchanged lines are suppressed, so only changed (i.e. removed lines in your case) are output. You may also use a few diff
options that other solutions don't offer, such as -i
to ignore case, or various whitespace options (-E
, -b
, -v
etc) for less strict matching.
Explanation
The options --new-line-format
, --old-line-format
and --unchanged-line-format
let you control the way diff
formats the differences, similar to printf
format specifiers. These options format new (added), old (removed) and unchanged lines respectively. Setting one to empty "" prevents output of that kind of line.
If you are familiar with unified diff format, you can partly recreate it with:
diff --old-line-format="-%L" --unchanged-line-format=" %L" \
--new-line-format="+%L" file1 file2
The %L
specifier is the line in question, and we prefix each with "+" "-" or " ", like diff -u
(note that it only outputs differences, it lacks the ---
+++
and @@
lines at the top of each grouped change).
You can also use this to do other useful things like number each line with %dn
.
The diff
method (along with other suggestions comm
and join
) only produce the expected output with sorted input, though you can use <(sort ...)
to sort in place. Here's a simple awk
(nawk) script (inspired by the scripts linked-to in Konsolebox's answer) which accepts arbitrarily ordered input files, and outputs the missing lines in the order they occur in file1.
# output lines in file1 that are not in file2
BEGIN { FS="" } # preserve whitespace
(NR==FNR) { ll1[FNR]=$0; nl1=FNR; } # file1, index by lineno
(NR!=FNR) { ss2[$0]++; } # file2, index by string
END {
for (ll=1; ll<=nl1; ll++) if (!(ll1[ll] in ss2)) print ll1[ll]
}
This stores the entire contents of file1 line by line in a line-number indexed array ll1[]
, and the entire contents of file2 line by line in a line-content indexed associative array ss2[]
. After both files are read, iterate over ll1
and use the in
operator to determine if the line in file1 is present in file2. (This will have have different output to the diff
method if there are duplicates.)
In the event that the files are sufficiently large that storing them both causes a memory problem, you can trade CPU for memory by storing only file1 and deleting matches along the way as file2 is read.
BEGIN { FS="" }
(NR==FNR) { # file1, index by lineno and string
ll1[FNR]=$0; ss1[$0]=FNR; nl1=FNR;
}
(NR!=FNR) { # file2
if ($0 in ss1) { delete ll1[ss1[$0]]; delete ss1[$0]; }
}
END {
for (ll=1; ll<=nl1; ll++) if (ll in ll1) print ll1[ll]
}
The above stores the entire contents of file1 in two arrays, one indexed by line number ll1[]
, one indexed by line content ss1[]
. Then as file2 is read, each matching line is deleted from ll1[]
and ss1[]
. At the end the remaining lines from file1 are output, preserving the original order.
In this case, with the problem as stated, you can also divide and conquer using GNU split
(filtering is a GNU extension), repeated runs with chunks of file1 and reading file2 completely each time:
split -l 20000 --filter='gawk -f linesnotin.awk - file2' < file1
Note the use and placement of -
meaning stdin
on the gawk
command line. This is provided by split
from file1 in chunks of 20000 line per-invocation.
For users on non-GNU systems, there is almost certainly a GNU coreutils package you can obtain, including on OSX as part of the Apple Xcode tools which provides GNU diff
, awk
, though only a POSIX/BSD split
rather than a GNU version.
How to efficiently compare two unordered lists (not sets)?
O(n): The Counter() method is best (if your objects are hashable):
def compare(s, t):
return Counter(s) == Counter(t)
O(n log n): The sorted() method is next best (if your objects are orderable):
def compare(s, t):
return sorted(s) == sorted(t)
O(n * n): If the objects are neither hashable, nor orderable, you can use equality:
def compare(s, t):
t = list(t) # make a mutable copy
try:
for elem in s:
t.remove(elem)
except ValueError:
return False
return not t
Comparing two files in linux terminal
Here is my solution for this :
mkdir temp
mkdir results
cp /usr/share/dict/american-english ~/temp/american-english-dictionary
cp /usr/share/dict/british-english ~/temp/british-english-dictionary
cat ~/temp/american-english-dictionary | wc -l > ~/results/count-american-english-dictionary
cat ~/temp/british-english-dictionary | wc -l > ~/results/count-british-english-dictionary
grep -Fxf ~/temp/american-english-dictionary ~/temp/british-english-dictionary > ~/results/common-english
grep -Fxvf ~/results/common-english ~/temp/american-english-dictionary > ~/results/unique-american-english
grep -Fxvf ~/results/common-english ~/temp/british-english-dictionary > ~/results/unique-british-english
Comparing two unsorted text files to find intersection file
Here is a simple GNU awk that does what (I think) you are trying to do:
$ gawk 'BEGIN{FS=OFS=", "} NR==FNR{db[$1 $2 $3 $4 $5 $6]; next}
$1~/ARFCN: / && !($1 $2 $3 $4 $5 $6 in db)' db.txt rec.txt
Prints:
ARFCN: 56, Freq: 946.2M, CID: 0, LAC: 0, MCC: 0, MNC: 0, Pwr: -50
ARFCN: 100, Freq: 955.0M, CID: 667, LAC: 1007, MCC: 410, MNC: 3, Pwr: -27
This works by comparing the first 6 fields and ignoring Pwr
since that might be variable depending on conditions.
If all fields are relevant, you can simplify to:
$ gawk 'BEGIN{FS=OFS=", "} NR==FNR{db[$0]; next}
$1~/ARFCN: / && !($0 in db)' db.txt rec.txt
Prints:
ARFCN: 1004, Freq: 931.0M, CID: 36231, LAC: 7713, MCC: 410, MNC: 4, Pwr: -34
ARFCN: 1008, Freq: 931.8M, CID: 47103, LAC: 7713, MCC: 410, MNC: 4, Pwr: -30
ARFCN: 10, Freq: 937.0M, CID: 30911, LAC: 10470, MCC: 410, MNC: 1, Pwr: -33
ARFCN: 49, Freq: 944.8M, CID: 15535, LAC: 52207, MCC: 410, MNC: 3, Pwr: -28
ARFCN: 56, Freq: 946.2M, CID: 0, LAC: 0, MCC: 0, MNC: 0, Pwr: -50
ARFCN: 79, Freq: 950.8M, CID: 10003, LAC: 470, MCC: 410, MNC: 6, Pwr: -43
ARFCN: 82, Freq: 951.4M, CID: 10002, LAC: 470, MCC: 410, MNC: 6, Pwr: -36
ARFCN: 100, Freq: 955.0M, CID: 667, LAC: 1007, MCC: 410, MNC: 3, Pwr: -27
Or if you only want to compare the ARFCN
field:
$ gawk 'BEGIN{FS=OFS=", "} NR==FNR{db[$1]; next}
$1~/ARFCN: / && !($1 in db)' db.txt rec.txt
ARFCN: 100, Freq: 955.0M, CID: 667, LAC: 1007, MCC: 410, MNC: 3, Pwr: -27
You get the idea...
Related Topics
How to Move All Files Including Hidden Files into Parent Directory via *
Difference Between Unix Domain Stream and Datagram Sockets
In Linux Determine If a .A Library/Archive 32-Bit or 64-Bit
How to Calculate System Memory Usage from /Proc/Meminfo (Like Htop)
Get Current Time in Hours and Minutes
Shell Script Error Expecting "Do"
How to Find My Shell Version Using a Linux Command
Maximum Number of Inodes in a Directory
How to List All Vhosts in Nginx
Any Way to Exit Bash Script, But Not Quitting the Terminal
Preserve Colouring After Piping Grep to Grep
How to Find a File/Directory That Could Be Anywhere on Linux Command Line
How the Util of iOStat Is Computed
Ubuntu: Using Curl to Download an Image