Python: Simple List Merging Based on Intersections

Python: simple list merging based on intersections

My attempt:

def merge(lsts):
    sets = [set(lst) for lst in lsts if lst]
    merged = True
    while merged:
        merged = False
        results = []
        while sets:
            common, rest = sets[0], sets[1:]
            sets = []
            for x in rest:
                if x.isdisjoint(common):
                    sets.append(x)
                else:
                    merged = True
                    common |= x
            results.append(common)
        sets = results
    return sets

lst = [[65, 17, 5, 30, 79, 56, 48, 62],
       [6, 97, 32, 93, 55, 14, 70, 32],
       [75, 37, 83, 34, 9, 19, 14, 64],
       [43, 71],
       [],
       [89, 49, 1, 30, 28, 3, 63],
       [35, 21, 68, 94, 57, 94, 9, 3],
       [16],
       [29, 9, 97, 43],
       [17, 63, 24]]
print merge(lst)

Benchmark:

import random

# adapt parameters to your own usage scenario
class_count = 50
class_size = 1000
list_count_per_class = 100
large_list_sizes = list(range(100, 1000))
small_list_sizes = list(range(0, 100))
large_list_probability = 0.5

if False:  # change to true to generate the test data file (takes a while)
    with open("/tmp/test.txt", "w") as f:
        lists = []
        classes = [
            range(class_size * i, class_size * (i + 1)) for i in range(class_count)
        ]
        for c in classes:
            # distribute each class across ~300 lists
            for i in xrange(list_count_per_class):
                lst = []
                if random.random() < large_list_probability:
                    size = random.choice(large_list_sizes)
                else:
                    size = random.choice(small_list_sizes)
                nums = set(c)
                for j in xrange(size):
                    x = random.choice(list(nums))
                    lst.append(x)
                    nums.remove(x)
                random.shuffle(lst)
                lists.append(lst)
        random.shuffle(lists)
        for lst in lists:
            f.write(" ".join(str(x) for x in lst) + "\n")

setup = """
# Niklas'
def merge_niklas(lsts):
    sets = [set(lst) for lst in lsts if lst]
    merged = 1
    while merged:
        merged = 0
        results = []
        while sets:
            common, rest = sets[0], sets[1:]
            sets = []
            for x in rest:
                if x.isdisjoint(common):
                    sets.append(x)
                else:
                    merged = 1
                    common |= x
            results.append(common)
        sets = results
    return sets

# Rik's
def merge_rik(data):
    sets = (set(e) for e in data if e)
    results = [next(sets)]
    for e_set in sets:
        to_update = []
        for i, res in enumerate(results):
            if not e_set.isdisjoint(res):
                to_update.insert(0, i)

        if not to_update:
            results.append(e_set)
        else:
            last = results[to_update.pop(-1)]
            for i in to_update:
                last |= results[i]
                del results[i]
            last |= e_set
    return results

# katrielalex's
def pairs(lst):
    i = iter(lst)
    first = prev = item = i.next()
    for item in i:
        yield prev, item
        prev = item
    yield item, first

import networkx

def merge_katrielalex(lsts):
    g = networkx.Graph()
    for lst in lsts:
        for edge in pairs(lst):
            g.add_edge(*edge)
    return networkx.connected_components(g)

# agf's (optimized)
from collections import deque

def merge_agf_optimized(lists):
    sets = deque(set(lst) for lst in lists if lst)
    results = []
    disjoint = 0
    current = sets.pop()
    while True:
        merged = False
        newsets = deque()
        for _ in xrange(disjoint, len(sets)):
            this = sets.pop()
            if not current.isdisjoint(this):
                current.update(this)
                merged = True
                disjoint = 0
            else:
                newsets.append(this)
                disjoint += 1
        if sets:
            newsets.extendleft(sets)
        if not merged:
            results.append(current)
            try:
                current = newsets.pop()
            except IndexError:
                break
            disjoint = 0
        sets = newsets
    return results

# agf's (simple)
def merge_agf_simple(lists):
    newsets, sets = [set(lst) for lst in lists if lst], []
    while len(sets) != len(newsets):
        sets, newsets = newsets, []
        for aset in sets:
            for eachset in newsets:
                if not aset.isdisjoint(eachset):
                    eachset.update(aset)
                    break
            else:
                newsets.append(aset)
    return newsets

# alexis'
def merge_alexis(data):
    bins = range(len(data))  # Initialize each bin[n] == n
    nums = dict()

    data = [set(m) for m in data]  # Convert to sets
    for r, row in enumerate(data):
        for num in row:
            if num not in nums:
                # New number: tag it with a pointer to this row's bin
                nums[num] = r
                continue
            else:
                dest = locatebin(bins, nums[num])
                if dest == r:
                    continue  # already in the same bin

                if dest > r:
                    dest, r = r, dest  # always merge into the smallest bin

                data[dest].update(data[r])
                data[r] = None
                # Update our indices to reflect the move
                bins[r] = dest
                r = dest

    # Filter out the empty bins
    have = [m for m in data if m]
    return have

def locatebin(bins, n):
    while bins[n] != n:
        n = bins[n]
    return n

lsts = []
size = 0
num = 0
max = 0
for line in open("/tmp/test.txt", "r"):
    lst = [int(x) for x in line.split()]
    size += len(lst)
    if len(lst) > max:
        max = len(lst)
    num += 1
    lsts.append(lst)
"""

setup += """
print "%i lists, {class_count} equally distributed classes, average size %i, max size %i" % (num, size/num, max)
""".format(class_count=class_count)

import timeit
print "niklas"
print timeit.timeit("merge_niklas(lsts)", setup=setup, number=3)
print "rik"
print timeit.timeit("merge_rik(lsts)", setup=setup, number=3)
print "katrielalex"
print timeit.timeit("merge_katrielalex(lsts)", setup=setup, number=3)
print "agf (1)"
print timeit.timeit("merge_agf_optimized(lsts)", setup=setup, number=3)
print "agf (2)"
print timeit.timeit("merge_agf_simple(lsts)", setup=setup, number=3)
print "alexis"
print timeit.timeit("merge_alexis(lsts)", setup=setup, number=3)

These timings are obviously dependent on the specific parameters to the benchmark, like number of classes, number of lists, list size, etc. Adapt those parameters to your need to get more helpful results.

Below are some example outputs on my machine for different parameters. They show that all the algorithms have their strength and weaknesses, depending on the kind of input they get:

=====================
# many disjoint classes, large lists
class_count = 50
class_size = 1000
list_count_per_class = 100
large_list_sizes = list(range(100, 1000))
small_list_sizes = list(range(0, 100))
large_list_probability = 0.5
=====================

niklas
5000 lists, 50 equally distributed classes, average size 298, max size 999
4.80084705353
rik
5000 lists, 50 equally distributed classes, average size 298, max size 999
9.49251699448
katrielalex
5000 lists, 50 equally distributed classes, average size 298, max size 999
21.5317108631
agf (1)
5000 lists, 50 equally distributed classes, average size 298, max size 999
8.61671280861
agf (2)
5000 lists, 50 equally distributed classes, average size 298, max size 999
5.18117713928
=> alexis
=> 5000 lists, 50 equally distributed classes, average size 298, max size 999
=> 3.73504281044

===================
# less number of classes, large lists
class_count = 15
class_size = 1000
list_count_per_class = 300
large_list_sizes = list(range(100, 1000))
small_list_sizes = list(range(0, 100))
large_list_probability = 0.5
===================

niklas
4500 lists, 15 equally distributed classes, average size 296, max size 999
1.79993700981
rik
4500 lists, 15 equally distributed classes, average size 296, max size 999
2.58237695694
katrielalex
4500 lists, 15 equally distributed classes, average size 296, max size 999
19.5465381145
agf (1)
4500 lists, 15 equally distributed classes, average size 296, max size 999
2.75445604324
=> agf (2)
=> 4500 lists, 15 equally distributed classes, average size 296, max size 999
=> 1.77850699425
alexis
4500 lists, 15 equally distributed classes, average size 296, max size 999
3.23530197144

===================
# less number of classes, smaller lists
class_count = 15
class_size = 1000
list_count_per_class = 300
large_list_sizes = list(range(100, 1000))
small_list_sizes = list(range(0, 100))
large_list_probability = 0.1
===================

niklas
4500 lists, 15 equally distributed classes, average size 95, max size 997
0.773697137833
rik
4500 lists, 15 equally distributed classes, average size 95, max size 997
1.0523750782
katrielalex
4500 lists, 15 equally distributed classes, average size 95, max size 997
6.04466891289
agf (1)
4500 lists, 15 equally distributed classes, average size 95, max size 997
1.20285701752
=> agf (2)
=> 4500 lists, 15 equally distributed classes, average size 95, max size 997
=> 0.714507102966
alexis
4500 lists, 15 equally distributed classes, average size 95, max size 997
1.1286110878

Python merge multiple list with intersection

This works, but maybe isn't very elegant:

def merge_lists(l):
        s=map(set, l)
        i, n=0, len(s)
        while i < n-1:
                for j in xrange(i+1, n):
                        if s[i].intersection(s[j]):
                                s[i].update(s[j])
                                del s[j]
                                n-=1
                                break
                else:
                        i+=1
        return [sorted(i) for i in s]

merging sets which have even one element in common

It sounds like you're looking for a disjoint-set datastructure.

Given your set of id's, your categories separate them into disjoint subsets. A disjoint-set datastructure represents each category by choosing a representative id, which will be returned by a query of any of its members. (isolated id's form one category apiece, and return themselves)

Updates to a disjoint-set datastructure combine the categories of any two id's, so that future queries return the same representative for members of both subsets. (if the two id's are already members of the same category, the update is functionally a no-op)

The usual method is to represent each category as a reverse-tree: each id has a parent link, but no child links. The "representative element" is the root of the tree, which is easy to query by following the parent links. An update requires finding the root of the trees of both id's, and (if they are different) merging the trees by making one root the parent of the other.

By adding a couple of simple optimizations (queries "collapse" the query path to point directly to the root, and updates always choose the root of the deepest tree as the merge parent), this algorithm becomes extremely efficient, running in "almost-O(1)" amortized time.

If you want online access to the complete list of id's in each category, you should maintain a cumulative list attached to each category root, and concatenate them in each merge. In general, it can be convenient to maintain any number of statistics about your categories this way.

How to find list intersection?

If order is not important and you don't need to worry about duplicates then you can use set intersection:

>>> a = [1,2,3,4,5]
>>> b = [1,3,5,6]
>>> list(set(a) & set(b))
[1, 3, 5]

Fastest way to intersect and merge many dictionaries

I will focus on the second loop (merging the intersect_n dicts), give some general advice, and leave the rest as an exercise. My final result is so much shorter than the original that I feel the need to break down the process into several steps. Hopefully you will learn many useful techniques along the way.

After removing blank lines, comments (we'll rewrite those later anyway) and debug traces, and cleaning up a bit of whitespace we are starting with:

for iteration in range(1, 1000):
    itnext = iteration + 1
    globals()['combine_%s%s' % (iteration, itnext)] = defaultdict(list)
    for k, v in globals()['intersect_%s' % iteration].items():
        innr = []
        combine = defaultdict(list)
        for key in set(list(globals()['intersect_%s' % iteration][k].keys()) + list(globals()['intersect_%s' % itnext][k].keys())):
            if (isinstance(globals()['intersect_%s' % iteration][k].get(key, 99999), int) and isinstance(globals()['intersect_%s' % itnext][k].get(key, 99999), int)):
                combine[key] = [globals()['intersect_%s' % iteration][k].get(key, 99999)] + [globals()['intersect_%s' % itnext][k].get(key, 99999)]
            if (isinstance(globals()['intersect_%s' % iteration][k].get(key, 99999), list) and isinstance(globals()['intersect_%s' % itnext][k].get(key, 99999), int)):
                combine[key] = globals()['intersect_%s' % iteration][k].get(key, 99999) + [globals()['intersect_%s' % itnext][k].get(key, 99999)]
        globals()['combine_%s%s' % (iteration, itnext)][k] = combine
    globals()['intersect_%s' % itnext] = copy.copy(globals()['combine_%s%s' % (iteration, itnext)])
    del globals()['intersect_%s' % iteration]
    del globals()['combine_%s%s' % (iteration, itnext)]
    gc.collect()

Now we can get to work.

1. Properly structuring the data

Trying to create variable variables is generally a bad idea. It also has a performance cost:

$ python -m timeit -s "global x_0; x_0 = 'test'" "globals()['x_%s' % 0]"
2000000 loops, best of 5: 193 nsec per loop
$ python -m timeit -s "global x; x = ['test']" "x[0]"
10000000 loops, best of 5: 29.1 nsec per loop

Yes, we're talking about nanoseconds, but the existing code is doing it constantly, for nearly every access. But more importantly, visually simplified code is easier to analyze for subsequent improvements.

Clearly we already know how to manipulate nested data structures; adding one more level of nesting isn't an issue. To store the intersect_n results, rather than having 1000 dynamically named variables, the obvious solution is to just make a 1000-element list, where each element is one of those results. (Note that we will start counting them from 0 rather than 1, of course.) As for globals()['combine_%s%s' % (iteration, itnext)] - that makes no sense; we don't need to create a new variable name each time through, because we're going to throw that data away at the end of the loop anyway. So let's just use a constant name.

Once the first loop is modified to give the right data (which will also look much simpler in that part), the access looks much simpler here:

for iteration in range(999):
    itnext = iteration + 1
    combine_overall = defaultdict(list)
    for k, v in intersect[iteration].items():
        combine = defaultdict(list)
        for key in set(list(intersect[iteration][k].keys()) + list(intersection[itnext][k].keys())):
            if (isinstance(intersect[iteration][k].get(key, 99999), int) and isinstance(intersect[itnext][k].get(key, 99999), int)):
                combine[key] = [intersect[iteration][k].get(key, 99999)] + [intersect[itnext][k].get(key, 99999)]
            if (isinstance(intersect[iteration][k].get(key, 99999), list) and isinstance(intersect[itnext][k].get(key, 99999), int)):
                combine[key] = intersect[iteration][k].get(key, 99999) + [intersect[itnext][k].get(key, 99999)]
        combine_overall[k] = combine
    intersect[itnext] = copy.copy(combine_overall)

You'll notice I removed the memory management stuff at the end. I'll discuss a better approach for that later. The del for the iteration value would mess up iterating over that list, and we don't need to delete combine_overall because we'll just replace it with a new empty defaultdict. I also sneakily removed innr = [], because the value is simply unused. Like I said: visually simpler code is easier to analyze.

2. Unnecessary type checks

All this isinstance stuff is hard to read, and time consuming especially considering all the repeated access:

$ python -m timeit -s "global x; x = {'a': {'b': {'c': 0}}}" "isinstance(x['a']['b'].get('c', 0), int)"
2000000 loops, best of 5: 138 nsec per loop
$ python -m timeit -s "global x; x = {'a': {'b': {'c': 0}}}" "x['a']['b'].get('c', 0)"
5000000 loops, best of 5: 83.9 nsec per loop

We know the exact conditions under which intersect[itnext][k].get(key, 99999) should be an int: always, or else the data is simply corrupted. (We can worry about that later, and probably by doing exception handling in the calling code.) We know the conditions under which intersect[iteration][k].get(key, 99999) should be an int or a list: it will be an int (or missing) the first time through, and a list (or missing) every other time. Fixing this will also make it easier to understand the next step.

for iteration in range(999):
    itnext = iteration + 1
    combine_overall = defaultdict(list)
    for k, v in intersect[iteration].items():
        combine = defaultdict(list)
        for key in set(list(intersect[iteration][k].keys()) + list(intersection[itnext][k].keys())):
            if iteration == 0:
                combine[key] = [intersect[iteration][k].get(key, 99999)] + [intersect[itnext][k].get(key, 99999)]
            else:
                combine[key] = intersect[iteration][k].get(key, [99999]) + [intersect[itnext][k].get(key, 99999)]
        combine_overall[k] = combine
    intersect[itnext] = copy.copy(combine_overall)

Notice how, when the key is either a list or missing, we use a list as the default value. That's the trick to preserving type consistency and making it possible to write the code this way.

3. An unnecessary copy and unnecessary pair-wise iteration

Since combine_overall isn't referenced anywhere else, we don't actually need to copy it over the intersect[itnext] value - we could just reassign it without any aliasing issues. But better yet is to just leave it where it is. Instead of considering adjacent pairs of iteration values that we merge together pairwise, we just merge everything into combine_overall, one at a time (and set up an initial defaultdict once instead of overwriting it).

This does mean we'll have to do some setup work - instead of special-casing the first merge, we'll "merge" intersect[0] by itself into the initial state of combine_overall.

combine_overall = defaultdict(list)
for k, v in intersect[0].items():
    combine = defaultdict(list)
    for key, value in v.keys():
        combine[key] = [value]
    combine_overall[k] = combine

for iteration in range(999):
    itnext = iteration + 1
    for k, v in combine_overall.items():
        combine = defaultdict(list)
        for key in set(list(combine_overall[k].keys()) + list(intersection[itnext][k].keys())):
            combine[key] = combine_overall[k].get(key, [99999]) + [intersect[itnext][k].get(key, 99999)]
        combine_overall[k] = combine

Notice how much more simply we can do the initial step - we know which keys we're working with, so no .gets are necessary; and there's only one dict, so no merging of key-sets is necessary. But we aren't done...

4. Some miscellaneous cleanup

Looking at this version, we can more easily notice:

The iteration loop doesn't use iteration at all, but only itnext - so we can fix that. Also, there's no reason to use indexes like this for a simple loop - we should directly iterate over elements.
combine_overall will hold dicts, not lists (as we assign the values from combine, which is a defaultdict); so defaultdict(list) makes no sense.
Instead of using a temporary combine to build a replacement for combine_overall[k] and then assigning it back, we could just directly modify combine_overall[k]. In this way, we would actually get benefit from the defaultdict behaviour. We actually want the default values to be defaultdicts themselves - not completely straightforward, but very doable.
Since we no longer need to make a distinction between the overall combined result and individual results, we can rename combine_overall to just combine to look a little cleaner.

combine = defaultdict(lambda: defaultdict(list))
for k, v in intersect[0].items():
    for key, value in v.keys():
        combine[k][key] = [value]

for to_merge in intersect[1:]:
    for k, v in combine.items():
        for key in set(list(combine[k].keys()) + list(to_merge[k].keys())):
            combine[k][key] = combine[k].get(key, [99999]) + [to_merge[k].get(key, 99999)]

5. Oops, there was a bug all along. Also, "special cases aren't special enough to break the rules"

Hopefully, this looks a little strange to you. Why are we using .get on a defaultdict? Why would we have this single-item placeholder value, rather than an empty list? Why do we have to do this complicated check for the possible keys to use? And do we really need to handle the first intersect value differently?

Consider what happens on the following data (using the original naming conventions):

intersect_1 = {'1': {'1': 1}}
intersect_2 = {'1': {'1': 1}}
intersect_3 = {'1': {'1': 1, '2': 1}}

With the original approach, I get a result like:

$ python -i tmp.py 
>>> intersect_3
defaultdict(<class 'list'>, {'1': defaultdict(<class 'list'>, {'2': [99999, 1], '1': [1, 1, 1]})})

Oops. intersect_3['1']['2'] only has two elements ([99999, 1]), and thus doesn't match up with intersect_3['1']['1']. That defeats the purpose of the 99999 placeholder values. The problem is that, because the value was missing multiple times at the start, multiple 99999 values should have been inserted, but only one was - the one that came from creating the initial list, when the isinstance check reported an int rather than a list, when it retrieved the 99999 with .get. That lost information: we couldn't distinguish between a missing int and a missing list.

How do we work around this? Simple: we use the same, overall key set each time - the total set of keys that should be present, which we get from grid_density_original[k]. Whenever one of those entries is missing in any of the intersect results, we write the placeholder value instead. Now we are also handling each intersect result the same way - instead of doing special setup with the first value, and merging everything else in, we are merging everything in to an empty initial state.

Instead of iterating over the .items of combine (and expecting to_merge to have the same keys), we iterate over to_merge, which makes a lot more sense. Instead of creating and assigning a list for combine[k][key], we simply append a value to the existing list (and we know there is one available, because we are using defaultdicts properly now).

Thus:

combine = defaultdict(lambda: defaultdict(list))
for to_merge in intersect:
    for k, v in to_merge.items():
        # BTW, you typo'd this as "orignal" in the original code
        for key in grid_density_original[k].keys():
            combine[k][key].append(v.get(key, 99999))

(This does mean that, if none of the intersect dicts contain a particular key, the result will contain a list of 1000 99999 values, instead of omitting the key completely. I hope that isn't an issue.)

6. Okay, but weren't you going to do something about the memory usage?

Oh, right. Take a moment to write the corresponding code for the first loop, please.

Got it? Okay, now. Set up combine first; and then each time you compute one of the would-be elements of intersect, merge it in (using the two inner loops shown here) instead of building the actual intersect list.

Oh, and I think you'll find that - since we're going to iterate over grid_density_original[k].keys() anyway - the preprocessing to remove other keys from the g_den results isn't actually needed at all now.

Merge multiple list with intersection and add the items

If order does not matter you can try

>>> l = [[2, 15.0], [3, 15.0], [1, 20.0], [3, 18.0], [1, 50.0, u'pass'], [2, 10.0, u'fail'], [3, 30.0, u'pass']]
>>> temp = {}
>>> for i in l:
...      if i[0] in temp:
...            temp[i[0]].extend(i[1:])
...      else:
...            temp[i[0]] = i[1:]
... 
>>> temp
{1: [20.0, 50.0, u'pass'], 2: [15.0, 10.0, u'fail'], 3: [15.0, 18.0, 30.0, u'pass']}
>>> new_l = [[i]+temp[i] for i in temp]
>>> new_l
[[1, 20.0, 50.0, u'pass'], [2, 15.0, 10.0, u'fail'], [3, 15.0, 18.0, 30.0, u'pass']]

Here you create a dictionary and put the numbers as keys. After that you add the list as values to those keys. Finally you can get the desired output using a list comprehension

Code -

l = [[2, 15.0], [3, 15.0], [1, 20.0], [3, 18.0], [1, 50.0, u'pass'], [2, 10.0, u'fail'], [3, 30.0, u'pass']]
temp = {}
for i in l:
    if i[0] in temp:
          temp[i[0]].extend(i[1:])
    else:
          temp[i[0]] = i[1:]
new_l = [[i]+temp[i] for i in temp]

Easiest way to merge sets with non-empty intersection

The problem arises because two sets might not start to intersect until after you check them for intersection.

Suppose you have sets A, B, and C, where C intersects A and B. You do the following.

Check A and B for intersection. No intersection.
Check A and C for intersection. They intersect, so you replace A with A.union(C) and remove C. Now A intersects B and C is gone.
You're done with A. You move on to B, but there's nothing left to compare it against. You stop.

The simple, quick fix is to keep checking a set for intersections with the other sets until you get through all the other sets without finding any.

Python: Simple List Merging Based on Intersections