Flattening a Delimited Composite Column

Flattening a delimited composite column

items <- strsplit(df$items, " ")
data.frame(user = rep(df$users, sapply(items, length)), item = unlist(items))

##   user item                                                                                                                                                                                                                                
## 1    1   23                                                                                                                                                                                                                                
## 2    1   77                                                                                                                                                                                                                                
## 3    1   49                                                                                                                                                                                                                                
## 4    2   10                                                                                                                                                                                                                                
## 5    2   18                                                                                                                                                                                                                                
## 6    2   28                                                                                                                                                                                                                                
## 7    3   20                                                                                                                                                                                                                                
## 8    3   31                                                                                                                                                                                                                                
## 9    3   84

library(data.table)

DT <- data.table(df)    
DT[, list(item = unlist(strsplit(items, " "))), by = users]

##    users item                                                                                                                                                                                                                              
## 1:     1   23                                                                                                                                                                                                                              
## 2:     1   77                                                                                                                                                                                                                              
## 3:     1   49                                                                                                                                                                                                                              
## 4:     2   10                                                                                                                                                                                                                              
## 5:     2   18                                                                                                                                                                                                                              
## 6:     2   28                                                                                                                                                                                                                              
## 7:     3   20                                                                                                                                                                                                                              
## 8:     3   31                                                                                                                                                                                                                              
## 9:     3   84

How to flatten a hierarchical index in columns

I think the easiest way to do this would be to set the columns to the top level:

df.columns = df.columns.get_level_values(0)

Note: if the to level has a name you can also access it by this, rather than 0.

If you want to combine/join your MultiIndex into one Index (assuming you have just string entries in your columns) you could:

df.columns = [' '.join(col).strip() for col in df.columns.values]

Note: we must strip the whitespace for when there is no second index.

In [11]: [' '.join(col).strip() for col in df.columns.values]
Out[11]: 
['USAF',
 'WBAN',
 'day',
 'month',
 's_CD sum',
 's_CL sum',
 's_CNT sum',
 's_PC sum',
 'tempf amax',
 'tempf amin',
 'year']

Normalize data frame with list column

stack would be perfect for this:

stack(setNames(df$versions, df$id))
#   values ind
# 1      1   1
# 2      2   1
# 3      4   1
# 4      1   2
# 5      3   3
# 6      4   3

Opposite of 'summarise' in dplyr: turn one row into many

This appears to be currently impossible, but under active discussion by the developers with a target version of 0.5.

Note data.table currently allows this (see @akrun's comment), and also allows you to have arbitrary sized group outputs with arbitrary sized group inputs, whereas it seems like the solution being discussed with dplyr would require all groups to be the same size. Here is an example:

> data.table(a=1:3)[, paste(a, seq(a), sep=":"), by=a]
   a  V1
1: 1 1:1
2: 2 2:1
3: 2 2:2
4: 3 3:1
5: 3 3:2
6: 3 3:3

Additionally, based on @AlexBrown's comment, you could do:

unnest(testdf, a3)

for your specific example, but that does not seem to work with the group_by / summarize workflow for reasons described above (i.e. you can't create testdf directly with just dplyr::group_by, AFAIK).

MSSQL 2008: Get unique values from a many-to-many table

You can add an additional check in your co-related query like this AND a.id1 > a2.id1

select distinct
    a.id1 as [ID] 
    ,stuff(
    (
        select ', ' + a2.id1 
        from #associations a2
        where a2.id2 = a.id1  
        AND a.id1  > a2.id1  
        for xml path('')
    ), 1, 1, '') as [Relationship]
    from #associations a

This will restrict repetitive relation.

You may additionally have to do a UNION of #associations before the relation query like this.

SELECT ID1, ID2 FROM #associations
UNION 
SELECT ID2 AS ID1, ID1 as ID2 FROM #associations

and use this in your query instead of #associations

Creating a tree/deeply nested dict from an indented text file in python

Here is a recursive solution. First, transform the input in the following way.

Input:

person:
    address:
        street1: 123 Bar St
        street2: 
        city: Madison
        state: WI
        zip: 55555
    web:
        email: boo@baz.com

First-step output:

[{'name':'person','value':'','level':0},
 {'name':'address','value':'','level':1},
 {'name':'street1','value':'123 Bar St','level':2},
 {'name':'street2','value':'','level':2},
 {'name':'city','value':'Madison','level':2},
 {'name':'state','value':'WI','level':2},
 {'name':'zip','value':55555,'level':2},
 {'name':'web','value':'','level':1},
 {'name':'email','value':'boo@baz.com','level':2}]

This is easy to accomplish with split(':') and by counting the number of leading tabs:

def tab_level(astr):
    """Count number of leading tabs in a string
    """
    return len(astr)- len(astr.lstrip('\t'))

Then feed the first-step output into the following function:

def ttree_to_json(ttree,level=0):
    result = {}
    for i in range(0,len(ttree)):
        cn = ttree[i]
        try:
            nn  = ttree[i+1]
        except:
            nn = {'level':-1}

        # Edge cases
        if cn['level']>level:
            continue
        if cn['level']<level:
            return result

        # Recursion
        if nn['level']==level:
            dict_insert_or_append(result,cn['name'],cn['value'])
        elif nn['level']>level:
            rr = ttree_to_json(ttree[i+1:], level=nn['level'])
            dict_insert_or_append(result,cn['name'],rr)
        else:
            dict_insert_or_append(result,cn['name'],cn['value'])
            return result
    return result

where:

def dict_insert_or_append(adict,key,val):
    """Insert a value in dict at key if one does not exist
    Otherwise, convert value to list and append
    """
    if key in adict:
        if type(adict[key]) != list:
            adict[key] = [adict[key]]
        adict[key].append(val)
    else:
        adict[key] = val

Ignore cascade on foreign key update?

The solution, which we have arrived in chat chat:

/* Tables */

CREATE TABLE `emails` (
 `group_id` bigint(20) NOT NULL,
 `email` varchar(500) NOT NULL,
 UNIQUE KEY `group_id` (`group_id`,`email`) USING BTREE,
 CONSTRAINT `emails_ibfk_1` FOREIGN KEY (`group_id`) REFERENCES `entities` (`group_id`) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=latin1

CREATE TABLE `hashes` (
 `group_id` bigint(20) NOT NULL,
 `hash` varchar(128) NOT NULL,
 `repeat_count` int(11) NOT NULL DEFAULT '0',
 UNIQUE KEY `hash` (`hash`),
 KEY `group_id` (`group_id`),
 CONSTRAINT `hashes_ibfk_1` FOREIGN KEY (`group_id`) REFERENCES `entities` (`group_id`) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=latin1

CREATE TABLE `entities` (
 `group_id` bigint(20) NOT NULL,
 `entity_id` bigint(20) NOT NULL,
 PRIMARY KEY (`group_id`),
 KEY `entity_id` (`entity_id`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1

CREATE TABLE `entity_lookup` (
 `entity_id` bigint(20) NOT NULL,
 PRIMARY KEY (`entity_id`) USING HASH
) ENGINE=MyISAM DEFAULT CHARSET=latin1

/* Inserting */

START TRANSACTION;

/* Determine next group ID */
SET @next_group_id = (SELECT MAX(group_id) + 1 FROM entities);

/* Determine next entity ID */
SET @next_entity_id = (SELECT MAX(entity_id) + 1 FROM entities);

/* Merge any entity ids */
INSERT IGNORE INTO entity_lookup SELECT entity_id FROM entities JOIN hashes USING(group_id) WHERE HASH IN(...);
UPDATE entities JOIN entity_lookup USING(entity_id) SET entity_id = @next_entity_id;
TRUNCATE TABLE entity_lookup;

/* Add the new group ID to entity_id */
INSERT INTO entities(group_id, entity_id) VALUES(@next_group_id, @next_entity_id);

/* Add new values into hashes */
INSERT INTO hashes (group_id, HASH) VALUES 
    (@next_group_id, ...)
ON DUPLICATE KEY UPDATE
  repeat_count = repeat_count + 1;

/* Add other new values */
INSERT IGNORE INTO emails (group_id, email) VALUES
    (@next_group_id, "email1");
    
COMMIT;

Flattening a Delimited Composite Column