Unpacking and Merging Lists in a Column in Data.Frame

Unpacking and merging lists in a column in data.frame

Here's a possible data.table approach

library(data.table)
setDT(dat)[, .(name = c(name, unlist(altNames))), by = id]
#       id  name
#  1: 1001  Joan
#  2: 1002  Jane
#  3: 1002 Janie
#  4: 1002 Janet
#  5: 1002   Jan
#  6: 1003  John
#  7: 1003   Jon
#  8: 1004  Bill
#  9: 1004  Will
# 10: 1005   Tom

Unpack a R data frame column of lists

I would do

setDT(data)

dfcol   = "Subdocuments"
othcols = setdiff(names(data), dfcol)

subs = rbindlist(data[[dfcol]], id=TRUE)
subs[, (othcols) := data[.id, othcols, with=FALSE]]

If you don't want to setDT(data), you can change the last line like data[.id, othcols].

Unpacking lists within a data frame into multiple TensorFlow inputs

You can try merging the lists with pandas:

import pandas as pd

df = pd.DataFrame(data = {'col1': [[0,1,0], [1,0,0] ], 'col2': [1, 0], 'col3': [0, 1]})

df['col1-1'], df['col1-2'], df['col1-3'] = zip(*list(df['col1'].values))
df = df.drop('col1', axis=1)
print(df)

   col2  col3  col1-1  col1-2  col1-3
0     1     0       0       1       0
1     0     1       1       0       0

Or with numpy:

import pandas as pd
import numpy as np

df = pd.DataFrame(data = {'col1': [[0,1,0], [1,0,0] ], 'col2': [1, 0], 'col3': [0, 1]})

col1 = np.vstack(df['col1'].values)
col23 = df[['col2', 'col3']].values

data = np.concatenate([col1, col23], axis=-1)
print(data)

[[0 1 0 1 0]
 [1 0 0 0 1]]

Unpacking list of lists of dicts column in Pandas dataframe

Use dictionary comprehension with concat and DataFrame.pop for extract column:

df1 = pd.concat({k: pd.DataFrame(x) for k, x in df_in.pop('B').items()})
print (df1)
     B1  B2  B3
0 0   1   2   3
  1   4   5   6
  2   7   8   9
1 0  10  11  12
2 0  13  14  15
  1  16  17  18

Add original data by DataFrame.join and for correct order extract and append C column:

df = df_in.join(df1.reset_index(level=1, drop=True)).reset_index(drop=True)
df['C'] = df.pop('C')
print (df)
   A  B1  B2  B3  C
0  1   1   2   3  a
1  1   4   5   6  a
2  1   7   8   9  a
3  2  10  11  12  b
4  3  13  14  15  c
5  3  16  17  18  c

Alternative solution with DataFrame.assign, for correct order is used DataFrame.insert:

df1 = pd.concat([pd.DataFrame(v['B']).assign(A=v['A'], C=v['C']) 
                   for k, v in df_in.to_dict('index').items()], ignore_index=True)
df1.insert(0, 'A', df1.pop('A'))
print (df1)
   A  B1  B2  B3  C
0  1   1   2   3  a
1  1   4   5   6  a
2  1   7   8   9  a
3  2  10  11  12  b
4  3  13  14  15  c
5  3  16  17  18  c

Unpacking a list in an R dataframe

If the final goal is to get data in long format, we can use unnest from tidyr

tidyr::unnest(dat, cols = df_list)

#   id      some_date  df_list  
#   <chr>   <chr>      <chr>    
# 1 509935  2/09/1967  018084131
# 2 727889  28/04/1976 062197171
# 3 727889  28/04/1976 062171593
# 4 864607  22/12/2017 064601923
# 5 864607  22/12/2017 068994009
# 6 864607  22/12/2017 069831651
# 7 1234243 7/02/2006  071141584
# 8 1234243 7/02/2006  073129537
# 9 1020959 10/03/2019 061498574
#10 1020959 10/03/2019 065859718
#11 1020959 10/03/2019 067251995
#12 1020959 10/03/2019 069447806
#13 221975  21/10/1935 064623976

unpacking a column of a list of pairs into two columns - pandas

Another way using .explode and .str.split followed by join

assuming = was a typo? but we can handle it with a str.replace

df1 = df.join(df['li'].str.replace('=','-').str.split(',')\
             .explode().str.split('-',expand=True)\
             .groupby(level=0).agg(','.join)\
             .rename(columns={0 : 'start', 1 : 'end'})).drop('li',axis=1)

print(df1)

     ID                      start                       end
0   ld1           1205369, 1206384           1205491,1206570
1   ld2                111,113,117               112,114,119
2   ld3                444,765,777               445,785,779
3   ld5  1203843, 1204033, 1204398   1203967,1204235,1204485
4  ld28                666,756,896               777,788,909
5  ld37    999,1001,1112,1234,1999  1000,1111,1119,1278,2007

Pandas - Unpack column of lists of varying lengths of tuples

In [38]: (df.groupby('ID')['list']
            .apply(lambda x: pd.DataFrame(x.iloc[0], columns=['A', 'B', 'C', 'D']))
            .reset_index())
Out[38]: 
   ID  level_1    A  B  C    D
0   1        0    0  1  2    3
1   1        1    1  2  3    4
2   1        2    2  3  4  NaN
3   2        0  NaN  1  2    3
4   2        1    9  2  3    4
5   3        0  NaN  1  2    3
6   3        1    9  2  3    4
7   3        2    A  b  9    c
8   3        3    $  *  k    0

Unpacking cells containing list of lists in Pandas DataFrame into separate rows and columns of a new DataFrame

Using groupby+apply with pd.DataFrame :

df = df.groupby(['a','b'])\
       .apply(lambda x: pd.DataFrame(x['c'].tolist()[0], columns=['c','d']))\
       .reset_index([0,1]).reset_index(drop=True)

print(df)
    a  b  c  d
0   7  5 -4  7
1   7  5 -5  6
2  13  5 -9  4
3  13  5 -3  7

Explanation :

Since for each value in column c are list of lists. To upack them and to make them different columns we taking x['c'].tolist() this contains 2 open and close brackets ([[[values],[values]]]) which not useful, so x['c'].tolist()[0] gives [[values],[values]] which is used as data to pd.DataFrame with columns ['c','d'] and finaly reset_index on levels = [0,1] which are columns ['a','b'].

print(pd.DataFrame([[-4, 7], [-5, 6]],columns=['c','d']))
   c  d
0 -4  7
1 -5  6

print(df.groupby(['a','b'])\
        .apply(lambda x: pd.DataFrame(x['c'].tolist()[0], columns=['c','d'])))
        c  d
a  b        
7  5 0 -4  7
     1 -5  6
13 5 0 -9  4
     1 -3  7

Unpack the list element of DataFrame

I think you can use numpy.repeat for repeat values by legths by str.len and flat values of nested lists by chain:

from  itertools import chain

df1 = pd.DataFrame({
        "l1": np.repeat(df.l1.values, df.l2.str.len()),
        "l2": list(chain.from_iterable(df.l2))})
print (df1)
  l1  l2
0  a   x
1  b  y1
2  b  y2
3  b  y3
4  c   z

Timings:

#[100000 rows x 2 columns]
np.random.seed(10)
N = 100000
l1 = ['a', 'b', 'c']
l1 = np.random.choice(l1, N)
l2 = [list(tuple(string.ascii_letters[:np.random.randint(1, 10)])) for _ in np.arange(N)]
df = pd.DataFrame({"l1":l1, "l2":l2})
df.l2 = df.l2.apply(lambda x: x if len(x) !=1 else x[0])
#print (df)

In [91]: %timeit (pd.DataFrame([(left, right) for outer in zip(l1, l2) for left, right in zip_longest(*outer, fillvalue=outer[0])]))
1 loop, best of 3: 242 ms per loop

In [92]: %timeit (pd.DataFrame({ "l1": np.repeat(df.l1.values, df.l2.str.len()), "l2": list(chain.from_iterable(df.l2))}))
10 loops, best of 3: 84.6 ms per loop

Conclusion:

numpy.repeat is 3 times faster as zip_longest solution in larger df.

EDIT:

For compare with loop version is necessery smaller df, because very slow:

#[1000 rows x 2 columns]
np.random.seed(10)
N = 1000
l1 = ['a', 'b', 'c']
l1 = np.random.choice(l1, N)
l2 = [list(tuple(string.ascii_letters[:np.random.randint(1, 10)])) for _ in np.arange(N)]
df = pd.DataFrame({"l1":l1, "l2":l2})
df.l2 = df.l2.apply(lambda x: x if len(x) !=1 else x[0])
#print (df)

def alexey(df):
    df2 = pd.DataFrame(columns=df.columns,index=df.index)[0:0]

    for idx in df.index:
        new_row = df.loc[idx, :].copy()
        for res in df.ix[idx, 'l2']:
            new_row.set_value('l2', res)
            df2.loc[len(df2)] = new_row
    return df2

print (alexey(df))

In [20]: %timeit (alexey(df))
1 loop, best of 3: 11.4 s per loop

In [21]: %timeit pd.DataFrame([(left, right) for outer in zip(l1, l2) for left, right in zip_longest(*outer, fillvalue=outer[0])])
100 loops, best of 3: 2.57 ms per loop

In [22]: %timeit pd.DataFrame({ "l1": np.repeat(df.l1.values, df.l2.str.len()), "l2": list(chain.from_iterable(df.l2))})
The slowest run took 4.42 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 1.41 ms per loop

Unpacking and Merging Lists in a Column in Data.Frame