Split Pandas Dataframe Based on Groupby

Split pandas dataframe based on groupby

gb = df.groupby('ZZ')    
[gb.get_group(x) for x in gb.groups]

Split dataframe based on pandas groupby and generate multiple PDFs

I would write the groupby like this:

for EmpNo, data in df.groupby("EmpNo"):

For each group, the groupby will return the variable it groups on, and the dataframe which matches that variable.

Next, I would extract the first row of that dataframe. This is to make it easier to get the name and similar attributes.

first_row = data.iloc[0]

(What's the difference between iloc and loc?)

Since we have the employee ID already, we can skip looking it up in the dataframe. For other attributes, we can look it up like first_row['First Name'].

pdf.cell(80, 6, 'Employee ID: ' + str(EmpNo), 0, 1, 'L')
# ...
pdf.multi_cell(160, 5, 'Dear ' + str(first_row['First Name']) + ' ' + str(first_row['Last Name']) + ', Please find below your Plan.', 0, 1, 'L')

Next, in this loop which loops over the subset, I would use .iterrows() to do the loop instead of using range() and .loc. This is easier and won't break if the index of your dataframe doesn't start with zero. (After grouping, the second group's index won't start with zero anymore.)

Here is the final source code after the changes:

import pandas as pd
data = {'EmpNo': ['123','123','123','456','456', '456','456','789','789','789'],
  'First Name': ['John', 'John', 'John', 'Jane', 'Jane', 'Jane', 'Jane', 'Danny', 'Danny', 'Danny'],
  'Last Name': ['Doe', 'Doe' ,'Doe', 'Doe' ,'Doe', 'Doe', 'Doe', 'Roberts', 'Roberts', 'Roberts'],
  'Activity Code': ['HR-CONF-1', 'HR-Field-NH-ONB','COEATT-2021','HR-HBK-CA-1','HR-WD-EMP','HR-LIST-1','HS-Guide-3','HR-WD-EMP','HR-LIST-1','HS-Guide-3'],
  'RegistrationDate': ['11/22/2021', '11/22/2021', '11/22/2021', '11/22/2021', '11/22/2021', '11/22/2021','11/22/2021', '11/22/2021', '11/22/2021','11/22/2021']}
df = pd.DataFrame(data = data, columns = ['EmpNo','First Name', 'Last Name', 'Activity Code', 'RegistrationDate'])


from fpdf import FPDF

class PDF(FPDF):
    def header(self):

        # Arial bold 15
        self.set_font('Helvetica', 'B', 15)
        # Move to the right
        self.cell(80)            
        # Title
        self.cell(42, 2, 'Plan', 0, 0, 'C')
        # Line break
        self.ln(20)

        # Page footer
    def footer(self):
        # Position at 1.5 cm from bottom
        self.set_y(-15)
        # Arial italic 8
        self.set_font('Helvetica', 'I', 8)
        # Page number
        self.cell(0, 10, 'Page ' + str(self.page_no()) + '/{nb}', 0, 0, 'C')
        # Footer image First is horizontal, second is vertical, third is size

for EmpNo, data in df.groupby("EmpNo"):
    # Get first row of grouped dataframe
    first_row = data.iloc[0]

    # Instantiation of inherited class
    pdf = PDF()
    pdf.alias_nb_pages()
    pdf.add_page()
    pdf.set_font('Helvetica', '', 11)
    pdf.cell(80, 6, 'Employee ID: ' + str(EmpNo), 0, 1, 'L')
    pdf.ln(2.5)
    pdf.multi_cell(160, 5, 'Dear ' + str(first_row['First Name']) + ' ' + str(first_row['Last Name']) + ', Please find below your Plan.', 0, 1, 'L')
    pdf.cell(80, 6, '', 0, 1, 'C')
    pdf.set_font('Helvetica', 'B', 13)
    pdf.cell(80, 6, 'Name', 0, 0, 'L')
    pdf.cell(40, 6, 'Date', 0, 0, 'L')
    pdf.cell(40, 6, 'Link', 0, 1, 'L')
    pdf.cell(80, 6, '', 0, 1, 'C')
    pdf.set_font('Helvetica', '', 8)
    for _, row in data.iterrows():
        pdf.set_font('Helvetica', '', 8)
        pdf.cell(80, 6, row['Activity Code'], 0, 0, 'L')
        #pdf.cell(40, 6, row['Activity Link'], 0, 1, 'L')
        pdf.cell(40, 6, row['RegistrationDate'], 0, 0, 'L')
        pdf.set_font('Helvetica', 'U', 8)
        pdf.cell(40, 6, 'Click Here', 0, 1, 'L', link = 'www.google.com')
    pdf.set_font('Helvetica', 'B', 10)
    pdf.cell(80, 6, '', 0, 1, 'C')
    pdf.cell(80, 6, 'IF YOU REQUIRE ANY HELP, PLEASE CONTACT US', 0, 0, 'L')
    pdf.output(str(first_row['First Name']) + ' ' + str(first_row['Last Name'])+ '.pdf', 'F')

Tested, and it works.

How to divide the dataframe in pandas into multiple dataframes based on the group by results?

You can use groupby for create dictionary of DataFrames:

d = dict(tuple(df.groupby('Date_A')))
print (d['09/01/2016'])
       Date_A      Date_B      Date_C       Amount
0  09/01/2016  09/01/2016  01/01/1800   2405814.36
1  09/01/2016  09/01/2016  09/01/2016  11347445.71
2  09/01/2016  10/01/2016  01/01/1800   5005106.94
3  09/01/2016  10/01/2016  09/02/2016    391679.95
4  09/01/2016  10/01/2016  10/01/2016  15982401.76
5  09/01/2016  11/01/2016  01/01/1800   3625649.29
6  09/01/2016  11/01/2016  10/14/2016    200730.30
7  09/01/2016  11/01/2016  11/01/2016   3801867.77
8  09/01/2016  01/01/2017  01/01/2017    398961.22

EDIT:

For looping use items():

for i, df in d.items():
    print (i)
    print (df)

Pandas dataframe split based on a filter on groupby

Here is a slightly different approach using .duplicated instead of groupby/filter which can be really slow if you have a large dft. Note keep=False which marks all duplicate rows, instead of ignoring the first instance of a duplicate which is default behavior

import pandas as pd
import numpy as np

num_rows = 100

np.random.seed(1)

#Creating a test df
dft = pd.DataFrame({
    'time':np.random.randint(5,25,num_rows),
    'O':np.random.randint(1,4,num_rows),
    'A':np.random.randint(1,4,num_rows),
    'N':np.random.randint(1,4,num_rows),
    'value':np.random.randint(10,100,num_rows),
    'value_next':np.random.randint(-10,40,num_rows),
})

#Getting a mask of True if duplicated, False otherwise
is_dup = dft.duplicated(['O', 'A', 'N', 'value_next'],keep=False)

df1 = dft[~is_dup]
df2 = dft[is_dup]

print(df2)

#Quick check that a row in df2 was originally duplicated
dft[
    dft['O'].eq(2) &
    dft['A'].eq(3) &
    dft['N'].eq(1) &
    dft['value_next'].eq(8)
]

Split dataframe based on multiple columns pandas groupby

You can extract the unique values of both the a and b columns and use each one as a filter. For example,

import pandas as pd

df = pd.DataFrame(
    [
        ["red", "green", 1, 2],
        ["brown", "red", 4, 5],
        ["black", "grey", 0, 0],
        ["red", "blue", 6, 1],
        ["green", "blue", 0, 3],
        ["black", "brown", 2, 8],
        ["red", "grey", 4, 6],
    ],
    columns=["a", "b", "c", "d"]
)

colors = pd.unique(df[['a', 'b']].values.ravel('K'))

>>> colors
    array(['red', 'brown', 'black', 'green', 'grey', 'blue'], dtype=object)

Iterate over each color and perform your operation on the resulting current_df after filtering.

df_list = []
for color in colors:
    current_df = df[(df.a == color) | (df.b == color)].copy().reset_index(drop=True)
    current_df["e"] = current_df.apply(
        lambda x: (
            current_df[(current_df.a == color)].loc[x.name + 1 :, "c"].sum()
            + current_df[(current_df.b == color)].loc[x.name + 1 :, "d"].sum()
        )
        / (current_df.shape[0] - x.name - 1),
        axis=1
    )
    df_list.append(current_df)

(current_df.shape[0] - x.name - 1) becomes the number of values that were added, because x.name is the row number and current_df.shape[0] is the total number of rows of the current filtered df. This is equivalent to:

df_list = []
for color in colors:
    current_df = df[(df.a == color) | (df.b == color)].copy()
    current_df["e"] = current_df.apply(
        lambda x: (
            current_df[(current_df.a == color)].loc[x.name + 1 :, "c"].sum()
            + current_df[(current_df.b == color)].loc[x.name + 1 :, "d"].sum()
        )
        / (
            current_df[(current_df.a == color)].loc[x.name + 1 :, "c"].size
            + current_df[(current_df.b == color)].loc[x.name + 1 :, "d"].size
        ),
        axis=1,
    )
    df_list.append(current_df)

Result for red:

>>> df_list[0]
           a      b  c  d    e
    0    red  green  1  2  5.0
    1  brown    red  4  5  5.0
    3    red   blue  6  1  4.0
    6    red   grey  4  6  NaN

How to split groupby dataframe in a given ratio?

Starting from Pandas 1.1.0, you can use groupby.sample. Something like this:

# random_state for repeatability
# remove if not needed
df.groupby('A').sample(frac=0.7, random_state=43)
test = df.drop(train.index)

Update: If you just want the top rows, you can do:

train = (df.groupby('A')
           .apply(lambda x: x.head(int(len(x) * 0.7) ) )
           .reset_index(level=0, drop=True)
        )
test = df.drop(train.index)

Or you can do lazy groupby and boolean indexing, which is a bit faster but more verbose:

groups = df.groupby('A')
row_nums = groups.cumcount()
sizes = groups('A').transform('size')

train = df[row_nums <= sizes * 0.7]
test = df[row_nums > sizes * 0.7]

How to split a dataframe by unique groups and save to a csv

You can obtain the unique values calling unique, iterate over this, build the filename and write this out to csv:

genes = df['Gene'].unique()
for gene in genes:
    outfilename = gene + '.pdf'
    print(outfilename)
    df[df['Gene'] == gene].to_csv(outfilename)
HAPPY.pdf
SAD.pdf
LEG.pdf

A more pandas-thonic method is to groupby on 'Gene' and then iterate over the groups:

gp = df.groupby('Gene')
# groups() returns a dict with 'Gene':indices as k:v pair
for g in gp.groups.items():
    print(df.loc[g[1]])   
    
    chr  start  end   Gene  Value  MoreData
0  chr1    123  123  HAPPY   41.1       3.4
1  chr1    125  129  HAPPY   45.9       4.5
2  chr1    140  145  HAPPY   39.3       4.1
    chr  start  end Gene  Value  MoreData
3  chr1    342  355  SAD   34.2       9.0
4  chr1    360  361  SAD   44.3       8.1
5  chr1    390  399  SAD   29.0       7.2
6  chr1    400  411  SAD   35.6       6.5
    chr  start  end Gene  Value  MoreData
7  chr1    462  470  LEG     20       2.7

Split Pandas Dataframe Based on Groupby