Split Pandas Dataframe Based on Groupby

Split pandas dataframe based on groupby

gb = df.groupby('ZZ')    
[gb.get_group(x) for x in gb.groups]

Split dataframe based on pandas groupby and generate multiple PDFs

I would write the groupby like this:

for EmpNo, data in df.groupby("EmpNo"):

For each group, the groupby will return the variable it groups on, and the dataframe which matches that variable.

Next, I would extract the first row of that dataframe. This is to make it easier to get the name and similar attributes.

first_row = data.iloc[0]

(What's the difference between iloc and loc?)

Since we have the employee ID already, we can skip looking it up in the dataframe. For other attributes, we can look it up like first_row['First Name'].

pdf.cell(80, 6, 'Employee ID: ' + str(EmpNo), 0, 1, 'L')
# ...
pdf.multi_cell(160, 5, 'Dear ' + str(first_row['First Name']) + ' ' + str(first_row['Last Name']) + ', Please find below your Plan.', 0, 1, 'L')

Next, in this loop which loops over the subset, I would use .iterrows() to do the loop instead of using range() and .loc. This is easier and won't break if the index of your dataframe doesn't start with zero. (After grouping, the second group's index won't start with zero anymore.)

Here is the final source code after the changes:

import pandas as pd
data = {'EmpNo': ['123','123','123','456','456', '456','456','789','789','789'],
'First Name': ['John', 'John', 'John', 'Jane', 'Jane', 'Jane', 'Jane', 'Danny', 'Danny', 'Danny'],
'Last Name': ['Doe', 'Doe' ,'Doe', 'Doe' ,'Doe', 'Doe', 'Doe', 'Roberts', 'Roberts', 'Roberts'],
'Activity Code': ['HR-CONF-1', 'HR-Field-NH-ONB','COEATT-2021','HR-HBK-CA-1','HR-WD-EMP','HR-LIST-1','HS-Guide-3','HR-WD-EMP','HR-LIST-1','HS-Guide-3'],
'RegistrationDate': ['11/22/2021', '11/22/2021', '11/22/2021', '11/22/2021', '11/22/2021', '11/22/2021','11/22/2021', '11/22/2021', '11/22/2021','11/22/2021']}
df = pd.DataFrame(data = data, columns = ['EmpNo','First Name', 'Last Name', 'Activity Code', 'RegistrationDate'])


from fpdf import FPDF

class PDF(FPDF):
def header(self):

# Arial bold 15
self.set_font('Helvetica', 'B', 15)
# Move to the right
self.cell(80)
# Title
self.cell(42, 2, 'Plan', 0, 0, 'C')
# Line break
self.ln(20)

# Page footer
def footer(self):
# Position at 1.5 cm from bottom
self.set_y(-15)
# Arial italic 8
self.set_font('Helvetica', 'I', 8)
# Page number
self.cell(0, 10, 'Page ' + str(self.page_no()) + '/{nb}', 0, 0, 'C')
# Footer image First is horizontal, second is vertical, third is size

for EmpNo, data in df.groupby("EmpNo"):
# Get first row of grouped dataframe
first_row = data.iloc[0]

# Instantiation of inherited class
pdf = PDF()
pdf.alias_nb_pages()
pdf.add_page()
pdf.set_font('Helvetica', '', 11)
pdf.cell(80, 6, 'Employee ID: ' + str(EmpNo), 0, 1, 'L')
pdf.ln(2.5)
pdf.multi_cell(160, 5, 'Dear ' + str(first_row['First Name']) + ' ' + str(first_row['Last Name']) + ', Please find below your Plan.', 0, 1, 'L')
pdf.cell(80, 6, '', 0, 1, 'C')
pdf.set_font('Helvetica', 'B', 13)
pdf.cell(80, 6, 'Name', 0, 0, 'L')
pdf.cell(40, 6, 'Date', 0, 0, 'L')
pdf.cell(40, 6, 'Link', 0, 1, 'L')
pdf.cell(80, 6, '', 0, 1, 'C')
pdf.set_font('Helvetica', '', 8)
for _, row in data.iterrows():
pdf.set_font('Helvetica', '', 8)
pdf.cell(80, 6, row['Activity Code'], 0, 0, 'L')
#pdf.cell(40, 6, row['Activity Link'], 0, 1, 'L')
pdf.cell(40, 6, row['RegistrationDate'], 0, 0, 'L')
pdf.set_font('Helvetica', 'U', 8)
pdf.cell(40, 6, 'Click Here', 0, 1, 'L', link = 'www.google.com')
pdf.set_font('Helvetica', 'B', 10)
pdf.cell(80, 6, '', 0, 1, 'C')
pdf.cell(80, 6, 'IF YOU REQUIRE ANY HELP, PLEASE CONTACT US', 0, 0, 'L')
pdf.output(str(first_row['First Name']) + ' ' + str(first_row['Last Name'])+ '.pdf', 'F')

Tested, and it works.

How to divide the dataframe in pandas into multiple dataframes based on the group by results?

You can use groupby for create dictionary of DataFrames:

d = dict(tuple(df.groupby('Date_A')))
print (d['09/01/2016'])
Date_A Date_B Date_C Amount
0 09/01/2016 09/01/2016 01/01/1800 2405814.36
1 09/01/2016 09/01/2016 09/01/2016 11347445.71
2 09/01/2016 10/01/2016 01/01/1800 5005106.94
3 09/01/2016 10/01/2016 09/02/2016 391679.95
4 09/01/2016 10/01/2016 10/01/2016 15982401.76
5 09/01/2016 11/01/2016 01/01/1800 3625649.29
6 09/01/2016 11/01/2016 10/14/2016 200730.30
7 09/01/2016 11/01/2016 11/01/2016 3801867.77
8 09/01/2016 01/01/2017 01/01/2017 398961.22

EDIT:

For looping use items():

for i, df in d.items():
print (i)
print (df)

Pandas dataframe split based on a filter on groupby

Here is a slightly different approach using .duplicated instead of groupby/filter which can be really slow if you have a large dft. Note keep=False which marks all duplicate rows, instead of ignoring the first instance of a duplicate which is default behavior

import pandas as pd
import numpy as np

num_rows = 100

np.random.seed(1)

#Creating a test df
dft = pd.DataFrame({
'time':np.random.randint(5,25,num_rows),
'O':np.random.randint(1,4,num_rows),
'A':np.random.randint(1,4,num_rows),
'N':np.random.randint(1,4,num_rows),
'value':np.random.randint(10,100,num_rows),
'value_next':np.random.randint(-10,40,num_rows),
})

#Getting a mask of True if duplicated, False otherwise
is_dup = dft.duplicated(['O', 'A', 'N', 'value_next'],keep=False)

df1 = dft[~is_dup]
df2 = dft[is_dup]

print(df2)

#Quick check that a row in df2 was originally duplicated
dft[
dft['O'].eq(2) &
dft['A'].eq(3) &
dft['N'].eq(1) &
dft['value_next'].eq(8)
]

Split dataframe based on multiple columns pandas groupby

You can extract the unique values of both the a and b columns and use each one as a filter. For example,

import pandas as pd

df = pd.DataFrame(
[
["red", "green", 1, 2],
["brown", "red", 4, 5],
["black", "grey", 0, 0],
["red", "blue", 6, 1],
["green", "blue", 0, 3],
["black", "brown", 2, 8],
["red", "grey", 4, 6],
],
columns=["a", "b", "c", "d"]
)

colors = pd.unique(df[['a', 'b']].values.ravel('K'))

>>> colors
array(['red', 'brown', 'black', 'green', 'grey', 'blue'], dtype=object)

Iterate over each color and perform your operation on the resulting current_df after filtering.

df_list = []
for color in colors:
current_df = df[(df.a == color) | (df.b == color)].copy().reset_index(drop=True)
current_df["e"] = current_df.apply(
lambda x: (
current_df[(current_df.a == color)].loc[x.name + 1 :, "c"].sum()
+ current_df[(current_df.b == color)].loc[x.name + 1 :, "d"].sum()
)
/ (current_df.shape[0] - x.name - 1),
axis=1
)
df_list.append(current_df)

(current_df.shape[0] - x.name - 1) becomes the number of values that were added, because x.name is the row number and current_df.shape[0] is the total number of rows of the current filtered df. This is equivalent to:

df_list = []
for color in colors:
current_df = df[(df.a == color) | (df.b == color)].copy()
current_df["e"] = current_df.apply(
lambda x: (
current_df[(current_df.a == color)].loc[x.name + 1 :, "c"].sum()
+ current_df[(current_df.b == color)].loc[x.name + 1 :, "d"].sum()
)
/ (
current_df[(current_df.a == color)].loc[x.name + 1 :, "c"].size
+ current_df[(current_df.b == color)].loc[x.name + 1 :, "d"].size
),
axis=1,
)
df_list.append(current_df)

Result for red:

>>> df_list[0]
a b c d e
0 red green 1 2 5.0
1 brown red 4 5 5.0
3 red blue 6 1 4.0
6 red grey 4 6 NaN

How to split groupby dataframe in a given ratio?

Starting from Pandas 1.1.0, you can use groupby.sample. Something like this:

# random_state for repeatability
# remove if not needed
df.groupby('A').sample(frac=0.7, random_state=43)
test = df.drop(train.index)

Update: If you just want the top rows, you can do:

train = (df.groupby('A')
.apply(lambda x: x.head(int(len(x) * 0.7) ) )
.reset_index(level=0, drop=True)
)
test = df.drop(train.index)

Or you can do lazy groupby and boolean indexing, which is a bit faster but more verbose:

groups = df.groupby('A')
row_nums = groups.cumcount()
sizes = groups('A').transform('size')

train = df[row_nums <= sizes * 0.7]
test = df[row_nums > sizes * 0.7]

How to split a dataframe by unique groups and save to a csv

You can obtain the unique values calling unique, iterate over this, build the filename and write this out to csv:

genes = df['Gene'].unique()
for gene in genes:
outfilename = gene + '.pdf'
print(outfilename)
df[df['Gene'] == gene].to_csv(outfilename)
HAPPY.pdf
SAD.pdf
LEG.pdf

A more pandas-thonic method is to groupby on 'Gene' and then iterate over the groups:

gp = df.groupby('Gene')
# groups() returns a dict with 'Gene':indices as k:v pair
for g in gp.groups.items():
print(df.loc[g[1]])

chr start end Gene Value MoreData
0 chr1 123 123 HAPPY 41.1 3.4
1 chr1 125 129 HAPPY 45.9 4.5
2 chr1 140 145 HAPPY 39.3 4.1
chr start end Gene Value MoreData
3 chr1 342 355 SAD 34.2 9.0
4 chr1 360 361 SAD 44.3 8.1
5 chr1 390 399 SAD 29.0 7.2
6 chr1 400 411 SAD 35.6 6.5
chr start end Gene Value MoreData
7 chr1 462 470 LEG 20 2.7


Related Topics



Leave a reply



Submit