Merge Multiple Column Values into One Column in Python Pandas

Merge multiple column values into one column in python pandas

You can call apply pass axis=1 to apply row-wise, then convert the dtype to str and join:

In [153]:
df['ColumnA'] = df[df.columns[1:]].apply(
lambda x: ','.join(x.dropna().astype(str)),
axis=1
)
df

Out[153]:
Column1 Column2 Column3 Column4 Column5 ColumnA
0 a 1 2 3 4 1,2,3,4
1 a 3 4 5 NaN 3,4,5
2 b 6 7 8 NaN 6,7,8
3 c 7 7 NaN NaN 7,7

Here I call dropna to get rid of the NaN, however we need to cast again to int so we don't end up with floats as str.

Pandas merge multiple value columns into a value and type column

You could do that with the following

pd.melt(df, value_vars=['a','b','c'], value_name='count', var_name='type')

Merge multiple column in one column in python

Replace to missing values empty strigs, then back filling missing values and select first column, last convert to integers if necessary and to one column DataFrame:

data = {'Column1':[3, 2, 1, ""],
'Column2': ["", "", "6", "7"],
'Column3':["", "", "", 13]}

df = pd.DataFrame(data)

df1 = df.replace('', np.nan).bfill(axis=1).iloc[:, 0].astype(int).to_frame('Column A')
print (df1)
Column A
0 3
1 2
2 1
3 7

How to concatenate multiple column values into a single column in Panda dataframe based on start and end time

Let's do this in a few steps.

First, let's make sure your Timestamp is a datetime.

df['Timestamp'] = pd.to_datetime(df['Timestamp'])

Then we can create a new dataframe based on a min and max values of your timestamp.

df1 = pd.DataFrame({'start_time' : pd.date_range(df['Timestamp'].min(), df['Timestamp'].max())})

df1['end_time'] = df1['start_time'] + pd.DateOffset(days=1)

start_time end_time
0 2013-02-01 2013-02-02
1 2013-02-02 2013-02-03
2 2013-02-03 2013-02-04
3 2013-02-04 2013-02-05
4 2013-02-05 2013-02-06
5 2013-02-06 2013-02-07
6 2013-02-07 2013-02-08
7 2013-02-08 2013-02-09

Now we need to create a dataframe to merge onto your start_time column.

Let's filter out any values that are less than 0 and create a list of active appliances:

df = df.set_index('Timestamp')
# the remaining columns MUST be integers for this to work.
# or you'll need to subselect them.
df2 = df.mask(df.le(0)).stack().reset_index(1).groupby(level=0)\
.agg(active_appliances=('level_1',list)).reset_index(0)

# change .agg(active_appliances=('level_1',list) >
# to .agg(active_appliances=('level_1',','.join)
# if you prefer strings.

Timestamp active_appliances
0 2013-02-01 [A]
1 2013-02-02 [A, B, C]
2 2013-02-03 [A, C]
3 2013-02-04 [A, B, C]
4 2013-02-05 [B, C]
5 2013-02-06 [A, B, C]
6 2013-02-07 [A, B, C]

Then we can merge:

final = pd.merge(df1,df2,left_on='start_time',right_on='Timestamp',how='left').drop('Timestamp',1)

start_time end_time active_appliances
0 2013-02-01 2013-02-02 [A]
1 2013-02-02 2013-02-03 [A, B, C]
2 2013-02-03 2013-02-04 [A, C]
3 2013-02-04 2013-02-05 [A, B, C]
4 2013-02-05 2013-02-06 [B, C]
5 2013-02-06 2013-02-07 [A, B, C]
6 2013-02-07 2013-02-08 [A, B, C]
7 2013-02-08 2013-02-09 NaN

How to combine multiple dataframe columns into one given each column has nan values

With stack:

df["XYZ"] = df.stack().values

to get

>>> df

X Y Z XYZ
0 NaN NaN ZVal1 ZVal1
1 NaN NaN ZVal2 ZVal2
2 XVal1 NaN NaN XVal1
3 NaN YVal1 NaN YVal1

since you guarantee only 1 non-NaN per row and stack drops NaNs by default.


Another way with fancy indexing:

df["XYZ"] = df.to_numpy()[np.arange(len(df)),
df.columns.get_indexer(df.notna().idxmax(axis=1))]

which, for each row, looks at the index of the non-NaN value and selects it.

Combine different values of multiple columns into one column

Thanks to the comments (@ polars issues) from @cannero and @ritchie46, I was able to make it work.

This is a working version (Float64):

use polars::prelude::*;

fn my_black_box_function(a: f64, b: f64) -> f64 {
// do something
a
}

fn apply_multiples(lf: LazyFrame) -> Result<DataFrame> {

let ergebnis = lf
.select([col("struct_col").map(
|s| {
let ca = s.struct_()?;

let b = ca.field_by_name("a")?;
let a = ca.field_by_name("b")?;
let a = a.f64()?;
let b = b.f64()?;

let out: Float64Chunked = a
.into_iter()
.zip(b.into_iter())
.map(|(opt_a, opt_b)| match (opt_a, opt_b) {
(Some(a), Some(b)) => Some(my_black_box_function(a, b)),
_ => None,
})
.collect();

Ok(out.into_series())
},
GetOutput::from_type(DataType::Float64),
)])
.collect();

ergebnis
}

fn main() {
// We start with a normal DataFrame
let df = df![
"a" => [1.0, 2.0, 3.0],
"b" => [3.0, 5.1, 0.3]
]
.unwrap();

// We CONVERT the df into a StructChunked and WRAP this into a new LazyFrame
let lf = df![
"struct_col" => df.into_struct("StructChunked")
]
.unwrap()
.lazy();

let processed = apply_multiples(lf);

match processed {
Ok(..) => println!("We did it"),
Err(e) => println!("{:?}", e),
}
}

Here is a version for my initial question (String):

use polars::prelude::*;

fn my_fruit_box(fruit: String, color: String) -> String {
// do something
format!("{} has {} color", fruit, color)
}

fn apply_multiples(lf: LazyFrame) -> Result<DataFrame> {

let ergebnis = lf
.select([col("struct_col").map(
|s| {
let ca = s.struct_()?;

let fruit = ca.field_by_name("Fruit")?;
let color = ca.field_by_name("Color")?;
let color = color.utf8()?;
let fruit = fruit.utf8()?;

let out: Utf8Chunked = fruit
.into_iter()
.zip(color.into_iter())
.map(|(opt_fruit, opt_color)| match (opt_fruit, opt_color) {
(Some(fruit), Some(color)) => {
Some(my_fruit_box(fruit.to_string(), color.to_string()))
}
_ => None,
})
.collect();

Ok(out.into_series())
},
GetOutput::from_type(DataType::Utf8),
)])
.collect();

ergebnis
}

fn main() {
// We start with a normal DataFrame
let s1 = Series::new("Fruit", &["Apple", "Apple", "Pear"]);
let s2 = Series::new("Color", &["Red", "Yellow", "Green"]);

let df = DataFrame::new(vec![s1, s2]).unwrap();

// We CONVERT the df into a StructChunked and WRAP this into a new LazyFrame
let lf = df![
"struct_col" => df.into_struct("StructChunked")
]
.unwrap()
.lazy();

let processed = apply_multiples(lf);

match processed {
Ok(..) => println!("We did it"),
Err(e) => println!("{:?}", e),
}
}

Joining or merging multiple columns within one dataframe and keeping all data

Here is a way to do what you've asked:

df = df[['Position1', 'Count1']].rename(columns={'Position1':'Positions'}).join(
df[['Position2', 'Count2']].set_index('Position2'), on='Positions', how='outer').join(
df[['Position3', 'Count3']].set_index('Position3'), on='Positions', how='outer').sort_values(
by=['Positions']).reset_index(drop=True)

Output:

   Positions  Count1  Count2  Count3
0 1 55.0 NaN NaN
1 2 35.0 35.0 NaN
2 3 45.0 NaN 45.0
3 4 NaN 15.0 NaN
4 5 NaN NaN 95.0
5 6 NaN NaN 105.0
6 7 NaN 75.0 NaN

Explanation:

  • Use join first on Position1, Count1 and Position2, Count2 (with Position1 renamed as Positions) then on that join result and Position3, Count3.
  • Sort by Positions and use reset_index to create a new integer range index (ascending with no gaps).

Merge multiple column into one column as list in python pandas

There is no clear of MultiIndex or Index in input Data, so number of levels is counted by range:

df['ColumnA'] = df.stack().astype(int).groupby(level=list(range(df.index.nlevels))).agg(list)
print (df)
Column1 Column2 Column3 Column4 ColumnA
Index
0 a 1 2 3.0 4.0 [1, 2, 3, 4]
1 a 3 4 5.0 NaN [3, 4, 5]
2 b 6 7 8.0 NaN [6, 7, 8]
3 c 7 7 NaN NaN [7, 7]

Or:

df['ColumnA'] = df.apply(lambda x: x.dropna().astype(int).tolist(), axis=1)

If there are empty strings:

print (df)
Column1 Column2 Column3 Column4
Index
0 a 1 2 3.0 4.0
1 a 3 4 5.0
2 b 6 7 8.0
3 c 7 7

df['ColumnA'] = df.apply(lambda x: x[x!= ''].astype(int).tolist(), axis=1)

print (df)
Column1 Column2 Column3 Column4 ColumnA
Index
0 a 1 2 3.0 4.0 [1, 2, 3, 4]
1 a 3 4 5.0 [3, 4, 5]
2 b 6 7 8.0 [6, 7, 8]
3 c 7 7 [7, 7]

Merge multiple values of a column after group by into one column in python pandas

Use GroupBy.agg with lambda function is most general solution:

df1 = df.groupby(['ID1','ID2'])['Status'].agg(lambda x: ','.join(x.dropna())).reset_index()
print (df1)
ID1 ID2 Status
0 1 a 1
1 2 b 1
2 3 c 2,1

Another idea is remove duplicated in first step, but if some group with only misisng values it is removed from output, so is necessary next processing like merge:

#first group with only NaNs
df= pd.DataFrame({'ID1' : [1,1,2,2,3,3,3],'ID2' : ['a','a','b','b','c','c','c'],
'Status' : pd.Series([np.nan,np.nan, np.nan,'1','2',np.nan,'1'],
dtype="category")})

#first group is dropped
df11 = (df.dropna(subset=['Status'])
.groupby(['ID1','ID2'])['Status']
.agg(','.join)
.reset_index())
print (df11)
ID1 ID2 Status
0 2 b 1
1 3 c 2,1

#added missing pairs and `NaN`s converted to empty strings:
df2 = df.drop_duplicates(['ID1','ID2'])[['ID1','ID2']].merge(df11, how='left').fillna('')
print (df2)
ID1 ID2 Status
0 1 a
1 2 b 1
2 3 c 2,1

First solution:

df1 = df.groupby(['ID1','ID2'])['Status'].agg(lambda x: ','.join(x.dropna())).reset_index()
print (df1)
ID1 ID2 Status
0 1 a
1 2 b 1
2 3 c 2,1


Related Topics



Leave a reply



Submit