How to Get Mean of Every N Rows and Keep the Date Index

How can I get mean of every n rows and keep the date index?

A short one-liner solution with data.table:

library(data.table)

setDT(df)[,.(val=mean(val)), year-0:1]
# year val
# 1: 1990 75.5
# 2: 1992 105.0
# 3: 1994 105.5
# 4: 1996 164.5
# 5: 1998 164.5
# 6: 2000 119.5
# 7: 2002 213.5
# 8: 2004 223.0
# 9: 2006 164.5
#10: 2008 208.5
#11: 2010 157.0
#12: 2012 116.5

How to average column values every n rows in pandas

IIUC, DataFrame.melt + mean for each site with GroupBy.mean

# df_tmp = df_tmp.astype(int) # get correct result
df_tmp.melt('site').groupby('site')['value'].mean()

Or:

# df_tmp = df_tmp.astype(int) # get correct result
df_tmp.set_index('site').stack().groupby(level=0).mean()
#df_tmp.set_index('site').stack().mean(level=0) # .mean(level=0) deprecated

Output

site
1 3.333333
2 7.333333
Name: value, dtype: float64

Calculate average of every n rows from a csv file

You can use integer division by step for consecutive groups and pass to groupby for aggregate mean:

step = 30
m_df = pd.read_csv(m_path, usecols=['Col-01'])
df = m_df.groupby(m_df.index // step).mean()

Or:

df = m_df.groupby(np.arange(len(dfm_df// step).mean()

Sample data:

step = 3
df = m_df.groupby(m_df.index // step).mean()
print (df)
H
0 3
1 1
2 2

Calculate the mean of every 13 rows in data frame

Here's a solution using aggregate() and rep().

df <- data.frame(a=1:12, b=13:24 );
df;
## a b
## 1 1 13
## 2 2 14
## 3 3 15
## 4 4 16
## 5 5 17
## 6 6 18
## 7 7 19
## 8 8 20
## 9 9 21
## 10 10 22
## 11 11 23
## 12 12 24
n <- 5;
aggregate(df, list(rep(1:(nrow(df) %/% n + 1), each = n, len = nrow(df))), mean)[-1];
## a b
## 1 3.0 15.0
## 2 8.0 20.0
## 3 11.5 23.5

The important part of this solution that handles the issue of non-divisibility of nrow(df) by n is specifying the len parameter (actually the full parameter name is length.out) of rep(), which automatically caps the group vector to the appropriate length.

Python Pandas: Mean every n rows in a new column repeated n times

Use GroupBy.transform with integer division by index or helper array by length of DataFrame:

nrow = 3

#if default RangeIndex
df['c3'] = df.groupby(df.index // nrow)['c2'].transform('mean')

#alternative if not default RangeIndex
#df['c3'] = df.groupby(np.arange(df) // nrow)['c2'].transform('mean')
print(df)

c1 c2 c3
0 A 1 2
1 B 2 2
2 C 3 2
3 D 3 4
4 E 4 4
5 F 5 4

Average and median of each n rows, for all columns of a dataframe, while keeping a datetime object as index

For this data:

> dput(df)
df <- structure(list(Year = c(2000L, 2000L, 2000L, 2000L, 2000L, 2000L
), Day = c(122L, 122L, 122L, 122L, 122L, 122L), Hour = c(0L,
0L, 0L, 0L, 0L, 0L), Min = c(1L, 1L, 2L, 2L, 2L, 2L), Sec. = c(38.01,
50.1, 2.19, 14.28, 26.38, 38.47), E1.S1 = c(3.31, 1.98, 1.98,
2.65, 3.97, 2.65), E1.S2 = c(0.662, 3.31, 1.32, 1.32, 6.62, 0.662
), E1.S3 = c(0.662, 1.98, 3.97, 2.65, 0.662, 3.31), E1.S4 = c(2.65,
1.98, 1.98, 3.31, 3.31, 1.98), E1.S5 = c(1.32, 1.98, 1.32, 2.65,
3.31, 1.32), E1.S6 = c(0, 1.32, 0.662, 1.32, 4.63, 1.98), E1.S7 = c(3.31,
4.63, 0.662, 3.97, 5.29, 1.98), E1.S8 = c(1.32, 1.32, 3.97, 2.65,
1.98, 2.65), E2.S1 = c(1.98, 1.32, 1.32, 2.65, 0, 0.662), E2.S2 = c(1.98,
0.662, 0.662, 0, 0, 1.32), E2.S3 = c(0.662, 0, 1.32, 0.662, 1.98,
1.98), E2.S4 = c(0, 3.31, 0.662, 2.65, 0.662, 1.32)), .Names = c("Year",
"Day", "Hour", "Min", "Sec.", "E1.S1", "E1.S2", "E1.S3", "E1.S4",
"E1.S5", "E1.S6", "E1.S7", "E1.S8", "E2.S1", "E2.S2", "E2.S3",
"E2.S4"), class = "data.frame", row.names = c("1", "2", "3",
"4", "5", "6"))

This works:

lapply(split(df, ceiling(seq_len(nrow(df)) / 5)), colMeans)
# $`1`
# Year Day Hour Min Sec. E1.S1 E1.S2 E1.S3 E1.S4 E1.S5 E1.S6 E1.S7
# 2000.0000 122.0000 0.0000 1.6000 26.1920 2.7780 2.6464 1.9848 2.6460 2.1160 1.5864 3.5724
# E1.S8 E2.S1 E2.S2 E2.S3 E2.S4
# 2.2480 1.4540 0.6608 0.9248 1.4568
#
# $`2`
# Year Day Hour Min Sec. E1.S1 E1.S2 E1.S3 E1.S4 E1.S5 E1.S6 E1.S7 E1.S8
# 2000.000 122.000 0.000 2.000 38.470 2.650 0.662 3.310 1.980 1.320 1.980 1.980 2.650
# E2.S1 E2.S2 E2.S3 E2.S4
# 0.662 1.320 1.980 1.320
#

Then you can just bind them:

do.call(rbind, lapply(split(df, ceiling(seq_len(nrow(df)) / 5)), colMeans))
# Year Day Hour Min Sec. E1.S1 E1.S2 E1.S3 E1.S4 E1.S5 E1.S6 E1.S7 E1.S8 E2.S1 E2.S2 E2.S3 E2.S4
# 1 2000 122 0 1.6 26.192 2.778 2.6464 1.9848 2.646 2.116 1.5864 3.5724 2.248 1.454 0.6608 0.9248 1.4568
# 2 2000 122 0 2.0 38.470 2.650 0.6620 3.3100 1.980 1.320 1.9800 1.9800 2.650 0.662 1.3200 1.9800 1.3200

Note: It also helps to check if all the columns you want to take mean of are either integers or numeric by doing:

> sapply(df, class)
# Year Day Hour Min Sec. E1.S1 E1.S2 E1.S3 E1.S4 E1.S5 E1.S6 E1.S7
# "integer" "integer" "integer" "integer" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
# E1.S8 E2.S1 E2.S2 E2.S3 E2.S4
# "numeric" "numeric" "numeric" "numeric" "numeric"

Edit: Following OP's comment:

idx <- ceiling(seq_len(nrow(dd)) / 5)
# do colMeans on all columns except last one.
res <- lapply(split(dd[-(ncol(dd))], idx), colMeans, na.rm = TRUE)
# assign first value of "datetime" in each 5-er group as names to list
names(res) <- dd$datetime[seq(1, nrow(df), by=5)]
# bind them to give a matrix
res <- do.call(rbind, res)

Alternatively, if you want a data.frame and datetime as a column:

idx <- ceiling(seq_len(nrow(dd)) / 5)
res <- as.data.frame(do.call(rbind, lapply(split(dd[-(ncol(dd))], idx),
colMeans, na.rm = TRUE)))
res$datetime <- dd$datetime[seq(1, nrow(dd), by=5)]


Related Topics



Leave a reply



Submit