How can I get mean of every n rows and keep the date index?
A short one-liner solution with data.table
:
library(data.table)
setDT(df)[,.(val=mean(val)), year-0:1]
# year val
# 1: 1990 75.5
# 2: 1992 105.0
# 3: 1994 105.5
# 4: 1996 164.5
# 5: 1998 164.5
# 6: 2000 119.5
# 7: 2002 213.5
# 8: 2004 223.0
# 9: 2006 164.5
#10: 2008 208.5
#11: 2010 157.0
#12: 2012 116.5
How to average column values every n rows in pandas
IIUC, DataFrame.melt
+ mean for each site with GroupBy.mean
# df_tmp = df_tmp.astype(int) # get correct result
df_tmp.melt('site').groupby('site')['value'].mean()
Or:
# df_tmp = df_tmp.astype(int) # get correct result
df_tmp.set_index('site').stack().groupby(level=0).mean()
#df_tmp.set_index('site').stack().mean(level=0) # .mean(level=0) deprecated
Output
site
1 3.333333
2 7.333333
Name: value, dtype: float64
Calculate average of every n rows from a csv file
You can use integer division by step
for consecutive groups and pass to groupby
for aggregate mean
:
step = 30
m_df = pd.read_csv(m_path, usecols=['Col-01'])
df = m_df.groupby(m_df.index // step).mean()
Or:
df = m_df.groupby(np.arange(len(dfm_df// step).mean()
Sample data:
step = 3
df = m_df.groupby(m_df.index // step).mean()
print (df)
H
0 3
1 1
2 2
Calculate the mean of every 13 rows in data frame
Here's a solution using aggregate()
and rep()
.
df <- data.frame(a=1:12, b=13:24 );
df;
## a b
## 1 1 13
## 2 2 14
## 3 3 15
## 4 4 16
## 5 5 17
## 6 6 18
## 7 7 19
## 8 8 20
## 9 9 21
## 10 10 22
## 11 11 23
## 12 12 24
n <- 5;
aggregate(df, list(rep(1:(nrow(df) %/% n + 1), each = n, len = nrow(df))), mean)[-1];
## a b
## 1 3.0 15.0
## 2 8.0 20.0
## 3 11.5 23.5
The important part of this solution that handles the issue of non-divisibility of nrow(df)
by n
is specifying the len
parameter (actually the full parameter name is length.out
) of rep()
, which automatically caps the group vector to the appropriate length.
Python Pandas: Mean every n rows in a new column repeated n times
Use GroupBy.transform
with integer division by index or helper array by length of DataFrame
:
nrow = 3
#if default RangeIndex
df['c3'] = df.groupby(df.index // nrow)['c2'].transform('mean')
#alternative if not default RangeIndex
#df['c3'] = df.groupby(np.arange(df) // nrow)['c2'].transform('mean')
print(df)
c1 c2 c3
0 A 1 2
1 B 2 2
2 C 3 2
3 D 3 4
4 E 4 4
5 F 5 4
Average and median of each n rows, for all columns of a dataframe, while keeping a datetime object as index
For this data:
> dput(df)
df <- structure(list(Year = c(2000L, 2000L, 2000L, 2000L, 2000L, 2000L
), Day = c(122L, 122L, 122L, 122L, 122L, 122L), Hour = c(0L,
0L, 0L, 0L, 0L, 0L), Min = c(1L, 1L, 2L, 2L, 2L, 2L), Sec. = c(38.01,
50.1, 2.19, 14.28, 26.38, 38.47), E1.S1 = c(3.31, 1.98, 1.98,
2.65, 3.97, 2.65), E1.S2 = c(0.662, 3.31, 1.32, 1.32, 6.62, 0.662
), E1.S3 = c(0.662, 1.98, 3.97, 2.65, 0.662, 3.31), E1.S4 = c(2.65,
1.98, 1.98, 3.31, 3.31, 1.98), E1.S5 = c(1.32, 1.98, 1.32, 2.65,
3.31, 1.32), E1.S6 = c(0, 1.32, 0.662, 1.32, 4.63, 1.98), E1.S7 = c(3.31,
4.63, 0.662, 3.97, 5.29, 1.98), E1.S8 = c(1.32, 1.32, 3.97, 2.65,
1.98, 2.65), E2.S1 = c(1.98, 1.32, 1.32, 2.65, 0, 0.662), E2.S2 = c(1.98,
0.662, 0.662, 0, 0, 1.32), E2.S3 = c(0.662, 0, 1.32, 0.662, 1.98,
1.98), E2.S4 = c(0, 3.31, 0.662, 2.65, 0.662, 1.32)), .Names = c("Year",
"Day", "Hour", "Min", "Sec.", "E1.S1", "E1.S2", "E1.S3", "E1.S4",
"E1.S5", "E1.S6", "E1.S7", "E1.S8", "E2.S1", "E2.S2", "E2.S3",
"E2.S4"), class = "data.frame", row.names = c("1", "2", "3",
"4", "5", "6"))
This works:
lapply(split(df, ceiling(seq_len(nrow(df)) / 5)), colMeans)
# $`1`
# Year Day Hour Min Sec. E1.S1 E1.S2 E1.S3 E1.S4 E1.S5 E1.S6 E1.S7
# 2000.0000 122.0000 0.0000 1.6000 26.1920 2.7780 2.6464 1.9848 2.6460 2.1160 1.5864 3.5724
# E1.S8 E2.S1 E2.S2 E2.S3 E2.S4
# 2.2480 1.4540 0.6608 0.9248 1.4568
#
# $`2`
# Year Day Hour Min Sec. E1.S1 E1.S2 E1.S3 E1.S4 E1.S5 E1.S6 E1.S7 E1.S8
# 2000.000 122.000 0.000 2.000 38.470 2.650 0.662 3.310 1.980 1.320 1.980 1.980 2.650
# E2.S1 E2.S2 E2.S3 E2.S4
# 0.662 1.320 1.980 1.320
#
Then you can just bind
them:
do.call(rbind, lapply(split(df, ceiling(seq_len(nrow(df)) / 5)), colMeans))
# Year Day Hour Min Sec. E1.S1 E1.S2 E1.S3 E1.S4 E1.S5 E1.S6 E1.S7 E1.S8 E2.S1 E2.S2 E2.S3 E2.S4
# 1 2000 122 0 1.6 26.192 2.778 2.6464 1.9848 2.646 2.116 1.5864 3.5724 2.248 1.454 0.6608 0.9248 1.4568
# 2 2000 122 0 2.0 38.470 2.650 0.6620 3.3100 1.980 1.320 1.9800 1.9800 2.650 0.662 1.3200 1.9800 1.3200
Note: It also helps to check if all the columns you want to take mean
of are either integers
or numeric
by doing:
> sapply(df, class)
# Year Day Hour Min Sec. E1.S1 E1.S2 E1.S3 E1.S4 E1.S5 E1.S6 E1.S7
# "integer" "integer" "integer" "integer" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
# E1.S8 E2.S1 E2.S2 E2.S3 E2.S4
# "numeric" "numeric" "numeric" "numeric" "numeric"
Edit: Following OP's comment:
idx <- ceiling(seq_len(nrow(dd)) / 5)
# do colMeans on all columns except last one.
res <- lapply(split(dd[-(ncol(dd))], idx), colMeans, na.rm = TRUE)
# assign first value of "datetime" in each 5-er group as names to list
names(res) <- dd$datetime[seq(1, nrow(df), by=5)]
# bind them to give a matrix
res <- do.call(rbind, res)
Alternatively, if you want a data.frame
and datetime
as a column:
idx <- ceiling(seq_len(nrow(dd)) / 5)
res <- as.data.frame(do.call(rbind, lapply(split(dd[-(ncol(dd))], idx),
colMeans, na.rm = TRUE)))
res$datetime <- dd$datetime[seq(1, nrow(dd), by=5)]
Related Topics
Is There a Package or Technique Availabe for Calculating Large Factorials in R
In R, Switch Uppercase to Lowercase and Vice-Versa in a String
Simulate an Ar(1) Process with Uniform Innovations
Web Scraping a Tableauviz into an R Dataframe
Cumsum Reset at Certain Values
Check Which Elements of a Vector Is Between the Elements of Another One in R
How to Do a Glm When "Contrasts Can Be Applied Only to Factors with 2 or More Levels"
Warnings When Running an Lmer in R
Terms of a Sum in a R Expression
Unexpected Date When Converting Posixct Date-Time to Date - Timezone Issue
For Loop Within Custom Function to Create Ggplot Time Series Plots
Efficient Method to Filter and Add Based on Certain Conditions (3 Conditions in This Case)
Error in Running Factor() on a Column of a Data Frame