R: Calculating 5 Year Averages in Panel Data

R: Calculating 5 year averages in panel data

This is the stuff aggregate is made for. :

Df <- data.frame(
    year=rep(1951:1970,2),
    country=rep(c("Arg","Ven"),each=20),
    var1 = c(1:20,51:70),
    var2 = c(20:1,70:51)
)

Level <-cut(Df$year,seq(1951,1971,by=5),right=F)
id <- c("var1","var2")

> aggregate(Df[id],list(Df$country,Level),mean)
  Group.1     Group.2 var1 var2
1     Arg [1951,1956)    3   18
2     Ven [1951,1956)   53   68
3     Arg [1956,1961)    8   13
4     Ven [1956,1961)   58   63
5     Arg [1961,1966)   13    8
6     Ven [1961,1966)   63   58
7     Arg [1966,1971)   18    3
8     Ven [1966,1971)   68   53

The only thing you might want to do, is to rename the categories and the variable names.

How can I create 5 year averages for multiple countries in a panel data set?

To create the averages, with dplyr you can do:

df %>%
 arrange(Country, Year) %>%
 group_by(Country) %>%
 mutate(Period = gl(ceiling(n() / 5), 5, length = n())) %>%
 group_by(Country, Country_Code, Period) %>%
 summarise_all(mean)

  Country       Country_Code Period  Year  GINI Trade_Openness Financial_Openness   HMI
  <chr>         <chr>        <fct>  <dbl> <dbl>          <dbl>              <dbl> <dbl>
1 Argentina     ARG          1      1993   46.6           55.8               88.4  5.2 
2 Argentina     ARG          2      1996.  48.8           55.3               86.8  4.7 
3 United_States USA          1      1993   57.7           57.6               79.6  5.87
4 United_States USA          2      1996.  56.8           60.9               78.9  6.15

Sample data:

df <- read.table(text = "
    Country     Country_Code  Year  GINI  Trade_Openness  Financial_Openness  HMI  
                  United_States  USA           1991  60.4           45.71                81.4  5.56 
                  United_States  USA           1992  50.6           65.1                 80.7  6.00 
                  United_States  USA           1993  56.1           61.0                 79.1   6.1  
                  United_States  USA           1994  59.1           62.07                80.7   5.7 
                  United_States  USA           1995  62.5           54.34                76.0   6.0 
                  United_States  USA           1996  56.9           60.09                79.8   6.4 
                  United_States  USA           1997  56.8           61.7                 78.1   5.9 
                  Argentina      ARG           1991  45.9            54.8                91.2   4.6 
                  Argentina      ARG           1992  47.1            56.1                87.1   4.7 
                  Argentina      ARG           1993  42.7            56.9                89.8   6.1 
                  Argentina      ARG           1994  47.1            55.1                88.1   5.9 
                  Argentina      ARG           1995  50.0            56.1                85.7   4.7 
                  Argentina      ARG           1996  49.0            54.0                87.2   4.5 
                  Argentina      ARG           1997  48.7            56.6                86.5   4.9 ", 
                 header = TRUE,
                 stringsAsFactors = FALSE)

calculate 5 year average of panel data with factors kept

The warnning results from that summarise_all(mean) calculates averages not only on var1 & var2 but on state & country. If you want to keep state and country as grouping columns, you should put them into group_by():

library(dplyr)

df %>%
  group_by(county, state, country,
           period = cut(year, seq(2011, 2021, by = 5), right = FALSE)) %>%
  summarise_all(mean) %>%
  ungroup()

# # A tibble: 10 × 7
#    county state country period       year  var1  var2
#    <chr>  <chr> <chr>   <fct>       <dbl> <dbl> <dbl>
#  1 a      A     AA      [2011,2016)  2013  33.1  69.7
#  2 a      A     AA      [2016,2021)  2018  24.7  73.6
#  3 b      B     BB      [2011,2016)  2013  27.6  72.3
#  4 b      B     BB      [2016,2021)  2018  24.7  83.1
#  5 c      C     CC      [2011,2016)  2013  38.7  75.7
#  6 c      C     CC      [2016,2021)  2018  22.8  66.8
#  7 d      D     DD      [2011,2016)  2013  33.8  72.2
#  8 d      D     DD      [2016,2021)  2018  20.0  83.7
#  9 e      E     EE      [2011,2016)  2013  14.9  71.0
# 10 e      E     EE      [2016,2021)  2018  19.6  70.4

If the grouping columns are simply county and period, and other categorical variables are unique in each group, you could keep them by just leaving the first values with first() while doing summarise().

df %>%
  group_by(county,
           period = cut(year, seq(2011, 2021, by = 5), right = FALSE)) %>%
  summarise(across(!where(is.numeric), first),
            across( where(is.numeric), mean)) %>%
  ungroup()

Compute Average values in panel data R

You can use summarize or mutate:

Using summarize will fully aggregate your data to provide only the grouping variable (ID number for each firm) and the mean.

df %>% 
  group_by(BvD_ID_Number) %>% 
  summarize(TotalAsset_Average = round(mean(TotalAsset),0))

This gives us:

  BvD_ID_Number TotalAsset_Average
  <chr>                      <dbl>
1 FR810911719                1697 
2 GBFC024701               124191 
3 GBFC032748                 6318.
4 GBFC32536                 72912 
5 GBFC32699                 92878.

Using mutate

df %>% 
  group_by(BvD_ID_Number) %>% 
  mutate(TotalAsset_Average = round(mean(TotalAsset),0))

This gives us:

# A tibble: 10 x 4
# Groups:   BvD_ID_Number [5]
   BvD_ID_Number  Year TotalAsset TotalAsset_Average
   <chr>         <dbl>      <dbl>              <dbl>
 1 FR810911719    2017       2220               1697
 2 FR810911719    2016       1174               1697
 3 GBFC024701     2018     124726             124191
 4 GBFC024701     2017     126010             124191
 5 GBFC024701     2016     121837             124191
 6 GBFC32536      2017      72912              72912
 7 GBFC32699      2016     111298              92878
 8 GBFC32699      2015      74457              92878
 9 GBFC032748     2017       6579               6318
10 GBFC032748     2016       6056               6318

Data:

structure(list(BvD_ID_Number = c("FR810911719", "FR810911719", 
"GBFC024701", "GBFC024701", "GBFC024701", "GBFC32536", "GBFC32699", 
"GBFC32699", "GBFC032748", "GBFC032748"), Year = c(2017, 2016, 
2018, 2017, 2016, 2017, 2016, 2015, 2017, 2016), TotalAsset = c(2220, 
1174, 124726, 126010, 121837, 72912, 111298, 74457, 6579, 6056
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
))

Taking a 3 year average across in a panel data set with NAs

Here's how you can do that with package dplyr. Basically, you first create a "year group" using mutate. I used ifelse but it you have more groups, you should consider looking at case_when although nested ifelse will work. Then, we summarise by country and Year_group.

df1 <- read.table(text="Country Year Var1 Var2
                  USA 2010 5 3
                  USA 2011 6 5
                  USA 2012 NA 8
                  USA 2013 4 NA
                  USA 2014 NA 6
                  USA 2015 6 9
                  CHN 2010 NA 5
                  CHN 2011 7 NA
                  CHN 2012 6 NA
                  CHN 2013 4 4
                  CHN 2014 NA 6
                  CHN 2015 NA 8
                  EGY 2010 3 NA
                  EGY 2011 3 5
                  EGY 2012 3 6
                  EGY 2013 NA 8
                  EGY 2014 NA NA
                  EGY 2015 NA 2",header=TRUE, stringsAsFactors=FALSE)
library(dplyr)
df1%>%
  group_by(Country)%>%
  mutate(Year_group=ifelse(Year<2013,"2010-2012","2013-2016"))%>%
  group_by(Country,Year_group)%>%
  summarise(Mean_var1=mean(Var1,na.rm=TRUE),Mean_var2=mean(Var2,na.rm=TRUE)

  Country Year_group Mean_var1 Mean_var2
    <chr>      <chr>     <dbl>     <dbl>
1     CHN  2010-2012       6.5  5.000000
2     CHN  2013-2016       4.0  6.000000
3     EGY  2010-2012       3.0  5.500000
4     EGY  2013-2016       NaN  5.000000
5     USA  2010-2012       5.5  5.333333
6     USA  2013-2016       5.0  7.500000

R: Formula for moving average for panel data

It is slightly easier if you are willing to use packages but since the question asks for base R, using only that we have the following. ave applies roll to new_value by code_ID and roll is implemented by taking row means of embed or using filter or a combination of cumsum and diff

roll <- function(x, n = 7) c(rep(NA, n-1), rowMeans(embed(x, n)))
dat2 <- transform(dat, mean7 = ave(new_value, code_ID, FUN = roll))

or use one of these alternatives for roll:

roll2 <- function(x, n = 7) stats::filter(x, rep(1, n) / n, sides = 1)

roll3 <- function(x, n = 7) c(rep(NA, n-1), diff(cumsum(c(0, x)), n)/n)

roll4 <- function(x, n = 7) c(rep(NA, n-1), apply(embed(x, n), 1, mean))

R: Calculating 5 Year Averages in Panel Data