Dplyr - Summary Table for Multiple Variables

dplyr - summary table for multiple variables

Use dplyr in combination with tidyr to reshape the end result.

library(dplyr)
library(tidyr)

df <- tbl_df(mtcars)

df.sum <- df %>%
  select(mpg, cyl, vs, am, gear, carb) %>% # select variables to summarise
  summarise_each(funs(min = min, 
                      q25 = quantile(., 0.25), 
                      median = median, 
                      q75 = quantile(., 0.75), 
                      max = max,
                      mean = mean, 
                      sd = sd))

# the result is a wide data frame
> dim(df.sum)
[1]  1 42

# reshape it using tidyr functions

df.stats.tidy <- df.sum %>% gather(stat, val) %>%
  separate(stat, into = c("var", "stat"), sep = "_") %>%
  spread(stat, val) %>%
  select(var, min, q25, median, q75, max, mean, sd) # reorder columns

> print(df.stats.tidy)

   var  min    q25 median  q75  max     mean        sd
1   am  0.0  0.000    0.0  1.0  1.0  0.40625 0.4989909
2 carb  1.0  2.000    2.0  4.0  8.0  2.81250 1.6152000
3  cyl  4.0  4.000    6.0  8.0  8.0  6.18750 1.7859216
4 gear  3.0  3.000    4.0  4.0  5.0  3.68750 0.7378041
5  mpg 10.4 15.425   19.2 22.8 33.9 20.09062 6.0269481
6   vs  0.0  0.000    0.0  1.0  1.0  0.43750 0.5040161

Summary statistics for multiple variables with statistics as rows and variables as columns?

Here is a way using purrr to iterate over a list of functions. This is effectively what you were doing with bind_rows(), but in less code.

library(dplyr)
library(purrr)

funs <- lst(min, median, mean, max, sd)

map_dfr(funs,
        ~ summarize(starwars, across(where(is.numeric), .x, na.rm = TRUE)),
        .id = "statistic")

# # A tibble: 5 x 4
#   statistic height   mass birth_year
#   <chr>      <dbl>  <dbl>      <dbl>
# 1 min         66     15          8  
# 2 median     180     79         52  
# 3 mean       174.    97.3       87.6
# 4 max        264   1358        896  
# 5 sd          34.8  169.       155.

Dplyr: Production of a Summary Descriptive Statistics Table (Standard error and Coefficient of Variation) for Multiple Variables

The summarise() function now lets you summarize multiple variables directly, using across(). It looks like you want all the numeric variables, but you could also specify them directly (c(Low.Freq, High.Freq, Peak.Freq, Delta.Freq, Delta.Time, Peak.Time, Center.Freq, Start.Freq, End.Freq)). You also needed ~ for the functions that refer to the variable with ..

library(dplyr)
library(tidyr)

Summary_Statistics <- New_Acoustic_Parameters %>% 
  summarise(across(where(is.numeric), .fns = 
                     list(Median = median,
                          Mean = mean,
                          n = sum,
                          SD = sd,
                          SE = ~sd(.)/sqrt(n()),
                          Min = min,
                          Max = max,
                          q25 = ~quantile(., 0.25), 
                          q75 = ~quantile(., 0.75), 
                          CV = cv
                     ))) %>% 
  pivot_longer(everything(), names_sep = "_", names_to = c( "variable", ".value"))

# A tibble: 9 × 11
  variable    Median       Mean           n          SD          SE   Min       Max   q25       q75    CV
  <chr>        <dbl>      <dbl>       <dbl>       <dbl>       <dbl> <dbl>     <dbl> <dbl>     <dbl> <dbl>
1 ID            75.5       75.5       11325        43.4        3.55     1       150  38.2      113.  57.5
2 Low.Freq   30645   47718421.   7157763188 160229651.  13082696.       0 936779338 392.   5065917. 336. 
3 High.Freq   6020.  33588147.   5038222034 126884782.  10360099.       0 825466852  78.5   941394. 378. 
4 Peak.Freq  45487   74707306.  11206095904 202504621.  16534433.       0 999242982 436.  32466176. 271. 
5 Delta.Freq 20268.  31612255.   4741838252 113350682.   9255044.       0 754038591  93.2  2282342. 359. 
6 Delta.Time 16852.  64582719.   9687407814 208416077.  17017101.       0 946706344  70.5  4181862. 323. 
7 Peak.Time  35342   64781815.   9717272204 190695860.  15570252.       1 964147297 790.   6424504. 294. 
8 Start.Freq 39416.  54517987.   8177697991 173895386.  14198499.       0 940000382  77.2  2694535  319. 
9 End.Freq   71317   41475068.   6221260243 132873661.  10849089.       1 856943893 430.   7667247. 320.

R - create summary table of means and counts by group for multiple columns

We may group by 'group' and summarise across the numeric columns to get the mean and the count of non-NA (sum(!is.na)

library(dplyr)
df %>%
   group_by(group) %>% 
  summarise(across(where(is.numeric),
    list(mean = ~ mean(.x, na.rm = TRUE), count = ~ sum(!is.na(.x)))))

Multiple variable summary with dplyr

After the initial summarize, you have one entry for each article per year. You then wish to know what the contribution of each article was to each year's total, so you need to group_by again using just the year, and finally mutate to get the proportion for each article.

library(dplyr)

sample_data %>%
   group_by(article, date) %>% 
   summarise(weight = sum(value), .groups = "keep") %>%
   group_by(date) %>%
   mutate(prop = weight / sum(weight))
#> # A tibble: 29 x 4
#> # Groups:   date [7]
#>    article date  weight  prop
#>    <chr>   <chr>  <dbl> <dbl>
#>  1 A       2015   55572 0.661
#>  2 A       2016   33948 0.876
#>  3 A       2017   11716 0.632
#>  4 A       2019    8668 0.491
#>  5 A       2020    5386 0.799
#>  6 A       2021    2577 0.628
#>  7 B       2015   16119 0.192
#>  8 B       2018    1414 0.622
#>  9 B       2019    8137 0.461
#> 10 B       2020    1174 0.174
#> # ... with 19 more rows

^{Created on 2022-02-19 by the reprex package (v2.0.1)}

Dplyr - Summary Table for Multiple Variables