Dplyr Summarize with Subtotals

dplyr summarize with subtotals

data.table It's very clunky, but this is one way:

library(data.table)
DT <- data.table(mtcars)
rbind(
  DT[,.(mean(disp)),          by=.(cyl,carb)],
  DT[,.(mean(disp), carb=NA), by=.(cyl) ],
  DT[,.(mean(disp), cyl=NA),  by=.(carb)]
)[order(cyl,carb)]

This gives

    cyl carb       V1
 1:   4    1  91.3800
 2:   4    2 116.6000
 3:   4   NA 105.1364
 4:   6    1 241.5000
 5:   6    4 163.8000
 6:   6    6 145.0000
 7:   6   NA 183.3143
 8:   8    2 345.5000
 9:   8    3 275.8000
10:   8    4 405.5000
11:   8    8 301.0000
12:   8   NA 353.1000
13:  NA    1 134.2714
14:  NA    2 208.1600
15:  NA    3 275.8000
16:  NA    4 308.8200
17:  NA    6 145.0000
18:  NA    8 301.0000

I'd rather see results in something like an R table, but don't know of any functions for that.

dplyr @akrun found this analogous code

bind_rows(
  mtcars %>% 
    group_by(cyl, carb) %>% 
    summarise(Mean= mean(disp)), 
  mtcars %>% 
    group_by(cyl) %>% 
    summarise(carb=NA, Mean=mean(disp)), 
  mtcars %>% 
    group_by(carb) %>% 
    summarise(cyl=NA, Mean=mean(disp))
) %>% arrange(cyl, carb)

We could wrap the repeat operations in a function

library(lazyeval)
f1 <- function(df, grp, Var, func){
  FUN <- match.fun(func)
   df %>% 
     group_by_(.dots=grp) %>%
     summarise_(interp(~FUN(v), v=as.name(Var)))
  }

 m1 <- f1(mtcars, c('carb', 'cyl'), 'disp', 'mean')
 m2 <- f1(mtcars, 'carb', 'disp', 'mean')
 m3 <- f1(mtcars, 'cyl', 'disp', 'mean')

 bind_rows(list(m1, m2, m3)) %>%
              arrange(cyl, carb) %>%
              rename(Mean=`FUN(disp)`)
   carb cyl     Mean
1     1   4  91.3800
2     2   4 116.6000
3    NA   4 105.1364
4     1   6 241.5000
5     4   6 163.8000
6     6   6 145.0000
7    NA   6 183.3143
8     2   8 345.5000
9     3   8 275.8000
10    4   8 405.5000
11    8   8 301.0000
12   NA   8 353.1000
13    1  NA 134.2714
14    2  NA 208.1600
15    3  NA 275.8000
16    4  NA 308.8200
17    6  NA 145.0000
18    8  NA 301.0000

Either option can be made a little less ugly with data.table's rbindlist with fill:

rbindlist(list(
  mtcars %>% group_by(cyl) %>% summarise(mean(disp)),
  mtcars %>% group_by(carb) %>% summarise(mean(disp)),
  mtcars %>% group_by(cyl,carb) %>% summarise(mean(disp))
),fill=TRUE) %>% arrange(cyl,carb)

rbindlist(list(
  DT[,mean(disp),by=.(cyl,carb)],
  DT[,mean(disp),by=.(cyl)],
  DT[,mean(disp),by=.(carb)]
),fill=TRUE)[order(cyl,carb)]

Calculate subtotals with dplyr and tidyr

I don't know if this is the best (compact and readable) but it works ;)

data %>%
  spread(sport, medals) %>%
  mutate(Total = rowSums(.[2:4])) %>%
  rbind(., data.frame(country="Total", t(colSums(.[2:5]))))

  country curling crosscountry downhill Total
1  Sweden       0            2        0     2
2  Norway       1            1        0     2
3 Denmark       2            2        1     5
4 Finland       3            0        2     5
5   Total       6            5        3    14

subtotals by group R

Using DF in the Note at the end try this one-liner. The same code works if there are a different number of columns. Also try it without the as.data.frame for wide format. No packages are used.

as.data.frame(addmargins(xtabs(~., DF)))

giving:

  Var1 Var2 Freq
1    a    b    1
2    b    b    1
3  Sum    b    2
4    a    c    1
5    b    c    0
6  Sum    c    1
7    a  Sum    2
8    b  Sum    1
9  Sum  Sum    3

Note

DF in reproducible form is:

DF <- structure(list(Var1 = structure(c(1L, 1L, 2L), .Label = c("a", 
"b"), class = "factor"), Var2 = structure(c(1L, 2L, 1L), .Label = c("b", 
"c"), class = "factor")), class = "data.frame", row.names = c(NA, 
-3L))

Add missing subtotals to each group using dplyr

This isn't a join, it's just binding new rows on:

x %>% group_by(id) %>%
    summarize(
      value = sum(value[key == 'total']) - sum(value[key %in% c('a', 'b')]),
      key = 'n'
    ) %>%
    bind_rows(x) %>%
    select(id, key, value) %>%  # back to original column order
    arrange(id, key)            # and a start a row order
# # A tibble: 9 × 3
#      id   key value
#   <dbl> <chr> <dbl>
# 1     1     a     1
# 2     1     b     2
# 3     1     n     7
# 4     1 total    10
# 5     2     a     4
# 6     2     b     3
# 7     2     n     5
# 8     2 total    12
# 9     2     x     1

Add margin row totals in dplyr chain

With adorn_totals() from the janitor package:

library(janitor)
mtcars %>%
  tabyl(cyl, gear) %>%
  adorn_totals("row") 

   cyl  3  4 5
     4  1  8 2
     6  2  4 1
     8 12  0 2
 Total 15 12 5

To get from there to the "long" form in your post, add tidyr::gather() to the pipeline:

mtcars %>%
  tabyl(cyl, gear) %>%
  adorn_totals("row") %>%
  tidyr::gather(gear, n, 2:ncol(.), convert = TRUE)

     cyl gear  n
1      4    3  1
2      6    3  2
3      8    3 12
4  Total    3 15
5      4    4  8
6      6    4  4
7      8    4  0
8  Total    4 12
9      4    5  2
10     6    5  1
11     8    5  2
12 Total    5  5

Self-promotion alert, I authored this package - adding this answer b/c it's a genuinely efficient solution here.

R: display subtotals in crosstables

ftable(addmargins(table(df[c('language', 'sex', 'smoker')])))
                smoker  no yes Sum
language sex                      
Eng      female          9  10  19
         male           10   9  19
         Sum            19  19  38
Ger      female          7   5  12
         male            9   8  17
         Sum            16  13  29
Spa      female         11   8  19
         male            9   5  14
         Sum            20  13  33
Sum      female         27  23  50
         male           28  22  50
         Sum            55  45 100

ftable(addmargins(table(df[c('language', 'sex', 'smoker')]), 2))
                smoker no yes
language sex                 
Eng      female         9  10
         male          10   9
         Sum           19  19
Ger      female         7   5
         male           9   8
         Sum           16  13
Spa      female        11   8
         male           9   5
         Sum           20  13

subtotal with ddply in R

You can replicate the data 4 times:
- including sex and group
- including sex
- including group
- not including any column

The columns that are not included become "all"

require(plyr)
dfx <- data.frame(
  group = c(rep('A', 8), rep('B', 15), rep('C', 6)),
  sex = sample(c("M", "F"), size = 29, replace = TRUE),
  age = runif(n = 29, min = 18, max = 54)
)

# replicate the data not taking account of one or more attributed
dfAll <- dfx
dfAll$group <- "all"
dfAll$sex <- "all"
dfGroup <- dfx
dfGroup$group <- "all_group"
dfSex <- dfx
dfSex$group <- "all_sex"
dfToGroup <- rbind(dfx, dfGroup, dfSex, dfAll)

# Note the use of the '.' function to allow
# group and sex to be used without quoting
ddply(dfToGroup, .(group, sex), summarize,
      mean = round(mean(age), 2),
      sd = round(sd(age), 2))

Calculating subtotals in R

OK. Assuming your data are in a data frame named foo:

> head(foo)
             date mcode      mname ycode yname   yissue bsent breturn tsent
417572 2010/07/28 45740 ENDPOINT A  5772  XMAG 20100800     7       0     7
417573 2010/07/31 45740 ENDPOINT A  5772  XMAG 20100800     0       0     0
417574 2010/08/04 45740 ENDPOINT A  5772  XMAG 20100800     0       0     0
417575 2010/08/14 45740 ENDPOINT A  5772  XMAG 20100800     0       0     0
417576 2010/08/26 45740 ENDPOINT A  5772  XMAG 20100800     0       4     0
417577 2010/07/28 45741 ENDPOINT L  5772  XMAG 20100800     2       0     2
       treturn csales
417572       0      0
417573       0      1
417574       0      1
417575       0      1
417576       0      0
417577       0      0

Then this will do the aggregation of the numeric columns in your data:

> aggregate(cbind(bsent, breturn, tsent, treturn, csales) ~ yname, data = foo, 
+           FUN = sum)
  yname bsent breturn tsent treturn csales
1  XMAG    14       8    14       0      6
2  YMAG    11       6    11       6      5

That was using the snippet of data you included in your Q. I used the formula interface to aggregate(), which is a bit nicer in this instance because you don't need all the foo$ bits on the variable names you wish the aggregate. If you have missing data (NA)in your full data set, then you'll need add an extra argument na.rm = TRUE which will get passed to sum(), like so:

> aggregate(cbind(bsent, breturn, tsent, treturn, csales) ~ yname, data = foo, 
+           FUN = sum, na.rm = TRUE)

dplyr summarize across ttest

If we are using tidy

library(dplyr)
library(broom)
library(tidyr)
mtcars %>% 
   group_by(am) %>% 
   summarise(across(
    .cols = mpg,
      ~ list(tidy(t.test(.[vs == 0], .[vs == 1])) %>%
             select(p.value, conf.low, conf.high))
    )) %>% 
   unnest(mpg)

-output

# A tibble: 2 x 4
     am  p.value conf.low conf.high
  <dbl>    <dbl>    <dbl>     <dbl>
1     0 0.000395    -8.33     -3.05
2     1 0.00459    -14.0      -3.27

In the OP's code, we need the lambda function inside the list

mtcars  %>%
  group_by(am) %>%
  summarise(across(
    .cols = mpg,
    .fns =  list( 
      p.value = ~ t.test(.[vs == 0], .[vs == 1])$p.value,
      conf.low = ~ t.test(.[vs == 0], .[vs == 1])$conf.int[1],
      conf.high =~ t.test(.[vs == 0], .[vs == 1])$conf.int[2]
    )
  ))

-output

# A tibble: 2 x 4
     am mpg_p.value mpg_conf.low mpg_conf.high
  <dbl>       <dbl>        <dbl>         <dbl>
1     0    0.000395        -8.33         -3.05
2     1    0.00459        -14.0          -3.27

Dplyr Summarize with Subtotals