Dplyr Summarize with Subtotals

dplyr summarize with subtotals

data.table It's very clunky, but this is one way:

library(data.table)
DT <- data.table(mtcars)
rbind(
DT[,.(mean(disp)), by=.(cyl,carb)],
DT[,.(mean(disp), carb=NA), by=.(cyl) ],
DT[,.(mean(disp), cyl=NA), by=.(carb)]
)[order(cyl,carb)]

This gives

    cyl carb       V1
1: 4 1 91.3800
2: 4 2 116.6000
3: 4 NA 105.1364
4: 6 1 241.5000
5: 6 4 163.8000
6: 6 6 145.0000
7: 6 NA 183.3143
8: 8 2 345.5000
9: 8 3 275.8000
10: 8 4 405.5000
11: 8 8 301.0000
12: 8 NA 353.1000
13: NA 1 134.2714
14: NA 2 208.1600
15: NA 3 275.8000
16: NA 4 308.8200
17: NA 6 145.0000
18: NA 8 301.0000

I'd rather see results in something like an R table, but don't know of any functions for that.


dplyr @akrun found this analogous code

bind_rows(
mtcars %>%
group_by(cyl, carb) %>%
summarise(Mean= mean(disp)),
mtcars %>%
group_by(cyl) %>%
summarise(carb=NA, Mean=mean(disp)),
mtcars %>%
group_by(carb) %>%
summarise(cyl=NA, Mean=mean(disp))
) %>% arrange(cyl, carb)

We could wrap the repeat operations in a function

library(lazyeval)
f1 <- function(df, grp, Var, func){
FUN <- match.fun(func)
df %>%
group_by_(.dots=grp) %>%
summarise_(interp(~FUN(v), v=as.name(Var)))
}

m1 <- f1(mtcars, c('carb', 'cyl'), 'disp', 'mean')
m2 <- f1(mtcars, 'carb', 'disp', 'mean')
m3 <- f1(mtcars, 'cyl', 'disp', 'mean')

bind_rows(list(m1, m2, m3)) %>%
arrange(cyl, carb) %>%
rename(Mean=`FUN(disp)`)
carb cyl Mean
1 1 4 91.3800
2 2 4 116.6000
3 NA 4 105.1364
4 1 6 241.5000
5 4 6 163.8000
6 6 6 145.0000
7 NA 6 183.3143
8 2 8 345.5000
9 3 8 275.8000
10 4 8 405.5000
11 8 8 301.0000
12 NA 8 353.1000
13 1 NA 134.2714
14 2 NA 208.1600
15 3 NA 275.8000
16 4 NA 308.8200
17 6 NA 145.0000
18 8 NA 301.0000

Either option can be made a little less ugly with data.table's rbindlist with fill:

rbindlist(list(
mtcars %>% group_by(cyl) %>% summarise(mean(disp)),
mtcars %>% group_by(carb) %>% summarise(mean(disp)),
mtcars %>% group_by(cyl,carb) %>% summarise(mean(disp))
),fill=TRUE) %>% arrange(cyl,carb)

rbindlist(list(
DT[,mean(disp),by=.(cyl,carb)],
DT[,mean(disp),by=.(cyl)],
DT[,mean(disp),by=.(carb)]
),fill=TRUE)[order(cyl,carb)]

Calculate subtotals with dplyr and tidyr

I don't know if this is the best (compact and readable) but it works ;)

data %>%
spread(sport, medals) %>%
mutate(Total = rowSums(.[2:4])) %>%
rbind(., data.frame(country="Total", t(colSums(.[2:5]))))

country curling crosscountry downhill Total
1 Sweden 0 2 0 2
2 Norway 1 1 0 2
3 Denmark 2 2 1 5
4 Finland 3 0 2 5
5 Total 6 5 3 14

subtotals by group R

Using DF in the Note at the end try this one-liner. The same code works if there are a different number of columns. Also try it without the as.data.frame for wide format. No packages are used.

as.data.frame(addmargins(xtabs(~., DF)))

giving:

  Var1 Var2 Freq
1 a b 1
2 b b 1
3 Sum b 2
4 a c 1
5 b c 0
6 Sum c 1
7 a Sum 2
8 b Sum 1
9 Sum Sum 3

Note

DF in reproducible form is:

DF <- structure(list(Var1 = structure(c(1L, 1L, 2L), .Label = c("a", 
"b"), class = "factor"), Var2 = structure(c(1L, 2L, 1L), .Label = c("b",
"c"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))

Add missing subtotals to each group using dplyr

This isn't a join, it's just binding new rows on:

x %>% group_by(id) %>%
summarize(
value = sum(value[key == 'total']) - sum(value[key %in% c('a', 'b')]),
key = 'n'
) %>%
bind_rows(x) %>%
select(id, key, value) %>% # back to original column order
arrange(id, key) # and a start a row order
# # A tibble: 9 × 3
# id key value
# <dbl> <chr> <dbl>
# 1 1 a 1
# 2 1 b 2
# 3 1 n 7
# 4 1 total 10
# 5 2 a 4
# 6 2 b 3
# 7 2 n 5
# 8 2 total 12
# 9 2 x 1

Add margin row totals in dplyr chain

With adorn_totals() from the janitor package:

library(janitor)
mtcars %>%
tabyl(cyl, gear) %>%
adorn_totals("row")

cyl 3 4 5
4 1 8 2
6 2 4 1
8 12 0 2
Total 15 12 5

To get from there to the "long" form in your post, add tidyr::gather() to the pipeline:

mtcars %>%
tabyl(cyl, gear) %>%
adorn_totals("row") %>%
tidyr::gather(gear, n, 2:ncol(.), convert = TRUE)

cyl gear n
1 4 3 1
2 6 3 2
3 8 3 12
4 Total 3 15
5 4 4 8
6 6 4 4
7 8 4 0
8 Total 4 12
9 4 5 2
10 6 5 1
11 8 5 2
12 Total 5 5

Self-promotion alert, I authored this package - adding this answer b/c it's a genuinely efficient solution here.

R: display subtotals in crosstables

ftable(addmargins(table(df[c('language', 'sex', 'smoker')])))
smoker no yes Sum
language sex
Eng female 9 10 19
male 10 9 19
Sum 19 19 38
Ger female 7 5 12
male 9 8 17
Sum 16 13 29
Spa female 11 8 19
male 9 5 14
Sum 20 13 33
Sum female 27 23 50
male 28 22 50
Sum 55 45 100

ftable(addmargins(table(df[c('language', 'sex', 'smoker')]), 2))
smoker no yes
language sex
Eng female 9 10
male 10 9
Sum 19 19
Ger female 7 5
male 9 8
Sum 16 13
Spa female 11 8
male 9 5
Sum 20 13

subtotal with ddply in R

You can replicate the data 4 times:
- including sex and group
- including sex
- including group
- not including any column

The columns that are not included become "all"

require(plyr)
dfx <- data.frame(
group = c(rep('A', 8), rep('B', 15), rep('C', 6)),
sex = sample(c("M", "F"), size = 29, replace = TRUE),
age = runif(n = 29, min = 18, max = 54)
)

# replicate the data not taking account of one or more attributed
dfAll <- dfx
dfAll$group <- "all"
dfAll$sex <- "all"
dfGroup <- dfx
dfGroup$group <- "all_group"
dfSex <- dfx
dfSex$group <- "all_sex"
dfToGroup <- rbind(dfx, dfGroup, dfSex, dfAll)

# Note the use of the '.' function to allow
# group and sex to be used without quoting
ddply(dfToGroup, .(group, sex), summarize,
mean = round(mean(age), 2),
sd = round(sd(age), 2))

Calculating subtotals in R

OK. Assuming your data are in a data frame named foo:

> head(foo)
date mcode mname ycode yname yissue bsent breturn tsent
417572 2010/07/28 45740 ENDPOINT A 5772 XMAG 20100800 7 0 7
417573 2010/07/31 45740 ENDPOINT A 5772 XMAG 20100800 0 0 0
417574 2010/08/04 45740 ENDPOINT A 5772 XMAG 20100800 0 0 0
417575 2010/08/14 45740 ENDPOINT A 5772 XMAG 20100800 0 0 0
417576 2010/08/26 45740 ENDPOINT A 5772 XMAG 20100800 0 4 0
417577 2010/07/28 45741 ENDPOINT L 5772 XMAG 20100800 2 0 2
treturn csales
417572 0 0
417573 0 1
417574 0 1
417575 0 1
417576 0 0
417577 0 0

Then this will do the aggregation of the numeric columns in your data:

> aggregate(cbind(bsent, breturn, tsent, treturn, csales) ~ yname, data = foo, 
+ FUN = sum)
yname bsent breturn tsent treturn csales
1 XMAG 14 8 14 0 6
2 YMAG 11 6 11 6 5

That was using the snippet of data you included in your Q. I used the formula interface to aggregate(), which is a bit nicer in this instance because you don't need all the foo$ bits on the variable names you wish the aggregate. If you have missing data (NA)in your full data set, then you'll need add an extra argument na.rm = TRUE which will get passed to sum(), like so:

> aggregate(cbind(bsent, breturn, tsent, treturn, csales) ~ yname, data = foo, 
+ FUN = sum, na.rm = TRUE)

dplyr summarize across ttest

If we are using tidy

library(dplyr)
library(broom)
library(tidyr)
mtcars %>%
group_by(am) %>%
summarise(across(
.cols = mpg,
~ list(tidy(t.test(.[vs == 0], .[vs == 1])) %>%
select(p.value, conf.low, conf.high))
)) %>%
unnest(mpg)

-output

# A tibble: 2 x 4
am p.value conf.low conf.high
<dbl> <dbl> <dbl> <dbl>
1 0 0.000395 -8.33 -3.05
2 1 0.00459 -14.0 -3.27

In the OP's code, we need the lambda function inside the list

mtcars  %>%
group_by(am) %>%
summarise(across(
.cols = mpg,
.fns = list(
p.value = ~ t.test(.[vs == 0], .[vs == 1])$p.value,
conf.low = ~ t.test(.[vs == 0], .[vs == 1])$conf.int[1],
conf.high =~ t.test(.[vs == 0], .[vs == 1])$conf.int[2]
)
))

-output

# A tibble: 2 x 4
am mpg_p.value mpg_conf.low mpg_conf.high
<dbl> <dbl> <dbl> <dbl>
1 0 0.000395 -8.33 -3.05
2 1 0.00459 -14.0 -3.27


Related Topics



Leave a reply



Submit