Dplyr: How to Use Group_By Inside a Function

dplyr: How to use group_by inside a function?

For programming, group_by_ is the counterpart to group_by:

library(dplyr)

mytable <- function(x, ...) x %>% group_by_(...) %>% summarise(n = n())
mytable(iris, "Species")
# or iris %>% mytable("Species")

which gives:

     Species  n
1     setosa 50
2 versicolor 50
3  virginica 50

Update At the time this was written dplyr used %.% which is what was originally used above but now %>% is favored so have changed above to that to keep this relevant.

Update 2 regroup is now deprecated, use group_by_ instead.

Update 3 group_by_(list(...)) now becomes group_by_(...) in new version of dplyr as per Roberto's comment.

Update 4 Added minor variation suggested in comments.

Update 5: With rlang/tidyeval it is now possible to do this:

library(rlang)
mytable <- function(x, ...) {
  group_ <- syms(...)
  x %>% 
    group_by(!!!group_) %>% 
    summarise(n = n())
}
mytable(iris, "Species")

or passing Species unevaluated, i.e. no quotes around it:

library(rlang)
mytable <- function(x, ...) {
  group_ <- enquos(...)
  x %>% 
    group_by(!!!group_) %>% 
    summarise(n = n())
}
mytable(iris, Species)

Update 6: There is now a {{...}} notation that works if there is just one grouping variable:

mytable <- function(x, group) {
  x %>% 
    group_by({{group}}) %>% 
    summarise(n = n())
}
mytable(iris, Species)

Using dplyr group_by in a function

Here is one way to work with the new enquo from dplyr, where enquo takes the string and converts to quosure which gets evaluated by unquoting (UQ or !!) in group_by, mutate, summarise etc.

library(dplyr)
testFunction <- function(df, x) {
 x <- enquo(x)
  df %>%
    group_by(!! x) %>%
     summarize(mean.Petal.Width = mean(Petal.Width))
 }

testFunction(iris, Species)
# A tibble: 3 x 2
#     Species mean.Petal.Width
#      <fctr>            <dbl>
#1     setosa            0.246
#2 versicolor            1.326
#3  virginica            2.026

How to use dplyr::group_by in a function

You can use group_by_at and column index such as:

countString <- function(things) {
  index <- which(colnames(theTibble) %in% things)
  theTibble %>% 
       group_by_at(index) %>% 
       count()
}

countString(c("animal", "sex"))

## A tibble: 4 x 3
## Groups:   animal, sex [4]
#  animal sex        nn
#  <chr>  <chr>   <int>
#1 cat    f           2
#2 dog    f           1
#3 dog    m           2
#4 fish   unknown     1

how to use group_by in a function in R

You can do:

library(dplyr)
group <- function(df, var1, var2){
  var1 <-  enquo(var1); var2 <-  enquo(var2);
  df %>% 
    group_by(!!var1) %>% 
    summarise(n = n(),
              mean = mean(!!var2),
              sd = sd(!!var2))
}

group(df = df, var1 = y, var2 = x)
### A tibble: 2 x 4
##  y         n    mean    sd
##  <fct> <int>   <dbl> <dbl>
##1 A        50 -0.133  0.866
##2 B        50  0.0770 0.976

For further reference check the link

Group_by inside a function

If we need to pass both index and strings as 'x', wrap it inside across within group_by

library(dplyr) # version >= 1.0.0

f1 <- function(data, x) {
    data %>%
     group_by(across(all_of(x))) %>%
     summarise(n=n(), .groups = 'drop') %>% 
     mutate(pc=scales::percent(n/sum(n))) %>% 
     arrange(desc(n)) %>% 
     head()
}

If we have an older version, use group_by_at(x)

-apply the function

out1 <- lapply(colnames(dat), function(x) f1(dat, x))

Or use index

out2 <- lapply(seq_along(dat), function(i) f1(dat, i))

identical(out1, out2)
#[1] TRUE

-output

out1[[1]]
# A tibble: 3 x 3
#    cyl     n pc   
#  <dbl> <int> <chr>
#1     8    14 43.8%
#2     4    11 34.4%
#3     6     7 21.9%

out2[[1]]
# A tibble: 3 x 3
#    cyl     n pc   
#  <dbl> <int> <chr>
#1     8    14 43.8%
#2     4    11 34.4%
#3     6     7 21.9%

using dplyr::group_by in a function within apply

You should apply using the colnames(dat) to get the correct groupings:

dat <- mtcars[c(2:4,11)]



grp <- function(x) {
  group_by(dat,!!as.name(x)) %>%
  summarise(n=n()) %>% 
  mutate(pc=scales::percent(n/sum(n))) %>% 
  arrange(desc(n)) %>% head()
}


lapply(colnames(dat), grp)

Function calling variable names for group_by in dplyr - how do I vectorise this variable in the function?

@akrun's answer offers a working solution, but I think this is an ideal situation to wrap function parameters in vars(), passing the variables you want to group by as a quasi-quotation that dplyr can interpret without any explicit tidyeval code in the body of the function.

library(tidyverse)
#> -- Attaching packages ------------------------------------ tidyverse 1.2.1 --
#> v ggplot2 3.0.0     v purrr   0.2.5
#> v tibble  1.4.2     v dplyr   0.7.6
#> v tidyr   0.8.0     v stringr 1.3.1
#> v readr   1.1.1     v forcats 0.3.0
#> -- Conflicts --------------------------------------- tidyverse_conflicts() --
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag()    masks stats::lag()
# Create data frame for analysis
dat <- data.frame(
  Type1  = c(0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0),
  Type2  = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3),
  Output = c(4, 2, 7, 5, 1, 1, 7, 8, 3, 2, 5, 4, 3, 6)
)
# using the dplyr::vars() quoting function has 3 main advantages: 
# 1. It makes functions neater
mean_out <- function(.vars) {

  dat %>% 

    # group_by will continue to work for basic selections
    # group_by_at allows for full tidyselect functionality 
    group_by_at(.vars) %>% 
    summarise(mean = mean(Output)) 
}
# 2. It lets us harness the power of tidyselect
mean_out(vars(Type1))
#> # A tibble: 2 x 2
#>   Type1  mean
#>   <dbl> <dbl>
#> 1     0  3.83
#> 2     1  4.38
mean_out(vars(Type1, Type2))
#> # A tibble: 6 x 3
#> # Groups:   Type1 [?]
#>   Type1 Type2  mean
#>   <dbl> <dbl> <dbl>
#> 1     0     1  2.33
#> 2     0     2  5   
#> 3     0     3  6   
#> 4     1     1  4.33
#> 5     1     2  5   
#> 6     1     3  4
mean_out(vars(-Output))
#> # A tibble: 6 x 3
#> # Groups:   Type1 [?]
#>   Type1 Type2  mean
#>   <dbl> <dbl> <dbl>
#> 1     0     1  2.33
#> 2     0     2  5   
#> 3     0     3  6   
#> 4     1     1  4.33
#> 5     1     2  5   
#> 6     1     3  4
mean_out(vars(matches("Type")))
#> # A tibble: 6 x 3
#> # Groups:   Type1 [?]
#>   Type1 Type2  mean
#>   <dbl> <dbl> <dbl>
#> 1     0     1  2.33
#> 2     0     2  5   
#> 3     0     3  6   
#> 4     1     1  4.33
#> 5     1     2  5   
#> 6     1     3  4
# 3. It doesn't demand that we load rlang, since it's built into dplyr

Dplyr: How to Use Group_By Inside a Function