Run a Custom Function on a Data Frame in R, by Group

Run a custom function on a data frame in R, by group

Using dplyr

library(dplyr)
df %>% 
   group_by(tm) %>%
   do(data.frame(val=calc(.)))
#  tm      val
#1  1 1.665882
#2  2 1.504545
#3  3 1.838000

If we change the function slightly to include multiple arguments, this could also work with summarise

 calc1 <- function(d1, t1, h1, p1){
      (1.27*sum(d1) + 1.62*sum(t1) + 2.10*sum(h1) )/sum(p1) }
 df %>%
     group_by(tm) %>% 
     summarise(val=calc1(d, t, h, p))
 #  tm      val
 #1  1 1.665882
 #2  2 1.504545
 #3  3 1.838000

R: How to run a custom function after group_by using group_map?

As @André Oliveira mentions in the comments, it is recommended to use mutate for adding a column. However, it is possible to do so with group_modify after making some small changes to your function.

newState <- function(dt, groupvars){

  dt["new"]= dt[1,"state"]*3 
  dt

}

dt %>% 
  group_by(paitent_ID) %>% 
  arrange(year, month) %>% 
  group_modify(newState) %>% 
  ungroup

# # A tibble: 100 x 5
#    paitent_ID  year month state   new
#         <int> <int> <int> <int> <dbl>
#  1          1  2006     5     3     9
#  2          2  2012    12     3     9
#  3          3  2013    11     8    24
#  4          3  2014    10     1    24
#  5          3  2019     5     6    24
#  6          4  2006     7     5    15
#  7          4  2006     7     2    15
#  8          5  2003     8     8    24
#  9          7  2015    12     2     6
# 10          7  2017     8    10     6

And a more conventional approach

dt %>% 
  group_by(paitent_ID) %>% 
  arrange(year, month) %>% 
  mutate(new = state[1]*3)

R : group by with custom functions

As guided by eddi's point above, the basic problem is thinking that your custom function is being called inside a loop and that 'dt$column' will mysteriously give you the 'current value at the current row'. Instead it gives you the entire column (a vector). The function is passed the entire data table, not row-wise bits of data.

So, replacing the value in the return statement with something that represents a single value works. Example:

customfunc <- function(dt){
  q = unname(quantile(dt$column,0.25))
  n = nrow(dt[dt$column <= q])
  return(n/length(dt$someOtherColumn))
}

> df <- data.frame(Z=c("abc","abc","def","abc"), column=c(1,2,3,4), someOtherColumn=c(5,6,7,8))
> df
    Z column someOtherColumn
1 abc      1               5
2 abc      2               6
3 def      3               7
4 abc      4               8
> newdf <- setDT(df)[, customfunc(.SD), by=Z, .SDcols=c("column", "someOtherColumn")]
> newdf
     Z        V1
1: abc 0.3333333
2: def 1.0000000

Now the data is aggregated correctly.

R group by one column and apply custom function to another column

With dplyr, you can do:

fun <- function(x) {
 (x/n()) * 100
}

x %>%
 group_by(x) %>%
 mutate(Data = fun(Data))

   x      Data
   <fct> <dbl>
 1 A        20
 2 A        40
 3 A        60
 4 A        80
 5 A       100
 6 B        20
 7 B        40
 8 B        60
 9 B        80
10 B       100

Apply custom function to multiple groups in R

You almost had it. You just need to apply your function for each group. The purrr library makes this pretty easy.

Libraries:

library(purrr)
library(dplyr)

Main function:

Takes group name and full data set as arguments, then filters by that group. Then makes calculations and returns them as a dataframe.

width <- function(Group.Name, data){
  # limit to rows for that group
  df<-data %>% filter(Group == Group.Name)
  
  i.mins  <- which(diff(sign(diff(c(Inf, df$Value, Inf)))) == 2)
  
  i.mx <- which.max(df$Value)
  
  i <- sort(c(i.mins, i.mx))
  
  ix <- i[which(i == i.mx) + c(-1, 1)]
  
  # put results in dataframe
  df <- data.frame("Group" = Group.Name, "Value_1" = ix[1], "Value_2" = ix[2])
  
  # format Group Col
  df$Group <- as.character(df$Group)
  return(df)
}

Looping through groups with `purrr`

# unique group names we need to loop through for calcs
Group.Names <- unique(data$Group)

# take each name and use the width function
# then put results together in one datframe
Group.Names %>% map_df(~ width(Group.Name = .x, data = data))

Results:

   Group Value_1 Value_2
1 Group1      16      22
2 Group2       4      12
3 Group3       2      15

Note: the .x notation just tells map to put the Group.Names object as the first argument in our width function

custom function after grouping data.fame

I guess this is the dplyr equivalent to @jlhoward's data.table solution:

df %>%
  group_by(l) %>%
  mutate(e = v[d == "b"] - v[d == "a"])

Edit after comment by OP:

If you want to use a custom function, here's a possible way:

myfunc <- function(x) {
  with(x, v[d == "b"] - v[d == "a"])
}

test %>%
  group_by(l) %>%
  do(data.frame(. , e = myfunc(.))) %>%
  arrange(d, l)                   # <- just to get it back in the original order

Edit after comment by @hadley:

As hadley commented below, it would be better in this case to define the function as

f <- function(v, d) v[d == "b"] - v[d == "a"]

and then use the custom function f inside a mutate:

df %>%
  group_by(l) %>%
  mutate(e = f(v, d))

Thanks @hadley for the comment.

R apply() custom function to every row in data frame

Another approach is modifying your existing function such that it is vectorised.

    t.test2 <- function(m1,m2,s1,s2,n1,n2,m0=0,equal.variance=FALSE)
{
    if(!equal.variance) 
    {
        se <- sqrt( (s1^2/n1) + (s2^2/n2) )
        # welch-satterthwaite df
        df <- ( (s1^2/n1 + s2^2/n2)^2 )/( (s1^2/n1)^2/(n1-1) + (s2^2/n2)^2/(n2-1) )
    } else
    {
        # pooled standard deviation, scaled by the sample sizes
        se <- sqrt( (1/n1 + 1/n2) * ((n1-1)*s1^2 + (n2-1)*s2^2)/(n1+n2-2) ) 
        df <- n1+n2-2
    }      
    t <- (m1-m2-m0)/se 
    dat <- vapply(seq_len(length(m1)), 
                  function(x){c(m1[x]-m2[x], se[x], t[x], 2*pt(-abs(t[x]),df[x]))},
                  numeric(4))  #one tailed m2 > m1. Replace with "2*pt(-abs(t),df))" for two tailed. 
    dat <- t(dat)
    dat <- as.data.frame(dat)
    names(dat) <- c("Difference of means", "Std Error", "t", "p-value")
    return(dat) 
}

This approach allows you to pass in vectors for your various inputs and it will provide a data frame of equal length to your inputs. It uses the vapply function to return a vector of length 4 for each value provided.

Under this approach, you can simply go

t.test2(MPAmeans$reference_mean, MPAmeans$MPA_mean, MPAmeans$sd_reference, MPAmeans$sd_MPA, MPAmeans$n_reference, MPAmeans$n_MPA)

(or whatever you end up calling your variables)

apply a custom function on grouped dataframe n rows at a time

You could use slider::slide to create subgroups :

library(dplyr)
library(purrr)
library(slider)

N <- 10

Collage <- function(country,strain,subgroupnumber,data) {
  cat(paste('Processing:',country,'-',strain,'/',subgroupnumber),'\n')
  cat(paste(nrow(data) , ' files to read \n'))
  cat(paste(data$png_file,collapse=' ; '),'\n')
  cat('\n')
}

res <- df %>% group_by(country,strain) %>% 
  group_walk(~{
    group <- .y
    subgroups <- slider::slide(.x,.f=~.x,.step = N ,.after = N-1)
    # Remove empty elements
    subgroups <- subgroups[lengths(subgroups) != 0]
    
    # Run wished function on each subgroup
    subgroups %>% iwalk(~{
      Collage(group[1],group[2],.y,.x)
  })

})

Processing: UK - Covid_123 / 1 
10  files to read 
A_UK_1_lp21_pmn1__1.png ; A_UK_1_xno9_pmn1__1.png ; A_UK_2.14.3_lp21_pmn1__1.png ; A_UK_2.14.3_xno9_pmn1__1.png ; A_UK_2.2_lp21_zn78__1.png ; A_UK_2.2_xno9_zn78__1.png ; A_UK_2.3_lp21_pmn1__1.png ; A_UK_2.3_xno9_pmn1__1.png ; A_UK_2.4_lp21_yun7__1.png ; A_UK_2.8.1_lp21_pmn1__1.png 

Processing: UK - Covid_123 / 2 
9  files to read 
A_UK_2.8.1_xno9_pmn1__1.png ; A_UK_2.8.2_lp21_pmn1__1.png ; A_UK_2.8.2_xno9_pmn1__1.png ; B_UK_2.1_lp21_pmn1__1.png ; B_UK_2.1_xno9_pmn1__1.png ; B_UK_2.14.1_lp21_pmn1__1.png ; B_UK_2.14.1_xno9_pmn1__1.png ; B_UK_2.14.2_lp21_pmn1__1.png ; B_UK_2.14.2_xno9_pmn1__1.png 

Processing: UK - Covid_125 / 1 
10  files to read 
A_UK_2.14.3_lp21_pmn1__1.png ; A_UK_2.14.3_xno9_pmn1__1.png ; A_UK_2.2_lp21_zn78__1.png ; A_UK_2.2_xno9_zn78__1.png ; A_UK_2.3_lp21_pmn1__1.png ; A_UK_2.3_xno9_pmn1__1.png ; A_UK_2.4_lp21_yun7__1.png ; A_UK_2.8.1_lp21_pmn1__1.png ; A_UK_2.8.1_xno9_pmn1__1.png ; A_UK_2.8.2_lp21_pmn1__1.png 

Processing: UK - Covid_125 / 2 
5  files to read 
A_UK_2.8.2_xno9_pmn1__1.png ; B_UK_2.14.1_lp21_pmn1__1.png ; B_UK_2.14.1_xno9_pmn1__1.png ; B_UK_2.14.2_lp21_pmn1__1.png ; B_UK_2.14.2_xno9_pmn1__1.png 

Processing: UK - Covid_127 / 1 
10  files to read 
A_UK_2.2_lp21_zn78__1.png ; A_UK_2.2_xno9_zn78__1.png ; A_UK_2.3_lp21_pmn1__1.png ; A_UK_2.3_xno9_pmn1__1.png ; A_UK_2.4_lp21_yun7__1.png ; A_UK_2.9.1_lp21_yun7__1.png ; B_UK_2.12.1_lp21_yun7__1.png ; B_UK_2.12.2_lp21_yun7__1.png ; B_UK_2.7.1_lp21_pmn1__1.png ; B_UK_2.7.1_xno9_pmn1__1.png 

Processing: UK - Covid_127 / 2 
7  files to read 
B_UK_2.7.4_lp21_yun7__1.png ; B_UK_2.9.2_lp21_yun7__1.png ; A_UK_2.4_lp21_yun7__1.png ; A_UK_2.5.4_lp21_pmn1__1.png ; A_UK_2.5.4_xno9_pmn1__1.png ; A_UK_2.6.4_lp21_yun7__1.png ; B_UK_2.5.3_lp21_yun7__1.png 

Processing: UK - Covid_128 / 1 
1  files to read 
A_UK_2.4_lp21_yun7__1.png 
              1

Grouped application of function that return a data.frame (without a for loop)

One option is to use dplyr::group_split() and purrr::map_dfr().

How this works: group_split() will divide your data.frame df into a list of data.frames based on the grouping variables you supply (e.g., g). Next, map_dfr() can be used to apply a function to each element of that list. Because your custom function ff() returns a data.frame without your grouping variable g, you'll want to add that information back to ff() output - this can be accomplished with mutate() as in the example below:

library(dplyr)
library(purrr)

# set seed so that example is reproducible
set.seed(1)

# your example data and function
df <- data.frame(start=1:10,end=21:30,g=sample(LETTERS[1:2],10,replace=TRUE))

ff <- function(start,end,... ) {
  out <- data.frame(T1=c(start,rev(start)),T2=c(end,rev(end)))
  return(out)
}

# use group_split & map_dfr
df %>%
  # divide df into a list of data.frames based on supplied grouping variables
  group_split(g) %>%
  # for each element in the list, apply this function
  map_dfr(function(df.x) {
    with(df.x,
         # get the data.frame your function returns
         ff(start, end) %>%
           # add your grouping variables back-in (stripped by ff)
           mutate(g = g[1]))
  })

# a short-hand version of the above can be written as:
df %>%
  group_split(g) %>%
  map_dfr(~ff(.x$start, .x$end) %>% mutate(g = .x$g[1]))

Run a Custom Function on a Data Frame in R, by Group