Apply T-Test on Many Columns in a Dataframe Split by Factor

Apply t-test on many columns in a dataframe split by factor

Maybe this produces the result you are looking for:

df <- read.table(text="Group   var1    var2    var3    var4    var5
1           3   5   7   3   7
1           3   7   5   9   6
1           5   2   6   7   6
1           9   5   7   0   8
1           2   4   5   7   8
1           2   3   1   6   4
2           4   2   7   6   5
2           0   8   3   7   5
2           1   2   3   5   9
2           1   5   3   8   0
2           2   6   9   0   7
2           3   6   7   8   8
2           10  6   3   8   0", header = TRUE)

t(sapply(df[-1], function(x) 
     unlist(t.test(x~df$Group)[c("estimate","p.value","statistic","conf.int")])))

The result:

     estimate.mean in group 1 estimate.mean in group 2   p.value statistic.t conf.int1 conf.int2
var1                 4.000000                 3.000000 0.5635410   0.5955919 -2.696975  4.696975
var2                 4.333333                 5.000000 0.5592911  -0.6022411 -3.104788  1.771454
var3                 5.166667                 5.000000 0.9028444   0.1249164 -2.770103  3.103436
var4                 5.333333                 6.000000 0.7067827  -0.3869530 -4.497927  3.164593
var5                 6.500000                 4.857143 0.3053172   1.0925986 -1.803808  5.089522

split a dataframe on a factor and apply a function

You can replace

 sdat <- with(dat, split(dat, strat.var))

with

sdat <- split(dat, dat[strat.var])

in the myFun.

The previous code was not splitting as it was intended, instead you were getting the sum for the whole data, i.e.

sum(with(warpbreaks, tapply(breaks, tension, FUN=mean)))
#[1] 84.44444

Using the corrected myFun

myFun(warpbreaks, strat.var='wool', PSU='tension', var1='breaks')
#$N.h
#[1] 2

#$out
#  stratum ns              mns
#A       A  3 93.1111111111111
#B       B  3 75.7777777777778

You could also create a function using dplyr (you can fine-tune the below one)

library(lazyeval)
library(dplyr)
myFun2 <- function(dat, strat.var, PSU, var1) {
   dat %>%
      mutate_(N.h = interp(~n_distinct(var),
               var = as.name(strat.var))) %>% 
      group_by_(.dots=strat.var) %>%
      mutate_(ns = interp(~n_distinct(var), var=as.name(PSU))) %>% 
      group_by_(.dots=PSU, add=TRUE) %>% 
      mutate_(mns=interp(~mean(var), var=as.name(var1))) %>%  
      select_(.dots= list(strat.var, 'ns', 'N.h', 'mns')) %>%
      unique() %>%
      group_by_(.dots=strat.var, 'ns', 'N.h') %>% 
      summarise(mns=sum(mns))                  
 }

myFun2(warpbreaks, 'wool', 'tension', 'breaks')
#Source: local data frame [2 x 4]
#Groups: ns, N.h

#  ns N.h wool      mns
#1  3   2    A 93.11111
#2  3   2    B 75.77778

for loop to change columns with a specified unique length to factor in multiple dataframes

The instruction

for(i in dataframes)

extracts i from the list dataframes and the loop changes the copy, that is never reassigned to the original. A way to correct the problem is

for (i in seq_along(dataframes)){
  x <- dataframes[[i]]
  cols.to.factor <-sapply(x, function(col) length(unique(col)) < 6)
  x[cols.to.factor] <- lapply(x[cols.to.factor] , factor)
  dataframes[[i]] <- x
}

An equivalent lapply based solution is

dataframes <- lapply(dataframes, \(x){
  cols.to.factor <- sapply(x, function(col) length(unique(col)) < 6)
  x[cols.to.factor] <- lapply(x[cols.to.factor], factor)
  x
})

Split dataframe by groups while keeping certain factor levels in each part

Using lapply you can rbind one particular group that you wish to keep.

keep_val <- 'y'
keep_group <- subset(test_df, df_groups == keep_val)

lapply(split(test_df, test_df$df_groups), function(x) 
       if(x$df_groups[1] == keep_val) x else rbind(x, keep_group))

You can also use purrr's imap similarly -

purrr::imap(split(test_df, test_df$df_groups), 
             ~if(.y == keep_val) .x else rbind(.x, keep_group))

#$x
#   df_groups
#1          x
#4          x
#7          x
#8          x
#2          y
#5          y
#10         y

#$y
#   df_groups
#2          y
#5          y
#10         y

#$z
#   df_groups
#3          z
#6          z
#9          z
#2          y
#5          y
#10         y

Running multiple t.tests to compare pairs of column values in R

If we need to do the t.test on corresponding '1s', '2s' and '3s' for 'A' and 'C', then split the dataseet based on the substring of the column names with only numbers and then apply t.test

lapply(split.default(df[2:7], gsub("\\D+", "", names(df)[2:7])), t.test)

Apply T-Test on Many Columns in a Dataframe Split by Factor