dplyr summarise multiple columns using t.test
After all discussions with @aosmith and @Misha, here is one approach. As @aosmith wrote in his/her comments, You want to do the following.
mtcars %>%
summarise_each(funs(t.test(.[vs == 0], .[vs == 1])$p.value), vars = disp:qsec)
# vars1 vars2 vars3 vars4 vars5
#1 2.476526e-06 1.819806e-06 0.01285342 0.0007281397 3.522404e-06
vs is either 0 or 1 (group). If you want to run a t-test between the two groups in a variable (e.g., dips), it seems that you need to subset data as @aosmith suggested. I would like to say thank you for the contribution.
What I originally suggested works in another situation, in which you simply compare two columns. Here is sample data and codes.
foo <- data.frame(country = "Iceland",
year = 2014,
id = 1:30,
A = sample.int(1e5, 30, replace = TRUE),
B = sample.int(1e5, 30, replace = TRUE),
C = sample.int(1e5, 30, replace = TRUE),
stringsAsFactors = FALSE)
If you want to run t-tests for the A-C, and B-C combination, the following would be one way.
foo2 <- foo %>%
summarise_each(funs(t.test(., C, pair = TRUE)$p.value), vars = A:B)
names(foo2) <- colnames(foo[4:5])
# A B
#1 0.2937979 0.5316822
dplyr summarize across ttest
If we are using tidy
library(dplyr)
library(broom)
library(tidyr)
mtcars %>%
group_by(am) %>%
summarise(across(
.cols = mpg,
~ list(tidy(t.test(.[vs == 0], .[vs == 1])) %>%
select(p.value, conf.low, conf.high))
)) %>%
unnest(mpg)
-output
# A tibble: 2 x 4
am p.value conf.low conf.high
<dbl> <dbl> <dbl> <dbl>
1 0 0.000395 -8.33 -3.05
2 1 0.00459 -14.0 -3.27
In the OP's code, we need the lambda function inside the list
mtcars %>%
group_by(am) %>%
summarise(across(
.cols = mpg,
.fns = list(
p.value = ~ t.test(.[vs == 0], .[vs == 1])$p.value,
conf.low = ~ t.test(.[vs == 0], .[vs == 1])$conf.int[1],
conf.high =~ t.test(.[vs == 0], .[vs == 1])$conf.int[2]
)
))
-output
# A tibble: 2 x 4
am mpg_p.value mpg_conf.low mpg_conf.high
<dbl> <dbl> <dbl> <dbl>
1 0 0.000395 -8.33 -3.05
2 1 0.00459 -14.0 -3.27
How to apply t.test() to multiple pairs of columns after mutate across
The t.test
output is a list
, so we may need to wrap in a list
to containerize with mutate
library(dplyr)
library(stringr)
out <- df %>%
mutate(across(starts_with('PreScore'),
~list(t.test(.,
get(str_replace(cur_column(), "^PreScore", "PostScore")))),
.names = "{.col}_TTest")) %>%
rename_at(vars(ends_with('TTest')), ~ str_remove(., "PreScore"))
-check the str
> str(out)
'data.frame': 3 obs. of 10 variables:
$ Subject : int 1 2 3
$ PreScoreTestA : int 30 15 20
$ PostScoreTestA: int 40 12 22
$ PreScoreTestB : int 6 9 11
$ PostScoreTestB: int 8 13 12
$ PreScoreTestC : int 12 7 9
$ PostScoreTestC: int 10 7 10
$ TestA_TTest :List of 3
..$ :List of 10
.. ..$ statistic : Named num -0.322
.. .. ..- attr(*, "names")= chr "t"
.. ..$ parameter : Named num 3.07
.. .. ..- attr(*, "names")= chr "df"
.. ..$ p.value : num 0.768
.. ..$ conf.int : num -32.2 26.2
.. .. ..- attr(*, "conf.level")= num 0.95
.. ..$ estimate : Named num 21.7 24.7
.. .. ..- attr(*, "names")= chr [1:2] "mean of x" "mean of y"
.. ..$ null.value : Named num 0
.. .. ..- attr(*, "names")= chr "difference in means"
.. ..$ stderr : num 9.3
.. ..$ alternative: chr "two.sided"
.. ..$ method : chr "Welch Two Sample t-test"
.. ..$ data.name : chr "PreScoreTestA and get(str_replace(cur_column(), \"^PreScore\", \"PostScore\"))"
.. ..- attr(*, "class")= chr "htest"
..$ :List of 10
...
If we need to extract only a particular list
element i.e. p.value
df %>%
mutate(across(starts_with('PreScore'),
~ t.test(.,
get(str_replace(cur_column(), "^PreScore", "PostScore")))$p.value,
.names = "{.col}_TTest"))
Subject PreScoreTestA PostScoreTestA PreScoreTestB PostScoreTestB PreScoreTestC PostScoreTestC PreScoreTestA_TTest
1 1 30 40 6 8 12 10 0.767827
2 2 15 12 9 13 7 7 0.767827
3 3 20 22 11 12 9 10 0.767827
PreScoreTestB_TTest PreScoreTestC_TTest
1 0.330604 0.8604162
2 0.330604 0.8604162
3 0.330604 0.8604162
Note that by using mutate
we are storing the same information for all the rows. Instead we may use summarise
df %>%
summarise(across(starts_with('PreScore'), ~ t.test(.,
get(str_replace(cur_column(), "^PreScore", "PostScore")))$p.value,
.names = "{.col}_TTest"))
PreScoreTestA_TTest PreScoreTestB_TTest PreScoreTestC_TTest
1 0.767827 0.330604 0.8604162
Summarise multiple columns using weighted t-test
We can use summarise
with across
library(dplyr)
df %>%
summarise(across(c(population:farmland),
~ weights::wtd.t.test(x = .[cat == 'Treated'],
y = .[cat == 'Control'],
weight = weight[cat == 'Treated'],
weighty= weight[cat == 'Control'])$coefficients[3]))
Or using lapply/sapply
sapply(df[2:4], function(v)
weights::wtd.t.test(x = v[df$cat == "Treated"],
y = v[df$cat == "Control"],
weight = df$weight[df$cat == "Treated"],
weighty = df$weight[df$cat == "Control"])$coefficients[3])
R: t.test multiple variables in dataframe with dplyr then summarise in table
I would use the dplyr
package for this analysis as follows:
library(dplyr)
DF %>%
pivot_longer(starts_with("KP"), names_to = "KP", values_to = "value") %>%
group_by(AOI, KP) %>%
nest() %>%
mutate(
pval = map_dbl(data, ~t.test(value ~ Stimuli, data = .x)$p.value),
mean_a = map_dbl(data, ~mean(.x$value[.x$Stimuli == "A"])),
mean_b = map_dbl(data, ~mean(.x$value[.x$Stimuli == "B"]))
) %>%
select(-data) %>%
arrange(KP, AOI)
Using a t.test inside dplyr summarise after grouping
library(tidyverse)
library(magrittr)
diamonds %>%
group_by(cut) %>%
summarise(price_avg = t.test(price[color=="E"], price[color=="I"])$p.value)
# # A tibble: 5 x 2
# cut price_avg
# <ord> <dbl>
# 1 Fair 3.90e- 3
# 2 Good 1.46e-12
# 3 Very Good 2.44e-39
# 4 Premium 7.27e-52
# 5 Ideal 7.63e-62
The problem with your solution is that .
won't get the subset of your dataset (based on your grouping), but the whole dataset. Check by doing this:
diamonds %>%
group_by(cut) %>%
summarise(d = list(.))
# # A tibble: 5 x 2
# cut d
# <ord> <list>
# 1 Fair <tibble [53,940 x 10]>
# 2 Good <tibble [53,940 x 10]>
# 3 Very Good <tibble [53,940 x 10]>
# 4 Premium <tibble [53,940 x 10]>
# 5 Ideal <tibble [53,940 x 10]>
An alternative solution would be this:
diamonds %>%
nest(-cut) %>%
mutate(price_avg = map_dbl(data, ~t.test(
.x %>% filter(color == "E") %$% price,
.x %>% filter(color == "I") %$% price )$p.value))
# # A tibble: 5 x 3
# cut data price_avg
# <ord> <list> <dbl>
# 1 Ideal <tibble [21,551 x 9]> 7.63e-62
# 2 Premium <tibble [13,791 x 9]> 7.27e-52
# 3 Good <tibble [4,906 x 9]> 1.46e-12
# 4 Very Good <tibble [12,082 x 9]> 2.44e-39
# 5 Fair <tibble [1,610 x 9]> 3.90e- 3
This works with filter
because you are able to pass to filter
the appropriate subset of your data (i.e. column data
) each time.
T-tests across multiple columns or tidy the data
Yes, some pivoting is needed. Asssuming you have no directional hypotheses and you want to do a pre-post assessment for each test, this might be what you are looking for:
df <- as.data.frame(rbind(c(1, 30, 40, 6, 8, 12, 10),
c(2, 15, 12, 9, 13, 7, 7),
c(3, 20, 22, 11, 12, 9, 10)))
names(df) <- c("Subject",
"PrePushup", "PostPushup",
"PreRun", "PostRun",
"PreJump", "PostJump")
df %>%
pivot_longer(-Subject,
names_to = c("time", "test"), values_to = "score",
names_pattern = "(Pre|Post)(.*)") %>%
group_by(test) %>%
nest() %>%
mutate(t_tests = map(data, ~t.test(score ~ time, data = .x, paired = TRUE))) %>%
pull(t_tests) %>%
purrr::set_names(c("Pushup", "Run", "Jump"))
$Pushup
Paired t-test
data: score by time
t = 0.79241, df = 2, p-value = 0.5112
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-13.28958 19.28958
sample estimates:
mean of the differences
3
$Run
Paired t-test
data: score by time
t = 2.6458, df = 2, p-value = 0.1181
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-1.461250 6.127916
sample estimates:
mean of the differences
2.333333
$Jump
Paired t-test
data: score by time
t = -0.37796, df = 2, p-value = 0.7418
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-4.127916 3.461250
sample estimates:
mean of the differences
-0.3333333
Related Topics
Set Number of Columns (Or Rows) in a Facetted Plot
Legends for Multiple Fills in Ggplot
Update a Column of Nas in One Data Table with the Value from a Column in Another Data Table
R Shiny, How to Make Datatable React to Checkboxes in Datatable
Dictionary() Is Not Supported Anymore in Tm Package. How to Emend Code
Show Element Values in Barplot
Filtering Rows in R Unexpectedly Removes Nas When Using Subset or Dplyr::Filter
Changing the Symbol in the Legend Key in Ggplot2
Deleting Specific Rows from a Data Frame
Calling a User-Defined R Function from C++ Using Rcpp