chi square test for each row in data frame
You can use apply
with "MARGIN =1" to and then do the chisq.test
. Extract the values using $statistic
and $p.value
and cbind
it to the dataset.
df1 <- cbind(df, t(apply(df, 1, function(x) {
ch <- chisq.test(x)
c(unname(ch$statistic), ch$p.value)})))
colnames(df1)[3:4] <- c('x-squared', 'p-value')
chisq.test for each row on four numbers and output in new data frame in R
You were close, just inspect one single test with str
which helps you to decide which elements to select.
apply(dat[,c('female_boxing','female_cycling','male_boxing','male_cycling')],
1, function(x) chisq.test(x)[c('statistic', 'p.value')] )
The apply
gives you a list, the results are a little nicer using sapply
and looping over the rows.
chi <- t(sapply(seq(nrow(dat)), function(i)
chisq.test(dat[i, c('female_boxing','female_cycling','male_boxing','male_cycling')])[
c('statistic', 'p.value')]))
cbind(dat, chi)
# ID1 ID2 female_boxing female_cycling male_boxing male_cycling statistic p.value
# 1 A zit 43 170 159 710 988.7209 5.033879e-214
# 2 B tag 37 134 165 744 1142.541 2.146278e-247
# 3 C hfs 32 96 170 784 1334.991 3.762222e-289
# 4 D prt 17 61 185 811 1518.015 0
# 5 E its 31 112 169 762 1245.218 1.133143e-269
# 6 F qrw 68 233 130 645 752.3941 9.129485e-163
Data:
dat <- structure(list(ID1 = c("A", "B", "C", "D", "E", "F"), ID2 = c("zit",
"tag", "hfs", "prt", "its", "qrw"), female_boxing = c(43L, 37L,
32L, 17L, 31L, 68L), female_cycling = c(170L, 134L, 96L, 61L,
112L, 233L), male_boxing = c(159L, 165L, 170L, 185L, 169L, 130L
), male_cycling = c(710L, 744L, 784L, 811L, 762L, 645L)), class = "data.frame", row.names = c(NA,
-6L))
Chi-square tests for different groups in a R dataframe
base
df <- data.frame(species = factor(c(rep("species1", 4), rep("species2", 4), rep("species3", 4))),
trap = c(rep(c("A","B","C","D"), 3)),
count=c(6,3,7,9,5,3,6,6,5,8,1,3))
df
#> species trap count
#> 1 species1 A 6
#> 2 species1 B 3
#> 3 species1 C 7
#> 4 species1 D 9
#> 5 species2 A 5
#> 6 species2 B 3
#> 7 species2 C 6
#> 8 species2 D 6
#> 9 species3 A 5
#> 10 species3 B 8
#> 11 species3 C 1
#> 12 species3 D 3
species <- unique(df$species)
chi_species <- lapply(species, function(x) xtabs(count~trap, df,
subset = species== x))
chi_species <- setNames(chi_species, species)
lapply(chi_species, chisq.test)
#> $species1
#>
#> Chi-squared test for given probabilities
#>
#> data: X[[i]]
#> X-squared = 3, df = 3, p-value = 0.3916
#>
#>
#> $species2
#>
#> Chi-squared test for given probabilities
#>
#> data: X[[i]]
#> X-squared = 1.2, df = 3, p-value = 0.753
#>
#>
#> $species3
#>
#> Chi-squared test for given probabilities
#>
#> data: X[[i]]
#> X-squared = 6.2941, df = 3, p-value = 0.09815
Created on 2022-04-25 by the reprex package (v2.0.1)
tidyverse
df %>%
group_by(species, trap) %>%
summarise(count = sum(count)) %>%
summarise(pvalue= chisq.test(count)$p.value)
# A tibble: 3 × 2
species pvalue
<fct> <dbl>
1 species1 0.392
2 species2 0.753
3 species3 0.0981
add results of chi square test to each row
To see why the error is trying to communicate, compare your data with the type of data chisq.test
is expecting:
dput(matrix(main[1,2:5,drop=T], nrow=2, 2,2))
# structure(list(20, 10, 40, 80), .Dim = c(2L, 2L))
dput(matrix(1:4, nrow=2, 2,2))
# structure(c(1L, 3L, 2L, 4L), .Dim = c(2L, 2L))
One remedy is to force you data into a numeric
vector:
res <- chisq.test(matrix(as.numeric(main[1,2:5]), nrow=2, 2,2))
res
# Pearson's Chi-squared test with Yates' continuity correction
# data: matrix(as.numeric(main[1, 2:5]), nrow = 2, 2, 2)
# X-squared = 9.7656, df = 1, p-value = 0.001778
Now, if you want to add the results to each row, you first need to pick "which results". Namely, the results are actually prettied up a bit, with several tidbits internally:
str(unclass(res))
# List of 9
# $ statistic: Named num 9.77
# ..- attr(*, "names")= chr "X-squared"
# $ parameter: Named int 1
# ..- attr(*, "names")= chr "df"
# $ p.value : num 0.00178
# $ method : chr "Pearson's Chi-squared test with Yates' continuity correction"
# $ data.name: chr "matrix(as.numeric(main[1, 2:5]), nrow = 2, 2, 2)"
# $ observed : num [1:2, 1:2] 20 10 40 80
# $ expected : num [1:2, 1:2] 12 18 48 72
# $ residuals: num [1:2, 1:2] 2.309 -1.886 -1.155 0.943
# $ stdres : num [1:2, 1:2] 3.33 -3.33 -3.33 3.33
If you wanted to include (e.g.) the test statistic as a number, you might do:
chisq.statistic <- sapply(seq_len(nrow(main)), function(row) {
chisq.test(matrix(as.numeric(main[row,2:5]), nrow=2, 2,2))$statistic
})
main$chisq.statistic <- chisq.statistic
main
# Genes Group1_Mut Group1_WT Group2_Mut Group2_WT chisq.statistic
# 1 GENE_A 20 40 10 80 9.76562500
# 2 GENE_B 10 50 30 60 4.29687500
# 3 GENE_C 5 55 10 80 0.07716049
Note that tools like dplyr
and data.table
may facilitate this. For example:
library(dplyr)
main %>%
rowwise() %>%
mutate(
chisq.statistic = chisq.test(matrix(c(Group1_Mut, Group1_WT, Group2_Mut, Group2_WT), nrow = 2))$statistic
)
# Source: local data frame [3 x 6]
# Groups: <by row>
# # A tibble: 3 × 6
# Genes Group1_Mut Group1_WT Group2_Mut Group2_WT chisq.statistic
# <fctr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 GENE_A 20 40 10 80 9.76562500
# 2 GENE_B 10 50 30 60 4.29687500
# 3 GENE_C 5 55 10 80 0.07716049
This example shows one thing you may wish to incorporate into whichever method you use: explicit naming of columns. That is, "2:5" could change depending on your input matrix.
With R, I would like to loop through each row and create corresponding chisquare results for each row
An option is apply
with MARGIN = 1
to loop over the rows. Within each row, it is a vector
, so we just need to wrap with matrix
to convert to a matrix
with specified dim
ensions, apply the chisq.test
and get the output in a tibble format with tidy
library(broom)
library(dplyr)
apply(df, 1, function(x) tidy(chisq.test(matrix(x, ncol = 2)))) %>%
bind_rows
Or this can be done in tidyverse
with pmap
library(purrr)
pmap_dfr(df, ~ c(...) %>%
matrix(ncol = 2) %>%
chisq.test %>%
tidy)
-output
# A tibble: 4 x 4
# statistic p.value parameter method
# <dbl> <dbl> <int> <chr>
#1 3.17e- 1 0.574 1 Pearson's Chi-squared test with Yates' continuity correction
#2 1.66e- 2 0.898 1 Pearson's Chi-squared test with Yates' continuity correction
#3 6.70e-32 1.00 1 Pearson's Chi-squared test with Yates' continuity correction
#4 7.51e- 1 0.386 1 Pearson's Chi-squared test with Yates' continuity correction
R: How to perform a chiquare test on two groups & each row of a dataframe
First, this is a statistics question, not an R question really. You should try posting it on stats.stackexchange.com, where you are likely to get a much better answer.
Second, there are two types of chi-square test, one to assess whether a sample is from a given test distribution, and one to test for independence. I assume that you are interested in the first type.
If that is correct, then it looks like you are asking - using the first row of your df as an example - how likely is it, if allele A and allele B are present in equal amounts, that you could get a sample where allele A is present at 69% and allele B is present at 31%? If the likelihood (p) is very low, then we can assert with confidence 1-p that allele A and allele B were not present in equal amounts.
[NB: If this is not what you are asking, then I am misunderstanding your question - let me know in a comment and I'll delete the answer.]
In your case it is probably better to skip the vagaries of the chisq.test(...)
function in R and go directly to the definition of Xi-sq:
χ2 = Σ( Oi - Ei )2 / Ei
Where Oi and Ei are the ith observed and expected value, respectively. The way you have this set up, in each row there are only 2 observations, for allele A and allele B. So for row 1 we would write:
χ2 = (0.692 - 0.5)2 / 0.5 + (0.307 - 0.5)2 / 0.5 = 0.148
Since there are only two observations, there is only 1 degree of freedom. Chi-square tests with only 1 df are extremely unreliable, so I do not recommend this, but following through for the sake of the example, we can calculate the probability that chi-sq will be this large or larger as follows in R:
pchisq(0.148, df=1, lower.tail=F)
# [1] 0.700454
This means, assuming that allele A and allele B are present in equal amounts, there is still a 70% chance that you could obtain a sample with allele A present at 69% and allele B present at 31%. So we definately cannot reject the null hypothesis (that allele A and B are present equally).
Running this test for all rows is straightforward:
df <- na.omit(df) # remove rows with missing values
colnames(df) <- c("A.obs","B.obs","A.exp","B.exp") # because I'm lazy
df$chisq <- with(df,(A.obs-A.exp)^2/A.exp + (B.obs-B.exp)^2/B.exp)
df$p.value <- pchisq(df$chisq,df=1, lower.tail=F)
df
# A.obs B.obs A.exp B.exp chisq p.value
# 1 0.6923077 0.3076923 0.5 0.5 0.147929 0.7005224
# 4 0.6250000 0.3750000 0.5 0.5 0.062500 0.8025873
# 5 0.6250000 0.3750000 0.5 0.5 0.062500 0.8025873
You can in fact use the chisq.test(...)
function to do this, although in your case I'm not sure it's an improvement:
t(apply(df,1,function(x)
with(chisq.test(x[1:2],p=x[3:4]),c(statistic,p.value=p.value))))
# X-squared p.value
# 1 0.147929 0.7005224
# 4 0.062500 0.8025873
# 5 0.062500 0.8025873
Related Topics
R: "Make" Not Found When Installing a R-Package from Local Tar.Gz
Calculate Centroid Within/Inside a Spatialpolygon
Click on Cross Domain Iframe Element Using Rselenium
Programmatically Create Tab and Plot in Markdown
Map Array of Strings to an Array of Integers
Out of Order Text Labels on Stack Bar Plot (Ggplot)
R - Converting Posixct to Milliseconds
Creating a Prng Engine for <Random> in C++11 That Matches Prng Results in R
Removing Unicode Symbols from Column Names
Make Legend Invisible But Keep Figure Dimensions and Margins the Same
How to Find Correct Executable with Sys.Which on Windows
Find Closest Points (Lat/Lon) from One Data Set to a Second Data Set
How to Color Bar Plots When Using ..Prop.. in Ggplot
Rvest Not Recognizing CSS Selector
How to Create a Dropdown List in a Shiny Table Using Datatable When Editing the Table
Is There a Package or Technique Availabe for Calculating Large Factorials in R