A matrix version of cor.test()
corr.test
in the psych
package is designed to do this:
library("psych")
data(sat.act)
corr.test(sat.act)
As noted in the comments, to replicate the p-values from the base cor.test()
function over the entire matrix, then you need to turn off adjustment of the p-values for multiple comparisons (the default is to use Holm's method of adjustment):
corr.test(sat.act, adjust = "none")
[But be careful when interpreting those results!]
How to use cor.test in an apply function?
You were on the right track but with few mistakes. Check the following code, I believe it produces your desired output
b = apply(df2[, -1], 2, function(x) {
cor.test(df2[, 1], x, method = "pearson", use = "pairwise")
})
p.vals <- sapply(b, "[[", "p.value")
p.vals
country value
0 0
get output of cor.test as data frame
Try this
# Create dataframe where to save results
res <- data.frame(matrix(nrow = 0, ncol = 4))
colnames(res) <- c("var1", "var2", "correlation", "pvalue")
# Correlation in loop
for(i in colnames(matrix.dge.cpm.t[,3:5])) {
for(j in colnames(matrix.dge.cpm.t[,3:5])) {
a <- cor.test(matrix.dge.cpm.t[[i]], matrix.dge.cpm.t[[j]])
res <- rbind(res, data.frame(
"var1" = i,
"var2" = j,
"correlation" = a$estimate,
"pvalue" = a$p.value) )
}
}
# Remove rownames
rownames(res) <- NULL
Another option is to use the Hmisc package and code presented here.
# ++++++++++++++++++++++++++++
# flattenCorrMatrix
# ++++++++++++++++++++++++++++
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix <- function(cormat, pmat) {
ut <- upper.tri(cormat)
data.frame(
row = rownames(cormat)[row(cormat)[ut]],
column = rownames(cormat)[col(cormat)[ut]],
cor =(cormat)[ut],
p = pmat[ut]
)
}
library(Hmisc)
res <- rcorr(as.matrix(matrix.dge.cpm.t[,1:5]))
flattenCorrMatrix(res$r, res$P)
EDIT 8th October 2020
To run the matrix against one variable
# Create dataframe where to save results
res <- data.frame(matrix(nrow = 0, ncol = 3))
colnames(res) <- c("var1", "correlation", "pvalue")
# Correlation in loop
for(i in colnames(matrix.dge.cpm.t[,3:5])) {
a <- cor.test(matrix.dge.cpm.t[[i]], samplesheet$cond1)
res <- rbind(res, data.frame(
"var1" = i,
"correlation" = a$estimate,
"pvalue" = a$p.value) )
}
Obtaining a matrix of p-values of a pearson correlation matrix
Here's a tidyverse
solution that creates all pairs of interest and then performs a cor.test
for each pair and extracts the correlation value and the corresponding p value.
# example data
FG_Smooth <- data.frame(FS_1 = c(0.43, 0.33, 3.47, 5.26, 1.09), FS2 = c(0.01, 0.02, 6.86, 3.27, 0.86), FS_3 = c(0.07, 0.36, 1.91, 5.61, 0.84), row.names = c("Group_3", "Thermo", "Embryophyta", "Flavo", "Cyclo"))
FMG_Smooth <- data.frame(GS_1 = c(1.13, 1.20, 0.52, 2.81, 0.70), GS_2 = c(1.18, 1.7, 0.42, 2.93, 0.78), GS_3 = c(1.17, 1.11, 0.60, 3.10, 0.87), row.names = c("Proline", "Trigonelline", "L-Lysine", "Nioctine", "Caffeate"))
library(tidyverse)
expand.grid(v1 = row.names(FG_Smooth), # create combinations of names
v2 = row.names(FMG_Smooth)) %>%
tbl_df() %>% # use for visualisation purpose
mutate(cor_test = map2(v1, v2, ~cor.test(unlist(FG_Smooth[.x,]), # perform the correlation test for each pair and store it
unlist(FMG_Smooth[.y,]))),
cor_value = map_dbl(cor_test, "estimate"), # get the correlation value from the test
cor_p_value = map_dbl(cor_test, "p.value")) # get the p value from the test
# # A tibble: 25 x 5
# v1 v2 cor_test cor_value cor_p_value
# <fct> <fct> <list> <dbl> <dbl>
# 1 Group_3 Proline <S3: htest> -0.998 0.0367
# 2 Thermo Proline <S3: htest> -0.592 0.596
# 3 Embryophyta Proline <S3: htest> 0.390 0.745
# 4 Flavo Proline <S3: htest> -0.544 0.634
# 5 Cyclo Proline <S3: htest> -0.966 0.167
# 6 Group_3 Trigonelline <S3: htest> -0.492 0.673
# 7 Thermo Trigonelline <S3: htest> -0.998 0.0396
# 8 Embryophyta Trigonelline <S3: htest> 0.985 0.109
# 9 Flavo Trigonelline <S3: htest> -1.000 0.00188
#10 Cyclo Trigonelline <S3: htest> -0.305 0.803
# # ... with 15 more rows
v1
and v2
are the row names of your datasets that will create the pairs for the correlation tests, cor_test
column has the correlation test object for each pair, cor_value
has the extracted correlation coefficient and cor_p_value
has the extracted p value.
If you save the above output as a data frame you can easily reshape. For example if you save it as d
you can get a 5x5 data frame of p values like this:
d %>%
select(v1, v2, cor_p_value) %>%
spread(v2, cor_p_value)
# # A tibble: 5 x 6
# v1 Caffeate `L-Lysine` Nioctine Proline Trigonelline
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Cyclo 0.309 0.995 0.351 0.167 0.803
# 2 Embryophyta 0.779 0.0931 0.737 0.745 0.109
# 3 Flavo 0.890 0.204 0.848 0.634 0.00188
# 4 Group_3 0.439 0.875 0.481 0.0367 0.673
# 5 Thermo 0.928 0.242 0.886 0.596 0.0396
An alternative version using broom
package as well would be:
library(tidyverse)
library(broom)
expand.grid(v1 = row.names(FG_Smooth),
v2 = row.names(FMG_Smooth)) %>%
tbl_df() %>%
mutate(cor_test = map2(v1, v2, ~tidy(cor.test(unlist(FG_Smooth[.x,]),
unlist(FMG_Smooth[.y,]))))) %>%
unnest()
# # A tibble: 25 x 8
# v1 v2 estimate statistic p.value parameter method alternative
# <fct> <fct> <dbl> <dbl> <dbl> <int> <chr> <chr>
# 1 Group_3 Proline -0.998 -17.3 0.0367 1 Pearson's product-moment correlation two.sided
# 2 Thermo Proline -0.592 -0.735 0.596 1 Pearson's product-moment correlation two.sided
# 3 Embryophyta Proline 0.390 0.423 0.745 1 Pearson's product-moment correlation two.sided
# 4 Flavo Proline -0.544 -0.648 0.634 1 Pearson's product-moment correlation two.sided
# 5 Cyclo Proline -0.966 -3.73 0.167 1 Pearson's product-moment correlation two.sided
# 6 Group_3 Trigonelline -0.492 -0.565 0.673 1 Pearson's product-moment correlation two.sided
# 7 Thermo Trigonelline -0.998 -16.0 0.0396 1 Pearson's product-moment correlation two.sided
# 8 Embryophyta Trigonelline 0.985 5.78 0.109 1 Pearson's product-moment correlation two.sided
# 9 Flavo Trigonelline -1.000 -339. 0.00188 1 Pearson's product-moment correlation two.sided
#10 Cyclo Trigonelline -0.305 -0.320 0.803 1 Pearson's product-moment correlation two.sided
# # ... with 15 more rows
which gives you a tidy
format of the correlation test object. You need to use columns estimate
(correlation coefficient) and p.value
.
Calculating a correlation matrix with pspearman package with apply() function
Consider creating pair-wise combinations of genes from row.names
with combn
and then iterating through the list of pairs through a defined function. Be sure to return an NA
structure from if
logic to avoid NULL
in matrix output.
However, be forewarned that pair-wise permutations of 5,000 genes (choose(5000, 2)
) results very high at 12,497,500 elements! Hence, sapply
(a loop itself) may not be that different in performance than for
. Look into parallelizing the iteration.
gene_combns <- combn(row.names(data), 2, simplify = FALSE)
spear_func <- function(x) {
# EXTRACT ROWS BY ROW NAMES
row1 <- as.numeric(data[x[1],])
row2 <- as.numeric(data[x[2],])
# Check the number of complete spots.There are no NAs in this set.
complete = sum(!(is.na(x)) & !(is.na(y)))
if (complete >=2 ) {
pspearman::spearman.test(row1, row2)
} else {
c(statistic=NA, parameter=NA, p.value=NA, estimate=NA,
null.value=NA, alternative=NA, method=NA, data.name=NA)
}
}
pair.all2 <- sapply(gene_combns, spear_func)
Testing
Above has been tested with cor.test
(exactly same input args and output list as spearman.test but more accurate p-value
) using a small sample of dataset (50 obs, 20 vars):
set.seed(82418)
data <- data.frame(matrix(rbinom(10*100000, 50, .5), ncol=200))[1:50, 1:20]
rownames(data) <- paste0("gene", 1:50)
colnames(data) <- paste0("spot", 1:20)
gene_combns <- combn(row.names(data), 2, simplify = FALSE)
# [[1]]
# [1] "gene1" "gene2"
# [[2]]
# [1] "gene1" "gene3"
# [[3]]
# [1] "gene1" "gene4"
# [[4]]
# [1] "gene1" "gene5"
# [[5]]
# [1] "gene1" "gene6"
# [[6]]
# [1] "gene1" "gene7"
test <- sapply(gene_combns, spear_func) # SAME FUNC BUT WITH cor.test
test[,1:5]
# [,1] [,2]
# statistic 885.1386 1659.598
# parameter NULL NULL
# p.value 0.1494607 0.2921304
# estimate 0.3344823 -0.2478179
# null.value 0 0
# alternative "two.sided" "two.sided"
# method "Spearman's rank correlation rho" "Spearman's rank correlation rho"
# data.name "row1 and row2" "row1 and row2"
# [,3] [,4]
# statistic 1554.533 1212.988
# parameter NULL NULL
# p.value 0.4767667 0.7122505
# estimate -0.1688217 0.08797877
# null.value 0 0
# alternative "two.sided" "two.sided"
# method "Spearman's rank correlation rho" "Spearman's rank correlation rho"
# data.name "row1 and row2" "row1 and row2"
# [,5]
# statistic 1421.707
# parameter NULL
# p.value 0.7726922
# estimate -0.06895299
# null.value 0
# alternative "two.sided"
# method "Spearman's rank correlation rho"
# data.name "row1 and row2"
Related Topics
How to Add a Ggplot2 Subtitle with Different Size and Colour
Is There a Better Alternative Than String Manipulation to Programmatically Build Formulas
R Loop for Variable Names to Run Linear Regression Model
How to Calculate Combination and Permutation in R
Merge 2 Dataframes If Value Within Range
How to Create Grouped Barplot with R
R - Add Column That Counts Sequentially Within Groups But Repeats for Duplicates
Add Empty Columns to a Dataframe with Specified Names from a Vector
Convert a Numeric Month to a Month Abbreviation
Update Subset of Data.Table Based on Join
Comma Separator for Numbers in R