A Matrix Version of Cor.Test()

A matrix version of cor.test()

corr.test in the psych package is designed to do this:

library("psych")
data(sat.act)
corr.test(sat.act)

As noted in the comments, to replicate the p-values from the base cor.test() function over the entire matrix, then you need to turn off adjustment of the p-values for multiple comparisons (the default is to use Holm's method of adjustment):

 corr.test(sat.act, adjust = "none")

[But be careful when interpreting those results!]

How to use cor.test in an apply function?

You were on the right track but with few mistakes. Check the following code, I believe it produces your desired output

b = apply(df2[, -1], 2, function(x) {
cor.test(df2[, 1], x, method = "pearson", use = "pairwise")
})

p.vals <- sapply(b, "[[", "p.value")
p.vals

country value
0 0

get output of cor.test as data frame

Try this

# Create dataframe where to save results
res <- data.frame(matrix(nrow = 0, ncol = 4))
colnames(res) <- c("var1", "var2", "correlation", "pvalue")

# Correlation in loop
for(i in colnames(matrix.dge.cpm.t[,3:5])) {
for(j in colnames(matrix.dge.cpm.t[,3:5])) {
a <- cor.test(matrix.dge.cpm.t[[i]], matrix.dge.cpm.t[[j]])
res <- rbind(res, data.frame(
"var1" = i,
"var2" = j,
"correlation" = a$estimate,
"pvalue" = a$p.value) )
}
}

# Remove rownames
rownames(res) <- NULL

Another option is to use the Hmisc package and code presented here.

# ++++++++++++++++++++++++++++
# flattenCorrMatrix
# ++++++++++++++++++++++++++++
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix <- function(cormat, pmat) {
ut <- upper.tri(cormat)
data.frame(
row = rownames(cormat)[row(cormat)[ut]],
column = rownames(cormat)[col(cormat)[ut]],
cor =(cormat)[ut],
p = pmat[ut]
)
}

library(Hmisc)
res <- rcorr(as.matrix(matrix.dge.cpm.t[,1:5]))
flattenCorrMatrix(res$r, res$P)

EDIT 8th October 2020

To run the matrix against one variable

# Create dataframe where to save results
res <- data.frame(matrix(nrow = 0, ncol = 3))
colnames(res) <- c("var1", "correlation", "pvalue")

# Correlation in loop
for(i in colnames(matrix.dge.cpm.t[,3:5])) {
a <- cor.test(matrix.dge.cpm.t[[i]], samplesheet$cond1)
res <- rbind(res, data.frame(
"var1" = i,
"correlation" = a$estimate,
"pvalue" = a$p.value) )
}

Obtaining a matrix of p-values of a pearson correlation matrix

Here's a tidyverse solution that creates all pairs of interest and then performs a cor.test for each pair and extracts the correlation value and the corresponding p value.

# example data
FG_Smooth <- data.frame(FS_1 = c(0.43, 0.33, 3.47, 5.26, 1.09), FS2 = c(0.01, 0.02, 6.86, 3.27, 0.86), FS_3 = c(0.07, 0.36, 1.91, 5.61, 0.84), row.names = c("Group_3", "Thermo", "Embryophyta", "Flavo", "Cyclo"))
FMG_Smooth <- data.frame(GS_1 = c(1.13, 1.20, 0.52, 2.81, 0.70), GS_2 = c(1.18, 1.7, 0.42, 2.93, 0.78), GS_3 = c(1.17, 1.11, 0.60, 3.10, 0.87), row.names = c("Proline", "Trigonelline", "L-Lysine", "Nioctine", "Caffeate"))

library(tidyverse)

expand.grid(v1 = row.names(FG_Smooth), # create combinations of names
v2 = row.names(FMG_Smooth)) %>%
tbl_df() %>% # use for visualisation purpose
mutate(cor_test = map2(v1, v2, ~cor.test(unlist(FG_Smooth[.x,]), # perform the correlation test for each pair and store it
unlist(FMG_Smooth[.y,]))),
cor_value = map_dbl(cor_test, "estimate"), # get the correlation value from the test
cor_p_value = map_dbl(cor_test, "p.value")) # get the p value from the test

# # A tibble: 25 x 5
# v1 v2 cor_test cor_value cor_p_value
# <fct> <fct> <list> <dbl> <dbl>
# 1 Group_3 Proline <S3: htest> -0.998 0.0367
# 2 Thermo Proline <S3: htest> -0.592 0.596
# 3 Embryophyta Proline <S3: htest> 0.390 0.745
# 4 Flavo Proline <S3: htest> -0.544 0.634
# 5 Cyclo Proline <S3: htest> -0.966 0.167
# 6 Group_3 Trigonelline <S3: htest> -0.492 0.673
# 7 Thermo Trigonelline <S3: htest> -0.998 0.0396
# 8 Embryophyta Trigonelline <S3: htest> 0.985 0.109
# 9 Flavo Trigonelline <S3: htest> -1.000 0.00188
#10 Cyclo Trigonelline <S3: htest> -0.305 0.803
# # ... with 15 more rows

v1 and v2 are the row names of your datasets that will create the pairs for the correlation tests, cor_test column has the correlation test object for each pair, cor_value has the extracted correlation coefficient and cor_p_value has the extracted p value.

If you save the above output as a data frame you can easily reshape. For example if you save it as d you can get a 5x5 data frame of p values like this:

d %>%
select(v1, v2, cor_p_value) %>%
spread(v2, cor_p_value)

# # A tibble: 5 x 6
# v1 Caffeate `L-Lysine` Nioctine Proline Trigonelline
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Cyclo 0.309 0.995 0.351 0.167 0.803
# 2 Embryophyta 0.779 0.0931 0.737 0.745 0.109
# 3 Flavo 0.890 0.204 0.848 0.634 0.00188
# 4 Group_3 0.439 0.875 0.481 0.0367 0.673
# 5 Thermo 0.928 0.242 0.886 0.596 0.0396

An alternative version using broom package as well would be:

library(tidyverse)
library(broom)

expand.grid(v1 = row.names(FG_Smooth),
v2 = row.names(FMG_Smooth)) %>%
tbl_df() %>%
mutate(cor_test = map2(v1, v2, ~tidy(cor.test(unlist(FG_Smooth[.x,]),
unlist(FMG_Smooth[.y,]))))) %>%
unnest()

# # A tibble: 25 x 8
# v1 v2 estimate statistic p.value parameter method alternative
# <fct> <fct> <dbl> <dbl> <dbl> <int> <chr> <chr>
# 1 Group_3 Proline -0.998 -17.3 0.0367 1 Pearson's product-moment correlation two.sided
# 2 Thermo Proline -0.592 -0.735 0.596 1 Pearson's product-moment correlation two.sided
# 3 Embryophyta Proline 0.390 0.423 0.745 1 Pearson's product-moment correlation two.sided
# 4 Flavo Proline -0.544 -0.648 0.634 1 Pearson's product-moment correlation two.sided
# 5 Cyclo Proline -0.966 -3.73 0.167 1 Pearson's product-moment correlation two.sided
# 6 Group_3 Trigonelline -0.492 -0.565 0.673 1 Pearson's product-moment correlation two.sided
# 7 Thermo Trigonelline -0.998 -16.0 0.0396 1 Pearson's product-moment correlation two.sided
# 8 Embryophyta Trigonelline 0.985 5.78 0.109 1 Pearson's product-moment correlation two.sided
# 9 Flavo Trigonelline -1.000 -339. 0.00188 1 Pearson's product-moment correlation two.sided
#10 Cyclo Trigonelline -0.305 -0.320 0.803 1 Pearson's product-moment correlation two.sided
# # ... with 15 more rows

which gives you a tidy format of the correlation test object. You need to use columns estimate (correlation coefficient) and p.value.

Calculating a correlation matrix with pspearman package with apply() function

Consider creating pair-wise combinations of genes from row.names with combn and then iterating through the list of pairs through a defined function. Be sure to return an NA structure from if logic to avoid NULL in matrix output.

However, be forewarned that pair-wise permutations of 5,000 genes (choose(5000, 2)) results very high at 12,497,500 elements! Hence, sapply (a loop itself) may not be that different in performance than for. Look into parallelizing the iteration.

gene_combns <- combn(row.names(data), 2, simplify = FALSE)

spear_func <- function(x) {
# EXTRACT ROWS BY ROW NAMES
row1 <- as.numeric(data[x[1],])
row2 <- as.numeric(data[x[2],])

# Check the number of complete spots.There are no NAs in this set.
complete = sum(!(is.na(x)) & !(is.na(y)))

if (complete >=2 ) {
pspearman::spearman.test(row1, row2)
} else {
c(statistic=NA, parameter=NA, p.value=NA, estimate=NA,
null.value=NA, alternative=NA, method=NA, data.name=NA)
}
}

pair.all2 <- sapply(gene_combns, spear_func)

Testing

Above has been tested with cor.test (exactly same input args and output list as spearman.test but more accurate p-value) using a small sample of dataset (50 obs, 20 vars):

set.seed(82418)
data <- data.frame(matrix(rbinom(10*100000, 50, .5), ncol=200))[1:50, 1:20]
rownames(data) <- paste0("gene", 1:50)
colnames(data) <- paste0("spot", 1:20)

gene_combns <- combn(row.names(data), 2, simplify = FALSE)
# [[1]]
# [1] "gene1" "gene2"
# [[2]]
# [1] "gene1" "gene3"
# [[3]]
# [1] "gene1" "gene4"
# [[4]]
# [1] "gene1" "gene5"
# [[5]]
# [1] "gene1" "gene6"
# [[6]]
# [1] "gene1" "gene7"

test <- sapply(gene_combns, spear_func) # SAME FUNC BUT WITH cor.test
test[,1:5]

# [,1] [,2]
# statistic 885.1386 1659.598
# parameter NULL NULL
# p.value 0.1494607 0.2921304
# estimate 0.3344823 -0.2478179
# null.value 0 0
# alternative "two.sided" "two.sided"
# method "Spearman's rank correlation rho" "Spearman's rank correlation rho"
# data.name "row1 and row2" "row1 and row2"
# [,3] [,4]
# statistic 1554.533 1212.988
# parameter NULL NULL
# p.value 0.4767667 0.7122505
# estimate -0.1688217 0.08797877
# null.value 0 0
# alternative "two.sided" "two.sided"
# method "Spearman's rank correlation rho" "Spearman's rank correlation rho"
# data.name "row1 and row2" "row1 and row2"
# [,5]
# statistic 1421.707
# parameter NULL
# p.value 0.7726922
# estimate -0.06895299
# null.value 0
# alternative "two.sided"
# method "Spearman's rank correlation rho"
# data.name "row1 and row2"


Related Topics



Leave a reply



Submit