How to Compute Correlations Between All Columns in R and Detect Highly Correlated Variables

How to compute correlations between all columns in R and detect highly correlated variables

Updated for newer tidyverse packages..

I would try gathering a correlation matrix.

# install.packages(c('tibble', 'dplyr', 'tidyr'))
library(tibble)
library(dplyr)
library(tidyr)

d <- data.frame(x1=rnorm(10),
x2=rnorm(10),
x3=rnorm(10))

d2 <- d %>%
as.matrix %>%
cor %>%
as.data.frame %>%
rownames_to_column(var = 'var1') %>%
gather(var2, value, -var1)

var1 var2 value
1 x1 x1 1.00000000
2 x1 x2 -0.05936703
3 x1 x3 -0.37479619
4 x2 x1 -0.05936703
5 x2 x2 1.00000000
6 x2 x3 0.43716004
7 x3 x1 -0.37479619
8 x3 x2 0.43716004
9 x3 x3 1.00000000

# .5 is an arbitrary number
filter(d2, value > .5)

# remove duplicates
d2 %>%
mutate(var_order = paste(var1, var2) %>%
strsplit(split = ' ') %>%
map_chr( ~ sort(.x) %>%
paste(collapse = ' '))) %>%
mutate(cnt = 1) %>%
group_by(var_order) %>%
mutate(cumsum = cumsum(cnt)) %>%
filter(cumsum != 2) %>%
ungroup %>%
select(-var_order, -cnt, -cumsum)

var1 var2 value
1 x1 x1 1
2 x1 x2 -0.0594
3 x1 x3 -0.375
4 x2 x2 1
5 x2 x3 0.437
6 x3 x3 1

How to compute the most correlated variable between multiple variables(except itself)? And how to simplify the codes by using loops or map functions?

You can use cor function to calculate the correlation of each variable with every other variable. Replace diagonals with 0 and use max.col to get the highest correlated variable.

set.seed(123)
df <- data.frame(replicate(10,sample(10:100,1000,rep=TRUE)))
cor_df <- cor(df)
diag(cor_df) <- 0
max.col(cor_df)
#[1] 5 6 4 3 6 2 5 4 2 1

#Or creating a dataframe with column names
result <- data.frame(var = names(df), cor_var = names(df)[max.col(cor_df)])
result
# var cor_var
#1 X1 X5
#2 X2 X6
#3 X3 X4
#4 X4 X3
#5 X5 X6
#6 X6 X2
#7 X7 X5
#8 X8 X4
#9 X9 X2
#10 X10 X1

Extract pairs of variables with high correlation

which(..., arr.ind=TRUE) is the key.

Make up some data:

set.seed(101)
X <- matrix(rnorm(500), nrow=10,
dimnames=list(NULL, outer(LETTERS,1:2,paste0)[1:50]))
cc <- cor(X)

range(cc[cc<1]) shows values from -0.82 to 0.87; I'll select values with abs(cc)>0.8; row(cc) < col(cc) will select only values from the upper triangle.

w <- which(abs(cc)>0.8 & row(cc)<col(cc), arr.ind=TRUE)
## reconstruct names from positions
high_cor <- matrix(colnames(cc)[w],ncol=2)
high_cor
[,1] [,2]
[1,] "G1" "H1"
[2,] "F1" "N1"
[3,] "T1" "Z1"
[4,] "U1" "A2"
[5,] "Q1" "C2"
[6,] "M1" "O2"

R creating a comprehensive table of correlation between combinations of columns

Do you expect such kind of matrix?

df <- structure(list(INDEX = 1:6, TARGET_WINS = c(39L, 70L, 86L, 70L, 
82L, 75L), TEAM_BATTING_H = c(1445L, 1339L, 1377L, 1387L, 1297L,
1279L), TEAM_BATTING_2B = c(194L, 219L, 232L, 209L, 186L, 200L
), TEAM_BATTING_3B = c(39L, 22L, 35L, 38L, 27L, 36L), TEAM_BATTING_HR = c(13L,
190L, 137L, 96L, 102L, 92L), TEAM_BATTING_BB = c(143L, 685L,
602L, 451L, 472L, 443L), TEAM_BATTING_SO = c(842L, 1075L, 917L,
922L, 920L, 973L), TEAM_BASERUN_SB = c(NA, 37L, 46L, 43L, 49L,
107L), TEAM_BASERUN_CS = c(NA, 28L, 27L, 30L, 39L, 59L), TEAM_BATTING_HBP = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), TEAM_PITCHING_H = c(9364L, 1347L, 1377L, 1396L, 1297L, 1279L
), TEAM_PITCHING_HR = c(84L, 191L, 137L, 97L, 102L, 92L), TEAM_PITCHING_BB = c(927L,
689L, 602L, 454L, 472L, 443L), TEAM_PITCHING_SO = c(5456L, 1082L,
917L, 928L, 920L, 973L), TEAM_FIELDING_E = c(1011L, 193L, 175L,
164L, 138L, 123L), TEAM_FIELDING_DP = c(NA, 155L, 153L, 156L,
168L, 149L)), row.names = c(NA, 6L), class = "data.frame")

# install.packages("corrr")
library(corrr)
df1 <- corrr::correlate(df, method = "pearson")

# 1. Output:
# A tibble: 17 x 18
term INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 INDEX NA 0.642 -0.820 -0.291 0.0236 0.0826 0.205
2 TARG~ 0.642 NA -0.685 0.367 -0.373 0.673 0.788
3 TEAM~ -0.820 -0.685 NA 0.192 0.496 -0.449 -0.502
4 TEAM~ -0.291 0.367 0.192 NA -0.0789 0.640 0.653
5 TEAM~ 0.0236 -0.373 0.496 -0.0789 NA -0.752 -0.676
6 TEAM~ 0.0826 0.673 -0.449 0.640 -0.752 NA 0.984
7 TEAM~ 0.205 0.788 -0.502 0.653 -0.676 0.984 NA
8 TEAM~ 0.134 0.401 -0.560 0.377 -0.754 0.864 0.799
9 TEAM~ 0.790 -0.00267 -0.690 -0.356 0.413 -0.528 -0.541
10 TEAM~ 0.874 -0.0332 -0.834 -0.598 0.261 -0.578 -0.623
11 TEAM~ NA NA NA NA NA NA NA
12 TEAM~ -0.662 -0.923 0.733 -0.358 0.448 -0.771 -0.852
13 TEAM~ -0.352 0.308 -0.127 0.661 -0.767 0.891 0.809
14 TEAM~ -0.914 -0.793 0.736 0.0225 0.0863 -0.341 -0.464
15 TEAM~ -0.667 -0.930 0.719 -0.360 0.424 -0.757 -0.842
16 TEAM~ -0.707 -0.925 0.757 -0.314 0.418 -0.733 -0.820
17 TEAM~ 0.0666 0.265 -0.144 -0.583 -0.447 -0.123 -0.150

Find the pair of most correlated variables

Solution using corrr:

corrr is a package for exploring correlations in R. It focuses on
creating and working with data frames of correlations

library(corrr)
matrix(rnorm(100), 5) %>%
correlate() %>%
stretch() %>%
arrange(r)

Solution using reshape2 & data.table:

You can reshape2::melt (imported with data.table) cor result and order (sort) according correlation values.

library(data.table)
corMatrix <- cor(matrix(rnorm(100), 5))
setDT(melt(corMatrix))[order(value)]

Find which 5 variables are most correlated with response

cors <- cor(mtcars$mpg, mtcars)
cors[, order(cors[1, ])]

Returns:

        wt        cyl       disp         hp       carb       qsec       gear         am         vs       drat        mpg 
-0.8676594 -0.8521620 -0.8475514 -0.7761684 -0.5509251 0.4186840 0.4802848 0.5998324 0.6640389 0.6811719 1.0000000

We can use

cors[, order(cors[1, ], decreasing = TRUE)]

For the decreasing ordering...

Calculate correlation between two columns based on column names

You can create a function like this:

cor_f <- function(x) {

cor(test[,names(test)[grepl(x, names(test))]])[2]

}

cor_f('Obs1') #correlation between Obs1_grp1 and Obs1_grp2
#0.3159908

In case you need a loop, one way would be:

vars <- c('Obs1', 'Obs2')    
sapply(vars, function(i) cor_f(i))

Identifying columns with high correlation in large dataset

To identify the columns most similar, try the following. It systematically compares the values from each column in dta1 with the columns in dta2. It returns a matrix.

sapply(dta1, function(x) sapply(dta2, function(y) sum(x == y)))
A B C
first 0 1 0
second 0 0 0
third 0 0 3

From here we can see that third and C have the most matches. Now you can join your two data.frames. To keep all rows and columns, you will want a full_join from the dplyr package.

library(dplyr)
full_join(dta1, dta2, by = c("C" = "third"))
A B C first second
1 1 23 001 NA NA
2 2 45 028 6 32
3 3 6 076 7 33
4 4 8 039 8 45
5 NA NA 008 5 58


Related Topics



Leave a reply



Submit