How to Compute Correlations Between All Columns in R and Detect Highly Correlated Variables

How to compute correlations between all columns in R and detect highly correlated variables

Updated for newer tidyverse packages..

I would try gathering a correlation matrix.

# install.packages(c('tibble', 'dplyr', 'tidyr'))
library(tibble)
library(dplyr)
library(tidyr)

d <- data.frame(x1=rnorm(10),
                x2=rnorm(10),
                x3=rnorm(10))

d2 <- d %>% 
  as.matrix %>%
  cor %>%
  as.data.frame %>%
  rownames_to_column(var = 'var1') %>%
  gather(var2, value, -var1)

  var1 var2       value
1   x1   x1  1.00000000
2   x1   x2 -0.05936703
3   x1   x3 -0.37479619
4   x2   x1 -0.05936703
5   x2   x2  1.00000000
6   x2   x3  0.43716004
7   x3   x1 -0.37479619
8   x3   x2  0.43716004
9   x3   x3  1.00000000

# .5 is an arbitrary number
filter(d2, value > .5)

# remove duplicates
d2 %>%
  mutate(var_order = paste(var1, var2) %>%
           strsplit(split = ' ') %>%
           map_chr( ~ sort(.x) %>% 
                      paste(collapse = ' '))) %>%
  mutate(cnt = 1) %>%
  group_by(var_order) %>%
  mutate(cumsum = cumsum(cnt)) %>%
  filter(cumsum != 2) %>%
  ungroup %>%
  select(-var_order, -cnt, -cumsum)

  var1  var2   value
1 x1    x1     1     
2 x1    x2    -0.0594
3 x1    x3    -0.375 
4 x2    x2     1     
5 x2    x3     0.437 
6 x3    x3     1

How to compute the most correlated variable between multiple variables(except itself)? And how to simplify the codes by using loops or map functions?

You can use cor function to calculate the correlation of each variable with every other variable. Replace diagonals with 0 and use max.col to get the highest correlated variable.

set.seed(123)
df <- data.frame(replicate(10,sample(10:100,1000,rep=TRUE)))
cor_df <- cor(df)
diag(cor_df) <- 0
max.col(cor_df)
#[1] 5 6 4 3 6 2 5 4 2 1

#Or creating a dataframe with column names
result <- data.frame(var = names(df), cor_var = names(df)[max.col(cor_df)])
result
#   var cor_var
#1   X1      X5
#2   X2      X6
#3   X3      X4
#4   X4      X3
#5   X5      X6
#6   X6      X2
#7   X7      X5
#8   X8      X4
#9   X9      X2
#10 X10      X1

Extract pairs of variables with high correlation

which(..., arr.ind=TRUE) is the key.

Make up some data:

set.seed(101)
X <- matrix(rnorm(500), nrow=10,
        dimnames=list(NULL, outer(LETTERS,1:2,paste0)[1:50]))
cc <- cor(X)

range(cc[cc<1]) shows values from -0.82 to 0.87; I'll select values with abs(cc)>0.8; row(cc) < col(cc) will select only values from the upper triangle.

w <- which(abs(cc)>0.8 & row(cc)<col(cc), arr.ind=TRUE)
## reconstruct names from positions
high_cor <- matrix(colnames(cc)[w],ncol=2)

high_cor
     [,1] [,2]
[1,] "G1" "H1"
[2,] "F1" "N1"
[3,] "T1" "Z1"
[4,] "U1" "A2"
[5,] "Q1" "C2"
[6,] "M1" "O2"

R creating a comprehensive table of correlation between combinations of columns

Do you expect such kind of matrix?

df <- structure(list(INDEX = 1:6, TARGET_WINS = c(39L, 70L, 86L, 70L, 
82L, 75L), TEAM_BATTING_H = c(1445L, 1339L, 1377L, 1387L, 1297L, 
1279L), TEAM_BATTING_2B = c(194L, 219L, 232L, 209L, 186L, 200L
), TEAM_BATTING_3B = c(39L, 22L, 35L, 38L, 27L, 36L), TEAM_BATTING_HR = c(13L, 
190L, 137L, 96L, 102L, 92L), TEAM_BATTING_BB = c(143L, 685L, 
602L, 451L, 472L, 443L), TEAM_BATTING_SO = c(842L, 1075L, 917L, 
922L, 920L, 973L), TEAM_BASERUN_SB = c(NA, 37L, 46L, 43L, 49L, 
107L), TEAM_BASERUN_CS = c(NA, 28L, 27L, 30L, 39L, 59L), TEAM_BATTING_HBP = c(NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), TEAM_PITCHING_H = c(9364L, 1347L, 1377L, 1396L, 1297L, 1279L
), TEAM_PITCHING_HR = c(84L, 191L, 137L, 97L, 102L, 92L), TEAM_PITCHING_BB = c(927L, 
689L, 602L, 454L, 472L, 443L), TEAM_PITCHING_SO = c(5456L, 1082L, 
917L, 928L, 920L, 973L), TEAM_FIELDING_E = c(1011L, 193L, 175L, 
164L, 138L, 123L), TEAM_FIELDING_DP = c(NA, 155L, 153L, 156L, 
168L, 149L)), row.names = c(NA, 6L), class = "data.frame")

# install.packages("corrr")
library(corrr)
df1 <- corrr::correlate(df, method = "pearson")

# 1. Output:
# A tibble: 17 x 18
   term    INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB
   <chr>   <dbl>       <dbl>          <dbl>           <dbl>           <dbl>           <dbl>           <dbl>
 1 INDEX NA          0.642           -0.820         -0.291           0.0236          0.0826           0.205
 2 TARG~  0.642     NA               -0.685          0.367          -0.373           0.673            0.788
 3 TEAM~ -0.820     -0.685           NA              0.192           0.496          -0.449           -0.502
 4 TEAM~ -0.291      0.367            0.192         NA              -0.0789          0.640            0.653
 5 TEAM~  0.0236    -0.373            0.496         -0.0789         NA              -0.752           -0.676
 6 TEAM~  0.0826     0.673           -0.449          0.640          -0.752          NA                0.984
 7 TEAM~  0.205      0.788           -0.502          0.653          -0.676           0.984           NA    
 8 TEAM~  0.134      0.401           -0.560          0.377          -0.754           0.864            0.799
 9 TEAM~  0.790     -0.00267         -0.690         -0.356           0.413          -0.528           -0.541
10 TEAM~  0.874     -0.0332          -0.834         -0.598           0.261          -0.578           -0.623
11 TEAM~ NA         NA               NA             NA              NA              NA               NA    
12 TEAM~ -0.662     -0.923            0.733         -0.358           0.448          -0.771           -0.852
13 TEAM~ -0.352      0.308           -0.127          0.661          -0.767           0.891            0.809
14 TEAM~ -0.914     -0.793            0.736          0.0225          0.0863         -0.341           -0.464
15 TEAM~ -0.667     -0.930            0.719         -0.360           0.424          -0.757           -0.842
16 TEAM~ -0.707     -0.925            0.757         -0.314           0.418          -0.733           -0.820
17 TEAM~  0.0666     0.265           -0.144         -0.583          -0.447          -0.123           -0.150

Find the pair of most correlated variables

Solution using corrr:

corrr is a package for exploring correlations in R. It focuses on
creating and working with data frames of correlations

library(corrr)
matrix(rnorm(100), 5) %>%
    correlate() %>% 
    stretch() %>% 
    arrange(r)

Solution using reshape2 & data.table:

You can reshape2::melt (imported with data.table) cor result and order (sort) according correlation values.

library(data.table)
corMatrix <- cor(matrix(rnorm(100), 5))
setDT(melt(corMatrix))[order(value)]

Find which 5 variables are most correlated with response

cors <- cor(mtcars$mpg, mtcars)
cors[, order(cors[1, ])]

Returns:

        wt        cyl       disp         hp       carb       qsec       gear         am         vs       drat        mpg 
-0.8676594 -0.8521620 -0.8475514 -0.7761684 -0.5509251  0.4186840  0.4802848  0.5998324  0.6640389  0.6811719  1.0000000

We can use

cors[, order(cors[1, ], decreasing = TRUE)]

For the decreasing ordering...

Calculate correlation between two columns based on column names

You can create a function like this:

cor_f <- function(x) {
  
  cor(test[,names(test)[grepl(x, names(test))]])[2]
  
}

cor_f('Obs1') #correlation between Obs1_grp1 and Obs1_grp2
#0.3159908

In case you need a loop, one way would be:

vars <- c('Obs1', 'Obs2')    
sapply(vars, function(i) cor_f(i))

Identifying columns with high correlation in large dataset

To identify the columns most similar, try the following. It systematically compares the values from each column in dta1 with the columns in dta2. It returns a matrix.

sapply(dta1, function(x) sapply(dta2, function(y) sum(x == y)))
       A B C
first  0 1 0
second 0 0 0
third  0 0 3

From here we can see that third and C have the most matches. Now you can join your two data.frames. To keep all rows and columns, you will want a full_join from the dplyr package.

library(dplyr)
full_join(dta1, dta2, by = c("C" = "third"))
   A  B   C first second
1  1 23 001    NA     NA
2  2 45 028     6     32
3  3  6 076     7     33
4  4  8 039     8     45
5 NA NA 008     5     58

How to Compute Correlations Between All Columns in R and Detect Highly Correlated Variables