How to compute correlations between all columns in R and detect highly correlated variables
Updated for newer tidyverse packages..
I would try gathering a correlation matrix.
# install.packages(c('tibble', 'dplyr', 'tidyr'))
library(tibble)
library(dplyr)
library(tidyr)
d <- data.frame(x1=rnorm(10),
x2=rnorm(10),
x3=rnorm(10))
d2 <- d %>%
as.matrix %>%
cor %>%
as.data.frame %>%
rownames_to_column(var = 'var1') %>%
gather(var2, value, -var1)
var1 var2 value
1 x1 x1 1.00000000
2 x1 x2 -0.05936703
3 x1 x3 -0.37479619
4 x2 x1 -0.05936703
5 x2 x2 1.00000000
6 x2 x3 0.43716004
7 x3 x1 -0.37479619
8 x3 x2 0.43716004
9 x3 x3 1.00000000
# .5 is an arbitrary number
filter(d2, value > .5)
# remove duplicates
d2 %>%
mutate(var_order = paste(var1, var2) %>%
strsplit(split = ' ') %>%
map_chr( ~ sort(.x) %>%
paste(collapse = ' '))) %>%
mutate(cnt = 1) %>%
group_by(var_order) %>%
mutate(cumsum = cumsum(cnt)) %>%
filter(cumsum != 2) %>%
ungroup %>%
select(-var_order, -cnt, -cumsum)
var1 var2 value
1 x1 x1 1
2 x1 x2 -0.0594
3 x1 x3 -0.375
4 x2 x2 1
5 x2 x3 0.437
6 x3 x3 1
How to compute the most correlated variable between multiple variables(except itself)? And how to simplify the codes by using loops or map functions?
You can use cor
function to calculate the correlation of each variable with every other variable. Replace diagonals with 0 and use max.col
to get the highest correlated variable.
set.seed(123)
df <- data.frame(replicate(10,sample(10:100,1000,rep=TRUE)))
cor_df <- cor(df)
diag(cor_df) <- 0
max.col(cor_df)
#[1] 5 6 4 3 6 2 5 4 2 1
#Or creating a dataframe with column names
result <- data.frame(var = names(df), cor_var = names(df)[max.col(cor_df)])
result
# var cor_var
#1 X1 X5
#2 X2 X6
#3 X3 X4
#4 X4 X3
#5 X5 X6
#6 X6 X2
#7 X7 X5
#8 X8 X4
#9 X9 X2
#10 X10 X1
Extract pairs of variables with high correlation
which(..., arr.ind=TRUE)
is the key.
Make up some data:
set.seed(101)
X <- matrix(rnorm(500), nrow=10,
dimnames=list(NULL, outer(LETTERS,1:2,paste0)[1:50]))
cc <- cor(X)
range(cc[cc<1])
shows values from -0.82 to 0.87; I'll select values with abs(cc)>0.8
; row(cc) < col(cc)
will select only values from the upper triangle.
w <- which(abs(cc)>0.8 & row(cc)<col(cc), arr.ind=TRUE)
## reconstruct names from positions
high_cor <- matrix(colnames(cc)[w],ncol=2)
high_cor
[,1] [,2]
[1,] "G1" "H1"
[2,] "F1" "N1"
[3,] "T1" "Z1"
[4,] "U1" "A2"
[5,] "Q1" "C2"
[6,] "M1" "O2"
R creating a comprehensive table of correlation between combinations of columns
Do you expect such kind of matrix?
df <- structure(list(INDEX = 1:6, TARGET_WINS = c(39L, 70L, 86L, 70L,
82L, 75L), TEAM_BATTING_H = c(1445L, 1339L, 1377L, 1387L, 1297L,
1279L), TEAM_BATTING_2B = c(194L, 219L, 232L, 209L, 186L, 200L
), TEAM_BATTING_3B = c(39L, 22L, 35L, 38L, 27L, 36L), TEAM_BATTING_HR = c(13L,
190L, 137L, 96L, 102L, 92L), TEAM_BATTING_BB = c(143L, 685L,
602L, 451L, 472L, 443L), TEAM_BATTING_SO = c(842L, 1075L, 917L,
922L, 920L, 973L), TEAM_BASERUN_SB = c(NA, 37L, 46L, 43L, 49L,
107L), TEAM_BASERUN_CS = c(NA, 28L, 27L, 30L, 39L, 59L), TEAM_BATTING_HBP = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), TEAM_PITCHING_H = c(9364L, 1347L, 1377L, 1396L, 1297L, 1279L
), TEAM_PITCHING_HR = c(84L, 191L, 137L, 97L, 102L, 92L), TEAM_PITCHING_BB = c(927L,
689L, 602L, 454L, 472L, 443L), TEAM_PITCHING_SO = c(5456L, 1082L,
917L, 928L, 920L, 973L), TEAM_FIELDING_E = c(1011L, 193L, 175L,
164L, 138L, 123L), TEAM_FIELDING_DP = c(NA, 155L, 153L, 156L,
168L, 149L)), row.names = c(NA, 6L), class = "data.frame")
# install.packages("corrr")
library(corrr)
df1 <- corrr::correlate(df, method = "pearson")
# 1. Output:
# A tibble: 17 x 18
term INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 INDEX NA 0.642 -0.820 -0.291 0.0236 0.0826 0.205
2 TARG~ 0.642 NA -0.685 0.367 -0.373 0.673 0.788
3 TEAM~ -0.820 -0.685 NA 0.192 0.496 -0.449 -0.502
4 TEAM~ -0.291 0.367 0.192 NA -0.0789 0.640 0.653
5 TEAM~ 0.0236 -0.373 0.496 -0.0789 NA -0.752 -0.676
6 TEAM~ 0.0826 0.673 -0.449 0.640 -0.752 NA 0.984
7 TEAM~ 0.205 0.788 -0.502 0.653 -0.676 0.984 NA
8 TEAM~ 0.134 0.401 -0.560 0.377 -0.754 0.864 0.799
9 TEAM~ 0.790 -0.00267 -0.690 -0.356 0.413 -0.528 -0.541
10 TEAM~ 0.874 -0.0332 -0.834 -0.598 0.261 -0.578 -0.623
11 TEAM~ NA NA NA NA NA NA NA
12 TEAM~ -0.662 -0.923 0.733 -0.358 0.448 -0.771 -0.852
13 TEAM~ -0.352 0.308 -0.127 0.661 -0.767 0.891 0.809
14 TEAM~ -0.914 -0.793 0.736 0.0225 0.0863 -0.341 -0.464
15 TEAM~ -0.667 -0.930 0.719 -0.360 0.424 -0.757 -0.842
16 TEAM~ -0.707 -0.925 0.757 -0.314 0.418 -0.733 -0.820
17 TEAM~ 0.0666 0.265 -0.144 -0.583 -0.447 -0.123 -0.150
Find the pair of most correlated variables
Solution using corrr:
corrr is a package for exploring correlations in R. It focuses on
creating and working with data frames of correlations
library(corrr)
matrix(rnorm(100), 5) %>%
correlate() %>%
stretch() %>%
arrange(r)
Solution using reshape2 & data.table:
You can reshape2::melt
(imported with data.table
) cor
result and order (sort) according correlation values.
library(data.table)
corMatrix <- cor(matrix(rnorm(100), 5))
setDT(melt(corMatrix))[order(value)]
Find which 5 variables are most correlated with response
cors <- cor(mtcars$mpg, mtcars)
cors[, order(cors[1, ])]
Returns:
wt cyl disp hp carb qsec gear am vs drat mpg
-0.8676594 -0.8521620 -0.8475514 -0.7761684 -0.5509251 0.4186840 0.4802848 0.5998324 0.6640389 0.6811719 1.0000000
We can use
cors[, order(cors[1, ], decreasing = TRUE)]
For the decreasing ordering...
Calculate correlation between two columns based on column names
You can create a function like this:
cor_f <- function(x) {
cor(test[,names(test)[grepl(x, names(test))]])[2]
}
cor_f('Obs1') #correlation between Obs1_grp1 and Obs1_grp2
#0.3159908
In case you need a loop, one way would be:
vars <- c('Obs1', 'Obs2')
sapply(vars, function(i) cor_f(i))
Identifying columns with high correlation in large dataset
To identify the columns most similar, try the following. It systematically compares the values from each column in dta1
with the columns in dta2
. It returns a matrix.
sapply(dta1, function(x) sapply(dta2, function(y) sum(x == y)))
A B C
first 0 1 0
second 0 0 0
third 0 0 3
From here we can see that third
and C
have the most matches. Now you can join your two data.frames. To keep all rows and columns, you will want a full_join
from the dplyr
package.
library(dplyr)
full_join(dta1, dta2, by = c("C" = "third"))
A B C first second
1 1 23 001 NA NA
2 2 45 028 6 32
3 3 6 076 7 33
4 4 8 039 8 45
5 NA NA 008 5 58
Related Topics
Developing Geographic Thematic Maps with R
Print String and Variable Contents on the Same Line in R
Plot Random Effects from Lmer (Lme4 Package) Using Qqmath or Dotplot: How to Make It Look Fancy
Get the Column Number in R Given the Column Name
Working with Dictionaries/Lists to Get List of Keys
How to Make a Matrix from a List of Vectors in R
Here We Go Again: Append an Element to a List in R
Reorder Rows Using Custom Order
What Is a Good Way to Read Line-By-Line in R
Can't Change Fonts in Ggplot/Geom_Text
Doing a Plyr Operation on Every Row of a Data Frame in R
How to Clean Up R Memory Without Restarting My Pc
Dplyr: Put Count Occurrences into New Variable