Pairwise Mahalanobis Distance Between Multiple Independent Variables and a Dependent Variable With Three Classes in R
Code to download the function pairwise.mahalanobis() from the HDMD package
#install.packages("biotools")
#install.packages("cluster")
library(biotools) #pairwise.mahalanobis function
library(cluster) #agnes function
group = matrix(Cluster_Dummy_2$Country) #what is being compared
group = t(group[,1]) #prepare for pairwise.mahalanobis function
variables = c(variables = c("Low.Freq", "High.Freq", "Peak.Freq", "Delta.Freq", "Delta.Time", "Peak.Time",
"Center.Freq", "Start.Freq","End.Freq")) #variables (what is being used for comparison)
variables = as.matrix(Cluster_Dummy_2[,variables]) #prepare for pairwise.mahalanobis function
variables
#function for calculating the Mahalanobis distance
#REF:https://cran.r-project.org/src/contrib/Archive/HDMD/
#HDMD Package download
pairwise.mahalanobis =
function(x, grouping=NULL, cov =NULL, inverted=FALSE, digits = 5, ...)
{
x <- if (is.vector(x)) #standardize input data as matrix
matrix(x, ncol = length(x))
else as.matrix(x)
if(!is.matrix(x))
stop("x could not be forced into a matrix")
if(length(grouping) ==0){ #no group assigned, uses first col
grouping = t(x[1])
x = x[2:dim(x)[2]]
cat("assigning grouping\n")
print(grouping)
}
n <- nrow(x)
p <- ncol(x)
if (n != length(grouping)){ #grouping and matrix do not correspond
cat(paste("n: ", n, "and groups: ", length(grouping), "\n"))
stop("nrow(x) and length(grouping) are different")
}
g <- as.factor(grouping)
g
lev <- lev1 <- levels(g) # Groups
counts <- as.vector(table(g)) # No. of elements in each group
if (any(counts == 0)) { # Remove grouping if not represented in data
empty <- lev[counts == 0]
warning(sprintf(ngettext(length(empty), "group %s is empty",
"groups %s are empty"), paste(empty, collapse = " ")),
domain = NA)
lev1 <- lev[counts > 0]
g <- factor(g, levels = lev1)
counts <- as.vector(table(g))
}
ng = length(lev1)
group.means <- tapply(x, list(rep(g, p), col(x)), mean) # g x p matrix of group means from x
if(missing(cov)){ #create covariance matrix
inverted = FALSE
cov = cor(x) #standardize into correlation mtx
}
else{ #check cov of correct dimension
if(dim(cov) != c(p,p))
stop("cov matrix not of dim = (p,p)\n")
}
Distance = matrix(nrow=ng, ncol=ng) #initialize distance matrix
dimnames(Distance) = list(names(group.means), names(group.means))
Means = round(group.means, digits)
Cov = round(cov, digits)
Distance = round(Distance, digits)
for(i in 1:ng){
Distance[i,]= mahalanobis(group.means, group.means[i,], cov, inverted)
}
result <- list(means = group.means, cov = cov, distance = Distance)
result
}
Calculate the pairwise Mahalanobis distance
#Calculate the Mahalanobis distance
mahala_sq = pairwise.mahalanobis(x=variables, grouping=group) #get squared mahalanobis distances (see mahala_sq$distance).
mahala_sq
names = rownames(mahala_sq$means) #capture labels
mahala = sqrt(mahala_sq$distance) #mahalanobis distance
rownames(mahala) = names #set rownames in the dissimilarity matrix
colnames(mahala) = names #set colnames in the dissimilarity matrix
mahala
#This is how I used the dissimilarity matrix to find clusters.
cluster = agnes(mahala,diss=TRUE,keep.diss=FALSE,method="complete") #hierarchical clustering
plot(cluster,which.plots=2) #plot dendrogram
Mahalanobis distance on R for more than 2 groups
Getting the distances is straigtforward if you organize your data in a 500 by 12 data.frame
or matrix
. To show you, first we create a data.frame
with some toy data:
set.seed(1) # To ensure reproducibility of the random numbers
df <- data.frame(sapply(LETTERS[1:12], function(x) rnorm(500)))
# Adding some outliers
df[1,1] <- 20
df[200,5] <- 60
head(df)
# A B C D E F G H
# 1 20.0000000 0.07730312 1.13496509 0.8500435 -0.88614959 -1.8054836 0.7391149 0.5205997
# 2 0.1836433 -0.29686864 1.11193185 -0.9253130 -1.92225490 -0.6780407 0.3866087 0.3775619
# 3 -0.8356286 -1.18324224 -0.87077763 0.8935812 1.61970074 -0.4733581 1.2963972 -0.6236588
# 4 1.5952808 0.01129269 0.21073159 -0.9410097 0.51926990 1.0274171 -0.8035584 -0.5726105
# 5 0.3295078 0.99160104 0.06939565 0.5389521 -0.05584993 -0.5973876 -1.6026257 0.3125012
# 6 -0.8204684 1.59396745 -1.66264885 -0.1819744 0.69641761 1.1598494 0.9332510 -0.7074278
# I J K L
# 1 -1.1346302 1.5579537 -1.5163733 -1.1378698
# 2 0.7645571 -0.7292970 0.6291412 -0.9518105
# 3 0.5707101 -1.5039509 -1.6781940 1.6192595
# 4 -1.3516939 -0.5667870 1.1797811 0.1678136
# 5 -2.0298855 -2.1044536 1.1176545 -0.9081778
# 6 0.5904787 0.5307319 -1.2377359 1.3417959
where you have the 12 species called A-L. Organized in this way, you simply run the following line:
dist.sq <- mahalanobis(x = df, center = colMeans(df), cov = cov(df))
Remember, the function returns the square of the distances!
plot(sqrt(dist.sq))
I hope this helps.
Calculating weigthed pairwise Mahalanobis distances
There is an error in your dist.maha
function. This is obvious straight away because some of the distances it computes are negative numbers -- so they cannot be the actual distances! Fortunately, this is quite easy to fix as you simply need to square your stdX
vector.
library("HDMD")
library("tidyverse")
# Convert a vector of pairwise distances from to a distance matrix
# (Simplified approach which doesn't use for-loops)
pairwise_dist_to_dist_matrix <- function(dist, n) {
stopifnot(length(dist) == n*(n-1)/2)
mat <- matrix(NA_real_, n, n)
diag(mat) <- 0
mat[lower.tri(mat)] <- dist
mat
}
dist.maha <- function (X) {
diff <- pair.diff(X) # pairwise difference of rows
V <- cov(X) # empirical covariance; positive definite
L <- t(chol(V)) # lower triangular factor
stdX <- t(forwardsolve(L, t(diff))) # solving the system of linear equations
dist <- stdX * stdX # Don't forget to square!
dist <- rowSums(dist) # And add up the differences in each dimension.
pairwise_dist_to_dist_matrix(dist, nrow(X))
}
# An alternative computation, for an additional check
dist.maha2 <- function(X) {
diff <- pair.diff(X)
V <- cov(X)
Vinv <- solve(V)
dist <- rowSums(diff %*% Vinv * diff)
pairwise_dist_to_dist_matrix(dist, nrow(X))
}
# Slightly more complex data matrix to check if
# functions work in higher dimensions
data <- matrix(c(100, 54, 56, 79, 12, 1, 2, 3, 4, 5), ncol = 2)
dist.maha(data)
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0.000000 NA NA NA NA
#> [2,] 2.275210 0.0000000 NA NA NA
#> [3,] 1.974017 0.9742819 0.000000 NA NA
#> [4,] 4.759700 7.5842687 3.250906 0.000000 NA
#> [5,] 7.896067 3.6213875 1.974017 5.690146 0
dist.maha2(data)
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0.000000 NA NA NA NA
#> [2,] 2.275210 0.0000000 NA NA NA
#> [3,] 1.974017 0.9742819 0.000000 NA NA
#> [4,] 4.759700 7.5842687 3.250906 0.000000 NA
#> [5,] 7.896067 3.6213875 1.974017 5.690146 0
It also seems that you don't use pairwise.mahalanobis
correctly. You have to compute and pass the covariance matrix (the cov
argument).
# This is the HDMD alternative
id = c(1,2,3,4,5)
# Incorrect:
# You have to specify the `cov` argument.
# Otherwise `pairwise.mahalanobis` doesn't compute it correctly
# as each sample is assumed to be in its own group.
pairwise.mahalanobis(data, grouping = id)$distance
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0.000 4345.2805 3840.7349 759.689 15362.940
#> [2,] 4345.280 0.0000 16.7591 1487.209 3369.708
#> [3,] 3840.735 16.7591 0.0000 1194.197 3840.735
#> [4,] 759.689 1487.2090 1194.1967 0.000 9310.174
#> [5,] 15362.940 3369.7076 3840.7349 9310.174 0.000
# Correct:
# NOTE: Ignore the warning message; there seems to be a small bug in `pairwise.mahalanobis`.
pairwise.mahalanobis(data, grouping = id, cov = cov(data))$distance
#> Warning in if (dim(cov) != c(p, p)) stop("cov matrix not of dim = (p,p)
#> \n"): the condition has length > 1 and only the first element will be used
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0.000000 2.2752099 1.9740168 4.759700 7.896067
#> [2,] 2.275210 0.0000000 0.9742819 7.584269 3.621388
#> [3,] 1.974017 0.9742819 0.0000000 3.250906 1.974017
#> [4,] 4.759700 7.5842687 3.2509059 0.000000 5.690146
#> [5,] 7.896067 3.6213875 1.9740168 5.690146 0.000000
Created on 2019-03-24 by the reprex package (v0.2.1)
mahalanobis distance in R between 2 goups
I felt like what you are trying to do must exist in some R
package. After a pretty thorough search, I found function D.sq
in package asbio
which looks very close. This function requires 2 matrices as input, so it doesn't work for your example. I also include a modified version that accepts a vector for the 2nd matrix.
# Original Function
D.sq <- function (g1, g2) {
dbar <- as.vector(colMeans(g1) - colMeans(g2))
S1 <- cov(g1)
S2 <- cov(g2)
n1 <- nrow(g1)
n2 <- nrow(g2)
V <- as.matrix((1/(n1 + n2 - 2)) * (((n1 - 1) * S1) + ((n2 -
1) * S2)))
D.sq <- t(dbar) %*% solve(V) %*% dbar
res <- list()
res$D.sq <- D.sq
res$V <- V
res
}
# Data
g1 <- matrix(c(90, 4, 70, 4, 27, 37, 82, 4, 17, 18, 41, 4), ncol = 3, byrow = TRUE)
g2 <- c(2, 27, 4)
# Function modified to accept a vector for g2 rather than a matrix
D.sq2 <- function (g1, g2) {
dbar <- as.vector(colMeans(g1) - g2)
S1 <- cov(g1)
S2 <- var(g2)
n1 <- nrow(g1)
n2 <- length(g2)
V <- as.matrix((1/(n1 + n2 - 2)) * (((n1 - 1) * S1) + ((n2 -
1) * S2)))
D.sq <- t(dbar) %*% solve(V) %*% dbar
res <- list()
res$D.sq <- D.sq
res$V <- V
res
}
However, this doesn't quite give the answer you expect: D.sq2(g1,g2)$D.sq
returns 2.2469.
Perhaps you can compare your original matlab
method with these details and figure out the source of the difference. A quick look suggests the difference is how the denominator in V
is computed. It may well also be an error on my part, but hopefully this gets you going.
Related Topics
To Find Whether a Column Exists in Data Frame or Not
How to Apply Function Over Each Matrix Element's Indices
R: Multiple Linear Regression Model and Prediction Model
Create Top-To-Bottom Fade/Gradient Geom_Density in Ggplot2
Figures Captions and Labels in Knitr
How to Get Rstudio to Automatically Compile R Markdown Vignettes
Effectively Debugging Shiny Apps
R: Replace All Values in a Dataframe Lower Than a Threshold with Na
Aggregation Using Ffdfdply Function in R
How to Calculate the Average of a Variable Between Two Date Ranges Using a Loop or Apply Function
The Difference Between Domc and Doparallel in R
How and When Should I Use On.Exit
Relationship Between R Markdown, Knitr, Pandoc, and Bookdown
Remove All Variables Except Functions
Rstudio Empty on Startup - No Windows, No Menus, No Rendering
How to Produce Time Series for Each Row of a Data Frame with an Unnamed First Column