Mahalanobis Distance in R

Pairwise Mahalanobis Distance Between Multiple Independent Variables and a Dependent Variable With Three Classes in R

Code to download the function pairwise.mahalanobis() from the HDMD package

#install.packages("biotools")
#install.packages("cluster")

library(biotools) #pairwise.mahalanobis function
library(cluster) #agnes function

group = matrix(Cluster_Dummy_2$Country) #what is being compared
group = t(group[,1]) #prepare for pairwise.mahalanobis function

variables = c(variables = c("Low.Freq", "High.Freq", "Peak.Freq", "Delta.Freq", "Delta.Time", "Peak.Time",
"Center.Freq", "Start.Freq","End.Freq")) #variables (what is being used for comparison)

variables = as.matrix(Cluster_Dummy_2[,variables]) #prepare for pairwise.mahalanobis function
variables

#function for calculating the Mahalanobis distance

#REF:https://cran.r-project.org/src/contrib/Archive/HDMD/

#HDMD Package download

pairwise.mahalanobis =
function(x, grouping=NULL, cov =NULL, inverted=FALSE, digits = 5, ...)
{
x <- if (is.vector(x)) #standardize input data as matrix
matrix(x, ncol = length(x))
else as.matrix(x)

if(!is.matrix(x))
stop("x could not be forced into a matrix")

if(length(grouping) ==0){ #no group assigned, uses first col
grouping = t(x[1])
x = x[2:dim(x)[2]]
cat("assigning grouping\n")
print(grouping)
}

n <- nrow(x)
p <- ncol(x)

if (n != length(grouping)){ #grouping and matrix do not correspond
cat(paste("n: ", n, "and groups: ", length(grouping), "\n"))
stop("nrow(x) and length(grouping) are different")
}
g <- as.factor(grouping)
g
lev <- lev1 <- levels(g) # Groups
counts <- as.vector(table(g)) # No. of elements in each group

if (any(counts == 0)) { # Remove grouping if not represented in data
empty <- lev[counts == 0]
warning(sprintf(ngettext(length(empty), "group %s is empty",
"groups %s are empty"), paste(empty, collapse = " ")),
domain = NA)
lev1 <- lev[counts > 0]
g <- factor(g, levels = lev1)
counts <- as.vector(table(g))
}

ng = length(lev1)

group.means <- tapply(x, list(rep(g, p), col(x)), mean) # g x p matrix of group means from x

if(missing(cov)){ #create covariance matrix
inverted = FALSE
cov = cor(x) #standardize into correlation mtx
}
else{ #check cov of correct dimension
if(dim(cov) != c(p,p))
stop("cov matrix not of dim = (p,p)\n")
}

Distance = matrix(nrow=ng, ncol=ng) #initialize distance matrix
dimnames(Distance) = list(names(group.means), names(group.means))

Means = round(group.means, digits)
Cov = round(cov, digits)
Distance = round(Distance, digits)

for(i in 1:ng){
Distance[i,]= mahalanobis(group.means, group.means[i,], cov, inverted)
}

result <- list(means = group.means, cov = cov, distance = Distance)
result
}

Calculate the pairwise Mahalanobis distance

#Calculate the Mahalanobis distance
mahala_sq = pairwise.mahalanobis(x=variables, grouping=group) #get squared mahalanobis distances (see mahala_sq$distance).
mahala_sq

names = rownames(mahala_sq$means) #capture labels

mahala = sqrt(mahala_sq$distance) #mahalanobis distance
rownames(mahala) = names #set rownames in the dissimilarity matrix
colnames(mahala) = names #set colnames in the dissimilarity matrix

mahala

#This is how I used the dissimilarity matrix to find clusters.
cluster = agnes(mahala,diss=TRUE,keep.diss=FALSE,method="complete") #hierarchical clustering
plot(cluster,which.plots=2) #plot dendrogram

Mahalanobis distance on R for more than 2 groups

Getting the distances is straigtforward if you organize your data in a 500 by 12 data.frame or matrix. To show you, first we create a data.frame with some toy data:

set.seed(1) # To ensure reproducibility of the random numbers
df <- data.frame(sapply(LETTERS[1:12], function(x) rnorm(500)))
# Adding some outliers
df[1,1] <- 20
df[200,5] <- 60
head(df)
# A B C D E F G H
# 1 20.0000000 0.07730312 1.13496509 0.8500435 -0.88614959 -1.8054836 0.7391149 0.5205997
# 2 0.1836433 -0.29686864 1.11193185 -0.9253130 -1.92225490 -0.6780407 0.3866087 0.3775619
# 3 -0.8356286 -1.18324224 -0.87077763 0.8935812 1.61970074 -0.4733581 1.2963972 -0.6236588
# 4 1.5952808 0.01129269 0.21073159 -0.9410097 0.51926990 1.0274171 -0.8035584 -0.5726105
# 5 0.3295078 0.99160104 0.06939565 0.5389521 -0.05584993 -0.5973876 -1.6026257 0.3125012
# 6 -0.8204684 1.59396745 -1.66264885 -0.1819744 0.69641761 1.1598494 0.9332510 -0.7074278
# I J K L
# 1 -1.1346302 1.5579537 -1.5163733 -1.1378698
# 2 0.7645571 -0.7292970 0.6291412 -0.9518105
# 3 0.5707101 -1.5039509 -1.6781940 1.6192595
# 4 -1.3516939 -0.5667870 1.1797811 0.1678136
# 5 -2.0298855 -2.1044536 1.1176545 -0.9081778
# 6 0.5904787 0.5307319 -1.2377359 1.3417959

where you have the 12 species called A-L. Organized in this way, you simply run the following line:

dist.sq <- mahalanobis(x = df, center = colMeans(df), cov = cov(df))

Remember, the function returns the square of the distances!

plot(sqrt(dist.sq))

I hope this helps.

Calculating weigthed pairwise Mahalanobis distances

There is an error in your dist.maha function. This is obvious straight away because some of the distances it computes are negative numbers -- so they cannot be the actual distances! Fortunately, this is quite easy to fix as you simply need to square your stdX vector.

library("HDMD")
library("tidyverse")

# Convert a vector of pairwise distances from to a distance matrix
# (Simplified approach which doesn't use for-loops)
pairwise_dist_to_dist_matrix <- function(dist, n) {
stopifnot(length(dist) == n*(n-1)/2)
mat <- matrix(NA_real_, n, n)
diag(mat) <- 0
mat[lower.tri(mat)] <- dist
mat
}

dist.maha <- function (X) {
diff <- pair.diff(X) # pairwise difference of rows
V <- cov(X) # empirical covariance; positive definite
L <- t(chol(V)) # lower triangular factor
stdX <- t(forwardsolve(L, t(diff))) # solving the system of linear equations
dist <- stdX * stdX # Don't forget to square!
dist <- rowSums(dist) # And add up the differences in each dimension.
pairwise_dist_to_dist_matrix(dist, nrow(X))
}

# An alternative computation, for an additional check
dist.maha2 <- function(X) {
diff <- pair.diff(X)
V <- cov(X)
Vinv <- solve(V)
dist <- rowSums(diff %*% Vinv * diff)
pairwise_dist_to_dist_matrix(dist, nrow(X))
}

# Slightly more complex data matrix to check if
# functions work in higher dimensions
data <- matrix(c(100, 54, 56, 79, 12, 1, 2, 3, 4, 5), ncol = 2)
dist.maha(data)
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0.000000 NA NA NA NA
#> [2,] 2.275210 0.0000000 NA NA NA
#> [3,] 1.974017 0.9742819 0.000000 NA NA
#> [4,] 4.759700 7.5842687 3.250906 0.000000 NA
#> [5,] 7.896067 3.6213875 1.974017 5.690146 0
dist.maha2(data)
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0.000000 NA NA NA NA
#> [2,] 2.275210 0.0000000 NA NA NA
#> [3,] 1.974017 0.9742819 0.000000 NA NA
#> [4,] 4.759700 7.5842687 3.250906 0.000000 NA
#> [5,] 7.896067 3.6213875 1.974017 5.690146 0

It also seems that you don't use pairwise.mahalanobis correctly. You have to compute and pass the covariance matrix (the cov argument).

# This is the HDMD alternative
id = c(1,2,3,4,5)

# Incorrect:

# You have to specify the `cov` argument.
# Otherwise `pairwise.mahalanobis` doesn't compute it correctly
# as each sample is assumed to be in its own group.

pairwise.mahalanobis(data, grouping = id)$distance
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0.000 4345.2805 3840.7349 759.689 15362.940
#> [2,] 4345.280 0.0000 16.7591 1487.209 3369.708
#> [3,] 3840.735 16.7591 0.0000 1194.197 3840.735
#> [4,] 759.689 1487.2090 1194.1967 0.000 9310.174
#> [5,] 15362.940 3369.7076 3840.7349 9310.174 0.000

# Correct:

# NOTE: Ignore the warning message; there seems to be a small bug in `pairwise.mahalanobis`.
pairwise.mahalanobis(data, grouping = id, cov = cov(data))$distance
#> Warning in if (dim(cov) != c(p, p)) stop("cov matrix not of dim = (p,p)
#> \n"): the condition has length > 1 and only the first element will be used
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0.000000 2.2752099 1.9740168 4.759700 7.896067
#> [2,] 2.275210 0.0000000 0.9742819 7.584269 3.621388
#> [3,] 1.974017 0.9742819 0.0000000 3.250906 1.974017
#> [4,] 4.759700 7.5842687 3.2509059 0.000000 5.690146
#> [5,] 7.896067 3.6213875 1.9740168 5.690146 0.000000

Created on 2019-03-24 by the reprex package (v0.2.1)

mahalanobis distance in R between 2 goups

I felt like what you are trying to do must exist in some R package. After a pretty thorough search, I found function D.sq in package asbio which looks very close. This function requires 2 matrices as input, so it doesn't work for your example. I also include a modified version that accepts a vector for the 2nd matrix.

# Original Function
D.sq <- function (g1, g2) {
dbar <- as.vector(colMeans(g1) - colMeans(g2))
S1 <- cov(g1)
S2 <- cov(g2)
n1 <- nrow(g1)
n2 <- nrow(g2)
V <- as.matrix((1/(n1 + n2 - 2)) * (((n1 - 1) * S1) + ((n2 -
1) * S2)))
D.sq <- t(dbar) %*% solve(V) %*% dbar
res <- list()
res$D.sq <- D.sq
res$V <- V
res
}

# Data
g1 <- matrix(c(90, 4, 70, 4, 27, 37, 82, 4, 17, 18, 41, 4), ncol = 3, byrow = TRUE)
g2 <- c(2, 27, 4)

# Function modified to accept a vector for g2 rather than a matrix
D.sq2 <- function (g1, g2) {
dbar <- as.vector(colMeans(g1) - g2)
S1 <- cov(g1)
S2 <- var(g2)
n1 <- nrow(g1)
n2 <- length(g2)
V <- as.matrix((1/(n1 + n2 - 2)) * (((n1 - 1) * S1) + ((n2 -
1) * S2)))
D.sq <- t(dbar) %*% solve(V) %*% dbar
res <- list()
res$D.sq <- D.sq
res$V <- V
res
}

However, this doesn't quite give the answer you expect: D.sq2(g1,g2)$D.sq returns 2.2469.

Perhaps you can compare your original matlab method with these details and figure out the source of the difference. A quick look suggests the difference is how the denominator in V is computed. It may well also be an error on my part, but hopefully this gets you going.



Related Topics



Leave a reply



Submit