Mahalanobis Distance in R

Pairwise Mahalanobis Distance Between Multiple Independent Variables and a Dependent Variable With Three Classes in R

Code to download the function pairwise.mahalanobis() from the HDMD package

#install.packages("biotools")
#install.packages("cluster")

library(biotools) #pairwise.mahalanobis function
library(cluster) #agnes function

group = matrix(Cluster_Dummy_2$Country) #what is being compared
group = t(group[,1]) #prepare for pairwise.mahalanobis function

variables = c(variables = c("Low.Freq", "High.Freq", "Peak.Freq", "Delta.Freq", "Delta.Time", "Peak.Time", 
                            "Center.Freq", "Start.Freq","End.Freq")) #variables (what is being used for comparison)

variables = as.matrix(Cluster_Dummy_2[,variables]) #prepare for pairwise.mahalanobis function
variables

#function for calculating the Mahalanobis distance

#REF:https://cran.r-project.org/src/contrib/Archive/HDMD/

#HDMD Package download

pairwise.mahalanobis = 
  function(x, grouping=NULL, cov =NULL, inverted=FALSE, digits = 5, ...) 
  {
    x <- if (is.vector(x))                                  #standardize input data as matrix
      matrix(x, ncol = length(x))
    else as.matrix(x)
    
    if(!is.matrix(x))
      stop("x could not be forced into a matrix")
    
    if(length(grouping) ==0){                               #no group assigned, uses first col
      grouping = t(x[1])
      x = x[2:dim(x)[2]]    
      cat("assigning grouping\n")
      print(grouping)
    }
    
    n <- nrow(x)
    p <- ncol(x)
    
    if (n != length(grouping)){                                 #grouping and matrix do not correspond
      cat(paste("n: ", n, "and groups: ", length(grouping), "\n"))
      stop("nrow(x) and length(grouping) are different")
    }
    g <- as.factor(grouping)
    g
    lev <- lev1 <- levels(g)                                # Groups
    counts <- as.vector(table(g))                           # No. of elements in each group
    
    if (any(counts == 0)) {                             # Remove grouping if not represented in data
      empty <- lev[counts == 0]
      warning(sprintf(ngettext(length(empty), "group %s is empty", 
                               "groups %s are empty"), paste(empty, collapse = " ")), 
              domain = NA)
      lev1 <- lev[counts > 0]
      g <- factor(g, levels = lev1)
      counts <- as.vector(table(g))
    }
    
    ng = length(lev1)
    
    group.means <- tapply(x, list(rep(g, p), col(x)), mean)     # g x p matrix of group means from x
    
    if(missing(cov)){                                           #create covariance matrix
      inverted = FALSE
      cov = cor(x)                                          #standardize into correlation mtx
    }
    else{                                                       #check cov of correct dimension
      if(dim(cov) != c(p,p))
        stop("cov matrix not of dim = (p,p)\n")
    }
    
    Distance = matrix(nrow=ng, ncol=ng)                                 #initialize distance matrix 
    dimnames(Distance) = list(names(group.means), names(group.means))
    
    Means = round(group.means, digits)
    Cov = round(cov, digits)
    Distance = round(Distance, digits)
    
    for(i in 1:ng){
      Distance[i,]= mahalanobis(group.means, group.means[i,], cov, inverted)
    }
    
    result <- list(means = group.means, cov = cov, distance = Distance)
    result
  }

Calculate the pairwise Mahalanobis distance

#Calculate the Mahalanobis distance
mahala_sq = pairwise.mahalanobis(x=variables, grouping=group) #get squared mahalanobis distances (see mahala_sq$distance).
mahala_sq

names = rownames(mahala_sq$means) #capture labels

mahala = sqrt(mahala_sq$distance) #mahalanobis distance
rownames(mahala) = names #set rownames in the dissimilarity matrix
colnames(mahala) = names #set colnames in the dissimilarity matrix

mahala 

#This is how I used the dissimilarity matrix to find clusters.
cluster = agnes(mahala,diss=TRUE,keep.diss=FALSE,method="complete") #hierarchical clustering
plot(cluster,which.plots=2) #plot dendrogram

Mahalanobis distance on R for more than 2 groups

Getting the distances is straigtforward if you organize your data in a 500 by 12 data.frame or matrix. To show you, first we create a data.frame with some toy data:

set.seed(1) # To ensure reproducibility of the random numbers
df <- data.frame(sapply(LETTERS[1:12], function(x) rnorm(500)))
# Adding some outliers
df[1,1] <- 20
df[200,5] <- 60
head(df)
#            A           B           C          D           E          F          G          H
# 1 20.0000000  0.07730312  1.13496509  0.8500435 -0.88614959 -1.8054836  0.7391149  0.5205997
# 2  0.1836433 -0.29686864  1.11193185 -0.9253130 -1.92225490 -0.6780407  0.3866087  0.3775619
# 3 -0.8356286 -1.18324224 -0.87077763  0.8935812  1.61970074 -0.4733581  1.2963972 -0.6236588
# 4  1.5952808  0.01129269  0.21073159 -0.9410097  0.51926990  1.0274171 -0.8035584 -0.5726105
# 5  0.3295078  0.99160104  0.06939565  0.5389521 -0.05584993 -0.5973876 -1.6026257  0.3125012
# 6 -0.8204684  1.59396745 -1.66264885 -0.1819744  0.69641761  1.1598494  0.9332510 -0.7074278
#            I          J          K          L
# 1 -1.1346302  1.5579537 -1.5163733 -1.1378698
# 2  0.7645571 -0.7292970  0.6291412 -0.9518105
# 3  0.5707101 -1.5039509 -1.6781940  1.6192595
# 4 -1.3516939 -0.5667870  1.1797811  0.1678136
# 5 -2.0298855 -2.1044536  1.1176545 -0.9081778
# 6  0.5904787  0.5307319 -1.2377359  1.3417959

where you have the 12 species called A-L. Organized in this way, you simply run the following line:

dist.sq <- mahalanobis(x = df, center = colMeans(df), cov = cov(df))

Remember, the function returns the square of the distances!

plot(sqrt(dist.sq))

I hope this helps.

Calculating weigthed pairwise Mahalanobis distances

There is an error in your dist.maha function. This is obvious straight away because some of the distances it computes are negative numbers -- so they cannot be the actual distances! Fortunately, this is quite easy to fix as you simply need to square your stdX vector.

library("HDMD")
library("tidyverse")

# Convert a vector of pairwise distances from to a distance matrix
# (Simplified approach which doesn't use for-loops)
pairwise_dist_to_dist_matrix <- function(dist, n) {
  stopifnot(length(dist) == n*(n-1)/2)
  mat <- matrix(NA_real_, n, n)
  diag(mat) <- 0
  mat[lower.tri(mat)] <- dist
  mat
}

dist.maha <- function (X) {
  diff <- pair.diff(X)                 # pairwise difference of rows
  V <- cov(X)                          # empirical covariance; positive definite
  L <- t(chol(V))                      # lower triangular factor
  stdX <- t(forwardsolve(L, t(diff)))  # solving the system of linear equations
  dist <- stdX * stdX                  # Don't forget to square!
  dist <- rowSums(dist)                # And add up the differences in each dimension.
  pairwise_dist_to_dist_matrix(dist, nrow(X))
}

# An alternative computation, for an additional check
dist.maha2 <- function(X) {
  diff <- pair.diff(X)
  V <- cov(X)
  Vinv <- solve(V)
  dist <- rowSums(diff %*% Vinv * diff)
  pairwise_dist_to_dist_matrix(dist, nrow(X))
}

# Slightly more complex data matrix to check if
# functions work in higher dimensions
data <- matrix(c(100, 54, 56, 79, 12, 1, 2, 3, 4, 5), ncol = 2)
dist.maha(data)
#>          [,1]      [,2]     [,3]     [,4] [,5]
#> [1,] 0.000000        NA       NA       NA   NA
#> [2,] 2.275210 0.0000000       NA       NA   NA
#> [3,] 1.974017 0.9742819 0.000000       NA   NA
#> [4,] 4.759700 7.5842687 3.250906 0.000000   NA
#> [5,] 7.896067 3.6213875 1.974017 5.690146    0
dist.maha2(data)
#>          [,1]      [,2]     [,3]     [,4] [,5]
#> [1,] 0.000000        NA       NA       NA   NA
#> [2,] 2.275210 0.0000000       NA       NA   NA
#> [3,] 1.974017 0.9742819 0.000000       NA   NA
#> [4,] 4.759700 7.5842687 3.250906 0.000000   NA
#> [5,] 7.896067 3.6213875 1.974017 5.690146    0

It also seems that you don't use pairwise.mahalanobis correctly. You have to compute and pass the covariance matrix (the cov argument).

# This is the HDMD alternative
id = c(1,2,3,4,5)

# Incorrect:

# You have to specify the `cov` argument.
# Otherwise `pairwise.mahalanobis` doesn't compute it correctly
# as each sample is assumed to be in its own group.

pairwise.mahalanobis(data, grouping = id)$distance
#>           [,1]      [,2]      [,3]     [,4]      [,5]
#> [1,]     0.000 4345.2805 3840.7349  759.689 15362.940
#> [2,]  4345.280    0.0000   16.7591 1487.209  3369.708
#> [3,]  3840.735   16.7591    0.0000 1194.197  3840.735
#> [4,]   759.689 1487.2090 1194.1967    0.000  9310.174
#> [5,] 15362.940 3369.7076 3840.7349 9310.174     0.000

# Correct:

# NOTE: Ignore the warning message; there seems to be a small bug in `pairwise.mahalanobis`.
pairwise.mahalanobis(data, grouping = id, cov = cov(data))$distance
#> Warning in if (dim(cov) != c(p, p)) stop("cov matrix not of dim = (p,p)
#> \n"): the condition has length > 1 and only the first element will be used
#>          [,1]      [,2]      [,3]     [,4]     [,5]
#> [1,] 0.000000 2.2752099 1.9740168 4.759700 7.896067
#> [2,] 2.275210 0.0000000 0.9742819 7.584269 3.621388
#> [3,] 1.974017 0.9742819 0.0000000 3.250906 1.974017
#> [4,] 4.759700 7.5842687 3.2509059 0.000000 5.690146
#> [5,] 7.896067 3.6213875 1.9740168 5.690146 0.000000

^{Created on 2019-03-24 by the reprex package (v0.2.1)}