Parallel Distance Matrix in R

Parallel distance Matrix in R

Here's the structure for one route you could go. It is not faster than just using the dist() function, instead taking many times longer. It does process in parallel, but even if the computation time were reduced to zero, the time to start up the function and export the variables to the cluster would probably be longer than just using dist()

library(parallel)

vec.array <- matrix(rnorm(2000 * 100), nrow = 2000, ncol = 100)

TaxiDistFun <- function(one.vec, whole.matrix) {
diff.matrix <- t(t(whole.matrix) - one.vec)
this.row <- apply(diff.matrix, 1, function(x) sum(abs(x)))
return(this.row)
}

cl <- makeCluster(detectCores())
clusterExport(cl, list("vec.array", "TaxiDistFun"))

system.time(dist.array <- parRapply(cl, vec.array,
function(x) TaxiDistFun(x, vec.array)))

stopCluster(cl)

dim(dist.array) <- c(2000, 2000)

Parallelise the calculation of a large, symmetric, distance matrix

Using the %:% operator to create nested foreach loops:

library(doParallel)
library(kmlShape)
df <- data.frame(replicate(168, sample(1:50, 20, rep=TRUE)))

# distance matrix
dist2.mat <- matrix(0, nrow(df), nrow(df))

cl<-makeCluster(detectCores()-1)
registerDoParallel(cl)
x<-double(nrow(df))
dist <-
foreach (i = 1:(nrow(df)), .combine='cbind', .packages="kmlShape") %:%
foreach (j = (i):nrow(df)) %dopar% {
distFrechet(1:168, df[i,], 1:168, df[j,])
#list(i=i,j=j)
}

stopCluster(cl)

dist

result.1 result.2 result.3 result.4 result.5 result.6 result.7 result.8 result.9 result.10 result.11 result.12
[1,] 0 0 0 0 0 0 0 0 0 0 0 0
[2,] 1480.152 1448.745 1358.875 1366.565 1397.582 1405.809 1328.405 1360.226 1331.069 1308.166 1503.361 1435.218
[3,] 1404.957 1407.091 1371.23 1411.818 1459.328 1356.241 1242.108 1303.164 1350.508 1447.664 1378.07 1441.903
[4,] 1407.35 1419.361 1371.301 1409.521 1488.415 1409.456 1321.429 1343.909 1312.71 1307.351 1503.293 1462.787
[5,] 1466.282 1272.466 1388.564 1345.999 1393.626 1352.437 1343.351 1411.275 1334.988 1421.106 1431.027 1377.316
[6,] 1378.757 1427.55 1382.05 1382.904 1431.42 1389.155 1281.994 1391.232 1320.862 1386.65 1469.759 1410.739
[7,] 1394.628 1446.476 1226.788 1324.619 1378.869 1404.817 1426.891 1378.571 1393.961 1244.002 1408.215 1369.311
[8,] 1369.582 1348.065 1410.982 1320.011 1479.091 1402.211 1405.336 1465.377 1431.344 1296.284 1491.27 1337.368
[9,] 1377.445 1409.933 1385.837 1327.244 1378.182 1408.215 1432.204 1427.572 1383.904 1413.975 1420.113 1408.19
[10,] 1339.582 1452.823 1448.128 1390.824 1372.587 1409.99 1386.355 1378.127 1402.049 1317.684 1295.564 0
[11,] 1403.069 1362.809 1462.443 1362.996 1476.378 1397.757 1341.169 1485.1 1348.429 1275.999 0 1435.218
[12,] 1429.874 1363.44 1374.134 1337.237 1529.611 1340.024 1426.915 1367.23 1358.045 0 1503.361 1441.903
[13,] 1473.359 1423.106 1410.257 1399.169 1438.352 1363.531 1346.741 1435.122 0 1308.166 1378.07 1462.787
[14,] 1513.195 1499.721 1448.198 1372.167 1531.107 1368.213 1315.645 0 1331.069 1447.664 1503.293 1377.316
[15,] 1469.946 1466.107 1442.905 1426.129 1379.798 1440.91 0 1360.226 1350.508 1307.351 1431.027 1410.739
[16,] 1330.253 1455.198 1384.926 1287.825 1481.265 0 1328.405 1303.164 1312.71 1421.106 1469.759 1369.311
[17,] 1347.726 1404.506 1410.232 1385.049 0 1405.809 1242.108 1343.909 1334.988 1386.65 1408.215 1337.368
[18,] 1444.1 1309.435 1495.531 0 1397.582 1356.241 1321.429 1411.275 1320.862 1244.002 1491.27 1408.19
[19,] 1446.756 1320.464 0 1366.565 1459.328 1409.456 1343.351 1391.232 1393.961 1296.284 1420.113 0
[20,] 1432.192 0 1358.875 1411.818 1488.415 1352.437 1281.994 1378.571 1431.344 1413.975 1295.564 1435.218
result.13 result.14 result.15 result.16 result.17 result.18 result.19 result.20
[1,] 0 0 0 0 0 0 0 0
[2,] 1357.679 1413.495 1434.734 1417.133 1417.112 1412.082 1328.449 0
[3,] 1493.827 1336.9 1335.423 1412.955 1322.08 1491.122 0 0
[4,] 1369.99 1342.195 1395.864 1367.652 1409.937 0 1328.449 0
[5,] 1383.691 1467.201 1316.58 1318.347 0 1412.082 0 0
[6,] 1566.664 1324.274 1424.907 0 1417.112 1491.122 1328.449 0
[7,] 1393.022 1461.161 0 1417.133 1322.08 0 0 0
[8,] 1387.437 0 1434.734 1412.955 1409.937 1412.082 1328.449 0
[9,] 0 1413.495 1335.423 1367.652 0 1491.122 0 0
[10,] 1357.679 1336.9 1395.864 1318.347 1417.112 0 1328.449 0
[11,] 1493.827 1342.195 1316.58 0 1322.08 1412.082 0 0
[12,] 1369.99 1467.201 1424.907 1417.133 1409.937 1491.122 1328.449 0
[13,] 1383.691 1324.274 0 1412.955 0 0 0 0
[14,] 1566.664 1461.161 1434.734 1367.652 1417.112 1412.082 1328.449 0
[15,] 1393.022 0 1335.423 1318.347 1322.08 1491.122 0 0
[16,] 1387.437 1413.495 1395.864 0 1409.937 0 1328.449 0
[17,] 0 1336.9 1316.58 1417.133 0 1412.082 0 0
[18,] 1357.679 1342.195 1424.907 1412.955 1417.112 1491.122 1328.449 0
[19,] 1493.827 1467.201 0 1367.652 1322.08 0 0 0
[20,] 1369.99 1324.274 1434.734 1318.347 1409.937 1412.082 1328.449 0

I left the distances calculations where i = j to facilitate visualization/understanding of the result.

This now just needs to be reshaped correctly:

do.call(cbind,lapply(1:ncol(dist),function(col) c(rep(0,col-1),dist[1:(nrow(dist)-col+1),col])))

[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
0 0 0 0 0 0 0 0 0 0 0 0 0 0
1480.152 0 0 0 0 0 0 0 0 0 0 0 0 0
1404.957 1448.745 0 0 0 0 0 0 0 0 0 0 0 0
1407.35 1407.091 1358.875 0 0 0 0 0 0 0 0 0 0 0
1466.282 1419.361 1371.23 1366.565 0 0 0 0 0 0 0 0 0 0
1378.757 1272.466 1371.301 1411.818 1397.582 0 0 0 0 0 0 0 0 0
1394.628 1427.55 1388.564 1409.521 1459.328 1405.809 0 0 0 0 0 0 0 0
1369.582 1446.476 1382.05 1345.999 1488.415 1356.241 1328.405 0 0 0 0 0 0 0
1377.445 1348.065 1226.788 1382.904 1393.626 1409.456 1242.108 1360.226 0 0 0 0 0 0
1339.582 1409.933 1410.982 1324.619 1431.42 1352.437 1321.429 1303.164 1331.069 0 0 0 0 0
1403.069 1452.823 1385.837 1320.011 1378.869 1389.155 1343.351 1343.909 1350.508 1308.166 0 0 0 0
1429.874 1362.809 1448.128 1327.244 1479.091 1404.817 1281.994 1411.275 1312.71 1447.664 1503.361 0 0 0
1473.359 1363.44 1462.443 1390.824 1378.182 1402.211 1426.891 1391.232 1334.988 1307.351 1378.07 1435.218 0 0
1513.195 1423.106 1374.134 1362.996 1372.587 1408.215 1405.336 1378.571 1320.862 1421.106 1503.293 1441.903 1357.679 0
1469.946 1499.721 1410.257 1337.237 1476.378 1409.99 1432.204 1465.377 1393.961 1386.65 1431.027 1462.787 1493.827 1413.495
1330.253 1466.107 1448.198 1399.169 1529.611 1397.757 1386.355 1427.572 1431.344 1244.002 1469.759 1377.316 1369.99 1336.9
1347.726 1455.198 1442.905 1372.167 1438.352 1340.024 1341.169 1378.127 1383.904 1296.284 1408.215 1410.739 1383.691 1342.195
1444.1 1404.506 1384.926 1426.129 1531.107 1363.531 1426.915 1485.1 1402.049 1413.975 1491.27 1369.311 1566.664 1467.201
1446.756 1309.435 1410.232 1287.825 1379.798 1368.213 1346.741 1367.23 1348.429 1317.684 1420.113 1337.368 1393.022 1324.274
result.20 1432.192 1320.464 1495.531 1385.049 1481.265 1440.91 1315.645 1435.122 1358.045 1275.999 1295.564 1408.19 1387.437 1461.161
[,15] [,16] [,17] [,18] [,19] [,20]
0 0 0 0 0 0
0 0 0 0 0 0
0 0 0 0 0 0
0 0 0 0 0 0
0 0 0 0 0 0
0 0 0 0 0 0
0 0 0 0 0 0
0 0 0 0 0 0
0 0 0 0 0 0
0 0 0 0 0 0
0 0 0 0 0 0
0 0 0 0 0 0
0 0 0 0 0 0
0 0 0 0 0 0
0 0 0 0 0 0
1434.734 0 0 0 0 0
1335.423 1417.133 0 0 0 0
1395.864 1412.955 1417.112 0 0 0
1316.58 1367.652 1322.08 1412.082 0 0
result.20 1424.907 1318.347 1409.937 1491.122 1328.449 0

Create a distance matrix in R using parallelization

As mentioned you can use dist function. Here an example of how to use the result of dist to create a heatmap.

nn <- paste0('row',1:5)
x <- matrix(rnorm(25), nrow = 5,dimnames=list(nn))
distObj <- dist(x)
cols <- c("#D33F6A", "#D95260", "#DE6355", "#E27449",
"#E6833D", "#E89331", "#E9A229", "#EAB12A", "#E9C037",
"#E7CE4C", "#E4DC68", "#E2E6BD")
## mandatory coercion
distObj <- as.matrix(distObj)
## hetamap
image(distObj[order(nn), order(nn)], col = cols,
xaxt = "n", yaxt = "n")
## axes labels
axis(1, at = seq(0, 1, length.out = dim(distObj)[1]), labels = nn,
las = 2)
axis(2, at = seq(0, 1, length.out = dim(distObj)[1]), labels = nn,
las = 2)

Sample Image

Fast parallel bipartite distance calculation in R

The wordspace dist.matrix() function supports parallelized calculation of bipartite distances.

Benchmarking wordspace against parallelDist

matrix1 <- abs(matrix(rnorm(1000),100,100000))
matrix2 <- abs(matrix(rnorm(1000),100,100000))

library(rbenchmark)
library(parallelDist)
library(wordspace)

bipartiteDist_parallelDist <- function(matrix1,matrix2){
matrix12 <- rbind(matrix1,matrix2)
d <- parallelDist(matrix12, method = "euclidean")
d <- as.matrix(d)[(1:nrow(matrix1)),((nrow(matrix1)+1):(nrow(matrix1)*2))]
d
}

bipartiteDist_wordspace <- function(matrix1,matrix2){
wordspace.openmp(threads = wordspace.openmp()$max)
dist.matrix(matrix1,matrix2, byrow = TRUE, method = "euclidean", convert = FALSE)
}

benchmark("parallelDist" = {
bd1 <- bipartiteDist_parallelDist(matrix1,matrix2)
},
"wordspace" = {
bd2 <- bipartiteDist_wordspace(matrix1,matrix2)
},
replications = 1,
columns = c("test", "replications", "elapsed",
"relative", "user.self", "sys.self"))

plot(bd1,bd2) # yes, both methods give near-identical results

Benchmarking results:

          test replications elapsed relative user.self sys.self
1 parallelDist 1 2.120 12.184 126.145 0.523
2 wordspace 1 0.174 1.000 3.749 0.252

I used 80 threads.

A framework for further speed gains

The author of wordspace admits to emphasizing low memory load over speed, thus additional speed gains are possible (source).

For instance, here is a general framework for Euclidean distance:

bipartiteDist3 <- function(matrix1,matrix2){
m1tm2 <- tcrossprod(matrix1,matrix2)
sq1 <- rowSums(matrix1^2)
sq2 <- rowSums(matrix2^2)
out0 <- outer(sq1, sq2, "+") - 2 * m1tm2
sqrt(out0)
}

I am very interested in a parallelized solution optimized for sparse matrices. To my knowledge, wordspace does not optimize for sparsity. For example, there are parallelizable sparse matrix implementations of tcrossprod, rowSums, and outer function equivalents.

How to parallalise partitioning around medoids (PAM) using a precalculated distance matrix

OK, I’ve managed to successfully pass a precalculated distance matrix to the function tsclust with help from @Alexis and code from the dtwclust github page here. My solution is below for anyone else who’s interested.


Stage 4: Calculate PAM clusters (parallelised)

# Define vector number of clusters as integers
ks = c( 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,10L,11L,12L,13L,14L,15L,16L,17L,18L,19L,20L,
21L,22L,23L,24L,25L,26L,27L,28L,29L,30L,31L,32L,33L,34L,35L,36L,37L,38L,39L,40L)

# Create empty list to be populated with clusters
clusters = list()

# For loop which calculates partitions around medoids for k=2:40
for (i in 2:40) {
clusters[[i]] = tsclust(inc_traj, k = ks[i], distance = "dtw_basic", centroid = "pam",
control = partitional_control(distmat = distmat), seed = 3247, trace = TRUE)
cat("\r",paste0(i," of 40 clusters calculated."))
}

I also compared the processing time of both the tsclust method outlined in Stage 4 above to the pam method outlined in Stage 3 of the original question for k=2:4. I did this on my full dataset of 34,591 trajectories and found the tsclust approach was substantially faster, so will be using that moving forward. I’ve reported the processing times in the below table in case others are interested, though it’s probably noting I have access to a machine with 28 CPU cores, so time differences may be less dramatic on regular desktops.

_______________________________
| Method | k | Time |
| tsclust | 2 | 18.584 seconds |
| tsclust | 3 | 25.746 seconds |
| tsclust | 4 | 15.37 seconds |
| pam | 2 | 6.195 minutes |
| pam | 3 | 6.231 minutes |
| pam | 4 | 9.658 minutes |
_______________________________

Similarity / distance between many pairs of matrices

You could take each pair of groups, concatenate them, and then just calculate the dissimilarity matrix within that group. Obviously this means you're comparing a group to itself to an extent, but it may still work for your use case, and with daisy it is reasonably quick for your size of data.

library(cluster)

n <- 30
groups <- vector("list", 30)

# dummy data
set.seed(123)
for(i in 1:30) {
groups[[i]] = data.frame(x = rnorm(1000,ceiling(runif(1, -10, 10)),ceiling(runif(1, 2, 4))),
y = rnorm(1000,ceiling(runif(1, -10, 10)),ceiling(runif(1, 2, 4))),
z = rbinom(1000,1,runif(1,0.1,0.9)))
}

m <- matrix(nrow = n, ncol = n)

# loop through each pair once
for(i in 1:n) {
for(j in 1:i) { #omit top right corner
if(i == j) {
m[i,j] <- NA #omit diagonal
} else {
# concatenate groups
dat <- rbind(df_list[[i]], df_list[[j]])

# compute all distances (between groups and within groups), return matrix
mm <- dat %>%
daisy(metric = "gower") %>%
as.matrix

# retain only distances between groups
mm <- mm[(nrow(df_list[[i]])+1):nrow(dat) , 1:nrow(df_list[[i]])]

# write mean distance to global comparison matrix
m[i,j] <- mean(mm)
}
}
}


Related Topics



Leave a reply



Submit