Faster Version of Combn

Faster version of combn

You could use combnPrim from gRbase

source("http://bioconductor.org/biocLite.R")
biocLite("gRbase") # will install dependent packages automatically.
system.time({
d.1 <- as.data.table(t(combn(d$id, 2)))
})
# user system elapsed
# 27.322 0.585 27.674

system.time({
d.2 <- as.data.table(t(combnPrim(d$id,2)))
})
# user system elapsed
# 2.317 0.110 2.425

identical(d.1[order(V1, V2),], d.2[order(V1,V2),])
#[1] TRUE

Increase speed for operations using a combination of combn and outer functions

This should be lightning fast for this problem and not use excessive memory.

library(data.table)
setDT(df)
numvec <- max(df[,V1])
dl <- lapply(1:numvec, function(i) df[V1 == i, sort(V2)])
dmat <- CJ(x=1:numvec, y=1:numvec)[, .(z = sum(findInterval(dl[[y]],dl[[x]]))), .(x,y)]
mat <- as.matrix(dcast(dmat, x~y, value.var = 'z')[, -'x'])

R combinations, looking for faster and more efficient way(package,code,parallel cpu) than basic R

As @inscaven points out, the real time crunch comes from paste. If we simply had to generate all 17 choose 10 combinations 15000 times, that wouldn't take that long with the advent of a couple of highly optimized packages in R, arrangements and RcppAlgos (I am the author):

set.seed(101)
testMat <- matrix(sample(1000, 15000 * 17, TRUE), nrow = 15000)

library(arrangements)
system.time(lapply(1:15000, function(x) {
temp <- combinations(x = testMat[x, ], k = 10)
x
}))
user system elapsed
6.879 2.133 9.014

library(RcppAlgos)
system.time(lapply(1:15000, function(x) {
temp <- comboGeneral(testMat[x, ], 10)
x
}))
user system elapsed
5.770 2.178 7.953

Compared to combn loaded in base R:

system.time(lapply(1:15000, function(x) {
temp <- combn(testMat[x, ], 10)
x
}))
user system elapsed
261.163 1.093 262.608

If we must combine our results into a matrix of characters, there isn't much more in base R that we can do. Even if we use either of the optimized libraries mentioned above, we are still left looping over all rows and pasting the results which is slow.

system.time(t1 <- lapply(1:50, function(x) {
combn(testMat[x, ], 10, paste0, collapse = "")
}))
user system elapsed
6.847 0.070 6.933

## from package arrangements
system.time(t2 <- lapply(1:50, function(x) {
apply(combinations(x = testMat[x, ], k = 10), 1, paste0, collapse = "")
}))
user system elapsed
6.318 0.032 6.353

This isn't really a win. We need a new approach.

Enter Rcpp

//[[Rcpp::export]]
CharacterVector pasteCombos(int n, int r, CharacterVector v, int numRows) {

int r1 = r - 1, r2 = r - 2;
int numIter, count = 0;
CharacterVector comboVec = Rcpp::no_init_vector(numRows);

std::vector<int> z(r);
std::iota(z.begin(), z.end(), 0);

while (count < numRows) {
numIter = n - z[r1];
if ((numIter + count) > numRows)
numIter = numRows - count;

for (int i = 0; i < numIter; ++i, ++count, ++z[r1])
for (int k = 0; k < r; ++k)
comboVec[count] += v[z[k]];

for (int i = r2; i >= 0; i--) {
if (z[i] != (n - r + i)) {
++z[i];
for (int k = (i + 1); k < r; ++k)
z[k] = z[k - 1] + 1;

break;
}
}
}

return comboVec;
}

This function simply generates all combinations of v choose r and paste the results on the fly via +=. This generates a vector without the necessity of dealing with rows of a matrix. Let's see if we have any improvements.

numCombs <- choose(17, 10)
charMat <- matrix(as.character(testMat), nrow = 15000)

funOP <- function(z, r) {
apply(X = combn(seq_len(ncol(z)), r), MAR = 2,FUN = function(jj) {apply(z[, jj], 1, paste, collapse="") })
}

system.time(t1 <- funOP(testMat[1:100, ], 10))
user system elapsed
22.221 0.110 22.330

system.time(t2 <- lapply(1:100, function(x) {
pasteCombos(17, 10, charMat[x,], numCombs)
}))
user system elapsed
7.890 0.085 7.975

Nearly 3 times faster... not bad, but we can do better.

Enter parallel

library(parallel)
system.time(t3 <- mclapply(1:100, function(x) {
pasteCombos(17, 10, charMat[x,], numCombs)
}, mc.cores = 8)) ## you will have to adjust this on your computer.. I'm running MacOS with 8 cores
user system elapsed
1.430 0.454 1.912

Now we are talking!!! Nearly 12 times faster!!

Here is a sanity check:

all.equal(t1, do.call(rbind, t2))
# [1] TRUE
all.equal(t1, do.call(rbind, t3))
# [1] TRUE

In total, if we assume we can complete 100 rows in 2 seconds, we can complete our task in a respectable 2 * 150 = 300 seconds = 5 minutes.

Performance: combn on large data.table

As Roland suggested in his comment, using combn just to calculate the combinations of labels and then perform directly joins on the data.table, is magnitudes faster:

corrs <- as.data.frame(do.call( rbind, combn(labs, m=2, simplify = FALSE) ), stringsAsFactors=FALSE)
names(corrs) <- c("a", "b")
setDT(corrs)

setkey(DT, label)
setkey( corrs, a )

corrs <- corrs[ DT, nomatch = 0, allow.cartesian = TRUE]
setkey(corrs, b, t)
setkey(DT, label, t)

corrs <- corrs[ DT, nomatch = 0 ]
corrs[ , overlap := .N >= minOverlap , by = list(a,b) ]
corrs <- corrs[ (overlap) ]
corrs <- corrs[ ,list( start = min(t), end = max(t), R = cor(I,I.1) ), by = list(a,b) ]

Optimizing calculation of combinations into list - large data set

Here is an answer that is over 25x faster than the OP's solution on large test cases. It doesn't rely on paste, but rather we take advantage of properties of numbers and vectorized operations. We also use comboGeneral from the RcppAlgos package (I am the author) which is much faster than combn and combnPrim from the linked answer for generating combinations of a vector. First we show the efficiency gains of comboGeneral over the other functions:

## library(gRbase)
library(RcppAlgos)
library(microbenchmark)

microbenchmark(gRbase::combnPrim(300, 2), combn(300, 2),
comboGeneral(300, 2), unit = "relative")
Unit: relative
expr min lq mean median uq max neval
gRbase::combnPrim(300, 2) 5.145654 5.192439 4.83561 7.167839 4.320497 3.98992 100
combn(300, 2) 204.866624 192.559119 143.75540 174.079339 102.733367 539.12325 100
comboGeneral(300, 2) 1.000000 1.000000 1.00000 1.000000 1.000000 1.00000 100

Now, we create a function to create some random reproducible data that will be passed to our test functions:

makeTestSet <- function(vectorSize, elementSize, mySeed = 42, withRep = FALSE) {
set.seed(mySeed)
sapply(1:vectorSize, function(x) {
paste(sample(10^6, s1 <- sample(2:elementSize, 1), replace = withRep), collapse = " ")
})
}

makeTestSet(5, 3)
[1] "937076 286140 830446" "519096 736588 134667" "705065 457742 719111"
[4] "255429 462293 940013" "117488 474997 560332"

That looks good. Now, lets see if setting fixed = TRUE gets us any gains (as suggested above by @MichaelChirico):

bigVec <- makeTestSet(10, 100000)

microbenchmark(standard = strsplit(bigVec, " "),
withFixed = strsplit(bigVec, " ", fixed = TRUE),
times = 15, unit = "relative")
Unit: relative
expr min lq mean median uq max neval
standard 4.447413 4.296662 4.133797 4.339537 4.084019 3.415639 15
withFixed 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 15

@MichaelChirico was spot on. Putting it all together we get:

combPairFast <- function(testVec) {
lapply(strsplit(testVec, " ", fixed = TRUE), function(x) {
combs <- RcppAlgos::comboGeneral(as.numeric(x), 2)
unique(combs[,1] * (10)^(as.integer(log10(combs[,2])) + 1L) + combs[,2])
})
}

## test.vector defined above by OP
combPairFast(test.vector)
[[1]]
[1] 335261344015 335261537633 344015537633

[[2]]
[1] 22404132858

[[3]]
[1] 254654355860 254654488288 355860488288

[[4]]
[1] 219943373817

[[5]]
[1] 331839404477

## OP original code
combPairOP <- function(testVec) {
lapply(strsplit(testVec, " "), function(x) unique(apply(combn(x, 2), 2, function(y) paste0(y, collapse = ""))))
}

As stated in the comments by the OP, the maximum number is less than a million (600000 to be exact), which means that after we multiply one of the numbers by at most 10^6 and add it to another 6 digit number (equivalent to simply concatenating two strings of numbers), we are guaranteed to be within the numerical precision of base R (i.e. 2^53 - 1). This is good because arithmetic operations on numerical numbers is much more efficient than strings operations.

All that is left is to benchmark:

test.vector <- makeTestSet(100, 50)

microbenchmark(combPairOP(test.vector),
combPairFast(test.vector),
times = 20, unit = "relative")
Unit: relative
expr min lq mean median uq max neval
combPairOP(test.vector) 22.33991 22.4264 21.67291 22.11017 21.729 25.23342 20
combPairFast(test.vector) 1.00000 1.0000 1.00000 1.00000 1.000 1.00000 20

And on larger vectors:

bigTest.vector <- makeTestSet(1000, 100, mySeed = 22, withRep = TRUE)

## Duplicate values exist
any(sapply(strsplit(bigTest.vector, " ", fixed = TRUE), function(x) {
any(duplicated(x))
}))
[1] TRUE

system.time(t1 <- combPairFast(bigTest.vector))
user system elapsed
0.303 0.011 0.314

system.time(t2 <- combPairOP(bigTest.vector))
user system elapsed
8.820 0.081 8.902 ### 8.902 / 0.314 ~= 28x faster

## results are the same
all.equal(t1, lapply(t2, as.numeric))
[1] TRUE

How can I generate all the possible combinations of a vector

Try

combn(v1, 2, FUN=function(x) paste(rev(x), collapse="-"))
#[1] "B-A" "C-A" "D-A" "E-A" "C-B" "D-B" "E-B" "D-C" "E-C" "E-D"

If you want in the default order

combn(v1, 2, FUN=paste, collapse="-")
#[1] "A-B" "A-C" "A-D" "A-E" "B-C" "B-D" "B-E" "C-D" "C-E" "D-E"

Update

For a faster option, you can use combnPrim from grBase. Check here

library(grBase) 
apply(combnPrim(v1,2), 2, FUN=paste, collapse='-')
#[1] "A-B" "A-C" "B-C" "A-D" "B-D" "C-D" "A-E" "B-E" "C-E" "D-E"

data

v1 <- LETTERS[1:5]


Related Topics



Leave a reply



Submit