Faster version of combn
You could use combnPrim
from gRbase
source("http://bioconductor.org/biocLite.R")
biocLite("gRbase") # will install dependent packages automatically.
system.time({
d.1 <- as.data.table(t(combn(d$id, 2)))
})
# user system elapsed
# 27.322 0.585 27.674
system.time({
d.2 <- as.data.table(t(combnPrim(d$id,2)))
})
# user system elapsed
# 2.317 0.110 2.425
identical(d.1[order(V1, V2),], d.2[order(V1,V2),])
#[1] TRUE
Increase speed for operations using a combination of combn and outer functions
This should be lightning fast for this problem and not use excessive memory.
library(data.table)
setDT(df)
numvec <- max(df[,V1])
dl <- lapply(1:numvec, function(i) df[V1 == i, sort(V2)])
dmat <- CJ(x=1:numvec, y=1:numvec)[, .(z = sum(findInterval(dl[[y]],dl[[x]]))), .(x,y)]
mat <- as.matrix(dcast(dmat, x~y, value.var = 'z')[, -'x'])
R combinations, looking for faster and more efficient way(package,code,parallel cpu) than basic R
As @inscaven points out, the real time crunch comes from paste
. If we simply had to generate all 17 choose 10 combinations 15000 times, that wouldn't take that long with the advent of a couple of highly optimized packages in R
, arrangements
and RcppAlgos
(I am the author):
set.seed(101)
testMat <- matrix(sample(1000, 15000 * 17, TRUE), nrow = 15000)
library(arrangements)
system.time(lapply(1:15000, function(x) {
temp <- combinations(x = testMat[x, ], k = 10)
x
}))
user system elapsed
6.879 2.133 9.014
library(RcppAlgos)
system.time(lapply(1:15000, function(x) {
temp <- comboGeneral(testMat[x, ], 10)
x
}))
user system elapsed
5.770 2.178 7.953
Compared to combn
loaded in base R
:
system.time(lapply(1:15000, function(x) {
temp <- combn(testMat[x, ], 10)
x
}))
user system elapsed
261.163 1.093 262.608
If we must combine our results into a matrix of characters, there isn't much more in base R
that we can do. Even if we use either of the optimized libraries mentioned above, we are still left looping over all rows and pasting the results which is slow.
system.time(t1 <- lapply(1:50, function(x) {
combn(testMat[x, ], 10, paste0, collapse = "")
}))
user system elapsed
6.847 0.070 6.933
## from package arrangements
system.time(t2 <- lapply(1:50, function(x) {
apply(combinations(x = testMat[x, ], k = 10), 1, paste0, collapse = "")
}))
user system elapsed
6.318 0.032 6.353
This isn't really a win. We need a new approach.
Enter Rcpp
//[[Rcpp::export]]
CharacterVector pasteCombos(int n, int r, CharacterVector v, int numRows) {
int r1 = r - 1, r2 = r - 2;
int numIter, count = 0;
CharacterVector comboVec = Rcpp::no_init_vector(numRows);
std::vector<int> z(r);
std::iota(z.begin(), z.end(), 0);
while (count < numRows) {
numIter = n - z[r1];
if ((numIter + count) > numRows)
numIter = numRows - count;
for (int i = 0; i < numIter; ++i, ++count, ++z[r1])
for (int k = 0; k < r; ++k)
comboVec[count] += v[z[k]];
for (int i = r2; i >= 0; i--) {
if (z[i] != (n - r + i)) {
++z[i];
for (int k = (i + 1); k < r; ++k)
z[k] = z[k - 1] + 1;
break;
}
}
}
return comboVec;
}
This function simply generates all combinations of v
choose r
and paste the results on the fly via +=
. This generates a vector without the necessity of dealing with rows of a matrix. Let's see if we have any improvements.
numCombs <- choose(17, 10)
charMat <- matrix(as.character(testMat), nrow = 15000)
funOP <- function(z, r) {
apply(X = combn(seq_len(ncol(z)), r), MAR = 2,FUN = function(jj) {apply(z[, jj], 1, paste, collapse="") })
}
system.time(t1 <- funOP(testMat[1:100, ], 10))
user system elapsed
22.221 0.110 22.330
system.time(t2 <- lapply(1:100, function(x) {
pasteCombos(17, 10, charMat[x,], numCombs)
}))
user system elapsed
7.890 0.085 7.975
Nearly 3 times faster... not bad, but we can do better.
Enter parallel
library(parallel)
system.time(t3 <- mclapply(1:100, function(x) {
pasteCombos(17, 10, charMat[x,], numCombs)
}, mc.cores = 8)) ## you will have to adjust this on your computer.. I'm running MacOS with 8 cores
user system elapsed
1.430 0.454 1.912
Now we are talking!!! Nearly 12 times faster!!
Here is a sanity check:
all.equal(t1, do.call(rbind, t2))
# [1] TRUE
all.equal(t1, do.call(rbind, t3))
# [1] TRUE
In total, if we assume we can complete 100 rows in 2 seconds, we can complete our task in a respectable 2 * 150 = 300 seconds = 5 minutes
.
Performance: combn on large data.table
As Roland suggested in his comment, using combn
just to calculate the combinations of labels and then perform directly joins on the data.table
, is magnitudes faster:
corrs <- as.data.frame(do.call( rbind, combn(labs, m=2, simplify = FALSE) ), stringsAsFactors=FALSE)
names(corrs) <- c("a", "b")
setDT(corrs)
setkey(DT, label)
setkey( corrs, a )
corrs <- corrs[ DT, nomatch = 0, allow.cartesian = TRUE]
setkey(corrs, b, t)
setkey(DT, label, t)
corrs <- corrs[ DT, nomatch = 0 ]
corrs[ , overlap := .N >= minOverlap , by = list(a,b) ]
corrs <- corrs[ (overlap) ]
corrs <- corrs[ ,list( start = min(t), end = max(t), R = cor(I,I.1) ), by = list(a,b) ]
Optimizing calculation of combinations into list - large data set
Here is an answer that is over 25x
faster than the OP's solution on large test cases. It doesn't rely on paste
, but rather we take advantage of properties of numbers and vectorized operations. We also use comboGeneral
from the RcppAlgos
package (I am the author) which is much faster than combn
and combnPrim
from the linked answer for generating combinations of a vector. First we show the efficiency gains of comboGeneral
over the other functions:
## library(gRbase)
library(RcppAlgos)
library(microbenchmark)
microbenchmark(gRbase::combnPrim(300, 2), combn(300, 2),
comboGeneral(300, 2), unit = "relative")
Unit: relative
expr min lq mean median uq max neval
gRbase::combnPrim(300, 2) 5.145654 5.192439 4.83561 7.167839 4.320497 3.98992 100
combn(300, 2) 204.866624 192.559119 143.75540 174.079339 102.733367 539.12325 100
comboGeneral(300, 2) 1.000000 1.000000 1.00000 1.000000 1.000000 1.00000 100
Now, we create a function to create some random reproducible data that will be passed to our test functions:
makeTestSet <- function(vectorSize, elementSize, mySeed = 42, withRep = FALSE) {
set.seed(mySeed)
sapply(1:vectorSize, function(x) {
paste(sample(10^6, s1 <- sample(2:elementSize, 1), replace = withRep), collapse = " ")
})
}
makeTestSet(5, 3)
[1] "937076 286140 830446" "519096 736588 134667" "705065 457742 719111"
[4] "255429 462293 940013" "117488 474997 560332"
That looks good. Now, lets see if setting fixed = TRUE
gets us any gains (as suggested above by @MichaelChirico):
bigVec <- makeTestSet(10, 100000)
microbenchmark(standard = strsplit(bigVec, " "),
withFixed = strsplit(bigVec, " ", fixed = TRUE),
times = 15, unit = "relative")
Unit: relative
expr min lq mean median uq max neval
standard 4.447413 4.296662 4.133797 4.339537 4.084019 3.415639 15
withFixed 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 15
@MichaelChirico was spot on. Putting it all together we get:
combPairFast <- function(testVec) {
lapply(strsplit(testVec, " ", fixed = TRUE), function(x) {
combs <- RcppAlgos::comboGeneral(as.numeric(x), 2)
unique(combs[,1] * (10)^(as.integer(log10(combs[,2])) + 1L) + combs[,2])
})
}
## test.vector defined above by OP
combPairFast(test.vector)
[[1]]
[1] 335261344015 335261537633 344015537633
[[2]]
[1] 22404132858
[[3]]
[1] 254654355860 254654488288 355860488288
[[4]]
[1] 219943373817
[[5]]
[1] 331839404477
## OP original code
combPairOP <- function(testVec) {
lapply(strsplit(testVec, " "), function(x) unique(apply(combn(x, 2), 2, function(y) paste0(y, collapse = ""))))
}
As stated in the comments by the OP, the maximum number is less than a million (600000 to be exact), which means that after we multiply one of the numbers by at most 10^6 and add it to another 6 digit number (equivalent to simply concatenating two strings of numbers), we are guaranteed to be within the numerical precision of base R (i.e. 2^53 - 1
). This is good because arithmetic operations on numerical numbers is much more efficient than strings operations.
All that is left is to benchmark:
test.vector <- makeTestSet(100, 50)
microbenchmark(combPairOP(test.vector),
combPairFast(test.vector),
times = 20, unit = "relative")
Unit: relative
expr min lq mean median uq max neval
combPairOP(test.vector) 22.33991 22.4264 21.67291 22.11017 21.729 25.23342 20
combPairFast(test.vector) 1.00000 1.0000 1.00000 1.00000 1.000 1.00000 20
And on larger vectors:
bigTest.vector <- makeTestSet(1000, 100, mySeed = 22, withRep = TRUE)
## Duplicate values exist
any(sapply(strsplit(bigTest.vector, " ", fixed = TRUE), function(x) {
any(duplicated(x))
}))
[1] TRUE
system.time(t1 <- combPairFast(bigTest.vector))
user system elapsed
0.303 0.011 0.314
system.time(t2 <- combPairOP(bigTest.vector))
user system elapsed
8.820 0.081 8.902 ### 8.902 / 0.314 ~= 28x faster
## results are the same
all.equal(t1, lapply(t2, as.numeric))
[1] TRUE
How can I generate all the possible combinations of a vector
Try
combn(v1, 2, FUN=function(x) paste(rev(x), collapse="-"))
#[1] "B-A" "C-A" "D-A" "E-A" "C-B" "D-B" "E-B" "D-C" "E-C" "E-D"
If you want in the default order
combn(v1, 2, FUN=paste, collapse="-")
#[1] "A-B" "A-C" "A-D" "A-E" "B-C" "B-D" "B-E" "C-D" "C-E" "D-E"
Update
For a faster option, you can use combnPrim
from grBase
. Check here
library(grBase)
apply(combnPrim(v1,2), 2, FUN=paste, collapse='-')
#[1] "A-B" "A-C" "B-C" "A-D" "B-D" "C-D" "A-E" "B-E" "C-E" "D-E"
data
v1 <- LETTERS[1:5]
Related Topics
Pasting Two Vectors With Combinations of All Vectors' Elements
How to Put Labels Over Geom_Bar in R With Ggplot2
How to Extract Plot Axes' Ranges For a Ggplot2 Object
Replace Missing Values With Column Mean
Simplest Way to Do Grouped Barplot
Import Text File as Single Character String
Using Unicode 'Dingbat-Like' Glyphs in R Graphics, Across Devices & Platforms, Especially Pdf
Rotating X Axis Labels in R For Barplot
How to Use Grep()/Gsub() to Find Exact Match
Why Is the Parallel Package Slower Than Just Using Apply
Split Date-Time Column into Date and Time Variables
Ggplot2 Two-Line Label With Expression
What Are Replacement Functions in R