Use Outer Instead of Expand.Grid

Use outer instead of expand.grid

Using rep.int:

expand.grid.alt <- function(seq1,seq2) {
  cbind(rep.int(seq1, length(seq2)),
        c(t(matrix(rep.int(seq2, length(seq1)), nrow=length(seq2)))))
}

expand.grid.alt(seq_len(nrow(dat)), seq_len(ncol(dat)))

In my computer is like 6 times faster than expand.grid.

Non-redundant version of expand.grid

How about using outer? But this particular function concatenates them into one character string.

outer( c("aa", "ab", "cc"), c("aa", "ab", "cc") , "paste" )
#     [,1]    [,2]    [,3]   
#[1,] "aa aa" "aa ab" "aa cc"
#[2,] "ab aa" "ab ab" "ab cc"
#[3,] "cc aa" "cc ab" "cc cc"

You can also use combn on the unique elements of the two vectors if you don't want the repeating elements (e.g. aa aa)

vals <- c( c("aa", "ab", "cc"), c("aa", "ab", "cc") )
vals <- unique( vals )
combn( vals , 2 )
#     [,1] [,2] [,3]
#[1,] "aa" "aa" "ab"
#[2,] "ab" "cc" "cc"

How to speed up `expand.grid()` in R?

You may try data.table::CJ function.

bench::mark(base = expand.grid(year, names),
            jc = expand.grid.jc(year, names),
            tidyr1 = tidyr::expand_grid(year, names), 
            tidyr2 = tidyr::crossing(year, names), 
            dt = data.table::CJ(year, names),
            check = FALSE, iterations = 10)

#  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory  time   gc   
#  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>  <list> <lis>
#1 base       635.48ms 715.02ms     1.25      699MB    2.00     10    16      8.02s <NULL> <Rprof… <benc… <tib…
#2 jc            5.66s    5.76s     0.172     820MB    0.275    10    16     58.13s <NULL> <Rprof… <benc… <tib…
#3 tidyr1     195.03ms 268.97ms     4.01      308MB    2.00     10     5       2.5s <NULL> <Rprof… <benc… <tib…
#4 tidyr2     590.91ms 748.35ms     1.31      312MB    0.656    10     5      7.62s <NULL> <Rprof… <benc… <tib…
#5 dt          318.1ms 384.21ms     2.47      206MB    0.986    10     4      4.06s <NULL> <Rprof… <benc… <tib…

PS - Also included tidyr::crossing for comparison as it does the same thing.

R - List of combinations with outer() and expand.grid()

I'm not sure I understand correctly, but I hope this helps:

#function to `outer`
fun <- function(x, y) 
{ 
 a1 <- get(paste0("a", x))
 a2 <- get(paste0("a", y))
 res <- apply(expand.grid(a1, a2), 1, paste, collapse = "")
 res2 <- paste(res, collapse = ";")

 return(res2)
}

#`outer` a vectorized `fun`
m2 <- outer(primes, primes, Vectorize(fun))
#select `upper.tri`
unq2 <- m2[upper.tri(m2)]

#combine to a list
myls <- lapply(as.list(unq2), function(x) as.numeric(unlist(strsplit(x, ";"))))
names(myls) <- unq

myls
#$`6`
#[1] 11 11 12 12

#$`10`
#[1] 12 12 12 12

#$`15`
#[1] 12 22 12 22

#$`14`
#[1] 11 11 13 13
#...

R: expand grid of all possible combinations within groups and apply functions across all the pairs

Use exand.grid you get all possible combination of columns, split the data by time and apply fun for each row of tmp.

library(dplyr)
library(purrr)

tmp <- expand.grid(firm1 = names(data[-1]), firm2 = names(data[-1]))

fun <- function(x, y) sum(x, y)

result <- data %>%
  group_split(time) %>%
  map_df(~cbind(time = .x$time[1], tmp, 
                value = apply(tmp, 1, function(x) fun(.x[[x[1]]], .x[[x[2]]]))))

result

#   time firm1 firm2 value
#1     1     a     a     6
#2     1     b     a    10
#3     1     c     a     5
#4     1     a     b    10
#5     1     b     b    14
#6     1     c     b     9
#7     1     a     c     5
#8     1     b     c     9
#9     1     c     c     4
#10    2     a     a    14
#11    2     b     a    10
#12    2     c     a     9
#13    2     a     b    10
#14    2     b     b     6
#15    2     c     b     5
#16    2     a     c     9
#17    2     b     c     5
#18    2     c     c     4

You may also do this in base R -

result <- do.call(rbind, by(data, data$time, function(x) {
  cbind(time = x$time[1], tmp, 
        value = apply(tmp, 1, function(y) fun(x[[y[1]]], x[[y[2]]])))
}))

split and expand.grid by group on large data set

One possible solution which avoids repetitions of the same pair as well as different orders is using the data.table and combinat packages:

library(data.table)
setDT(df)[order(id), data.table(combinat::combn2(unique(id))), by = group]

     group        V1        V2
1: 2365686 209044052 209044061
2: 2365686 209044052 209044062
3: 2365686 209044061 209044062
4:  387969 209044061 209044062
5:  388978 209044061 209044062
6: 2278460 209044182 209044183

order(id) is used here just for convenience to better check the results but can be skipped in production code.

Replace `combn2()` by a non-equi join

There is another approach where the call to combn2() is replaced by a non-equi join:

mdf <- setDT(df)[order(id), unique(id), by = group]
mdf[mdf, on = .(group, V1 < V1), .(group, x.V1, i.V1), nomatch = 0L,
    allow.cartesian = TRUE]

     group        V1        V2
1: 2365686 209044052 209044061
2: 2365686 209044052 209044062
3: 2365686 209044061 209044062
4:  387969 209044061 209044062
5:  388978 209044061 209044062
6: 2278460 209044182 209044183

Note that the non-equi join requires the data to be ordered.

Benchmark

The second method seems to be much faster

# create benchmark data
nr <- 1.2e5L # number of rows
rg <- 8L # number of ids within each group
ng <- nr / rg # number of groups
set.seed(1L)
df2 <- data.table(
  id = sample.int(rg, nr, TRUE),
  group = sample.int(ng, nr, TRUE)
)

#benchmark code
microbenchmark::microbenchmark(
  combn2 = df2[order(group, id), data.table((combinat::combn2(unique(id)))), by = group],
  nej = {
    mdf <- df2[order(group, id), unique(id), by = group]
    mdf[mdf, on = .(group, V1 < V1), .(group, x.V1, i.V1), nomatch = 0L,
        allow.cartesian = TRUE]},
  times = 1L)

For 120000 rows and 14994 groups the timings are:

Unit: milliseconds
   expr        min         lq       mean     median         uq        max neval
 combn2 10259.1115 10259.1115 10259.1115 10259.1115 10259.1115 10259.1115     1
    nej   137.3228   137.3228   137.3228   137.3228   137.3228   137.3228     1

Caveat

As pointed out by the OP the number of id per group is crucial in terms of memory consumption and speed. The number of combinations is of O(n²), exactly n * (n-1) / 2 or choose(n, 2L) if n is the number of ids.

The size of the largest group can be found by

df2[, uniqueN(id), by = group][, max(V1)]

The total number of rows in the final result can be computed in advance by

df2[, uniqueN(id), by = group][, sum(choose(V1, 2L))]

Alternative to expand.grid for data.frames

Why not just something like df[rep(1:nrow(df),times = 3),] to extend the data frame, and then add the extra column just as you have above, with df$Time <- rep(1:lengthTime, each=nrRow)?

Expand Grid With Unknown Number of Vectors - R

Usually the best way to do the same operation on multiple objects is to put them in a list. Once you've got the data arranged in this way, do.call can be used, so you might end up with something like:

dfs <- list(df1, df2, df3, df4, df5) 
matrix <- do.call(expand.grid, dfs[1:n])

In the specific case of expand.grid, the functionality is built in, so that you can just do

matrix <- expand.grid(dfs[1:n])

Note that if df1... really are data.frames, your code and this code produce a warning and possibly unexpected results. To avoid the warning df1... should be vectors.

How to use expand.grid with conditions?

Here is another base R way. It uses a logical index to modify columns d and e, the rest of the code is like in the question. The tests below show it's the fastest alternative.

f1 <- function(a, b, c, d, e){
  X <- expand.grid(a, b, c, d, e)
  names(X) <- c("a","b","c","d","e")
  X$d <- ifelse(X$c == 0, X$d[1], X$d)
  X$e <- ifelse(X$c == 0, X$d[1], X$e)
  unique(X)
}

f2 <- function(a, b, c, d, e){
  X <- expand.grid(a, b, c, d, e)
  names(X) <- c("a","b","c","d","e")
  i <- X$c == 0
  X$d[i] <- X$d[1]
  X$e[i] <- X$e[1]
  unique(X)
}

library(tidyr)
library(dplyr)

f3 <- function(a, b, c, d, e){
  crossing(a, b, c, d, e) %>% 
    mutate_at(vars(d, e), ~ replace(., c == 0, first(.))) %>%
    distinct
}

a = 1:5
b = 1:5
c = 0:3
d = 1:5
e = 1:3

library(microbenchmark)

mb <- microbenchmark(
  op = f1(a,b,c,d,e),
  rui = f2(a,b,c,d,e),
  akrun = f3(a,b,c,d,e)
)

print(mb, unit = "relative", order = "median")
#Unit: relative
#  expr       min       lq     mean   median       uq      max neval cld
#   rui 1.0000000 1.000000 1.000000 1.000000 1.000000 1.000000   100  a 
#    op 0.8147996 1.035322 1.018649 1.026295 1.038269 1.096384   100  a 
# akrun 1.7580304 1.815582 1.836061 1.827887 1.872767 1.107545   100   b

How can I fast outer-join and filter two vectors (or lists), preferably in base R?

The main reason your solution is so slow is because of the line cp <- c( cp, list(c(d1, d2))). This is a very inefficient way to grow an object because it results in the object being copied with each c() call.

If you instead insert into a list you will see substantially better performance. We can make a couple of other small optimisations:

Checking whether FUN is missing outside of the main loop so that we just need to do it once.
Allocating a vector of the correct length up-front if FUN is missing, since we know the length of the output for this case.

outer_join <- function(x, y = x, FUN) {
  fmissing <- missing(FUN)
  if (fmissing) {
    cp <- vector("list", length(x) * length(y))
  } else {
    cp <- list()
  }
  i <- 1L
  for (d1 in x) {
    for (d2 in y) {
      if (fmissing || FUN(d1, d2)) {
        cp[[i]] <- c(d1, d2)
        i <- i + 1L
      }
    }
  }
  cp
}

microbenchmark::microbenchmark(
  `Ex. 1` = outer_join(seq(2^8)),
  `Ex. 2` = outer_join(seq(2^8), FUN = `==`),
  `Ex. 3` = outer_join(seq(2^8), FUN = function(a, b) (a - b) %% 7L == 0),
  times = 10,
  unit = "s"
)
#> Unit: seconds
#>   expr        min         lq       mean     median         uq        max neval
#>  Ex. 1 0.02300627 0.02473937 0.02787098 0.02566033 0.03057122 0.03753821    10
#>  Ex. 2 0.01391696 0.01527710 0.01785506 0.01735052 0.01916601 0.02490142    10
#>  Ex. 3 0.05839193 0.06460381 0.07189763 0.07218238 0.08215803 0.08275439    10

Also see this chapter from Hadley Wickham's Advanced R for a discussion of issues with growing objects, as well as Chapter 2 of R Inferno. My experience is that R is not as slow at loops as it is reputed to be, as long as you avoid growing objects inefficiently.

Use Outer Instead of Expand.Grid