Group Vector on Conditional Sum

Group vector on conditional sum

Here is my Rcpp-solution (close to Khashaa's solution but a bit shorter/stripped down), because you said speed was important, Rcppis probably the way to go:

# create the data
set.seed(1)
x <- sample(10, 20, replace = TRUE)
y <- c(1, 1, 1, 2, 2, 3, 4, 5 ,5, 5, 6, 6, 6, 7, 7, 8, 8, 9, 9, 10)

# create the Rcpp function
library(Rcpp)
cppFunction('
IntegerVector sotosGroup(NumericVector x, int cutoff) {
 IntegerVector groupVec (x.size());
 int group = 1;
 double runSum = 0;
 for (int i = 0; i < x.size(); i++) {
  runSum += x[i];
  if (runSum > cutoff) {
   group++;
   runSum = x[i];
  }
  groupVec[i] = group;
 }
 return groupVec;
}
')

# use the function as usual
y_cpp <- sotosGroup(x, 15)
sapply(split(x, y_cpp), sum)
#>  1  2  3  4  5  6  7  8  9 10 
#> 13 13  9 10 15 12 12 13 14  8

all.equal(y, y_cpp)
#> [1] TRUE

In case anyone needs to be convinced by the speed:

# Speed Benchmarks
library(data.table)
library(microbenchmark)
dt <- data.table(x)

frank <- function(DT, n = 15) {
 DT[, xc := cumsum(x)]
 b = DT[.(shift(xc, fill=0) + n + 1), on=.(xc), roll=-Inf, which=TRUE]
 z = 1; res = z
 while (!is.na(z)) 
  res <- c(res, z <- b[z])
 DT[, g := cumsum(.I %in% res)][]
}

microbenchmark(
 frank(dt),
 sotosGroup(x, 15),
 times = 100
)
#> Unit: microseconds
#>               expr      min       lq       mean    median       uq       max neval cld
#>          frank(dt) 1720.589 1831.320 2148.83096 1878.0725 1981.576 13728.830   100   b
#>  sotosGroup(x, 15)    2.595    3.962    6.47038    7.5035    8.290    11.579   100  a

R: Conditional summation of a numeric vector

Try this:

MakeNonNeg <- function(v) {
    size <- length(v)
    myOut <- as.numeric(v)
    if (size > 1L) {
        for (i in 1:(size-1L)) {
            if (myOut[i] >= 0) {next}
            myOut[i+1L] <- myOut[i]+myOut[i+1L]
            myOut[i] <- 0
        }
    }
    myOut
}

MakeNonNeg(inVector)
[1]  2  0  0 29  7

Below is a more exotic example:

set.seed(4242)

BigVec <- sample(-40000:100000, 100000, replace = TRUE)
gmp::sum.bigz(BigVec)
Big Integer ('bigz') :
    [1] 2997861106

t3 <- MakeNonNeg(BigVec)
gmp::sum.bigz(t3)
Big Integer ('bigz') :
    [1] 2997861106

BigVec[1:20]
[1]  98056   8680  -7814  53620  58390  90832  74970 -16392  52648  83779 -17229  38484 -36589  75156  71200  95968 -11599  57705
[19]  19209 -21596

t3[1:20]
[1] 98056  8680     0 45806 58390 90832 74970     0 36256 83779     0 21255     0 38567 71200 95968     0 46106 19209     0

Here is my system info:

sessionInfo()
R version 3.3.0 (2016-05-03)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1

Below are timings for both functions with JIT disabled.

microbenchmark(
    makeNonNeg = MakeNonNeg(BigVec),
    zeroElement = zeroElement(BigVec),
    times=10)
Unit: milliseconds
       expr      min       lq     mean   median       uq      max neval
 makeNonNeg 254.1255 255.8430 267.9527 258.6369 277.0222 303.6516    10
zeroElement 152.0358 164.7988 175.3191 166.4948 198.3855 209.8739    10

With JIT enabled, we obtain much different results for makeNonNeg. However, the results for zeroElement don't change that much (I'm thinking that since Reduce is the major part of the function and it is already bytecoded, there is not much room for improvement).

library(compiler)
enableJIT(3)
[1] 0

microbenchmark(
    makeNonNeg = MakeNonNeg(BigVec),
    zeroElement = zeroElement(BigVec),
    times=10)
Unit: milliseconds
       expr       min        lq      mean    median        uq       max neval
 makeNonNeg  11.20514  11.55366  12.76953  11.84655  12.20554  20.60036    10
zeroElement 144.15123 149.33591 163.66421 157.34711 176.20139 198.57268    10

So, with JIT disabled, zeroElement is about 50% faster and when JIT is enabled, MakeNonNeg is about 13x faster.

Creating a vector of conditional sums

Since you did not provide a reproducible example, I'll use the iris dataset.

aggregate(iris[,1:4], by=list(iris$Species), sum)
#     Group.1 Sepal.Length Sepal.Width Petal.Length Petal.Width
#1     setosa        250.3       171.4         73.1        12.3
#2 versicolor        296.8       138.5        213.0        66.3
#3  virginica        329.4       148.7        277.6       101.3

Create grouping based on cumulative sum and another group

This works! It uses a combination of purrr's accumulate (similar to cumsum but more versatile) and cumsum with appropriate use of group_by to get what you're looking for. I've added comments to indicate what each part is doing. I'll note that next_group2 is a bit of a misnomer--it's more of a not_next_group2, but hopefully the rest is clear.

library(tidyverse)

domain <- c(rep(1,5),rep(2,8))
value <- c(1,0,2,2.5,0.1,0.1,0.5,0,0.2,0.6,0,0,0.1)
df_raw <- data.frame(domain,value)

## Modified from: https://stackoverflow.com/questions/49076769/dplyr-r-cumulative-sum-with-reset
sum_reset_at = function(val_col, threshold, include.equals = TRUE) {
  if (include.equals) {
    purrr::accumulate({{val_col}}, ~if_else(.x>=threshold , .y, .x+.y))
  } else {
    purrr::accumulate({{val_col}}, ~if_else(.x>threshold , .y, .x+.y))
  }
}

df_raw %>% 
  group_by(domain) %>% 
  mutate(cumsum_val = sum_reset_at(value, 1)) %>% 
  mutate(next_group1 = ifelse(lag(cumsum_val) >= 1 | row_number() == 1, 1, 0)) %>% ## binary interpretation of whether there should be a new group
  ungroup %>% 
  mutate(group1 = cumsum(next_group1)) %>% ## generate new groups
  group_by(domain, group1) %>%
  mutate(next_group2 = ifelse(max(cumsum_val) < 1 & row_number() == 1, 1, 0)) %>% ## similar to above, but grouped by your new group1; we ask it only to transition at the first value of the group that doesn't reach 1
  ungroup %>% 
  mutate(group2 = cumsum(next_group1 - next_group2)) %>% ## cancel out the next_group1 binary if it meets the conditions of next_group2
  select(-starts_with("next_"))

And as specified, this produces:

# A tibble: 13 x 5
   domain value cumsum_val group1 group2
    <dbl> <dbl>      <dbl>  <dbl>  <dbl>
 1      1   1          1        1      1
 2      1   0          0        2      2
 3      1   2          2        2      2
 4      1   2.5        2.5      3      3
 5      1   0.1        0.1      4      3
 6      2   0.1        0.1      5      4
 7      2   0.5        0.6      5      4
 8      2   0          0.6      5      4
 9      2   0.2        0.8      5      4
10      2   0.6        1.4      5      4
11      2   0          0        6      4
12      2   0          0        6      4
13      2   0.1        0.1      6      4

Grouping elements based on their sum

We can do a double cumsum to create a grouping variable and split the vector ('x') based on that.

lst <- split(x, cumsum(cumsum(x)>10))

If we need individual objects in the global environment, list2env can be used (but not recommended)

list2env(setNames(lst, letters[seq_along(lst)]), envir = .GlobalEnv)

Total sum of a numeric vector conditional on values from a character vector

Test data, in a dataframe for convenience rather than two vectors:

> d = data.frame(x=runif(10),y=sample(LETTERS[1:3],10,TRUE))
> d
            x y
1  0.25927547 B
2  0.95012667 C
3  0.85133149 C
4  0.64658480 B
etc

Use tapply:

> tapply(d$x,d$y,sum)
       A        B        C 
1.547225 1.891884 2.666552

Convince yourself:

> sum(d$x[d$y=="A"])
[1] 1.547225
> sum(d$x[d$y=="B"])
[1] 1.891884
> sum(d$x[d$y=="C"])
[1] 2.666552

The aggregate function with a formula argument makes it easy to do with more than one column:

> d=data.frame(x=runif(10),y=sample(LETTERS[1:3],10,TRUE),z=sample(LETTERS[1:3],10,TRUE))
> d
           x y z
1  0.4166217 A C
2  0.5816940 B C
3  0.9915231 A C
4  0.7177323 B C
etc

We want to sum x within classes defined by y and z.

> aggregate(x~y+z,d,sum)
  y z         x
1 C A 1.6392171
2 B B 0.9389463
3 C B 0.3330299
4 A C 2.3748477
5 B C 1.2994263

Convince yourself:

> sum(d$x[d$y=="C" & d$z=="A"])
[1] 1.639217
> sum(d$x[d$y=="B" & d$z=="B"])
[1] 0.9389463

Note that C,A is a separate entry to A,C...

Group Vector on Conditional Sum