Use Rle to Group by Runs When Using Dplyr

Use rle to group by runs when using dplyr

One option seems to be the use of {} as in:

dat %>%
group_by(yy = {yy = rle(x); rep(seq_along(yy$lengths), yy$lengths)}) %>%
summarize(mean(y))
#Source: local data frame [4 x 2]
#
# yy mean(y)
# (int) (dbl)
#1 1 2.0
#2 2 4.5
#3 3 6.0
#4 4 7.0

It would be nice if future dplyr versions also had an equivalent of data.table's rleid function.


I noticed that this problem occurs when using a data.frame or tbl_df input but not, when using a tbl_dt or data.table input:

dat %>% 
tbl_df %>%
group_by(yy = with(rle(x), rep(seq_along(lengths), lengths))) %>%
summarize(mean(y))
Error: cannot coerce type 'closure' to vector of type 'integer'

dat %>%
tbl_dt %>%
group_by(yy = with(rle(x), rep(seq_along(lengths), lengths))) %>%
summarize(mean(y))
Source: local data table [4 x 2]

yy mean(y)
(int) (dbl)
1 1 2.0
2 2 4.5
3 3 6.0
4 4 7.0

I reported this as an issue on dplyr's github page.

Using `rle` function along with `dplyr` `group_by` command to mapping grouping variable

We could use rleid from data.table

library(data.table)
library(dplyr)
data %>%
group_by(b, c, grp = rleid(a)) %>%
summarise(from = first(a), fromCount = n()) %>%
mutate(to = lead(from), toCount = lead(fromCount)) %>%
ungroup %>%
select(-grp) %>%
filter(!is.na(to)) %>%
arrange(c)
# A tibble: 6 x 6
# b c from fromCount to toCount
# <chr> <chr> <chr> <int> <chr> <int>
#1 experiment A01 a 1 b 3
#2 experiment A02 a 1 c 1
#3 experiment A02 c 1 a 1
#4 experiment A02 a 1 b 1
#5 control A03 d 3 e 1
#6 control A04 f 2 e 2

Or using rle, after grouping by 'b', 'c', summarise with rle to create a list column, then extract the 'values' and 'lengths' from column in summarise, create the 'to', 'toCount' on the lead of the 'from', 'fromCount' column filter out the NA elements and arrange the rows based on the 'c' column

data %>% 
group_by(b, c) %>%
summarise(rl = list(rle(a)),
from = rl[[1]]$values,
fromCount = rl[[1]]$lengths) %>%
mutate(to = lead(from),
toCount = lead(fromCount)) %>%
ungroup %>%
select(-rl) %>%
filter(!is.na(to)) %>%
arrange(c)
# A tibble: 6 x 6
# b c from fromCount to toCount
# <chr> <chr> <chr> <int> <chr> <int>
#1 experiment A01 a 1 b 3
#2 experiment A02 a 1 c 1
#3 experiment A02 c 1 a 1
#4 experiment A02 a 1 b 1
#5 control A03 d 3 e 1
#6 control A04 f 2 e 2

We could also loop over the rle list column ('rl') with map, extract the components, and take the lead of the lengths, values in a tibble, use unnest_wider to create the columns and unnest the list structure, filter out the NA elements and arrange

library(tidyr)
library(purrr)
data %>%
group_by(b, c) %>%
summarise(rl = list(rle(a))) %>%
ungroup %>%
mutate(out = map(rl,
~ tibble(from = .x$values,
fromCount = .x$lengths,
to = lead(from),
toCount = lead(fromCount)))) %>%
unnest_wider(c(out)) %>%
unnest(from:toCount) %>%
filter(!is.na(to)) %>%
arrange(c) %>%
select(-rl)

Replace consecutive repeat values based on different run lengths in R

What you could do:

myfun <- function(x){
y <- rle(x)
z <- match(y$values, LETTERS)
ind <- which(is.na(z))
m <- z[ind + 1] - z[ind - 1] >= y$lengths[ind]
y$values[ind[m]] <- paste(y$values[ind[m] - 1], y$values[ind[m] + 1], sep = "-")
inverse.rle(y)
}


transform(dat, var1 = ave(var1, id, FUN = myfun))

id var1
1 1 A
2 1 A-B
3 1 B
4 1 A
5 1 NA
6 1 NA
7 1 B
8 2 A
9 2 NA
10 2 NA
11 2 NA
12 2 C
13 2 A
14 2 A-B
15 2 B
16 3 A
17 3 A-D
18 3 A-D
19 3 D
20 3 A
21 3 NA
22 3 NA
23 3 B

Create counter for runs of TRUE among FALSE and NA, by group

Another data.table approach:

library(data.table)
setDT(dt)
dt[, cr := rleid(criterium)][
(criterium), goal := rleid(cr), by=.(group)]

Summarize consecutive failures with dplyr and rle

We group by the 'customerId' and use do to perform the rle on 'isFailure' column. Extract the lengths that are 'TRUE' for values (lengths[values]), and create the 'Max' column with an if/else condition to return 0 for those that didn't have any 1 value.

 df %>%
group_by(customerId) %>%
do({tmp <- with(rle(.$isFailure==1), lengths[values])
data.frame(customerId= .$customerId, Max=if(length(tmp)==0) 0
else max(tmp)) }) %>%
slice(1L)
# customerId Max
#1 1 0
#2 2 1
#3 3 2

Use dplyr to summarize but preserve date of group row

You could either include Date in summarise

library(dplyr)

df %>%
group_by(yy = {yy = rle(Flare); rep(seq_along(yy$lengths),yy$lengths)}) %>%
summarize(Painmed_UseCum = sum(Painmed_Use),FlareLength = n(), Date = max(Date))

# Groups: yy, Flare [5]
# Date Flare Painmed_Use yy
# <date> <int> <int> <int>
#1 2015-12-06 0 1 1
#2 2015-12-10 1 0 2
#3 2015-12-12 0 0 3
#4 2015-12-15 1 1 4
#5 2015-12-16 0 0 5

Or if there are more columns to preserve better approach is to use mutate and select the last row in each group.

df %>% 
group_by(yy = {yy = rle(Flare); rep(seq_along(yy$lengths), yy$lengths)}) %>%
mutate(Painmed_UseCum = sum(Painmed_Use),FlareLength = n()) %>%
slice(n())

To create groups, we can replace rle with rleid from data.table which would be simpler.

group_by(yy = data.table::rleid(Flare))


Related Topics



Leave a reply



Submit