Generate Sequence Within Group in R

generate sequence within group in R

Use ave with seq_along:

> mydf$C <- with(mydf, ave(A, A, B, FUN = seq_along))
> mydf
A B C
1 1 1 1
2 1 2 1
3 1 2 2
4 1 3 1
5 1 3 2
6 1 3 3
7 1 4 1
8 1 4 2

If your data are already ordered (as they are in this case), you can also use sequence with rle (mydf$C <- sequence(rle(do.call(paste, mydf))$lengths)), but you don't have that limitation with ave.

If you're a data.table fan, you can make use of .N as follows:

library(data.table)
DT <- data.table(mydf)
DT[, C := sequence(.N), by = c("A", "B")]
DT
# A B C
# 1: 1 1 1
# 2: 1 2 1
# 3: 1 2 2
# 4: 1 3 1
# 5: 1 3 2
# 6: 1 3 3
# 7: 1 4 1
# 8: 1 4 2

Generate sequence within sub group in data.table

We group by 'id1', sort the 'val' and then create 'grp' as row_number()

input %>%
group_by(id1) %>%
mutate(val = sort(val), grp= row_number())

Or another option is to arrange

input %>%
arrange(id1, val) %>%
group_by(id1) %>%
mutate(grp = row_number())

Or using data.table

library(data.table)
setDT(input)[, c("grp", "val") := .(seq_len(.N), sort(val)), by = id1]
input
# id1 val grp
#1: 1 1 1
#2: 1 2 2
#3: 1 3 3
#4: 1 4 4
#5: 2 3 1
#6: 2 4 2
#7: 2 5 3

If we need to sort as well, use setorder based on the 'id1' and 'val' to order in place, then create the 'grp' as the rowid of 'id1'

input <- data.frame("id1"=c(1,1,1,1,2,2,2),val=c(2,3,4,1,4,3,5), 
achar=c('a','a','b','b','d','c','e'))
setorder(setDT(input), id1, val)[, grp := rowid(id1)][]
# id1 val achar grp
#1: 1 1 b 1
#2: 1 2 a 2
#3: 1 3 a 3
#4: 1 4 b 4
#5: 2 3 c 1
#6: 2 4 d 2
#7: 2 5 e 3

Create a sequential number within each group

A simple solution with Base R:

df$seq <- ave(sapply(df$gap, identical, "gap"), df$id, FUN = cumsum)
df
#> id date lc lon lat gap_days gap seq
#> 1 20162.03 2003-10-19 14:33:00 Tagging -39.370 -18.480 NA <NA> 0
#> 2 20162.03 2003-10-21 12:19:00 1 -38.517 -18.253 1.90694444 gap 1
#> 3 20162.03 2003-10-21 13:33:00 1 -38.464 -18.302 0.05138889 no 1
#> 4 20162.03 2003-10-21 16:38:00 A -38.461 -18.425 0.12847222 no 1
#> 5 20162.03 2003-10-21 18:50:00 A -38.322 -18.512 0.09166667 no 1
#> 6 20162.03 2003-10-23 10:33:00 B -38.674 -19.824 1.65486111 gap 2
#> 7 20162.03 2003-10-23 17:52:00 B -38.957 -19.511 0.30486111 no 2
#> 8 20162.03 2003-11-02 08:14:00 B -42.084 -24.071 9.59861111 gap 3
#> 9 20162.03 2003-11-02 09:36:00 A -41.999 -24.114 0.05694444 no 3
#> 10 20687.03 2003-10-27 17:02:00 Tagging -39.320 -18.460 NA <NA> 0
#> 11 20687.03 2003-10-27 19:44:00 2 -39.306 -18.454 0.11250000 no 0
#> 12 20687.03 2003-10-27 21:05:00 1 -39.301 -18.458 0.05625000 no 0

And then split it:

split(df, list(df$id, df$seq), drop = TRUE)
#> $`20162.03.0`
#> id date lc lon lat gap_days gap seq
#> 1 20162.03 2003-10-19 14:33:00 Tagging -39.37 -18.48 NA <NA> 0
#>
#> $`20687.03.0`
#> id date lc lon lat gap_days gap seq
#> 10 20687.03 2003-10-27 17:02:00 Tagging -39.320 -18.460 NA <NA> 0
#> 11 20687.03 2003-10-27 19:44:00 2 -39.306 -18.454 0.11250 no 0
#> 12 20687.03 2003-10-27 21:05:00 1 -39.301 -18.458 0.05625 no 0
#>
#> $`20162.03.1`
#> id date lc lon lat gap_days gap seq
#> 2 20162.03 2003-10-21 12:19:00 1 -38.517 -18.253 1.90694444 gap 1
#> 3 20162.03 2003-10-21 13:33:00 1 -38.464 -18.302 0.05138889 no 1
#> 4 20162.03 2003-10-21 16:38:00 A -38.461 -18.425 0.12847222 no 1
#> 5 20162.03 2003-10-21 18:50:00 A -38.322 -18.512 0.09166667 no 1
#>
#> $`20162.03.2`
#> id date lc lon lat gap_days gap seq
#> 6 20162.03 2003-10-23 10:33:00 B -38.674 -19.824 1.6548611 gap 2
#> 7 20162.03 2003-10-23 17:52:00 B -38.957 -19.511 0.3048611 no 2
#>
#> $`20162.03.3`
#> id date lc lon lat gap_days gap seq
#> 8 20162.03 2003-11-02 08:14:00 B -42.084 -24.071 9.59861111 gap 3
#> 9 20162.03 2003-11-02 09:36:00 A -41.999 -24.114 0.05694444 no 3

Code sequence by group in R: recurring values within group

In data.table, we have rleid function which makes it simple here.

library(data.table)

setDT(df)[, seq1 := seq_len(.N), .(ID, rleid(loc))]

df
# ID yr loc seq seq1
# 1: 1 1990 A 1 1
# 2: 1 1991 A 2 2
# 3: 1 1992 B 1 1
# 4: 1 1993 B 2 2
# 5: 1 1994 B 3 3
# 6: 2 1990 B 1 1
# 7: 2 1991 B 2 2
# 8: 2 1992 A 1 1
# 9: 2 1993 B 1 1
#10: 2 1994 B 2 2
#11: 3 1990 C 1 1
#12: 3 1991 C 2 2
#13: 3 1992 C 3 3
#14: 3 1993 B 1 1
#15: 3 1994 C 1 1

We can use rleid in dplyr and base R approaches to get expected output.

library(dplyr)
df %>%
group_by(ID, grp = data.table::rleid(loc)) %>%
mutate(seq1 = row_number())

Or in base R :

df$seq1 <- with(df, ave(yr, ID, data.table::rleid(loc), FUN = seq_along))

A concise option suggested by @chinsoon12 is to use rowid function.

setDT(df)[, seq2 := rowid(ID, rleid(loc))]

data

df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
3L, 3L, 3L, 3L, 3L), yr = c(1990L, 1991L, 1992L, 1993L, 1994L,
1990L, 1991L, 1992L, 1993L, 1994L, 1990L, 1991L, 1992L, 1993L,
1994L), loc = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 3L, 3L, 3L, 2L, 3L), .Label = c("A", "B", "C"), class = "factor"),
seq = c(1L, 2L, 1L, 2L, 3L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 3L,
1L, 1L)), class = "data.frame", row.names = c(NA, -15L))

Numbering rows within groups in a data frame

Use ave, ddply, dplyr or data.table:

df$num <- ave(df$val, df$cat, FUN = seq_along)

or:

library(plyr)
ddply(df, .(cat), mutate, id = seq_along(val))

or:

library(dplyr)
df %>% group_by(cat) %>% mutate(id = row_number())

or (the most memory efficient, as it assigns by reference within DT):

library(data.table)
DT <- data.table(df)

DT[, id := seq_len(.N), by = cat]
DT[, id := rowid(cat)]

Create Group from Sequences

We can use rleid from data.table

library(data.table)
rleid(x)
#[1] 1 2 3 3 4 5 6

Or in base R with rle

with(rle(x), rep(seq_along(values), lengths))
#[1] 1 2 3 3 4 5 6

Or if we use the similar approach from OP

1 + cumsum(x != dplyr::lag(x, default = first(x)))

How to create a sequency by group from a specific string in R?

You can first create a column specifying the first occurrence of "UNP", then use cumsum() and lag() to calculate the Seq column.

library(dplyr)

df <- read.table(header = T, text = "
ColA Colb Seq
A HM 0
A RES 0
A UNP 0
A RES 1
A RES 2
A HM 3
B HM 0
B RES 0
B UNP 0
B RES 1
B UNP 2
C UNP 0") %>%
select(-Seq)

df %>%
group_by(ColA, Colb) %>%
mutate(seq_count = ifelse(first(Colb) == "UNP" & !duplicated(Colb), 1, 0)) %>%
group_by(ColA) %>%
mutate(Seq = lag(cumsum(cumsum(seq_count)), default = 0), .keep = "unused")
#> # A tibble: 12 × 3
#> # Groups: ColA [3]
#> ColA Colb Seq
#> <chr> <chr> <dbl>
#> 1 A HM 0
#> 2 A RES 0
#> 3 A UNP 0
#> 4 A RES 1
#> 5 A RES 2
#> 6 A HM 3
#> 7 B HM 0
#> 8 B RES 0
#> 9 B UNP 0
#> 10 B RES 1
#> 11 B UNP 2
#> 12 C UNP 0

Created on 2022-03-31 by the reprex package (v2.0.1)

Sequentially Number Group within Group

One way would be to join your original data to a summarized version of the same data where you assign group id's for the variable in question. Here's an example on a standard data set:

left_join(mtcars, mtcars %>% group_by(gear) %>% summarize(id = cur_group_id()))

Or, using a version of data like yours:

dft <- data.frame(
stringsAsFactors = FALSE,
pmid = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L),
item = c("Age", "Age", "BMI", "BMI", "Age", "Age", "BMI", "Duration")
)

left_join(
dft,
dft %>%
group_by(item) %>%
summarize(id = cur_group_id()))

Result

Joining, by = "item"
pmid item id
1 1 Age 1
2 1 Age 1
3 1 BMI 2
4 1 BMI 2
5 2 Age 1
6 2 Age 1
7 2 BMI 2
8 2 Duration 3

Please note, for future questions, it will make it easier for others to help you if you can provide some example data as code we can load, as opposed to a printout of what the data looks like. It's often easiest to use the magical dput function, by running dput(dft) or dput(head(dft, 50)) and pasting the output into the body of your question.

Create a sequence of unique observations by group with dplyr and create a difference in months column

Using zoo::as.yearmon(), however, I had to round because otherwise 2015-11-26 to 2015-12-26 is considered longer than one month. Perhaps someone can comment/edit/explain how to make that particular calculation more "intuitive".

library(dplyr)
library(zoo)

df %>%
group_by(User) %>%
mutate(Count = 1:n(),
Gap_In_Months = round(12 * as.numeric(as.yearmon(Date) - as.yearmon(lag(Date))), 1),
Gap = ifelse(Gap_In_Months <= 1 | is.na(Gap_In_Months), 0, Gap_In_Months))

# User Date Count Gap_In_Months Gap
# (fctr) (fctr) (int) (dbl) (dbl)
# 1 aaaa 2015-11-26 1 NA 0
# 2 aaaa 2015-12-26 2 1 0
# 3 aaaa 2016-01-26 3 1 0
# 4 bbbb 2014-10-15 1 NA 0
# 5 bbbb 2014-11-15 2 1 0
# 6 bbbb 2015-05-16 3 6 6

Perhaps you want to be more specific as to "what is a month"? 30 days? 31 days? 28 days?

If that's the case, we can utilize lubridate:

library(lubridate)

df %>%
group_by(User) %>%
mutate(Count = 1:n(),
Diff_Time = ymd(Date) - ymd(lag(Date)),
Gap = ifelse(Diff_Time <= ddays(31) | is.na(Diff_Time), 0, as.numeric(Diff_Time, units = "days")))

# User Date Count Diff_Time Gap
# (fctr) (fctr) (int) (dfft) (dbl)
# 1 aaaa 2015-11-26 1 NA days 0
# 2 aaaa 2015-12-26 2 30 days 0
# 3 aaaa 2016-01-26 3 31 days 0
# 4 bbbb 2014-10-15 1 NA days 0
# 5 bbbb 2014-11-15 2 31 days 0
# 6 bbbb 2015-05-16 3 182 days 182


Related Topics



Leave a reply



Submit