Generate Sequence Within Group in R

generate sequence within group in R

Use ave with seq_along:

> mydf$C <- with(mydf, ave(A, A, B, FUN = seq_along))
> mydf
  A B C
1 1 1 1
2 1 2 1
3 1 2 2
4 1 3 1
5 1 3 2
6 1 3 3
7 1 4 1
8 1 4 2

If your data are already ordered (as they are in this case), you can also use sequence with rle (mydf$C <- sequence(rle(do.call(paste, mydf))$lengths)), but you don't have that limitation with ave.

If you're a data.table fan, you can make use of .N as follows:

library(data.table)
DT <- data.table(mydf)
DT[, C := sequence(.N), by = c("A", "B")]
DT
#    A B C
# 1: 1 1 1
# 2: 1 2 1
# 3: 1 2 2
# 4: 1 3 1
# 5: 1 3 2
# 6: 1 3 3
# 7: 1 4 1
# 8: 1 4 2

Generate sequence within sub group in data.table

We group by 'id1', sort the 'val' and then create 'grp' as row_number()

input %>%
  group_by(id1) %>%
  mutate(val = sort(val), grp= row_number())

Or another option is to arrange

input %>%
   arrange(id1, val) %>%
   group_by(id1) %>%
   mutate(grp = row_number())

Or using data.table

library(data.table)
setDT(input)[, c("grp", "val") := .(seq_len(.N), sort(val)), by = id1]
input
#   id1 val grp
#1:   1   1   1
#2:   1   2   2
#3:   1   3   3
#4:   1   4   4
#5:   2   3   1
#6:   2   4   2
#7:   2   5   3

If we need to sort as well, use setorder based on the 'id1' and 'val' to order in place, then create the 'grp' as the rowid of 'id1'

input <- data.frame("id1"=c(1,1,1,1,2,2,2),val=c(2,3,4,1,4,3,5), 
        achar=c('a','a','b','b','d','c','e'))
setorder(setDT(input), id1, val)[, grp := rowid(id1)][]
#   id1 val achar grp
#1:   1   1     b   1
#2:   1   2     a   2
#3:   1   3     a   3
#4:   1   4     b   4
#5:   2   3     c   1
#6:   2   4     d   2
#7:   2   5     e   3

Create a sequential number within each group

A simple solution with Base R:

df$seq <- ave(sapply(df$gap, identical, "gap"), df$id, FUN = cumsum)
df
#>          id                date      lc     lon     lat   gap_days  gap seq
#> 1  20162.03 2003-10-19 14:33:00 Tagging -39.370 -18.480         NA <NA>   0
#> 2  20162.03 2003-10-21 12:19:00       1 -38.517 -18.253 1.90694444  gap   1
#> 3  20162.03 2003-10-21 13:33:00       1 -38.464 -18.302 0.05138889   no   1
#> 4  20162.03 2003-10-21 16:38:00       A -38.461 -18.425 0.12847222   no   1
#> 5  20162.03 2003-10-21 18:50:00       A -38.322 -18.512 0.09166667   no   1
#> 6  20162.03 2003-10-23 10:33:00       B -38.674 -19.824 1.65486111  gap   2
#> 7  20162.03 2003-10-23 17:52:00       B -38.957 -19.511 0.30486111   no   2
#> 8  20162.03 2003-11-02 08:14:00       B -42.084 -24.071 9.59861111  gap   3
#> 9  20162.03 2003-11-02 09:36:00       A -41.999 -24.114 0.05694444   no   3
#> 10 20687.03 2003-10-27 17:02:00 Tagging -39.320 -18.460         NA <NA>   0
#> 11 20687.03 2003-10-27 19:44:00       2 -39.306 -18.454 0.11250000   no   0
#> 12 20687.03 2003-10-27 21:05:00       1 -39.301 -18.458 0.05625000   no   0

And then split it:

split(df, list(df$id, df$seq), drop = TRUE)
#> $`20162.03.0`
#>         id                date      lc    lon    lat gap_days  gap seq
#> 1 20162.03 2003-10-19 14:33:00 Tagging -39.37 -18.48       NA <NA>   0
#> 
#> $`20687.03.0`
#>          id                date      lc     lon     lat gap_days  gap seq
#> 10 20687.03 2003-10-27 17:02:00 Tagging -39.320 -18.460       NA <NA>   0
#> 11 20687.03 2003-10-27 19:44:00       2 -39.306 -18.454  0.11250   no   0
#> 12 20687.03 2003-10-27 21:05:00       1 -39.301 -18.458  0.05625   no   0
#> 
#> $`20162.03.1`
#>         id                date lc     lon     lat   gap_days gap seq
#> 2 20162.03 2003-10-21 12:19:00  1 -38.517 -18.253 1.90694444 gap   1
#> 3 20162.03 2003-10-21 13:33:00  1 -38.464 -18.302 0.05138889  no   1
#> 4 20162.03 2003-10-21 16:38:00  A -38.461 -18.425 0.12847222  no   1
#> 5 20162.03 2003-10-21 18:50:00  A -38.322 -18.512 0.09166667  no   1
#> 
#> $`20162.03.2`
#>         id                date lc     lon     lat  gap_days gap seq
#> 6 20162.03 2003-10-23 10:33:00  B -38.674 -19.824 1.6548611 gap   2
#> 7 20162.03 2003-10-23 17:52:00  B -38.957 -19.511 0.3048611  no   2
#> 
#> $`20162.03.3`
#>         id                date lc     lon     lat   gap_days gap seq
#> 8 20162.03 2003-11-02 08:14:00  B -42.084 -24.071 9.59861111 gap   3
#> 9 20162.03 2003-11-02 09:36:00  A -41.999 -24.114 0.05694444  no   3

Code sequence by group in R: recurring values within group

In data.table, we have rleid function which makes it simple here.

library(data.table)

setDT(df)[, seq1 := seq_len(.N), .(ID, rleid(loc))]

df
#    ID   yr loc seq seq1
# 1:  1 1990   A   1    1
# 2:  1 1991   A   2    2
# 3:  1 1992   B   1    1
# 4:  1 1993   B   2    2
# 5:  1 1994   B   3    3
# 6:  2 1990   B   1    1
# 7:  2 1991   B   2    2
# 8:  2 1992   A   1    1
# 9:  2 1993   B   1    1
#10:  2 1994   B   2    2
#11:  3 1990   C   1    1
#12:  3 1991   C   2    2
#13:  3 1992   C   3    3
#14:  3 1993   B   1    1
#15:  3 1994   C   1    1

We can use rleid in dplyr and base R approaches to get expected output.

library(dplyr)
df %>%
 group_by(ID, grp = data.table::rleid(loc)) %>%
  mutate(seq1 = row_number())

Or in base R :

df$seq1 <- with(df, ave(yr, ID, data.table::rleid(loc), FUN = seq_along))

A concise option suggested by @chinsoon12 is to use rowid function.

setDT(df)[, seq2 := rowid(ID, rleid(loc))]

data

df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
3L, 3L, 3L, 3L, 3L), yr = c(1990L, 1991L, 1992L, 1993L, 1994L, 
1990L, 1991L, 1992L, 1993L, 1994L, 1990L, 1991L, 1992L, 1993L, 
1994L), loc = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 
2L, 3L, 3L, 3L, 2L, 3L), .Label = c("A", "B", "C"), class = "factor"), 
seq = c(1L, 2L, 1L, 2L, 3L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 3L, 
1L, 1L)), class = "data.frame", row.names = c(NA, -15L))

Numbering rows within groups in a data frame

Use ave, ddply, dplyr or data.table:

df$num <- ave(df$val, df$cat, FUN = seq_along)

or:

library(plyr)
ddply(df, .(cat), mutate, id = seq_along(val))

or:

library(dplyr)
df %>% group_by(cat) %>% mutate(id = row_number())

or (the most memory efficient, as it assigns by reference within DT):

library(data.table)
DT <- data.table(df)

DT[, id := seq_len(.N), by = cat]
DT[, id := rowid(cat)]

Create Group from Sequences

We can use rleid from data.table

library(data.table)
rleid(x)
#[1] 1 2 3 3 4 5 6

Or in base R with rle

with(rle(x), rep(seq_along(values), lengths))
#[1] 1 2 3 3 4 5 6

Or if we use the similar approach from OP

1 + cumsum(x != dplyr::lag(x, default = first(x)))

How to create a sequency by group from a specific string in R?

You can first create a column specifying the first occurrence of "UNP", then use cumsum() and lag() to calculate the Seq column.

library(dplyr)

df <- read.table(header = T, text = "
ColA    Colb    Seq
A   HM  0
A   RES 0
A   UNP 0
A   RES 1
A   RES 2
A   HM  3
B   HM  0
B   RES 0
B   UNP 0
B   RES 1
B   UNP 2
C   UNP 0") %>% 
  select(-Seq)

df %>% 
  group_by(ColA, Colb) %>% 
  mutate(seq_count = ifelse(first(Colb) == "UNP" & !duplicated(Colb), 1, 0)) %>% 
  group_by(ColA) %>% 
  mutate(Seq = lag(cumsum(cumsum(seq_count)), default = 0), .keep = "unused")
#> # A tibble: 12 × 3
#> # Groups:   ColA [3]
#>    ColA  Colb    Seq
#>    <chr> <chr> <dbl>
#>  1 A     HM        0
#>  2 A     RES       0
#>  3 A     UNP       0
#>  4 A     RES       1
#>  5 A     RES       2
#>  6 A     HM        3
#>  7 B     HM        0
#>  8 B     RES       0
#>  9 B     UNP       0
#> 10 B     RES       1
#> 11 B     UNP       2
#> 12 C     UNP       0

^{Created on 2022-03-31 by the reprex package (v2.0.1)}

Sequentially Number Group within Group

One way would be to join your original data to a summarized version of the same data where you assign group id's for the variable in question. Here's an example on a standard data set:

left_join(mtcars, mtcars %>% group_by(gear) %>% summarize(id = cur_group_id()))

Or, using a version of data like yours:

dft <- data.frame(
  stringsAsFactors = FALSE,
              pmid = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L),
              item = c("Age", "Age", "BMI", "BMI", "Age", "Age", "BMI", "Duration")
)

left_join(
  dft,
  dft %>%
    group_by(item) %>%
    summarize(id = cur_group_id()))

Result

Joining, by = "item"
  pmid     item id
1    1      Age  1
2    1      Age  1
3    1      BMI  2
4    1      BMI  2
5    2      Age  1
6    2      Age  1
7    2      BMI  2
8    2 Duration  3

Please note, for future questions, it will make it easier for others to help you if you can provide some example data as code we can load, as opposed to a printout of what the data looks like. It's often easiest to use the magical dput function, by running dput(dft) or dput(head(dft, 50)) and pasting the output into the body of your question.

Create a sequence of unique observations by group with dplyr and create a difference in months column

Using zoo::as.yearmon(), however, I had to round because otherwise 2015-11-26 to 2015-12-26 is considered longer than one month. Perhaps someone can comment/edit/explain how to make that particular calculation more "intuitive".

library(dplyr)
library(zoo)

df %>%
  group_by(User) %>%
  mutate(Count = 1:n(),
         Gap_In_Months = round(12 * as.numeric(as.yearmon(Date) - as.yearmon(lag(Date))), 1),
         Gap = ifelse(Gap_In_Months <= 1 | is.na(Gap_In_Months), 0, Gap_In_Months))

#     User       Date Count Gap_In_Months   Gap
#   (fctr)     (fctr) (int)         (dbl) (dbl)
# 1   aaaa 2015-11-26     1            NA     0
# 2   aaaa 2015-12-26     2             1     0
# 3   aaaa 2016-01-26     3             1     0
# 4   bbbb 2014-10-15     1            NA     0
# 5   bbbb 2014-11-15     2             1     0
# 6   bbbb 2015-05-16     3             6     6

Perhaps you want to be more specific as to "what is a month"? 30 days? 31 days? 28 days?

If that's the case, we can utilize lubridate:

library(lubridate)

df %>%
  group_by(User) %>%
  mutate(Count = 1:n(),
         Diff_Time = ymd(Date) - ymd(lag(Date)),
         Gap = ifelse(Diff_Time <= ddays(31) | is.na(Diff_Time), 0, as.numeric(Diff_Time, units = "days")))

#     User       Date Count Diff_Time   Gap
#   (fctr)     (fctr) (int)    (dfft) (dbl)
# 1   aaaa 2015-11-26     1   NA days     0
# 2   aaaa 2015-12-26     2   30 days     0
# 3   aaaa 2016-01-26     3   31 days     0
# 4   bbbb 2014-10-15     1   NA days     0
# 5   bbbb 2014-11-15     2   31 days     0
# 6   bbbb 2015-05-16     3  182 days   182

Generate Sequence Within Group in R