Numbering Rows Within Groups in a Data Frame

Numbering rows within groups in a data frame

Use ave, ddply, dplyr or data.table:

df$num <- ave(df$val, df$cat, FUN = seq_along)

or:

library(plyr)
ddply(df, .(cat), mutate, id = seq_along(val))

or:

library(dplyr)
df %>% group_by(cat) %>% mutate(id = row_number())

or (the most memory efficient, as it assigns by reference within DT):

library(data.table)
DT <- data.table(df)

DT[, id := seq_len(.N), by = cat]
DT[, id := rowid(cat)]

Pandas number rows within group in increasing order

Use groupby/cumcount:

In [25]: df['C'] = df.groupby(['A','B']).cumcount()+1; df
Out[25]: 
   A  B  C
0  A  a  1
1  A  a  2
2  A  b  1
3  B  a  1
4  B  a  2
5  B  a  3

Numbering rows within groups in a data frame but without considering the NAs

You can tweak the suggestions that you have to get the desired output :

library(dplyr)

df %>%
  group_by(cat) %>%
  mutate(id = replace(row_number() - cumsum(is.na(val)), is.na(val), NA))

#   cat       val    id
#   <chr>   <dbl> <int>
# 1 aaa    0.0564     1
# 2 aaa    0.258      2
# 3 aaa   NA         NA
# 4 aaa    0.469      3
# 5 aaa    0.552      4
# 6 bbb   NA         NA
# 7 bbb    0.370      1
# 8 bbb    0.484      2
# 9 bbb    0.547      3
#10 bbb    0.812      4
#11 ccc    0.280      1
#12 ccc    0.398      2
#13 ccc    0.625      3
#14 ccc    0.763      4
#15 ccc    0.882      5

This can be also be written in data.table

library(data.table)
setDT(df)[, id:= replace(seq_len(.N) - cumsum(is.na(val)), is.na(val), NA), cat]

and base R :

df$id <- with(df, replace(ave(val, cat, FUN = function(x) 
                  seq_along(x) - cumsum(is.na(x))), is.na(val), NA))

The logic is to create an id variable which is current row number in the group minus the number of NA's that have occurred till that time.

For updated df2 to consider the NA only in first row we can use :

df2 %>%
  group_by(cat) %>%
  mutate(id = replace(row_number() - is.na(first(val)),1* is.na(first(val)), NA))

#   cat       val    id
#   <chr>   <dbl> <int>
# 1 aaa    0.0564     1
# 2 aaa    0.258      2
# 3 aaa   NA          3
# 4 aaa    0.469      4
# 5 aaa    0.552      5
# 6 bbb   NA         NA
# 7 bbb    0.370      1
# 8 bbb    0.484      2
# 9 bbb   NA          3
#10 bbb    0.812      4
#11 ccc    0.280      1
#12 ccc    0.398      2
#13 ccc    0.625      3
#14 ccc    0.763      4
#15 ccc    0.882      5

Numbering rows within groups with data gaps in R

You can add another value in group_by which would change the value when the difference of current date and previous date is greater than 1.

library(dplyr)

data %>%
  mutate(day = as.Date(day)) %>%
  group_by(group1, group2,
           date_gap = cumsum(day - lag(day, default = first(day)) > 1)) %>%
  mutate(id_day = row_number()) %>%
  ungroup %>%
  select(-date_gap)

#    day        group1 group2 id_day
#   <date>     <chr>  <chr>   <int>
# 1 2020-05-01 A      B           1
# 2 2020-05-02 A      B           2
# 3 2020-05-03 A      B           3
# 4 2020-05-04 A      B           4
# 5 2020-05-07 A      B           1
# 6 2020-05-08 A      B           2
# 7 2020-05-09 A      B           3
# 8 2020-06-05 C      D           1
# 9 2020-06-06 C      D           2
#10 2020-06-07 C      D           3
#11 2020-06-08 C      D           4
#12 2020-06-09 C      D           5
#13 2020-06-10 C      D           6
#14 2020-06-11 C      D           7

data

data <- structure(list(day = c("2020-05-01", "2020-05-02", "2020-05-03", 
"2020-05-04", "2020-05-07", "2020-05-08", "2020-05-09", "2020-06-05", 
"2020-06-06", "2020-06-07", "2020-06-08", "2020-06-09", "2020-06-10", 
"2020-06-11"), group1 = c("A", "A", "A", "A", "A", "A", "A", 
"C", "C", "C", "C", "C", "C", "C"), group2 = c("B", "B", "B", 
"B", "B", "B", "B", "D", "D", "D", "D", "D", "D", "D"), id_day = c(1L, 
2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L)), 
class = "data.frame", row.names = c(NA, -14L))

Numbering rows from 1 to n within groups in a data frame

Here's a classic split / apply / combine approach:

df <- unsplit(lapply(split(df, df$cat), function(x) 
              cbind(x, id = rep(1:3, length.out = nrow(x)))), df$cat)

#    cat        val id
# 1  aaa 0.05638315  1
# 2  aaa 0.25767250  2
# 3  aaa 0.30776611  3
# 4  aaa 0.46854928  1
# 5  aaa 0.55232243  2
# 6  bbb 0.17026205  1
# 7  bbb 0.37032054  2
# 8  bbb 0.48377074  3
# 9  bbb 0.54655860  1
# 10 bbb 0.81240262  2
# 11 ccc 0.28035384  1
# 12 ccc 0.39848790  2
# 13 ccc 0.62499648  3
# 14 ccc 0.76255108  1
# 15 ccc 0.88216552  2

And a dplyr alternative:

library(dplyr)
df %>% group_by(cat) %>% mutate(id = rep(1:3, length.out = n()))

And a data.table alternative, too:

library(data.table)
setDT(df)
df[, id := rep(1:3, length.out = .N), by = cat]

R: Create numbering within each group

A) The dplyr package offers group_indices() for adding unique group indentifiers:

library(dplyr)

df$number <- df %>% 
  group_indices(ID)
df

# A tibble: 10 × 3
   study    ID number
   <chr> <dbl>  <int>
 1 A         1      1
 2 B         1      1
 3 C         1      1
 4 A         5      2
 5 B         5      2
...

B) You can drop observations where the group size is less than 3 (i.e., "A", "B" and "C") with filter():

df %>% 
  group_by(ID) %>% 
  filter(n() == 3)

# A tibble: 6 × 3
# Groups:   ID [2]
  study    ID number
  <chr> <dbl>  <int>
1 A         1      1
2 B         1      1
3 C         1      1
4 A         7      3
5 B         7      3
6 C         7      3

How to store the row number by group

dt[, row := row.names(dt), by = c("id", "month")]
dt

   id month row
1:  1     1   1
2:  1     1   2
3:  1     2   1
4:  2     1   1
5:  2     1   2
6:  2     2   1