Numbering Rows Within Groups in a Data Frame

Numbering rows within groups in a data frame

Use ave, ddply, dplyr or data.table:

df$num <- ave(df$val, df$cat, FUN = seq_along)

or:

library(plyr)
ddply(df, .(cat), mutate, id = seq_along(val))

or:

library(dplyr)
df %>% group_by(cat) %>% mutate(id = row_number())

or (the most memory efficient, as it assigns by reference within DT):

library(data.table)
DT <- data.table(df)

DT[, id := seq_len(.N), by = cat]
DT[, id := rowid(cat)]

Pandas number rows within group in increasing order

Use groupby/cumcount:

In [25]: df['C'] = df.groupby(['A','B']).cumcount()+1; df
Out[25]:
A B C
0 A a 1
1 A a 2
2 A b 1
3 B a 1
4 B a 2
5 B a 3

Numbering rows within groups in a data frame but without considering the NAs

You can tweak the suggestions that you have to get the desired output :

library(dplyr)

df %>%
group_by(cat) %>%
mutate(id = replace(row_number() - cumsum(is.na(val)), is.na(val), NA))

# cat val id
# <chr> <dbl> <int>
# 1 aaa 0.0564 1
# 2 aaa 0.258 2
# 3 aaa NA NA
# 4 aaa 0.469 3
# 5 aaa 0.552 4
# 6 bbb NA NA
# 7 bbb 0.370 1
# 8 bbb 0.484 2
# 9 bbb 0.547 3
#10 bbb 0.812 4
#11 ccc 0.280 1
#12 ccc 0.398 2
#13 ccc 0.625 3
#14 ccc 0.763 4
#15 ccc 0.882 5

This can be also be written in data.table

library(data.table)
setDT(df)[, id:= replace(seq_len(.N) - cumsum(is.na(val)), is.na(val), NA), cat]

and base R :

df$id <- with(df, replace(ave(val, cat, FUN = function(x) 
seq_along(x) - cumsum(is.na(x))), is.na(val), NA))

The logic is to create an id variable which is current row number in the group minus the number of NA's that have occurred till that time.


For updated df2 to consider the NA only in first row we can use :

df2 %>%
group_by(cat) %>%
mutate(id = replace(row_number() - is.na(first(val)),1* is.na(first(val)), NA))

# cat val id
# <chr> <dbl> <int>
# 1 aaa 0.0564 1
# 2 aaa 0.258 2
# 3 aaa NA 3
# 4 aaa 0.469 4
# 5 aaa 0.552 5
# 6 bbb NA NA
# 7 bbb 0.370 1
# 8 bbb 0.484 2
# 9 bbb NA 3
#10 bbb 0.812 4
#11 ccc 0.280 1
#12 ccc 0.398 2
#13 ccc 0.625 3
#14 ccc 0.763 4
#15 ccc 0.882 5

Numbering rows within groups with data gaps in R

You can add another value in group_by which would change the value when the difference of current date and previous date is greater than 1.

library(dplyr)

data %>%
mutate(day = as.Date(day)) %>%
group_by(group1, group2,
date_gap = cumsum(day - lag(day, default = first(day)) > 1)) %>%
mutate(id_day = row_number()) %>%
ungroup %>%
select(-date_gap)

# day group1 group2 id_day
# <date> <chr> <chr> <int>
# 1 2020-05-01 A B 1
# 2 2020-05-02 A B 2
# 3 2020-05-03 A B 3
# 4 2020-05-04 A B 4
# 5 2020-05-07 A B 1
# 6 2020-05-08 A B 2
# 7 2020-05-09 A B 3
# 8 2020-06-05 C D 1
# 9 2020-06-06 C D 2
#10 2020-06-07 C D 3
#11 2020-06-08 C D 4
#12 2020-06-09 C D 5
#13 2020-06-10 C D 6
#14 2020-06-11 C D 7

data

data <- structure(list(day = c("2020-05-01", "2020-05-02", "2020-05-03", 
"2020-05-04", "2020-05-07", "2020-05-08", "2020-05-09", "2020-06-05",
"2020-06-06", "2020-06-07", "2020-06-08", "2020-06-09", "2020-06-10",
"2020-06-11"), group1 = c("A", "A", "A", "A", "A", "A", "A",
"C", "C", "C", "C", "C", "C", "C"), group2 = c("B", "B", "B",
"B", "B", "B", "B", "D", "D", "D", "D", "D", "D", "D"), id_day = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L)),
class = "data.frame", row.names = c(NA, -14L))

Numbering rows from 1 to n within groups in a data frame

Here's a classic split / apply / combine approach:

df <- unsplit(lapply(split(df, df$cat), function(x) 
cbind(x, id = rep(1:3, length.out = nrow(x)))), df$cat)

# cat val id
# 1 aaa 0.05638315 1
# 2 aaa 0.25767250 2
# 3 aaa 0.30776611 3
# 4 aaa 0.46854928 1
# 5 aaa 0.55232243 2
# 6 bbb 0.17026205 1
# 7 bbb 0.37032054 2
# 8 bbb 0.48377074 3
# 9 bbb 0.54655860 1
# 10 bbb 0.81240262 2
# 11 ccc 0.28035384 1
# 12 ccc 0.39848790 2
# 13 ccc 0.62499648 3
# 14 ccc 0.76255108 1
# 15 ccc 0.88216552 2

And a dplyr alternative:

library(dplyr)
df %>% group_by(cat) %>% mutate(id = rep(1:3, length.out = n()))

And a data.table alternative, too:

library(data.table)
setDT(df)
df[, id := rep(1:3, length.out = .N), by = cat]

R: Create numbering within each group

A) The dplyr package offers group_indices() for adding unique group indentifiers:

library(dplyr)

df$number <- df %>%
group_indices(ID)
df

# A tibble: 10 × 3
study ID number
<chr> <dbl> <int>
1 A 1 1
2 B 1 1
3 C 1 1
4 A 5 2
5 B 5 2
...

B) You can drop observations where the group size is less than 3 (i.e., "A", "B" and "C") with filter():

df %>% 
group_by(ID) %>%
filter(n() == 3)

# A tibble: 6 × 3
# Groups: ID [2]
study ID number
<chr> <dbl> <int>
1 A 1 1
2 B 1 1
3 C 1 1
4 A 7 3
5 B 7 3
6 C 7 3

How to store the row number by group

dt[, row := row.names(dt), by = c("id", "month")]
dt

id month row
1: 1 1 1
2: 1 1 2
3: 1 2 1
4: 2 1 1
5: 2 1 2
6: 2 2 1


Related Topics



Leave a reply



Submit