Adding a Counter Column for a Set of Similar Rows in R

Adding a counter column for a set of similar rows in R

I really like the simple syntax of data.table for this (not to mention speed)...

#  Load package
require( data.table )
# Turn data.frame into a data.table
dt <- data.table( df )

# Get running count by ID and T
dt[ , Index := 1:.N , by = c("ID" , "T") ]
# ID T Index
#1: A 1 1
#2: A 1 2
#3: A 2 1
#4: A 2 2
#5: B 1 1
#6: B 1 2
#7: B 1 3
#8: B 1 4

.N is an integer equal to the number of rows in each group. The groups are defined by the column names in the by argument, so 1:.N gives a vector as long as the group.

As data.table inherits from data.frame any function that takes a data.frame as input will also take a data.table as input and you can easily convert back if you wished ( df <- data.frame( dt ) )

Add counter column for each group

We can use frank

library(data.table)
df1[, Sequence := frank(date, ties.method = 'dense'), id]
df1
# id date Sequence
#1: 02 2020-08-27 1
#2: 02 2020-09-07 2
#3: 04 2020-08-27 1
#4: 07 2020-08-27 1
#5: 07 2020-08-27 1
#6: 19 2020-08-28 1
#7: 19 2020-09-07 2
#8: 19 2020-09-07 2

R: Create a counter column that goes up every second row in a group

Use group_by and row_number. This can be used irrespective of the size of the dataframe

library(tidyverse)
df %>%
group_by(group, numbering) %>%
mutate(counter = str_c(group, row_number(), sep='_'))
group numbering counter
<chr> <chr> <chr>
1 g1 x g1_1
2 g1 y g1_1
3 g2 x g2_1
4 g2 y g2_1
5 g2 x g2_2
6 g2 y g2_2
7 g3 x g3_1
8 g3 y g3_1

in base R:

transform(df, counter = ave(group, group, numbering, 
FUN = \(x)paste(x, seq_along(x), sep='_')))
group numbering counter
1 g1 x g1_1
2 g1 y g1_1
3 g2 x g2_1
4 g2 y g2_1
5 g2 x g2_2
6 g2 y g2_2
7 g3 x g3_1
8 g3 y g3_1

Aggregate and count identical rows in r

In SQL terms, you can count rows grouping by all columns and join the result with the initial data.frame.

I recommend using data.table package.

df=data.frame(a=c(1,1,2,3,4,4,4),b=c("a","a","b","b","e","e","f"))

library(data.table)

# convert df to data.table
df=as.data.table(df)

# aggregate df grouping by all columns
clmns=colnames(df)
row_multiplicity=df[,.N,by=clmns]

#join/merge with initial data.frame
new_df=merge(df,row_multiplicity)

Add a column that count number of rows until the first 1, by group in R

df <- data.frame(Group=c(1,1,1,1,2,2),
var1=c(1,0,0,1,1,1),
var2=c(0,0,1,1,0,0),
var3=c(0,1,0,0,0,1))

This works for any number of variables as long as the structure is the same as in the example (i.e. Group + many variables that are 0 or 1)

df %>% 
mutate(rownr = row_number()) %>%
pivot_longer(-c(Group, rownr)) %>%
group_by(Group, name) %>%
mutate(out = cumsum(value != 1 & (cumsum(value) < 1)) + 1,
out = ifelse(max(out) > n(), 0, max(out))) %>%
pivot_wider(names_from = c(name, name), values_from = c(value, out)) %>%
select(-rownr)

Returns:

  Group value_var1 value_var2 value_var3 out_var1 out_var2 out_var3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 0 1 3 2
2 1 0 0 1 1 3 2
3 1 0 1 0 1 3 2
4 1 1 1 0 1 3 2
5 2 1 0 0 1 0 2
6 2 1 0 1 1 0 2

add counter column by arranging two variables (dplyr)

Within dplyr you need to arrange() by ID and VAR and then group_by() just ID.

Then you use mutate() to add a new column, counting from 1 to n() (where n() is a dplyr function for 'number of rows')

set.seed(123)
dt %>%
arrange(ID, VAR) %>%
group_by(ID) %>%
mutate(COUNTER = 1:n()) %>% ## as per comment, can use row_number()
ungroup()

# # A tibble: 12 × 3
# ID VAR COUNTER
# <fctr> <int> <int>
# 1 a 29 1
# 2 a 41 2
# 3 a 79 3
# 4 a 86 4
# 5 b 29 1
# 6 b 41 2
# 7 b 79 3
# 8 b 86 4
# 9 c 29 1
# 10 c 41 2
# 11 c 79 3
# 12 c 86 4

A comment on ungrouping

I do this to remove all the 'grouping' attributes associated with a grouped_df. In this example the result is the same, but those grouped attributes may bite you further down the line.

dt_grouped <- dt %>%
arrange(ID, VAR) %>%
group_by(ID) %>%
mutate(COUNTER = 1:n())

dt_ungrouped <- dt %>%
arrange(ID, VAR) %>%
group_by(ID) %>%
mutate(COUNTER = 1:n()) %>%
ungroup()

str(dt_grouped)
# Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 12 obs. of 3 variables:
# $ ID : Factor w/ 3 levels "a","b","c": 1 1 1 1 2 2 2 2 3 3 ...
# $ VAR : int 29 41 79 86 29 41 79 86 29 41 ...
# $ COUNTER: int 1 2 3 4 1 2 3 4 1 2 ...
# - attr(*, "vars")=List of 1
# ..$ : symbol ID
# - attr(*, "labels")='data.frame': 3 obs. of 1 variable:
# ..$ ID: Factor w/ 3 levels "a","b","c": 1 2 3
# ..- attr(*, "vars")=List of 1
# .. ..$ : symbol ID
# ..- attr(*, "drop")= logi TRUE
# - attr(*, "indices")=List of 3
# ..$ : int 0 1 2 3
# ..$ : int 4 5 6 7
# ..$ : int 8 9 10 11
# - attr(*, "drop")= logi TRUE
# - attr(*, "group_sizes")= int 4 4 4
# - attr(*, "biggest_group_size")= int 4

str(dt_ungrouped)
# Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 12 obs. of 3 variables:
# $ ID : Factor w/ 3 levels "a","b","c": 1 1 1 1 2 2 2 2 3 3 ...
# $ VAR : int 29 41 79 86 29 41 79 86 29 41 ...
# $ COUNTER: int 1 2 3 4 1 2 3 4 1 2 ...

Count number of rows per group and add result to original data frame

Using data.table:

library(data.table)
dt = as.data.table(df)

# or coerce to data.table by reference:
# setDT(df)

dt[ , count := .N, by = .(name, type)]

For pre-data.table 1.8.2 alternative, see edit history.


Using dplyr:

library(dplyr)
df %>%
group_by(name, type) %>%
mutate(count = n())

Or simply:

add_count(df, name, type)

Using plyr:

plyr::ddply(df, .(name, type), transform, count = length(num))

R: Repeating row of dataframe with respect to multiple count columns

Here is a tidyverse option. We can use uncount from tidyr to duplicate the rows according to the count in value (i.e., from the var columns) after pivoting to long format.

library(tidyverse)

df %>%
pivot_longer(starts_with("var"), names_to = "class") %>%
filter(value != 0) %>%
uncount(value) %>%
mutate(class = str_extract(class, "\\d+"))

Output

  f1    f2    class
<chr> <chr> <chr>
1 a c 1
2 a c 3
3 a c 3
4 a c 3
5 b d 1
6 b d 2
7 b d 2

Another slight variation is to use expandrows from splitstackshape in conjunction with tidyverse.

library(splitstackshape)

df %>%
pivot_longer(starts_with("var"), names_to = "class") %>%
filter(value != 0) %>%
expandRows("value") %>%
mutate(class = str_extract(class, "\\d+"))


Related Topics



Leave a reply



Submit