Adding a Counter Column for a Set of Similar Rows in R

Adding a counter column for a set of similar rows in R

I really like the simple syntax of data.table for this (not to mention speed)...

#  Load package
require( data.table )
#  Turn data.frame into a data.table
dt <- data.table( df )

#  Get running count by ID and T
dt[ , Index := 1:.N , by = c("ID" , "T") ]
#   ID T Index
#1:  A 1     1
#2:  A 1     2
#3:  A 2     1
#4:  A 2     2
#5:  B 1     1
#6:  B 1     2
#7:  B 1     3
#8:  B 1     4

.N is an integer equal to the number of rows in each group. The groups are defined by the column names in the by argument, so 1:.N gives a vector as long as the group.

As data.table inherits from data.frame any function that takes a data.frame as input will also take a data.table as input and you can easily convert back if you wished ( df <- data.frame( dt ) )

Add counter column for each group

We can use frank

library(data.table)
df1[, Sequence := frank(date, ties.method = 'dense'), id]
df1
#   id       date Sequence
#1: 02 2020-08-27        1
#2: 02 2020-09-07        2
#3: 04 2020-08-27        1
#4: 07 2020-08-27        1
#5: 07 2020-08-27        1
#6: 19 2020-08-28        1
#7: 19 2020-09-07        2
#8: 19 2020-09-07        2

R: Create a counter column that goes up every second row in a group

Use group_by and row_number. This can be used irrespective of the size of the dataframe

library(tidyverse)
df %>%
  group_by(group, numbering) %>%
  mutate(counter = str_c(group, row_number(), sep='_'))
 group numbering counter
  <chr> <chr>     <chr>  
1 g1    x         g1_1   
2 g1    y         g1_1   
3 g2    x         g2_1   
4 g2    y         g2_1   
5 g2    x         g2_2   
6 g2    y         g2_2   
7 g3    x         g3_1   
8 g3    y         g3_1

in base R:

transform(df, counter = ave(group, group, numbering, 
                             FUN = \(x)paste(x, seq_along(x), sep='_')))
  group numbering counter
1    g1         x    g1_1
2    g1         y    g1_1
3    g2         x    g2_1
4    g2         y    g2_1
5    g2         x    g2_2
6    g2         y    g2_2
7    g3         x    g3_1
8    g3         y    g3_1

Aggregate and count identical rows in r

In SQL terms, you can count rows grouping by all columns and join the result with the initial data.frame.

I recommend using data.table package.

df=data.frame(a=c(1,1,2,3,4,4,4),b=c("a","a","b","b","e","e","f"))

library(data.table)

# convert df to data.table
df=as.data.table(df)

# aggregate df grouping by all columns
clmns=colnames(df)
row_multiplicity=df[,.N,by=clmns]

#join/merge with initial data.frame
new_df=merge(df,row_multiplicity)

Add a column that count number of rows until the first 1, by group in R

df <- data.frame(Group=c(1,1,1,1,2,2),
                 var1=c(1,0,0,1,1,1),
                 var2=c(0,0,1,1,0,0),
                 var3=c(0,1,0,0,0,1))

This works for any number of variables as long as the structure is the same as in the example (i.e. Group + many variables that are 0 or 1)

df %>% 
  mutate(rownr = row_number()) %>%
  pivot_longer(-c(Group, rownr)) %>%
  group_by(Group, name) %>%
  mutate(out = cumsum(value != 1 & (cumsum(value) < 1)) + 1,
         out = ifelse(max(out) > n(), 0, max(out))) %>% 
  pivot_wider(names_from = c(name, name), values_from = c(value, out)) %>% 
  select(-rownr)

Returns:

  Group value_var1 value_var2 value_var3 out_var1 out_var2 out_var3
  <dbl>      <dbl>      <dbl>      <dbl>    <dbl>    <dbl>    <dbl>
1     1          1          0          0        1        3        2
2     1          0          0          1        1        3        2
3     1          0          1          0        1        3        2
4     1          1          1          0        1        3        2
5     2          1          0          0        1        0        2
6     2          1          0          1        1        0        2

add counter column by arranging two variables (dplyr)

Within dplyr you need to arrange() by ID and VAR and then group_by() just ID.

Then you use mutate() to add a new column, counting from 1 to n() (where n() is a dplyr function for 'number of rows')

set.seed(123)
dt %>%
    arrange(ID, VAR) %>%
    group_by(ID) %>%
    mutate(COUNTER = 1:n()) %>%  ## as per comment, can use row_number()
    ungroup()

# # A tibble: 12 × 3
#         ID   VAR COUNTER
#     <fctr> <int>   <int>
# 1       a    29       1
# 2       a    41       2
# 3       a    79       3
# 4       a    86       4
# 5       b    29       1
# 6       b    41       2
# 7       b    79       3
# 8       b    86       4
# 9       c    29       1
# 10      c    41       2
# 11      c    79       3
# 12      c    86       4

A comment on ungrouping

I do this to remove all the 'grouping' attributes associated with a grouped_df. In this example the result is the same, but those grouped attributes may bite you further down the line.

dt_grouped <- dt %>%
    arrange(ID, VAR) %>%
    group_by(ID) %>%
    mutate(COUNTER = 1:n()) 

dt_ungrouped <- dt %>%
    arrange(ID, VAR) %>%
    group_by(ID) %>%
    mutate(COUNTER = 1:n()) %>%
    ungroup()

str(dt_grouped)
# Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame':   12 obs. of  3 variables:
#   $ ID     : Factor w/ 3 levels "a","b","c": 1 1 1 1 2 2 2 2 3 3 ...
# $ VAR    : int  29 41 79 86 29 41 79 86 29 41 ...
# $ COUNTER: int  1 2 3 4 1 2 3 4 1 2 ...
# - attr(*, "vars")=List of 1
# ..$ : symbol ID
# - attr(*, "labels")='data.frame': 3 obs. of  1 variable:
#   ..$ ID: Factor w/ 3 levels "a","b","c": 1 2 3
# ..- attr(*, "vars")=List of 1
# .. ..$ : symbol ID
# ..- attr(*, "drop")= logi TRUE
# - attr(*, "indices")=List of 3
# ..$ : int  0 1 2 3
# ..$ : int  4 5 6 7
# ..$ : int  8 9 10 11
# - attr(*, "drop")= logi TRUE
# - attr(*, "group_sizes")= int  4 4 4
# - attr(*, "biggest_group_size")= int 4

str(dt_ungrouped)
# Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 12 obs. of  3 variables:
#   $ ID     : Factor w/ 3 levels "a","b","c": 1 1 1 1 2 2 2 2 3 3 ...
# $ VAR    : int  29 41 79 86 29 41 79 86 29 41 ...
# $ COUNTER: int  1 2 3 4 1 2 3 4 1 2 ...

Count number of rows per group and add result to original data frame

Using data.table:

library(data.table)
dt = as.data.table(df)

# or coerce to data.table by reference:
# setDT(df)

dt[ , count := .N, by = .(name, type)]

For pre-data.table 1.8.2 alternative, see edit history.

Using dplyr:

library(dplyr)
df %>%
  group_by(name, type) %>%
  mutate(count = n())

Or simply:

add_count(df, name, type)

Using plyr:

plyr::ddply(df, .(name, type), transform, count = length(num))

R: Repeating row of dataframe with respect to multiple count columns

Here is a tidyverse option. We can use uncount from tidyr to duplicate the rows according to the count in value (i.e., from the var columns) after pivoting to long format.

library(tidyverse)

df %>% 
  pivot_longer(starts_with("var"), names_to = "class") %>% 
  filter(value != 0) %>% 
  uncount(value) %>% 
  mutate(class = str_extract(class, "\\d+"))

Output

  f1    f2    class
  <chr> <chr> <chr>
1 a     c     1    
2 a     c     3    
3 a     c     3    
4 a     c     3    
5 b     d     1    
6 b     d     2    
7 b     d     2

Another slight variation is to use expandrows from splitstackshape in conjunction with tidyverse.

library(splitstackshape)

df %>% 
  pivot_longer(starts_with("var"), names_to = "class") %>% 
  filter(value != 0) %>% 
  expandRows("value") %>% 
  mutate(class = str_extract(class, "\\d+"))

Adding a Counter Column for a Set of Similar Rows in R