R - Group by Variable and Then Assign a Unique Id

R - Group by variable and then assign a unique ID

dplyr has a group_indices function for creating unique group IDs

library(dplyr)
data <- data.frame(personal_id = c("111-111-111", "999-999-999", "222-222-222", "111-111-111"),
gender = c("M", "F", "M", "M"),
temperature = c(99.6, 98.2, 97.8, 95.5))

data$group_id <- data %>% group_indices(personal_id)
data <- data %>% select(-personal_id)

data
gender temperature group_id
1 M 99.6 1
2 F 98.2 3
3 M 97.8 2
4 M 95.5 1

Or within the same pipeline (https://github.com/tidyverse/dplyr/issues/2160):

data %>% 
mutate(group_id = group_indices(., personal_id))

Assign unique id to consecutive rows within a grouping variable in dplyr

We can use gl

library(dplyr)
df <- df %>%
group_by(group) %>%
mutate(id = as.integer(gl(n(), 2, n()))) %>%
ungroup

Group dataframe rows by creating a unique ID column based on the amount of time passed between entries and variable values

Here's a dplyr approach that calculates the gap and rolling avg gap within each Name/Item group, then flags large gaps, and assigns a new group for each large gap or change in Name or Item.

df1 %>%
group_by(Name,Item) %>%
mutate(purch_num = row_number(),
time_since_first = Date - first(Date),
gap = Date - lag(Date, default = as.Date(-Inf)),
avg_gap = time_since_first / (purch_num-1),
new_grp_flag = gap > 180 | gap > 3*avg_gap) %>%
ungroup() %>%
mutate(group = cumsum(new_grp_flag))

How to assign a unique ID number to each group of identical values in a column

How about

df2 <- transform(df,id=as.numeric(factor(sample)))

?

I think this (cribbed from Add ID column by group) should be slightly more efficient, although perhaps a little harder to remember:

df3 <- transform(df, id=match(sample, unique(sample)))
all.equal(df2,df3) ## TRUE

If you want to do this in tidyverse:

library(dplyr)
df %>% group_by(sample) %>% mutate(id=cur_group_id())

group rows (ID) and then assign a treatment for each group ID

You can use base R to do this with a merge:

set.seed(1)

random_trt <- data.frame(ID = unique(ID),
New_Treatment = sample(c("a", "b", "c"), size = length(unique(ID)), replace = T))

merge(data,
random_trt,
by = "ID",
all.x = T)

ID Treatment New_Treatment
1 1 a a
2 1 a a
3 1 a a
4 2 b c
5 2 b c
6 2 b c
7 2 b c
8 2 b c
9 3 c a
10 3 c a
11 4 a b
12 4 a b
13 4 a b
14 4 a b
15 5 b a
16 6 c c
17 6 c c
18 6 c c
19 7 a c
20 7 a c

You use sample to randomly sample your treatment vector for each unique ID. Then you merge that as a one-to-many merge so that it repeats for each ID in data.


Using dplyr:

set.seed(1)
data %>%
dplyr::group_by(ID) %>%
dplyr::mutate(New_Treatment = sample(c("a", "b", "c"), size = 1))

ID Treatment New_Treatment
<dbl> <chr> <chr>
1 1 a a
2 1 a a
3 1 a a
4 2 b c
5 2 b c
6 2 b c
7 2 b c
8 2 b c
9 3 c a
10 3 c a
11 4 a b
12 4 a b
13 4 a b
14 4 a b
15 5 b a
16 6 c c
17 6 c c
18 6 c c
19 7 a c
20 7 a c

generate id within group

You can use data.table::rleid(), i.e.

library(dplyr)

df %>%
group_by(VarA) %>%
mutate(id = data.table::rleid(VarB))

# A tibble: 6 x 3
# Groups: VarA [2]
# VarA VarB id
# <chr> <chr> <int>
#1 A aaaa 1
#2 A aaaa 1
#3 B bbbb 1
#4 B bbbb 1
#5 B bbbb 1
#6 B cccc 2

Assign unique ID to distinct values within Group with dplyr

Using data.table and sprintf:

library(data.table)
setDT(dat)[, ID := sprintf('%s.%02d.%03d',
Emp, rleid(Color), rowid(rleid(Color))),
by = Emp]

you get:

> dat
Emp Color ID
1: A Red A.01.001
2: A Green A.02.001
3: A Green A.02.002
4: B Orange B.01.001
5: B Yellow B.02.001
6: C Brown C.01.001

How this works:

  • You convert dat to a data.table with setDT()
  • Group by Emp.
  • And create the ID-variable with the sprintf-function. With sprintf you paste several vector easily together according to a specified format.
  • The use of := means that the data.table is updated by reference.
  • %s indicates that a string is to be used in the first part (which is Emp). %02d & %03d indicates that a number needs to have two or three digits with a leading zero(s) when needed. The dots in between will taken literally and thus in cluded in the resulting string.

Adressing the comment of @jsta, if the values in the Color-column are not sequential you can use:

setDT(dat)[, r := as.integer(factor(Color, levels = unique(Color))), by = Emp
][, ID := sprintf('%s.%02d.%03d',
Emp, r, rowid(r)),
by = Emp][, r:= NULL]

This will also maintain the order in which the Color column is presented. Instead of as.integer(factor(Color, levels = unique(Color))) you can also use match(Color, unique(Color)) as shown by akrun.

Implementing the above on a bit larger dataset to illustrate:

dat2 <- rbindlist(list(dat,dat))
dat2[, r := match(Color, unique(Color)), by = Emp
][, ID := sprintf('%s.%02d.%03d',
Emp, r, rowid(r)),
by = Emp]

gets you:

> dat2
Emp Color r ID
1: A Red 1 A.01.001
2: A Green 2 A.02.001
3: A Green 2 A.02.002
4: B Orange 1 B.01.001
5: B Yellow 2 B.02.001
6: C Brown 1 C.01.001
7: A Red 1 A.01.002
8: A Green 2 A.02.003
9: A Green 2 A.02.004
10: B Orange 1 B.01.002
11: B Yellow 2 B.02.002
12: C Brown 1 C.01.002

assign a unique ID number for every repeated value in a column R

It can be done using rowid

library(data.table)
library(dplyr)
weighted_df %>%
mutate(ID = rowid(Name))

-output

#     Name        Room1       Room2        Room3 ID
#1 H001 0.579649851 0.84602529 0.620850211 1
#2 H001 0.579649851 0.84602529 0.620850211 2
#3 H001 0.579649851 0.84602529 0.620850211 3
#4 H001 0.579649851 0.84602529 0.620850211 4
#5 H001 0.579649851 0.84602529 0.620850211 5
#6 H001 0.579649851 0.84602529 0.620850211 6
#7 H001 0.579649851 0.84602529 0.620850211 7
#8 H001 0.579649851 0.84602529 0.620850211 8
#9 H001 0.579649851 0.84602529 0.620850211 9
#10 H001 0.579649851 0.84602529 0.620850211 10
#11 H001 0.579649851 0.84602529 0.620850211 11
#12 H001 0.579649851 0.84602529 0.620850211 12
#13 H001 0.579649851 0.84602529 0.620850211 13
#14 H001 0.579649851 0.84602529 0.620850211 14
#15 H001 0.579649851 0.84602529 0.620850211 15
#16 H001 0.579649851 0.84602529 0.620850211 16
#17 H001 0.579649851 0.84602529 0.620850211 17
#18 H002 1.457267473 -1.18612874 0.553957293 1
#19 H002 1.457267473 -1.18612874 0.553957293 2
# ...

R: Create numbering within each group

A) The dplyr package offers group_indices() for adding unique group indentifiers:

library(dplyr)

df$number <- df %>%
group_indices(ID)
df

# A tibble: 10 × 3
study ID number
<chr> <dbl> <int>
1 A 1 1
2 B 1 1
3 C 1 1
4 A 5 2
5 B 5 2
...

B) You can drop observations where the group size is less than 3 (i.e., "A", "B" and "C") with filter():

df %>% 
group_by(ID) %>%
filter(n() == 3)

# A tibble: 6 × 3
# Groups: ID [2]
study ID number
<chr> <dbl> <int>
1 A 1 1
2 B 1 1
3 C 1 1
4 A 7 3
5 B 7 3
6 C 7 3

R: add a dplyr group label as a number

I think in this case something as simple as :

df %>%
mutate(group_no = as.integer(name))

will work

# A tibble: 20 x 4
# Groups: id [2]
id name val group_no
<fct> <fct> <dbl> <int>
1 a N1 0.647 1
2 a N1 0.530 1
3 a N1 0.245 1
4 a N2 0.693 2
5 a N2 0.478 2
6 a N2 0.861 2
7 a N3 0.821 3
8 a N3 0.0995 3
9 a N3 0.662 3
10 b N1 0.553 1
11 b N1 0.0233 1
12 b N1 0.519 1
13 b N2 0.783 2
14 b N2 0.789 2
15 b N2 0.477 2
16 b N2 0.438 2
17 b N2 0.407 2
18 b N3 0.732 3
19 b N3 0.0707 3
20 b N3 0.316 3


Related Topics



Leave a reply



Submit