generate sequence within group in R
Use ave
with seq_along
:
> mydf$C <- with(mydf, ave(A, A, B, FUN = seq_along))
> mydf
A B C
1 1 1 1
2 1 2 1
3 1 2 2
4 1 3 1
5 1 3 2
6 1 3 3
7 1 4 1
8 1 4 2
If your data are already ordered (as they are in this case), you can also use sequence
with rle
(mydf$C <- sequence(rle(do.call(paste, mydf))$lengths)
), but you don't have that limitation with ave
.
If you're a data.table
fan, you can make use of .N
as follows:
library(data.table)
DT <- data.table(mydf)
DT[, C := sequence(.N), by = c("A", "B")]
DT
# A B C
# 1: 1 1 1
# 2: 1 2 1
# 3: 1 2 2
# 4: 1 3 1
# 5: 1 3 2
# 6: 1 3 3
# 7: 1 4 1
# 8: 1 4 2
Generate sequence within sub group in data.table
We group by 'id1', sort
the 'val' and then create 'grp' as row_number()
input %>%
group_by(id1) %>%
mutate(val = sort(val), grp= row_number())
Or another option is to arrange
input %>%
arrange(id1, val) %>%
group_by(id1) %>%
mutate(grp = row_number())
Or using data.table
library(data.table)
setDT(input)[, c("grp", "val") := .(seq_len(.N), sort(val)), by = id1]
input
# id1 val grp
#1: 1 1 1
#2: 1 2 2
#3: 1 3 3
#4: 1 4 4
#5: 2 3 1
#6: 2 4 2
#7: 2 5 3
If we need to sort as well, use setorder
based on the 'id1' and 'val' to order in place, then create the 'grp' as the rowid
of 'id1'
input <- data.frame("id1"=c(1,1,1,1,2,2,2),val=c(2,3,4,1,4,3,5),
achar=c('a','a','b','b','d','c','e'))
setorder(setDT(input), id1, val)[, grp := rowid(id1)][]
# id1 val achar grp
#1: 1 1 b 1
#2: 1 2 a 2
#3: 1 3 a 3
#4: 1 4 b 4
#5: 2 3 c 1
#6: 2 4 d 2
#7: 2 5 e 3
Create a sequential number within each group
A simple solution with Base R:
df$seq <- ave(sapply(df$gap, identical, "gap"), df$id, FUN = cumsum)
df
#> id date lc lon lat gap_days gap seq
#> 1 20162.03 2003-10-19 14:33:00 Tagging -39.370 -18.480 NA <NA> 0
#> 2 20162.03 2003-10-21 12:19:00 1 -38.517 -18.253 1.90694444 gap 1
#> 3 20162.03 2003-10-21 13:33:00 1 -38.464 -18.302 0.05138889 no 1
#> 4 20162.03 2003-10-21 16:38:00 A -38.461 -18.425 0.12847222 no 1
#> 5 20162.03 2003-10-21 18:50:00 A -38.322 -18.512 0.09166667 no 1
#> 6 20162.03 2003-10-23 10:33:00 B -38.674 -19.824 1.65486111 gap 2
#> 7 20162.03 2003-10-23 17:52:00 B -38.957 -19.511 0.30486111 no 2
#> 8 20162.03 2003-11-02 08:14:00 B -42.084 -24.071 9.59861111 gap 3
#> 9 20162.03 2003-11-02 09:36:00 A -41.999 -24.114 0.05694444 no 3
#> 10 20687.03 2003-10-27 17:02:00 Tagging -39.320 -18.460 NA <NA> 0
#> 11 20687.03 2003-10-27 19:44:00 2 -39.306 -18.454 0.11250000 no 0
#> 12 20687.03 2003-10-27 21:05:00 1 -39.301 -18.458 0.05625000 no 0
And then split it:
split(df, list(df$id, df$seq), drop = TRUE)
#> $`20162.03.0`
#> id date lc lon lat gap_days gap seq
#> 1 20162.03 2003-10-19 14:33:00 Tagging -39.37 -18.48 NA <NA> 0
#>
#> $`20687.03.0`
#> id date lc lon lat gap_days gap seq
#> 10 20687.03 2003-10-27 17:02:00 Tagging -39.320 -18.460 NA <NA> 0
#> 11 20687.03 2003-10-27 19:44:00 2 -39.306 -18.454 0.11250 no 0
#> 12 20687.03 2003-10-27 21:05:00 1 -39.301 -18.458 0.05625 no 0
#>
#> $`20162.03.1`
#> id date lc lon lat gap_days gap seq
#> 2 20162.03 2003-10-21 12:19:00 1 -38.517 -18.253 1.90694444 gap 1
#> 3 20162.03 2003-10-21 13:33:00 1 -38.464 -18.302 0.05138889 no 1
#> 4 20162.03 2003-10-21 16:38:00 A -38.461 -18.425 0.12847222 no 1
#> 5 20162.03 2003-10-21 18:50:00 A -38.322 -18.512 0.09166667 no 1
#>
#> $`20162.03.2`
#> id date lc lon lat gap_days gap seq
#> 6 20162.03 2003-10-23 10:33:00 B -38.674 -19.824 1.6548611 gap 2
#> 7 20162.03 2003-10-23 17:52:00 B -38.957 -19.511 0.3048611 no 2
#>
#> $`20162.03.3`
#> id date lc lon lat gap_days gap seq
#> 8 20162.03 2003-11-02 08:14:00 B -42.084 -24.071 9.59861111 gap 3
#> 9 20162.03 2003-11-02 09:36:00 A -41.999 -24.114 0.05694444 no 3
Code sequence by group in R: recurring values within group
In data.table
, we have rleid
function which makes it simple here.
library(data.table)
setDT(df)[, seq1 := seq_len(.N), .(ID, rleid(loc))]
df
# ID yr loc seq seq1
# 1: 1 1990 A 1 1
# 2: 1 1991 A 2 2
# 3: 1 1992 B 1 1
# 4: 1 1993 B 2 2
# 5: 1 1994 B 3 3
# 6: 2 1990 B 1 1
# 7: 2 1991 B 2 2
# 8: 2 1992 A 1 1
# 9: 2 1993 B 1 1
#10: 2 1994 B 2 2
#11: 3 1990 C 1 1
#12: 3 1991 C 2 2
#13: 3 1992 C 3 3
#14: 3 1993 B 1 1
#15: 3 1994 C 1 1
We can use rleid
in dplyr
and base R approaches to get expected output.
library(dplyr)
df %>%
group_by(ID, grp = data.table::rleid(loc)) %>%
mutate(seq1 = row_number())
Or in base R :
df$seq1 <- with(df, ave(yr, ID, data.table::rleid(loc), FUN = seq_along))
A concise option suggested by @chinsoon12 is to use rowid
function.
setDT(df)[, seq2 := rowid(ID, rleid(loc))]
data
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L), yr = c(1990L, 1991L, 1992L, 1993L, 1994L,
1990L, 1991L, 1992L, 1993L, 1994L, 1990L, 1991L, 1992L, 1993L,
1994L), loc = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 3L, 3L, 3L, 2L, 3L), .Label = c("A", "B", "C"), class = "factor"),
seq = c(1L, 2L, 1L, 2L, 3L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 3L,
1L, 1L)), class = "data.frame", row.names = c(NA, -15L))
Numbering rows within groups in a data frame
Use ave
, ddply
, dplyr
or data.table
:
df$num <- ave(df$val, df$cat, FUN = seq_along)
or:
library(plyr)
ddply(df, .(cat), mutate, id = seq_along(val))
or:
library(dplyr)
df %>% group_by(cat) %>% mutate(id = row_number())
or (the most memory efficient, as it assigns by reference within DT
):
library(data.table)
DT <- data.table(df)
DT[, id := seq_len(.N), by = cat]
DT[, id := rowid(cat)]
Create Group from Sequences
We can use rleid
from data.table
library(data.table)
rleid(x)
#[1] 1 2 3 3 4 5 6
Or in base R
with rle
with(rle(x), rep(seq_along(values), lengths))
#[1] 1 2 3 3 4 5 6
Or if we use the similar approach from OP
1 + cumsum(x != dplyr::lag(x, default = first(x)))
How to create a sequency by group from a specific string in R?
You can first create a column specifying the first occurrence of "UNP", then use cumsum()
and lag()
to calculate the Seq
column.
library(dplyr)
df <- read.table(header = T, text = "
ColA Colb Seq
A HM 0
A RES 0
A UNP 0
A RES 1
A RES 2
A HM 3
B HM 0
B RES 0
B UNP 0
B RES 1
B UNP 2
C UNP 0") %>%
select(-Seq)
df %>%
group_by(ColA, Colb) %>%
mutate(seq_count = ifelse(first(Colb) == "UNP" & !duplicated(Colb), 1, 0)) %>%
group_by(ColA) %>%
mutate(Seq = lag(cumsum(cumsum(seq_count)), default = 0), .keep = "unused")
#> # A tibble: 12 × 3
#> # Groups: ColA [3]
#> ColA Colb Seq
#> <chr> <chr> <dbl>
#> 1 A HM 0
#> 2 A RES 0
#> 3 A UNP 0
#> 4 A RES 1
#> 5 A RES 2
#> 6 A HM 3
#> 7 B HM 0
#> 8 B RES 0
#> 9 B UNP 0
#> 10 B RES 1
#> 11 B UNP 2
#> 12 C UNP 0
Created on 2022-03-31 by the reprex package (v2.0.1)
Sequentially Number Group within Group
One way would be to join your original data to a summarized version of the same data where you assign group id's for the variable in question. Here's an example on a standard data set:
left_join(mtcars, mtcars %>% group_by(gear) %>% summarize(id = cur_group_id()))
Or, using a version of data like yours:
dft <- data.frame(
stringsAsFactors = FALSE,
pmid = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L),
item = c("Age", "Age", "BMI", "BMI", "Age", "Age", "BMI", "Duration")
)
left_join(
dft,
dft %>%
group_by(item) %>%
summarize(id = cur_group_id()))
Result
Joining, by = "item"
pmid item id
1 1 Age 1
2 1 Age 1
3 1 BMI 2
4 1 BMI 2
5 2 Age 1
6 2 Age 1
7 2 BMI 2
8 2 Duration 3
Please note, for future questions, it will make it easier for others to help you if you can provide some example data as code we can load, as opposed to a printout of what the data looks like. It's often easiest to use the magical dput
function, by running dput(dft)
or dput(head(dft, 50))
and pasting the output into the body of your question.
Create a sequence of unique observations by group with dplyr and create a difference in months column
Using zoo::as.yearmon()
, however, I had to round
because otherwise 2015-11-26
to 2015-12-26
is considered longer than one month. Perhaps someone can comment/edit/explain how to make that particular calculation more "intuitive".
library(dplyr)
library(zoo)
df %>%
group_by(User) %>%
mutate(Count = 1:n(),
Gap_In_Months = round(12 * as.numeric(as.yearmon(Date) - as.yearmon(lag(Date))), 1),
Gap = ifelse(Gap_In_Months <= 1 | is.na(Gap_In_Months), 0, Gap_In_Months))
# User Date Count Gap_In_Months Gap
# (fctr) (fctr) (int) (dbl) (dbl)
# 1 aaaa 2015-11-26 1 NA 0
# 2 aaaa 2015-12-26 2 1 0
# 3 aaaa 2016-01-26 3 1 0
# 4 bbbb 2014-10-15 1 NA 0
# 5 bbbb 2014-11-15 2 1 0
# 6 bbbb 2015-05-16 3 6 6
Perhaps you want to be more specific as to "what is a month"? 30 days? 31 days? 28 days?
If that's the case, we can utilize lubridate
:
library(lubridate)
df %>%
group_by(User) %>%
mutate(Count = 1:n(),
Diff_Time = ymd(Date) - ymd(lag(Date)),
Gap = ifelse(Diff_Time <= ddays(31) | is.na(Diff_Time), 0, as.numeric(Diff_Time, units = "days")))
# User Date Count Diff_Time Gap
# (fctr) (fctr) (int) (dfft) (dbl)
# 1 aaaa 2015-11-26 1 NA days 0
# 2 aaaa 2015-12-26 2 30 days 0
# 3 aaaa 2016-01-26 3 31 days 0
# 4 bbbb 2014-10-15 1 NA days 0
# 5 bbbb 2014-11-15 2 31 days 0
# 6 bbbb 2015-05-16 3 182 days 182
Related Topics
Long/Bigint/Decimal Equivalent Datatype in R
How to Delete Multiple Values from a Vector
Standard Evaluation in Dplyr: Summarise a Variable Given as a Character String
Convert Data.Frame Column Format from Character to Factor
How to Order Data by Value Within Ggplot Facets
What Do Hjust and Vjust Do When Making a Plot Using Ggplot
Cumulatively Paste (Concatenate) Values Grouped by Another Variable
Dplyr Join on By=(A = B), Where a and B Are Variables Containing Strings
Dynamically Add Plots to Web Page Using Shiny
Shiny 4 Small Textinput Boxes Side-By-Side
How Does One Reorder Columns in a Data Frame
What Are the "Standard Unambiguous Date" Formats For String-To-Date Conversion in R
Limit Ggplot2 Axes Without Removing Data (Outside Limits): Zoom
Change Variable Name in For Loop Using R
Aggregate a Data Frame Based on Unordered Pairs of Columns
What Does %≫% Function Mean in R
Calculating Statistics on Subsets of Data
How to Fill Geom_Polygon With Different Colors Above and Below Y = 0 (Or Any Other Value)