create number of consecutive sequences
Here's a quite hacky solution using dplyr
and tidyr
:
df <- df %>% group_by(id) %>%
mutate(lag_res=lag(response,default=0),
first = ifelse(lag_res == 0 & response == 1,1,0),
want_group = case_when(first == 1 ~ cumsum(first),
response == 0 ~ 0,
TRUE ~ NA_real_)) %>%
fill(want_group) %>% select(-lag_res,-first) %>%
print(n=26) %>% ungroup()
# A tibble: 26 x 5
# Groups: id [2]
row date id response want_group
<int> <date> <chr> <dbl> <dbl>
1 1 2021-10-06 A 1 1
2 2 2021-10-07 A 0 0
3 3 2021-10-08 A 1 2
4 4 2021-10-09 A 0 0
5 5 2021-10-10 A 0 0
6 6 2021-10-11 A 1 3
7 7 2021-10-12 A 1 3
8 8 2021-10-13 A 1 3
9 9 2021-10-14 A 1 3
10 10 2021-10-15 A 0 0
11 11 2021-10-16 A 1 4
12 12 2021-10-17 A 0 0
13 13 2021-10-18 A 0 0
14 14 2021-10-06 B 0 0
15 15 2021-10-07 B 0 0
16 16 2021-10-08 B 0 0
17 17 2021-10-09 B 1 1
18 18 2021-10-10 B 1 1
19 19 2021-10-11 B 0 0
20 20 2021-10-12 B 0 0
21 21 2021-10-13 B 0 0
22 22 2021-10-14 B 0 0
23 23 2021-10-15 B 0 0
24 24 2021-10-16 B 1 2
25 25 2021-10-17 B 1 2
26 26 2021-10-18 B 1 2
And then, to get cs_res, you can do:
df %>% group_by(id,want_group) %>%
mutate(cs_res = cumsum(response))
# A tibble: 26 x 6
# Groups: id, want_group [8]
row date id response want_group cs_res
<int> <date> <chr> <dbl> <dbl> <dbl>
1 1 2021-10-06 A 1 1 1
2 2 2021-10-07 A 0 0 0
3 3 2021-10-08 A 1 2 1
4 4 2021-10-09 A 0 0 0
5 5 2021-10-10 A 0 0 0
6 6 2021-10-11 A 1 3 1
7 7 2021-10-12 A 1 3 2
8 8 2021-10-13 A 1 3 3
9 9 2021-10-14 A 1 3 4
10 10 2021-10-15 A 0 0 0
Create a sequential number within each group
A simple solution with Base R:
df$seq <- ave(sapply(df$gap, identical, "gap"), df$id, FUN = cumsum)
df
#> id date lc lon lat gap_days gap seq
#> 1 20162.03 2003-10-19 14:33:00 Tagging -39.370 -18.480 NA <NA> 0
#> 2 20162.03 2003-10-21 12:19:00 1 -38.517 -18.253 1.90694444 gap 1
#> 3 20162.03 2003-10-21 13:33:00 1 -38.464 -18.302 0.05138889 no 1
#> 4 20162.03 2003-10-21 16:38:00 A -38.461 -18.425 0.12847222 no 1
#> 5 20162.03 2003-10-21 18:50:00 A -38.322 -18.512 0.09166667 no 1
#> 6 20162.03 2003-10-23 10:33:00 B -38.674 -19.824 1.65486111 gap 2
#> 7 20162.03 2003-10-23 17:52:00 B -38.957 -19.511 0.30486111 no 2
#> 8 20162.03 2003-11-02 08:14:00 B -42.084 -24.071 9.59861111 gap 3
#> 9 20162.03 2003-11-02 09:36:00 A -41.999 -24.114 0.05694444 no 3
#> 10 20687.03 2003-10-27 17:02:00 Tagging -39.320 -18.460 NA <NA> 0
#> 11 20687.03 2003-10-27 19:44:00 2 -39.306 -18.454 0.11250000 no 0
#> 12 20687.03 2003-10-27 21:05:00 1 -39.301 -18.458 0.05625000 no 0
And then split it:
split(df, list(df$id, df$seq), drop = TRUE)
#> $`20162.03.0`
#> id date lc lon lat gap_days gap seq
#> 1 20162.03 2003-10-19 14:33:00 Tagging -39.37 -18.48 NA <NA> 0
#>
#> $`20687.03.0`
#> id date lc lon lat gap_days gap seq
#> 10 20687.03 2003-10-27 17:02:00 Tagging -39.320 -18.460 NA <NA> 0
#> 11 20687.03 2003-10-27 19:44:00 2 -39.306 -18.454 0.11250 no 0
#> 12 20687.03 2003-10-27 21:05:00 1 -39.301 -18.458 0.05625 no 0
#>
#> $`20162.03.1`
#> id date lc lon lat gap_days gap seq
#> 2 20162.03 2003-10-21 12:19:00 1 -38.517 -18.253 1.90694444 gap 1
#> 3 20162.03 2003-10-21 13:33:00 1 -38.464 -18.302 0.05138889 no 1
#> 4 20162.03 2003-10-21 16:38:00 A -38.461 -18.425 0.12847222 no 1
#> 5 20162.03 2003-10-21 18:50:00 A -38.322 -18.512 0.09166667 no 1
#>
#> $`20162.03.2`
#> id date lc lon lat gap_days gap seq
#> 6 20162.03 2003-10-23 10:33:00 B -38.674 -19.824 1.6548611 gap 2
#> 7 20162.03 2003-10-23 17:52:00 B -38.957 -19.511 0.3048611 no 2
#>
#> $`20162.03.3`
#> id date lc lon lat gap_days gap seq
#> 8 20162.03 2003-11-02 08:14:00 B -42.084 -24.071 9.59861111 gap 3
#> 9 20162.03 2003-11-02 09:36:00 A -41.999 -24.114 0.05694444 no 3
Group by sequential data in R
Here is one dplyr
option -
library(dplyr)
df %>%
group_by(gene_name) %>%
mutate(grp = gene_number - lag(gene_number, default = 0) > 2) %>%
group_by(grp = cumsum(grp)) %>%
filter(n() > 1) %>%
ungroup
# gene_name gene_number grp
# <chr> <int> <int>
#1 ENSMUSG00000000001 4732 1
#2 ENSMUSG00000000001 4733 1
#3 ENSMUSG00000000058 7603 2
#4 ENSMUSG00000000058 7604 2
#5 ENSMUSG00000000058 8246 3
#6 ENSMUSG00000000058 8248 3
For each gene_name
subtract the current gene_number
value with the previous one and increment the group count if the difference is greater than 2. Drop the row if a group has a single row in it.
data
df <- structure(list(gene_name = c("ENSMUSG00000000001", "ENSMUSG00000000001",
"ENSMUSG00000000058", "ENSMUSG00000000058", "ENSMUSG00000000058",
"ENSMUSG00000000058", "ENSMUSG00000000058"), gene_number = c(4732L,
4733L, 7603L, 7604L, 8246L, 8248L, 9001L)),
class = "data.frame", row.names = c(NA, -7L))
Group rows based on consecutive line numbers
Convert the numbers to numeric, calculate difference between consecutive numbers and increment the group count when the difference is greater than 1.
transform(df, group = cumsum(c(TRUE, diff(as.numeric(line)) > 1)))
# line group
#1 0001 1
#2 0002 1
#3 0003 1
#4 0011 2
#5 0012 2
#6 0234 3
#7 0235 3
#8 0236 3
If you want to use dplyr
:
library(dplyr)
df %>% mutate(group = cumsum(c(TRUE, diff(as.numeric(line)) > 1)))
How to create a consecutive group number
Try Data$number <- as.numeric(as.factor(Data$site))
On a sidenote : the difference between the solution of me and @Chase on one hand, and the one of @DWin on the other, is the ordering of the numbers. Both as.factor
and factor
will automatically sort the levels, whereas that doesn't happen in the solution of @DWin :
Dat <- data.frame(site = rep(c(1,8,4), each = 3), score = runif(9))
Dat$number <- as.numeric(factor(Dat$site))
Dat$sitenum <- match(Dat$site, unique(Dat$site) )
Gives
> Dat
site score number sitenum
1 1 0.7377561 1 1
2 1 0.3131139 1 1
3 1 0.7862290 1 1
4 8 0.4480387 3 2
5 8 0.3873210 3 2
6 8 0.8778102 3 2
7 4 0.6916340 2 3
8 4 0.3033787 2 3
9 4 0.6552808 2 3
Identify groups of n consecutive numbers in a data.table field in a group
A solution using the tidyverse
.
library(tidyverse)
library(data.table)
DT2 <- DT %>%
arrange(Student, Month) %>%
group_by(Student) %>%
# Create sequence of 3
mutate(Seq = map(Month, ~seq.int(.x, .x + 2L))) %>%
# Create a flag to show if the sequence match completely with the Month column
mutate(Flag = map_lgl(Seq, ~all(.x %in% Month))) %>%
# Filter the Flag for TRUE
filter(Flag) %>%
# Remove columns
select(-Seq, -Flag) %>%
ungroup()
DT2
# # A tibble: 11 x 2
# Student Month
# <dbl> <dbl>
# 1 1 1
# 2 1 5
# 3 1 6
# 4 2 2
# 5 2 3
# 6 2 7
# 7 2 8
# 8 3 1
# 9 3 5
# 10 3 6
# 11 3 7
Related Topics
How to Match Fuzzy Match Strings from Two Datasets
Consistent Width For Geom_Bar in the Event of Missing Data
How to Count Runs in a Sequence
Yaml Current Date in Rmarkdown
Converting Decimal to Binary in R
Adding Minor Tick Marks to the X Axis in Ggplot2 (With No Labels)
How to Add a Row to a Data Frame in R
How to Delete Multiple Values from a Vector
Ggplot, Facet, Piechart: Placing Text in the Middle of Pie Chart Slices
Read All Files in a Folder and Apply a Function to Each Data Frame
How to Get Week Numbers from Dates
Dplyr Filter: Get Rows With Minimum of Variable, But Only the First If Multiple Minima