How to Create a Consecutive Group Number

How to create a consecutive group number

Try Data$number <- as.numeric(as.factor(Data$site))

On a sidenote : the difference between the solution of me and @Chase on one hand, and the one of @DWin on the other, is the ordering of the numbers. Both as.factor and factor will automatically sort the levels, whereas that doesn't happen in the solution of @DWin :

Dat <- data.frame(site = rep(c(1,8,4), each = 3), score = runif(9))

Dat$number <- as.numeric(factor(Dat$site))
Dat$sitenum <- match(Dat$site, unique(Dat$site) )

Gives

> Dat
  site     score number sitenum
1    1 0.7377561      1       1
2    1 0.3131139      1       1
3    1 0.7862290      1       1
4    8 0.4480387      3       2
5    8 0.3873210      3       2
6    8 0.8778102      3       2
7    4 0.6916340      2       3
8    4 0.3033787      2       3
9    4 0.6552808      2       3

Group rows based on consecutive line numbers

Convert the numbers to numeric, calculate difference between consecutive numbers and increment the group count when the difference is greater than 1.

transform(df, group = cumsum(c(TRUE, diff(as.numeric(line)) > 1)))

#  line group
#1 0001     1
#2 0002     1
#3 0003     1
#4 0011     2
#5 0012     2
#6 0234     3
#7 0235     3
#8 0236     3

If you want to use dplyr :

library(dplyr)
df %>% mutate(group = cumsum(c(TRUE, diff(as.numeric(line)) > 1)))

Create a sequential number within each group

A simple solution with Base R:

df$seq <- ave(sapply(df$gap, identical, "gap"), df$id, FUN = cumsum)
df
#>          id                date      lc     lon     lat   gap_days  gap seq
#> 1  20162.03 2003-10-19 14:33:00 Tagging -39.370 -18.480         NA <NA>   0
#> 2  20162.03 2003-10-21 12:19:00       1 -38.517 -18.253 1.90694444  gap   1
#> 3  20162.03 2003-10-21 13:33:00       1 -38.464 -18.302 0.05138889   no   1
#> 4  20162.03 2003-10-21 16:38:00       A -38.461 -18.425 0.12847222   no   1
#> 5  20162.03 2003-10-21 18:50:00       A -38.322 -18.512 0.09166667   no   1
#> 6  20162.03 2003-10-23 10:33:00       B -38.674 -19.824 1.65486111  gap   2
#> 7  20162.03 2003-10-23 17:52:00       B -38.957 -19.511 0.30486111   no   2
#> 8  20162.03 2003-11-02 08:14:00       B -42.084 -24.071 9.59861111  gap   3
#> 9  20162.03 2003-11-02 09:36:00       A -41.999 -24.114 0.05694444   no   3
#> 10 20687.03 2003-10-27 17:02:00 Tagging -39.320 -18.460         NA <NA>   0
#> 11 20687.03 2003-10-27 19:44:00       2 -39.306 -18.454 0.11250000   no   0
#> 12 20687.03 2003-10-27 21:05:00       1 -39.301 -18.458 0.05625000   no   0

And then split it:

split(df, list(df$id, df$seq), drop = TRUE)
#> $`20162.03.0`
#>         id                date      lc    lon    lat gap_days  gap seq
#> 1 20162.03 2003-10-19 14:33:00 Tagging -39.37 -18.48       NA <NA>   0
#> 
#> $`20687.03.0`
#>          id                date      lc     lon     lat gap_days  gap seq
#> 10 20687.03 2003-10-27 17:02:00 Tagging -39.320 -18.460       NA <NA>   0
#> 11 20687.03 2003-10-27 19:44:00       2 -39.306 -18.454  0.11250   no   0
#> 12 20687.03 2003-10-27 21:05:00       1 -39.301 -18.458  0.05625   no   0
#> 
#> $`20162.03.1`
#>         id                date lc     lon     lat   gap_days gap seq
#> 2 20162.03 2003-10-21 12:19:00  1 -38.517 -18.253 1.90694444 gap   1
#> 3 20162.03 2003-10-21 13:33:00  1 -38.464 -18.302 0.05138889  no   1
#> 4 20162.03 2003-10-21 16:38:00  A -38.461 -18.425 0.12847222  no   1
#> 5 20162.03 2003-10-21 18:50:00  A -38.322 -18.512 0.09166667  no   1
#> 
#> $`20162.03.2`
#>         id                date lc     lon     lat  gap_days gap seq
#> 6 20162.03 2003-10-23 10:33:00  B -38.674 -19.824 1.6548611 gap   2
#> 7 20162.03 2003-10-23 17:52:00  B -38.957 -19.511 0.3048611  no   2
#> 
#> $`20162.03.3`
#>         id                date lc     lon     lat   gap_days gap seq
#> 8 20162.03 2003-11-02 08:14:00  B -42.084 -24.071 9.59861111 gap   3
#> 9 20162.03 2003-11-02 09:36:00  A -41.999 -24.114 0.05694444  no   3

Group by sequential data in R

Here is one dplyr option -

library(dplyr)

df %>%
  group_by(gene_name) %>%
  mutate(grp =  gene_number - lag(gene_number, default = 0) > 2) %>%
  group_by(grp = cumsum(grp)) %>%
  filter(n() > 1) %>%
  ungroup
  
#  gene_name          gene_number   grp
#  <chr>                    <int> <int>
#1 ENSMUSG00000000001        4732     1
#2 ENSMUSG00000000001        4733     1
#3 ENSMUSG00000000058        7603     2
#4 ENSMUSG00000000058        7604     2
#5 ENSMUSG00000000058        8246     3
#6 ENSMUSG00000000058        8248     3

For each gene_name subtract the current gene_number value with the previous one and increment the group count if the difference is greater than 2. Drop the row if a group has a single row in it.

data

df <- structure(list(gene_name = c("ENSMUSG00000000001", "ENSMUSG00000000001", 
"ENSMUSG00000000058", "ENSMUSG00000000058", "ENSMUSG00000000058", 
"ENSMUSG00000000058", "ENSMUSG00000000058"), gene_number = c(4732L, 
4733L, 7603L, 7604L, 8246L, 8248L, 9001L)), 
class = "data.frame", row.names = c(NA, -7L))

create number of consecutive sequences

Here's a quite hacky solution using dplyr and tidyr:

  df <- df %>% group_by(id) %>% 
  mutate(lag_res=lag(response,default=0),
         first = ifelse(lag_res == 0 & response == 1,1,0),
         want_group = case_when(first == 1 ~ cumsum(first),
                                response == 0 ~ 0,
                                TRUE ~ NA_real_)) %>% 
  fill(want_group) %>% select(-lag_res,-first) %>% 
  print(n=26) %>% ungroup()

# A tibble: 26 x 5
# Groups:   id [2]
     row date       id    response want_group
   <int> <date>     <chr>    <dbl>      <dbl>
 1     1 2021-10-06 A            1          1
 2     2 2021-10-07 A            0          0
 3     3 2021-10-08 A            1          2
 4     4 2021-10-09 A            0          0
 5     5 2021-10-10 A            0          0
 6     6 2021-10-11 A            1          3
 7     7 2021-10-12 A            1          3
 8     8 2021-10-13 A            1          3
 9     9 2021-10-14 A            1          3
10    10 2021-10-15 A            0          0
11    11 2021-10-16 A            1          4
12    12 2021-10-17 A            0          0
13    13 2021-10-18 A            0          0
14    14 2021-10-06 B            0          0
15    15 2021-10-07 B            0          0
16    16 2021-10-08 B            0          0
17    17 2021-10-09 B            1          1
18    18 2021-10-10 B            1          1
19    19 2021-10-11 B            0          0
20    20 2021-10-12 B            0          0
21    21 2021-10-13 B            0          0
22    22 2021-10-14 B            0          0
23    23 2021-10-15 B            0          0
24    24 2021-10-16 B            1          2
25    25 2021-10-17 B            1          2
26    26 2021-10-18 B            1          2

And then, to get cs_res, you can do:

df %>% group_by(id,want_group) %>% 
   mutate(cs_res = cumsum(response))
# A tibble: 26 x 6
# Groups:   id, want_group [8]
     row date       id    response want_group cs_res
   <int> <date>     <chr>    <dbl>      <dbl>  <dbl>
 1     1 2021-10-06 A            1          1      1
 2     2 2021-10-07 A            0          0      0
 3     3 2021-10-08 A            1          2      1
 4     4 2021-10-09 A            0          0      0
 5     5 2021-10-10 A            0          0      0
 6     6 2021-10-11 A            1          3      1
 7     7 2021-10-12 A            1          3      2
 8     8 2021-10-13 A            1          3      3
 9     9 2021-10-14 A            1          3      4
10    10 2021-10-15 A            0          0      0

Consecutive group number in R

Assuming you mean a "group of SOG" is a set of consecutive non-zero SOG values, i.e. starts with a non-zero SOG value and ends with a non-zero SOG value (not necessarily the same value):

Trips <- ifelse(SOG>0, cumsum(c(SOG[1]>0, diff(SOG>0)) == 1), 0)
# [1] 1 1 0 0 0 2 2 2 0 0 3 3 0 0 0

How to Create a Consecutive Group Number