How to Count Sequences of Ones in a Logical Vector

How to count sequences of ones in a logical vector

with C++ through Rcpp

library(Rcpp)

cppFunction('NumericVector seqOfLogical(LogicalVector lv) {
  size_t n = lv.size();
  NumericVector res(n);
  int foundCounter = 0;
  for (size_t i = 0; i < n; i++) {
    if (lv[i] == 1) {
      foundCounter++;
    } else {
      foundCounter = 0;
    }
    res[i] = foundCounter;
  }
  return res;
}')

seqOfLogical(x)

# [1] 0 0 1 2 3 0 1 2 0 0 0 1 2 3 4

Benchmarks

library(microbenchmark)

set.seed(1)
x <- sample(c(T,F), size = 1e6, replace = T)

microbenchmark(
    symbolix = { symbolix(x) }, 
    thelatemail1 = { thelatemail1(x) },
    thelatemail2 = { thelatemail2(x) },
    wen = { wen(x) },
    maurits = { maurits(x) },
    #mhammer = { mhammer(x) },   ## this errors
    times = 5
)

# Unit: milliseconds
#         expr         min          lq       mean      median         uq       max neval
#     symbolix    2.760152    4.579596   34.60909    4.833333   22.31126  138.5611     5
# thelatemail1  154.050925  189.784368  235.16431  235.982093  262.33704  333.6671     5
# thelatemail2  138.876834  146.197278  158.66718  148.547708  179.80223  179.9119     5
#          wen  780.432786  898.505231 1091.39099 1093.702177 1279.33318 1404.9816     5
#      maurits 1002.267323 1043.590621 1136.35624 1086.967756 1271.38803 1277.5675     5

functions

symbolix <- function(x) {
    seqOfLogical(x)
}

thelatemail1 <- function(x) {
    r <- rle(x)
    x[x] <- sequence(r$l[r$v])
    return(x)
}

thelatemail2 <- function(x) {
    x[x] <- sequence(with(rle(x), lengths[values]))
    return(x)
}

maurits <- function(x) {
    unlist(Map(function(l, v) if (!isTRUE(v)) rep(0, l) else 1:l, rle(x)$lengths, rle(x)$values))
}

wen <- function(A) {
    B=data.table::rleid(A)
    B=ave(B,B,FUN = seq_along)
    B[!A]=0
    B
}

mhammer <- function(x) {
    x_counts <- x
    for(i in seq_along(x)) {
      if(x[i] == 1) { x_counts[i] <- x_counts[i] + x_counts[i-1] }
    }
    return(x_counts)
}

Get a vector with the count of consecutive specific values in R

Edit: Did not see Roman's identical solution when posting.

We would like something like:

tmp <- rle(with(df, !Movement & Booked))
tmp$lengths[tmp$values]

The indexing by tmp$values ensures you only get the rows corresponding to the pattern you've specified.

Hope this helps!

Count NAs in vector except at the beginning

We can subset the vector from the first non-NA element and then do the is.na to get a logical vector and get the sum

sum(is.na(temp[which(!is.na(temp))[1]:length(temp)]))
#[1] 3

Or another option is to subset based on the cumsum of logical vector and then do as above

sum(is.na(temp[cumsum(!is.na(temp))>0]))
#[1] 3

How to count character in a sequence under a dataframe based on a vector of known characters

Here is a solution using Biostrings::letterFrequency:

library(Biostrings);
dat %>%
    mutate(
        aa = list(complete_aa),
        aa_count = list(letterFrequency(BString(sequence), letters = complete_aa))) %>%
    unnest() %>%
    select(-type, -seq_len);
## A tibble: 20 x 4
#   fasta_header             sequence                             aa    aa_count
#   <chr>                    <chr>                                <chr>    <int>
# 1 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… A            1
# 2 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… C            2
# 3 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… D            1
# 4 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… E            1
# 5 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… F            2
# 6 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… G            3
# 7 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… H            0
# 8 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… I            0
# 9 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… K            1
#10 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… L            5
#11 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… M            0
#12 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… N            0
#13 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… P            6
#14 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… Q            3
#15 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… R            4
#16 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… S            4
#17 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… T            3
#18 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… V            3
#19 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… W            2
#20 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… Y            1

Biostrings also offers methods to read and parse fasta files directly, see ?read.DNAStringSet.

Update

For your second example, a solution is:

dat2 %>%
    mutate(
        aa = list(complete_aa),
        aa_count = lapply(sequence, function(x) 
            letterFrequency(BString(x), letters = complete_aa))) %>%
    unnest()

This produces data in a long format. If need be, reshape from long to wide with spread.

#   fasta_header   sequence aa aa_count
#1         >seq1  MPSRGTRPE  A        0
#2         >seq1  MPSRGTRPE  C        0
#3         >seq1  MPSRGTRPE  D        0
#4         >seq1  MPSRGTRPE  E        1
#5         >seq1  MPSRGTRPE  F        0
#6         >seq1  MPSRGTRPE  G        1
#7         >seq1  MPSRGTRPE  H        0
#8         >seq1  MPSRGTRPE  I        0
#9         >seq1  MPSRGTRPE  K        0
#10        >seq1  MPSRGTRPE  L        0
#11        >seq1  MPSRGTRPE  M        1
#12        >seq1  MPSRGTRPE  N        0
#13        >seq1  MPSRGTRPE  P        2
#14        >seq1  MPSRGTRPE  Q        0
#15        >seq1  MPSRGTRPE  R        2
#16        >seq1  MPSRGTRPE  S        1
#17        >seq1  MPSRGTRPE  T        1
#18        >seq1  MPSRGTRPE  V        0
#19        >seq1  MPSRGTRPE  W        0
#20        >seq1  MPSRGTRPE  Y        0
#21        >seq2 VSSKYTFWNF  A        0
#22        >seq2 VSSKYTFWNF  C        0
#23        >seq2 VSSKYTFWNF  D        0
#24        >seq2 VSSKYTFWNF  E        0
#25        >seq2 VSSKYTFWNF  F        2
#26        >seq2 VSSKYTFWNF  G        0
#27        >seq2 VSSKYTFWNF  H        0
#28        >seq2 VSSKYTFWNF  I        0
#29        >seq2 VSSKYTFWNF  K        1
#30        >seq2 VSSKYTFWNF  L        0
#31        >seq2 VSSKYTFWNF  M        0
#32        >seq2 VSSKYTFWNF  N        1
#33        >seq2 VSSKYTFWNF  P        0
#34        >seq2 VSSKYTFWNF  Q        0
#35        >seq2 VSSKYTFWNF  R        0
#36        >seq2 VSSKYTFWNF  S        2
#37        >seq2 VSSKYTFWNF  T        1
#38        >seq2 VSSKYTFWNF  V        1
#39        >seq2 VSSKYTFWNF  W        1
#40        >seq2 VSSKYTFWNF  Y        1

Count events that occurred in sequence

Try:

library(data.table)

setDT(df)[, desirable_output := cumsum(event), by = .(city, rleid(city, event))]

How can I count runs in a sequence?

Use rle():

y <- rle(c(1,0,0,0,1,0,0,0,0,0,2,0,0))
y$lengths[y$values==0]

Efficient ways to check and count zero or one in a vector of logical variables

Assuming v is a logical vector

(1) ~all(v) or any(~v) is true only if there is at least one zero

(2) any(v) or ~all(~v) is true only if there is at least one one

(3) sum(~v) counts zeros (numel(v)-sum(v) is faster according to @gnovice)

(4) sum(v) counts ones

Sequentially index between a boolean vector in R

#with(rle(TF), sequence(lengths) * rep(values, lengths))
with(rle(TF), sequence(lengths) * TF) #Like Rich suggested in comments
# [1] 0 1 2 3 0 0 1 2 0 1 0 1 2 3 4 5 6 7 0

How to Count Sequences of Ones in a Logical Vector