How to Count Sequences of Ones in a Logical Vector

How to count sequences of ones in a logical vector

with C++ through Rcpp

library(Rcpp)

cppFunction('NumericVector seqOfLogical(LogicalVector lv) {
size_t n = lv.size();
NumericVector res(n);
int foundCounter = 0;
for (size_t i = 0; i < n; i++) {
if (lv[i] == 1) {
foundCounter++;
} else {
foundCounter = 0;
}
res[i] = foundCounter;
}
return res;
}')

seqOfLogical(x)

# [1] 0 0 1 2 3 0 1 2 0 0 0 1 2 3 4

Benchmarks

library(microbenchmark)

set.seed(1)
x <- sample(c(T,F), size = 1e6, replace = T)

microbenchmark(
symbolix = { symbolix(x) },
thelatemail1 = { thelatemail1(x) },
thelatemail2 = { thelatemail2(x) },
wen = { wen(x) },
maurits = { maurits(x) },
#mhammer = { mhammer(x) }, ## this errors
times = 5
)

# Unit: milliseconds
# expr min lq mean median uq max neval
# symbolix 2.760152 4.579596 34.60909 4.833333 22.31126 138.5611 5
# thelatemail1 154.050925 189.784368 235.16431 235.982093 262.33704 333.6671 5
# thelatemail2 138.876834 146.197278 158.66718 148.547708 179.80223 179.9119 5
# wen 780.432786 898.505231 1091.39099 1093.702177 1279.33318 1404.9816 5
# maurits 1002.267323 1043.590621 1136.35624 1086.967756 1271.38803 1277.5675 5

functions

symbolix <- function(x) {
seqOfLogical(x)
}

thelatemail1 <- function(x) {
r <- rle(x)
x[x] <- sequence(r$l[r$v])
return(x)
}

thelatemail2 <- function(x) {
x[x] <- sequence(with(rle(x), lengths[values]))
return(x)
}

maurits <- function(x) {
unlist(Map(function(l, v) if (!isTRUE(v)) rep(0, l) else 1:l, rle(x)$lengths, rle(x)$values))
}

wen <- function(A) {
B=data.table::rleid(A)
B=ave(B,B,FUN = seq_along)
B[!A]=0
B
}

mhammer <- function(x) {
x_counts <- x
for(i in seq_along(x)) {
if(x[i] == 1) { x_counts[i] <- x_counts[i] + x_counts[i-1] }
}
return(x_counts)
}

Get a vector with the count of consecutive specific values in R

Edit: Did not see Roman's identical solution when posting.

We would like something like:

tmp <- rle(with(df, !Movement & Booked))
tmp$lengths[tmp$values]

The indexing by tmp$values ensures you only get the rows corresponding to the pattern you've specified.

Hope this helps!

Count NAs in vector except at the beginning

We can subset the vector from the first non-NA element and then do the is.na to get a logical vector and get the sum

sum(is.na(temp[which(!is.na(temp))[1]:length(temp)]))
#[1] 3

Or another option is to subset based on the cumsum of logical vector and then do as above

sum(is.na(temp[cumsum(!is.na(temp))>0]))
#[1] 3

How to count character in a sequence under a dataframe based on a vector of known characters

Here is a solution using Biostrings::letterFrequency:

library(Biostrings);
dat %>%
mutate(
aa = list(complete_aa),
aa_count = list(letterFrequency(BString(sequence), letters = complete_aa))) %>%
unnest() %>%
select(-type, -seq_len);
## A tibble: 20 x 4
# fasta_header sequence aa aa_count
# <chr> <chr> <chr> <int>
# 1 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… A 1
# 2 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… C 2
# 3 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… D 1
# 4 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… E 1
# 5 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… F 2
# 6 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… G 3
# 7 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… H 0
# 8 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… I 0
# 9 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… K 1
#10 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… L 5
#11 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… M 0
#12 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… N 0
#13 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… P 6
#14 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… Q 3
#15 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… R 4
#16 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… S 4
#17 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… T 3
#18 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… V 3
#19 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… W 2
#20 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… Y 1

Biostrings also offers methods to read and parse fasta files directly, see ?read.DNAStringSet.


Update

For your second example, a solution is:

dat2 %>%
mutate(
aa = list(complete_aa),
aa_count = lapply(sequence, function(x)
letterFrequency(BString(x), letters = complete_aa))) %>%
unnest()

This produces data in a long format. If need be, reshape from long to wide with spread.

#   fasta_header   sequence aa aa_count
#1 >seq1 MPSRGTRPE A 0
#2 >seq1 MPSRGTRPE C 0
#3 >seq1 MPSRGTRPE D 0
#4 >seq1 MPSRGTRPE E 1
#5 >seq1 MPSRGTRPE F 0
#6 >seq1 MPSRGTRPE G 1
#7 >seq1 MPSRGTRPE H 0
#8 >seq1 MPSRGTRPE I 0
#9 >seq1 MPSRGTRPE K 0
#10 >seq1 MPSRGTRPE L 0
#11 >seq1 MPSRGTRPE M 1
#12 >seq1 MPSRGTRPE N 0
#13 >seq1 MPSRGTRPE P 2
#14 >seq1 MPSRGTRPE Q 0
#15 >seq1 MPSRGTRPE R 2
#16 >seq1 MPSRGTRPE S 1
#17 >seq1 MPSRGTRPE T 1
#18 >seq1 MPSRGTRPE V 0
#19 >seq1 MPSRGTRPE W 0
#20 >seq1 MPSRGTRPE Y 0
#21 >seq2 VSSKYTFWNF A 0
#22 >seq2 VSSKYTFWNF C 0
#23 >seq2 VSSKYTFWNF D 0
#24 >seq2 VSSKYTFWNF E 0
#25 >seq2 VSSKYTFWNF F 2
#26 >seq2 VSSKYTFWNF G 0
#27 >seq2 VSSKYTFWNF H 0
#28 >seq2 VSSKYTFWNF I 0
#29 >seq2 VSSKYTFWNF K 1
#30 >seq2 VSSKYTFWNF L 0
#31 >seq2 VSSKYTFWNF M 0
#32 >seq2 VSSKYTFWNF N 1
#33 >seq2 VSSKYTFWNF P 0
#34 >seq2 VSSKYTFWNF Q 0
#35 >seq2 VSSKYTFWNF R 0
#36 >seq2 VSSKYTFWNF S 2
#37 >seq2 VSSKYTFWNF T 1
#38 >seq2 VSSKYTFWNF V 1
#39 >seq2 VSSKYTFWNF W 1
#40 >seq2 VSSKYTFWNF Y 1

Count events that occurred in sequence

Try:

library(data.table)

setDT(df)[, desirable_output := cumsum(event), by = .(city, rleid(city, event))]

How can I count runs in a sequence?

Use rle():

y <- rle(c(1,0,0,0,1,0,0,0,0,0,2,0,0))
y$lengths[y$values==0]

Efficient ways to check and count zero or one in a vector of logical variables

Assuming v is a logical vector

(1) ~all(v) or any(~v) is true only if there is at least one zero

(2) any(v) or ~all(~v) is true only if there is at least one one

(3) sum(~v) counts zeros (numel(v)-sum(v) is faster according to @gnovice)

(4) sum(v) counts ones

Sequentially index between a boolean vector in R

#with(rle(TF), sequence(lengths) * rep(values, lengths))
with(rle(TF), sequence(lengths) * TF) #Like Rich suggested in comments
# [1] 0 1 2 3 0 0 1 2 0 1 0 1 2 3 4 5 6 7 0


Related Topics



Leave a reply



Submit