How to count sequences of ones in a logical vector
with C++ through Rcpp
library(Rcpp)
cppFunction('NumericVector seqOfLogical(LogicalVector lv) {
size_t n = lv.size();
NumericVector res(n);
int foundCounter = 0;
for (size_t i = 0; i < n; i++) {
if (lv[i] == 1) {
foundCounter++;
} else {
foundCounter = 0;
}
res[i] = foundCounter;
}
return res;
}')
seqOfLogical(x)
# [1] 0 0 1 2 3 0 1 2 0 0 0 1 2 3 4
Benchmarks
library(microbenchmark)
set.seed(1)
x <- sample(c(T,F), size = 1e6, replace = T)
microbenchmark(
symbolix = { symbolix(x) },
thelatemail1 = { thelatemail1(x) },
thelatemail2 = { thelatemail2(x) },
wen = { wen(x) },
maurits = { maurits(x) },
#mhammer = { mhammer(x) }, ## this errors
times = 5
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# symbolix 2.760152 4.579596 34.60909 4.833333 22.31126 138.5611 5
# thelatemail1 154.050925 189.784368 235.16431 235.982093 262.33704 333.6671 5
# thelatemail2 138.876834 146.197278 158.66718 148.547708 179.80223 179.9119 5
# wen 780.432786 898.505231 1091.39099 1093.702177 1279.33318 1404.9816 5
# maurits 1002.267323 1043.590621 1136.35624 1086.967756 1271.38803 1277.5675 5
functions
symbolix <- function(x) {
seqOfLogical(x)
}
thelatemail1 <- function(x) {
r <- rle(x)
x[x] <- sequence(r$l[r$v])
return(x)
}
thelatemail2 <- function(x) {
x[x] <- sequence(with(rle(x), lengths[values]))
return(x)
}
maurits <- function(x) {
unlist(Map(function(l, v) if (!isTRUE(v)) rep(0, l) else 1:l, rle(x)$lengths, rle(x)$values))
}
wen <- function(A) {
B=data.table::rleid(A)
B=ave(B,B,FUN = seq_along)
B[!A]=0
B
}
mhammer <- function(x) {
x_counts <- x
for(i in seq_along(x)) {
if(x[i] == 1) { x_counts[i] <- x_counts[i] + x_counts[i-1] }
}
return(x_counts)
}
Get a vector with the count of consecutive specific values in R
Edit: Did not see Roman's identical solution when posting.
We would like something like:
tmp <- rle(with(df, !Movement & Booked))
tmp$lengths[tmp$values]
The indexing by tmp$values
ensures you only get the rows corresponding to the pattern you've specified.
Hope this helps!
Count NAs in vector except at the beginning
We can subset the vector
from the first non-NA element and then do the is.na
to get a logical vector and get the sum
sum(is.na(temp[which(!is.na(temp))[1]:length(temp)]))
#[1] 3
Or another option is to subset based on the cumsum
of logical vector and then do as above
sum(is.na(temp[cumsum(!is.na(temp))>0]))
#[1] 3
How to count character in a sequence under a dataframe based on a vector of known characters
Here is a solution using Biostrings::letterFrequency
:
library(Biostrings);
dat %>%
mutate(
aa = list(complete_aa),
aa_count = list(letterFrequency(BString(sequence), letters = complete_aa))) %>%
unnest() %>%
select(-type, -seq_len);
## A tibble: 20 x 4
# fasta_header sequence aa aa_count
# <chr> <chr> <chr> <int>
# 1 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… A 1
# 2 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… C 2
# 3 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… D 1
# 4 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… E 1
# 5 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… F 2
# 6 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… G 3
# 7 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… H 0
# 8 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… I 0
# 9 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… K 1
#10 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… L 5
#11 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… M 0
#12 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… N 0
#13 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… P 6
#14 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… Q 3
#15 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… R 4
#16 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… S 4
#17 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… T 3
#18 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… V 3
#19 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… W 2
#20 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… Y 1
Biostrings
also offers methods to read and parse fasta files directly, see ?read.DNAStringSet
.
Update
For your second example, a solution is:
dat2 %>%
mutate(
aa = list(complete_aa),
aa_count = lapply(sequence, function(x)
letterFrequency(BString(x), letters = complete_aa))) %>%
unnest()
This produces data in a long format. If need be, reshape from long to wide with spread
.
# fasta_header sequence aa aa_count
#1 >seq1 MPSRGTRPE A 0
#2 >seq1 MPSRGTRPE C 0
#3 >seq1 MPSRGTRPE D 0
#4 >seq1 MPSRGTRPE E 1
#5 >seq1 MPSRGTRPE F 0
#6 >seq1 MPSRGTRPE G 1
#7 >seq1 MPSRGTRPE H 0
#8 >seq1 MPSRGTRPE I 0
#9 >seq1 MPSRGTRPE K 0
#10 >seq1 MPSRGTRPE L 0
#11 >seq1 MPSRGTRPE M 1
#12 >seq1 MPSRGTRPE N 0
#13 >seq1 MPSRGTRPE P 2
#14 >seq1 MPSRGTRPE Q 0
#15 >seq1 MPSRGTRPE R 2
#16 >seq1 MPSRGTRPE S 1
#17 >seq1 MPSRGTRPE T 1
#18 >seq1 MPSRGTRPE V 0
#19 >seq1 MPSRGTRPE W 0
#20 >seq1 MPSRGTRPE Y 0
#21 >seq2 VSSKYTFWNF A 0
#22 >seq2 VSSKYTFWNF C 0
#23 >seq2 VSSKYTFWNF D 0
#24 >seq2 VSSKYTFWNF E 0
#25 >seq2 VSSKYTFWNF F 2
#26 >seq2 VSSKYTFWNF G 0
#27 >seq2 VSSKYTFWNF H 0
#28 >seq2 VSSKYTFWNF I 0
#29 >seq2 VSSKYTFWNF K 1
#30 >seq2 VSSKYTFWNF L 0
#31 >seq2 VSSKYTFWNF M 0
#32 >seq2 VSSKYTFWNF N 1
#33 >seq2 VSSKYTFWNF P 0
#34 >seq2 VSSKYTFWNF Q 0
#35 >seq2 VSSKYTFWNF R 0
#36 >seq2 VSSKYTFWNF S 2
#37 >seq2 VSSKYTFWNF T 1
#38 >seq2 VSSKYTFWNF V 1
#39 >seq2 VSSKYTFWNF W 1
#40 >seq2 VSSKYTFWNF Y 1
Count events that occurred in sequence
Try:
library(data.table)
setDT(df)[, desirable_output := cumsum(event), by = .(city, rleid(city, event))]
How can I count runs in a sequence?
Use rle()
:
y <- rle(c(1,0,0,0,1,0,0,0,0,0,2,0,0))
y$lengths[y$values==0]
Efficient ways to check and count zero or one in a vector of logical variables
Assuming v
is a logical vector
(1) ~all(v)
or any(~v)
is true only if there is at least one zero
(2) any(v)
or ~all(~v)
is true only if there is at least one one
(3) sum(~v)
counts zeros (numel(v)-sum(v)
is faster according to @gnovice)
(4) sum(v)
counts ones
Sequentially index between a boolean vector in R
#with(rle(TF), sequence(lengths) * rep(values, lengths))
with(rle(TF), sequence(lengths) * TF) #Like Rich suggested in comments
# [1] 0 1 2 3 0 0 1 2 0 1 0 1 2 3 4 5 6 7 0
Related Topics
Legend Venn Diagram in Venneuler
How to Use "Cast" in Reshape Without Aggregation
Adding Shade to R Lineplot Denotes Standard Error
How to Get Covariance Matrix for Random Effects (Blups/Conditional Modes) from Lme4
Build Word Co-Occurence Edge List in R
Unzip Password Protected Zip Files in R
Chloropleth Map with Geojson and Ggplot2
How to Let R Use All the Cores of the Computer
Scientific Notation Issue in R
Shiny App File Upload: How to Save the Files Uploaded on a Shiny Gui to a Particular Destination
Combining Vector and Bitmap Graphics in a PDF
Understanding Lm and Environment
Change Background Colour of Knitr::Kable Headers
Predict.Svm Does Not Predict New Data
Numbered Code Chunks in Rmarkdown