R - test if a string vector contains any element of another list
You can get this using grepl
.
lst_A <- c("TET","RNR")
lst_B = c("RNR_B","BC_TET")
Pattern = paste(lst_A, collapse="|")
grepl(Pattern, lst_B)
library(data.table)
DT_result <- data.table(lst_B, result=grepl(Pattern, lst_B))
DT_result
lst_B result
1: RNR_B TRUE
2: BC_TET TRUE
Addition
To respond to a comment, here is an example with more strings to test. Some pass the test, others not.
lst_A <- c("TET","RNR")
lst_B = c("RNR_B","BC_TET", "Fred", "RNR_A", "Zero", "ABC_TET")
Pattern = paste(lst_A, collapse="|")
DT_result <- data.table(lst_B, result=grepl(Pattern, lst_B))
DT_result
lst_B result
1: RNR_B TRUE
2: BC_TET TRUE
3: Fred FALSE
4: RNR_A TRUE
5: Zero FALSE
6: ABC_TET TRUE
Test if a vector contains a given element
Both the match()
(returns the first appearance) and %in%
(returns a Boolean) functions are designed for this.
v <- c('a','b','c','e')
'b' %in% v
## returns TRUE
match('b',v)
## returns the first location of 'b', in this case: 2
Check if string contains anything other than items in vector [R]
One option is to split the 'string_column' with separate_rows
, grouped by 'id', check if there are not any
elements from 'string_column' %in%
the concatenated vectors
library(dplyr)
library(tidyr)
df %>%
separate_rows(string_column) %>%
group_by(id) %>%
summarise(unmatched = any(!string_column %in% c(matchvector1, matchvector2)) )
# A tibble: 3 x 2
# id unmatched
#* <dbl> <lgl>
#1 1 FALSE
#2 2 FALSE
#3 3 TRUE
or in base R
lengths(sapply(strsplit(df$string_column, ",\\s*"),
setdiff, c(matchvector1, matchvector2))) > 0
#[1] FALSE FALSE TRUE
Finding if string vector contains any string from other vector
It seems that you are interested in the domain. In this case I d suggest to remove everything but the domain and top level domain and simply use %in%
, i.e.
sub('.*@', '', Mail) %in% InterestingPublishers
How can I determine whether a vector contains another vector respecting order in R?
You can collapse your vector into a regex pattern and use grepl
vec1 <- c("a", "b", "c")
vec2 <- c("a", "b", "c", "d", "e")
grepl(paste(vec1, collapse=".*"), paste(vec2, collapse=""))
# TRUE
vec3 <- c("e", "d", "c", "b", "a")
grepl(paste(vec1, collapse=".*"), paste(vec3, collapse=""))
# FALSE
vec4 <- c("a", "x", "b", "c", "y")
grepl(paste(vec1, collapse=".*"), paste(vec4, collapse=""))
# TRUE
EDIT: Based on G5W's comment, you can add a delimiter in case each element is not a character but might be a short string. The delimiter will break up the entries of your vector
vec5 <- c("a", "b", "c")
vec6 <- c("ab", "c")
vec7 <- c("ab", "e", "c", "d")
grepl(paste(vec5, collapse="-.*"), paste(vec7, collapse="-"))
# FALSE
grepl(paste(vec6, collapse="-.*"), paste(vec7, collapse="-"))
# TRUE
How do I extract a word, that is contained in a group/list of words, from a string?
paste the words as a single string and use that in str_extract
library(stringr)
str_extract(x, str_c(p, collapse="|"))
[1] "Route" "Dice" NA
Way to check if each position in a string as part of a list of vectors is variable?
Here is one approach based on string splitting:
l <- list(rep.int(strrep("0", 6L), 8L),
rep.int(strrep(c("10", "01"), 3L), 4L),
rep.int(strrep(c("10", "100"), c(3L, 2L)), c(4L, 4L)))
l
## [[1]]
## [1] "000000" "000000" "000000" "000000"
## [5] "000000" "000000" "000000" "000000"
##
## [[2]]
## [1] "101010" "010101" "101010" "010101"
## [5] "101010" "010101" "101010" "010101"
##
## [[3]]
## [1] "101010" "101010" "101010" "101010"
## [5] "100100" "100100" "100100" "100100"
f <- function(l) {
m <- nchar(l[[1L]][1L])
n <- length(l)
f0 <- function(x) {
matrix(unlist(strsplit(x, ""), FALSE, FALSE), m)
}
X <- do.call(rbind, lapply(l, f0))
matrix(matrixStats::rowAnys(X != X[, 1L]), n, byrow = TRUE)
}
f(l)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] FALSE FALSE FALSE FALSE FALSE FALSE
## [2,] TRUE TRUE TRUE TRUE TRUE TRUE
## [3,] FALSE FALSE TRUE TRUE TRUE FALSE
If your codes can be read as decimal numbers less than or equal to .Machine$integer.max
, then you can optimize by replacing string splitting with integer arithmetic:
g <- function(l) {
m <- length(l)
n <- length(l[[1L]])
N <- nchar(l[[1L]][1L])
X <- matrix(as.integer(unlist(l, FALSE, FALSE)), m, n, byrow = TRUE)
g0 <- function(pow) {
Y <- X %/% pow
X <<- X - pow * Y
matrixStats::rowAnys(Y != Y[, 1L])
}
pow <- as.integer(10^((N - 1L):0))
matrix(unlist(lapply(pow, g0), FALSE, FALSE), m, N)
}
g(l)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] FALSE FALSE FALSE FALSE FALSE FALSE
## [2,] TRUE TRUE TRUE TRUE TRUE TRUE
## [3,] FALSE FALSE TRUE TRUE TRUE FALSE
If your codes are actually binary, then you can optimize slightly more and dispense with matrixStats
:
h <- function(l) {
m <- length(l[[1L]])
n <- length(l)
N <- nchar(l[[1L]][1L])
X <- matrix(as.integer(unlist(l, FALSE, FALSE)), m, n)
h0 <- function(p) {
Y <- X %/% p
X <<- X - p * Y
.colSums(Y, m, n) %% m > 0L
}
pow <- as.integer(10^((N - 1L):0))
matrix(unlist(lapply(pow, h0), FALSE, FALSE), n, N)
}
h(l)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] FALSE FALSE FALSE FALSE FALSE FALSE
## [2,] TRUE TRUE TRUE TRUE TRUE TRUE
## [3,] FALSE FALSE TRUE TRUE TRUE FALSE
Here is a benchmark on a length-10000 list of length-8 character vectors of 6-digit binary codes.
ll <- rep_len(l, 1e+04L)
microbenchmark::microbenchmark(f(ll), g(ll), h(ll))
## Unit: milliseconds
## expr min lq mean median uq max neval
## f(ll) 41.583143 55.960510 66.201555 64.211679 73.542807 127.47810 100
## g(ll) 8.612173 8.856123 9.725214 8.946077 9.116391 46.66698 100
## h(ll) 7.622679 7.824789 8.717184 7.887519 7.987128 46.32225 100
Related Topics
Rstudio Suddenly Stopped Showing Plots in the Plot Pane
Convert Categorical Variables to Numeric in R
Join 3 Columns of Different Lengths in R
Collapse/Concatenate/Aggregate a Column to a Single Comma Separated String Within Each Group
Showing Data Values on Stacked Bar Chart in Ggplot2
Does Ifelse Really Calculate Both of Its Vectors Every Time? Is It Slow
Shading a Kernel Density Plot Between Two Points.
Split a Large Dataframe into a List of Data Frames Based on Common Value in Column
Stratified Random Sampling from Data Frame
How to Spread Repeated Measures of Multiple Variables into Wide Format
Adding Value from One Data.Frame to Another Data.Frame by Matching a Variable
Find Duplicated Elements With Dplyr
Replacing Na Values from Another Dataframe by Id
How to Convert a Factor to Integer\Numeric Without Loss of Information
Overlap Join With Start and End Positions
How to Sort a Character Vector Where Elements Contain Letters and Numbers
Increasing (Or Decreasing) the Memory Available to R Processes
Data.Table Objects Assigned With := from Within Function Not Printed