R - Test If a String Vector Contains Any Element of Another List

R - test if a string vector contains any element of another list

You can get this using grepl.

lst_A <- c("TET","RNR")
lst_B = c("RNR_B","BC_TET")

Pattern = paste(lst_A, collapse="|")
grepl(Pattern, lst_B)

library(data.table)
DT_result <- data.table(lst_B, result=grepl(Pattern, lst_B))
DT_result
lst_B result
1: RNR_B TRUE
2: BC_TET TRUE

Addition

To respond to a comment, here is an example with more strings to test. Some pass the test, others not.

lst_A <- c("TET","RNR")
lst_B = c("RNR_B","BC_TET", "Fred", "RNR_A", "Zero", "ABC_TET")

Pattern = paste(lst_A, collapse="|")

DT_result <- data.table(lst_B, result=grepl(Pattern, lst_B))
DT_result
lst_B result
1: RNR_B TRUE
2: BC_TET TRUE
3: Fred FALSE
4: RNR_A TRUE
5: Zero FALSE
6: ABC_TET TRUE

Test if a vector contains a given element

Both the match() (returns the first appearance) and %in% (returns a Boolean) functions are designed for this.

v <- c('a','b','c','e')

'b' %in% v
## returns TRUE

match('b',v)
## returns the first location of 'b', in this case: 2

Check if string contains anything other than items in vector [R]

One option is to split the 'string_column' with separate_rows, grouped by 'id', check if there are not any elements from 'string_column' %in% the concatenated vectors

library(dplyr)
library(tidyr)
df %>%
separate_rows(string_column) %>%
group_by(id) %>%
summarise(unmatched = any(!string_column %in% c(matchvector1, matchvector2)) )
# A tibble: 3 x 2
# id unmatched
#* <dbl> <lgl>
#1 1 FALSE
#2 2 FALSE
#3 3 TRUE

or in base R

lengths(sapply(strsplit(df$string_column, ",\\s*"), 
setdiff, c(matchvector1, matchvector2))) > 0
#[1] FALSE FALSE TRUE

Finding if string vector contains any string from other vector

It seems that you are interested in the domain. In this case I d suggest to remove everything but the domain and top level domain and simply use %in%, i.e.

sub('.*@', '', Mail) %in% InterestingPublishers

How can I determine whether a vector contains another vector respecting order in R?

You can collapse your vector into a regex pattern and use grepl

vec1 <- c("a", "b", "c")
vec2 <- c("a", "b", "c", "d", "e")
grepl(paste(vec1, collapse=".*"), paste(vec2, collapse=""))
# TRUE
vec3 <- c("e", "d", "c", "b", "a")
grepl(paste(vec1, collapse=".*"), paste(vec3, collapse=""))
# FALSE
vec4 <- c("a", "x", "b", "c", "y")
grepl(paste(vec1, collapse=".*"), paste(vec4, collapse=""))
# TRUE

EDIT: Based on G5W's comment, you can add a delimiter in case each element is not a character but might be a short string. The delimiter will break up the entries of your vector

vec5 <- c("a", "b", "c")
vec6 <- c("ab", "c")
vec7 <- c("ab", "e", "c", "d")
grepl(paste(vec5, collapse="-.*"), paste(vec7, collapse="-"))
# FALSE
grepl(paste(vec6, collapse="-.*"), paste(vec7, collapse="-"))
# TRUE

How do I extract a word, that is contained in a group/list of words, from a string?

paste the words as a single string and use that in str_extract

library(stringr)
str_extract(x, str_c(p, collapse="|"))
[1] "Route" "Dice" NA

Way to check if each position in a string as part of a list of vectors is variable?

Here is one approach based on string splitting:

l <- list(rep.int(strrep("0", 6L), 8L),
rep.int(strrep(c("10", "01"), 3L), 4L),
rep.int(strrep(c("10", "100"), c(3L, 2L)), c(4L, 4L)))
l
## [[1]]
## [1] "000000" "000000" "000000" "000000"
## [5] "000000" "000000" "000000" "000000"
##
## [[2]]
## [1] "101010" "010101" "101010" "010101"
## [5] "101010" "010101" "101010" "010101"
##
## [[3]]
## [1] "101010" "101010" "101010" "101010"
## [5] "100100" "100100" "100100" "100100"

f <- function(l) {
m <- nchar(l[[1L]][1L])
n <- length(l)
f0 <- function(x) {
matrix(unlist(strsplit(x, ""), FALSE, FALSE), m)
}
X <- do.call(rbind, lapply(l, f0))
matrix(matrixStats::rowAnys(X != X[, 1L]), n, byrow = TRUE)
}
f(l)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] FALSE FALSE FALSE FALSE FALSE FALSE
## [2,] TRUE TRUE TRUE TRUE TRUE TRUE
## [3,] FALSE FALSE TRUE TRUE TRUE FALSE

If your codes can be read as decimal numbers less than or equal to .Machine$integer.max, then you can optimize by replacing string splitting with integer arithmetic:

g <- function(l) {
m <- length(l)
n <- length(l[[1L]])
N <- nchar(l[[1L]][1L])
X <- matrix(as.integer(unlist(l, FALSE, FALSE)), m, n, byrow = TRUE)
g0 <- function(pow) {
Y <- X %/% pow
X <<- X - pow * Y
matrixStats::rowAnys(Y != Y[, 1L])
}
pow <- as.integer(10^((N - 1L):0))
matrix(unlist(lapply(pow, g0), FALSE, FALSE), m, N)
}
g(l)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] FALSE FALSE FALSE FALSE FALSE FALSE
## [2,] TRUE TRUE TRUE TRUE TRUE TRUE
## [3,] FALSE FALSE TRUE TRUE TRUE FALSE

If your codes are actually binary, then you can optimize slightly more and dispense with matrixStats:

h <- function(l) {
m <- length(l[[1L]])
n <- length(l)
N <- nchar(l[[1L]][1L])
X <- matrix(as.integer(unlist(l, FALSE, FALSE)), m, n)
h0 <- function(p) {
Y <- X %/% p
X <<- X - p * Y
.colSums(Y, m, n) %% m > 0L
}
pow <- as.integer(10^((N - 1L):0))
matrix(unlist(lapply(pow, h0), FALSE, FALSE), n, N)
}
h(l)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] FALSE FALSE FALSE FALSE FALSE FALSE
## [2,] TRUE TRUE TRUE TRUE TRUE TRUE
## [3,] FALSE FALSE TRUE TRUE TRUE FALSE

Here is a benchmark on a length-10000 list of length-8 character vectors of 6-digit binary codes.

ll <- rep_len(l, 1e+04L)
microbenchmark::microbenchmark(f(ll), g(ll), h(ll))
## Unit: milliseconds
## expr min lq mean median uq max neval
## f(ll) 41.583143 55.960510 66.201555 64.211679 73.542807 127.47810 100
## g(ll) 8.612173 8.856123 9.725214 8.946077 9.116391 46.66698 100
## h(ll) 7.622679 7.824789 8.717184 7.887519 7.987128 46.32225 100


Related Topics



Leave a reply



Submit