Partial Animal String Matching in R

Partial animal string matching in R

There may be a more elegant solution than this, but you could use grep with | to specify alternative matches.

d[grep("cat|lion|tiger|panther", d$name), "species"] <- "feline"
d[grep("bird|eagle|sparrow", d$name), "species"] <- "avian"
d[grep("dog|yorkie", d$name), "species"] <- "canine"

I've assumed you meant "avian", and left out "bulldog" since it contains "dog".

You might want to add ignore.case = TRUE to the grep.

output:

#                 name label species
#1 brown cat 1 feline
#2 blue cat 2 feline
#3 big lion 3 feline
#4 tall tiger 4 feline
#5 black panther 5 feline
#6 short cat 6 feline
#7 red bird 7 avian
#8 short bird stuffed 8 avian
#9 big eagle 9 avian
#10 bad sparrow 10 avian
#11 dog fish 11 canine
#12 head dog 12 canine
#13 brown yorkie 13 canine
#14 lab short bulldog 14 canine

Partial string matching using patterns

The grep function supports regular expressions and with regular expressions, you can match almost anything

y<- c("I am looking for a dog", "looking for a new dog", "a dog", "I am just looking")
grep("looking.*dog",y, value=T)
# [1] "I am looking for a dog" "looking for a new dog"

Here this pattern looks for looking then "maybe something" then dog. So that should do what you want.

partial matching in r- multiple matches

I'd do through a cross join.

library(dplyr)
library(stringi)

key = data_frame(partial = c("cat", "lion", "tiger", "panther",
"bird", "eagle", "sparrow",
"dog", "yorkie", "bulldog"),
category = c("feline", "feline", "feline", "feline",
"avian", "avian", "avian",
"canine", "canine", "canine"))

d %>%
merge(key) %>%
filter(name %>% stri_detect_fixed(partial) )

Partial String Match with matching regular expression in new column - R

A base R option. Just because.

patt <- c("ll", "ood")
for (i in 1: length(patt)) {
try.dat[grep(patt[i], try.dat$words), "match"] <- patt[i]
}
try.dat
# num words match
#1 1 hello ll
#2 2 goodbye ood
#3 3 tidings <NA>
#4 4 partly <NA>
#5 5 totally ll
#6 6 hello ll
#7 7 goodbye ood
#8 8 tidings <NA>
#9 9 partly <NA>
#10 10 totally ll

R: Add column to dataframe based on partial matching characters

Here is something:

df1$D_Extended <- 
df2$ID_Extended[sapply(df1$ID_short,
function(x) match(x, substr(df2$ID_Extended, 1, nchar(x))))]

df1
ID_short Value D_Extended
1 Boar 4 Boar_Ko_1999_test
2 Pig 5 PigGG
3 Duck 6 Duck_p15
4 Dog 7 Dog32
5 Cat 8 Cat_Ok
6 Horse 9 Horse_p12

Data:

df1 <- data.frame(
ID_short = c("Boar", "Pig", "Duck", "Dog", "Cat", "Horse"),
Value = 4:9,
stringsAsFactors = FALSE
)
df2 <- data.frame(
ID_Extended = c("Duck_p15", "Dog32", "PigGG","Horse_p12", "Cat_Ok", "Boar_Ko_1999_test"),
stringsAsFactors = FALSE
)

Partial string match two columns R

Here is a solution using dplyr for piping. The core component is using grepl for logical string matching of species in both species.descriptor and product.authorised.

library(dplyr)
dats %>%
rowwise() %>%
mutate(authorised =
grepl(species, species.descriptor) &
grepl(species, product.authorise)
)

Source: local data frame [3 x 6]
Groups: <by row>

ID species species.descriptor product product.authorise authorised
(int) (fctr) (fctr) (dbl) (fctr) (lgl)
1 1 dog all animal dog 1 all animal dog cat rabbit TRUE
2 2 cat all animal cat 2 cat horse pig TRUE
3 3 rabbit rabbit exotic 3 dog cat FALSE

If you really like stringr you can use the str_detect function for more user friendly syntax.

library(stringr)
dats %>%
mutate(authorised =
str_detect(species.descriptor, species) &
str_detect(product.authorise, species)
)

And if you don't like dplyr you can add the column directly

dats$authorised <- 
with(dats,
str_detect(species.descriptor, species) &
str_detect(product.authorise, species)
)

R - All or Partial String Matching?

Based on the data showed, all of them should be TRUE

library(dplyr)
library(stringr)
pat <- str_c(gsub(" ", "\\b|\\b", str_c("\\b", name, "\\b"),
fixed = TRUE), collapse="|")

tweets %>%
mutate(ind = str_detect(text, pat))

-output

#                                                               text   ind
#1 Smith said he’s not counting on Monday being a makeup day. TRUE
#2 Williams says that Steve Austin will miss the rest of the week TRUE
#3 Weird times: Jeff Smith just got thrown out attempting to steal home TRUE
#4 Rest day for Austin today TRUE
#5 Jeff says he expects to bat leadoff TRUE
#6 Jeff TRUE
#7 No reference to either name FALSE

Create new column in dataframe based on partial string matching other column

Since you have only two conditions, you can use a nested ifelse:

#random data; it wasn't easy to copy-paste yours  
DF <- data.frame(GL = sample(10), GLDESC = paste(sample(letters, 10),
c("gas", "payroll12", "GaSer", "asdf", "qweaa", "PayROll-12",
"asdfg", "GAS--2", "fghfgh", "qweee"), sample(letters, 10), sep = " "))

DF$KIND <- ifelse(grepl("gas", DF$GLDESC, ignore.case = T), "Materials",
ifelse(grepl("payroll", DF$GLDESC, ignore.case = T), "Payroll", "Other"))

DF
# GL GLDESC KIND
#1 8 e gas l Materials
#2 1 c payroll12 y Payroll
#3 10 m GaSer v Materials
#4 6 t asdf n Other
#5 2 w qweaa t Other
#6 4 r PayROll-12 q Payroll
#7 9 n asdfg a Other
#8 5 d GAS--2 w Materials
#9 7 s fghfgh e Other
#10 3 g qweee k Other

EDIT 10/3/2016 (..after receiving more attention than expected)

A possible solution to deal with more patterns could be to iterate over all patterns and, whenever there is match, progressively reduce the amount of comparisons:

ff = function(x, patterns, replacements = patterns, fill = NA, ...)
{
stopifnot(length(patterns) == length(replacements))

ans = rep_len(as.character(fill), length(x))
empty = seq_along(x)

for(i in seq_along(patterns)) {
greps = grepl(patterns[[i]], x[empty], ...)
ans[empty[greps]] = replacements[[i]]
empty = empty[!greps]
}

return(ans)
}

ff(DF$GLDESC, c("gas", "payroll"), c("Materials", "Payroll"), "Other", ignore.case = TRUE)
# [1] "Materials" "Payroll" "Materials" "Other" "Other" "Payroll" "Other" "Materials" "Other" "Other"

ff(c("pat1a pat2", "pat1a pat1b", "pat3", "pat4"),
c("pat1a|pat1b", "pat2", "pat3"),
c("1", "2", "3"), fill = "empty")
#[1] "1" "1" "3" "empty"

ff(c("pat1a pat2", "pat1a pat1b", "pat3", "pat4"),
c("pat2", "pat1a|pat1b", "pat3"),
c("2", "1", "3"), fill = "empty")
#[1] "2" "1" "3" "empty"


Related Topics



Leave a reply



Submit