Partial Animal String Matching in R

Partial animal string matching in R

There may be a more elegant solution than this, but you could use grep with | to specify alternative matches.

d[grep("cat|lion|tiger|panther", d$name), "species"] <- "feline"
d[grep("bird|eagle|sparrow", d$name), "species"] <- "avian"
d[grep("dog|yorkie", d$name), "species"] <- "canine"

I've assumed you meant "avian", and left out "bulldog" since it contains "dog".

You might want to add ignore.case = TRUE to the grep.

output:

#                 name label species
#1           brown cat     1  feline
#2            blue cat     2  feline
#3            big lion     3  feline
#4          tall tiger     4  feline
#5       black panther     5  feline
#6           short cat     6  feline
#7            red bird     7   avian
#8  short bird stuffed     8   avian
#9           big eagle     9   avian
#10        bad sparrow    10   avian
#11           dog fish    11  canine
#12           head dog    12  canine
#13       brown yorkie    13  canine
#14  lab short bulldog    14  canine

Partial string matching using patterns

The grep function supports regular expressions and with regular expressions, you can match almost anything

y<- c("I am looking for a dog", "looking for a new dog", "a dog", "I am just looking")
grep("looking.*dog",y, value=T)
# [1] "I am looking for a dog" "looking for a new dog"

Here this pattern looks for looking then "maybe something" then dog. So that should do what you want.

partial matching in r- multiple matches

I'd do through a cross join.

library(dplyr)
library(stringi)

key = data_frame(partial = c("cat", "lion", "tiger", "panther",
                             "bird", "eagle", "sparrow",
                             "dog", "yorkie", "bulldog"),
                  category = c("feline", "feline", "feline", "feline",
                               "avian", "avian", "avian",
                               "canine", "canine", "canine"))

d %>%
  merge(key) %>%
  filter(name %>% stri_detect_fixed(partial) )

Partial String Match with matching regular expression in new column - R

A base R option. Just because.

patt <- c("ll", "ood")
for (i in 1: length(patt)) {
  try.dat[grep(patt[i], try.dat$words), "match"] <- patt[i]
}
try.dat
#    num  words match
#1    1   hello    ll
#2    2 goodbye   ood
#3    3 tidings  <NA>
#4    4  partly  <NA>
#5    5 totally    ll
#6    6   hello    ll
#7    7 goodbye   ood
#8    8 tidings  <NA>
#9    9  partly  <NA>
#10  10 totally    ll

R: Add column to dataframe based on partial matching characters

Here is something:

df1$D_Extended <- 
  df2$ID_Extended[sapply(df1$ID_short, 
                         function(x) match(x, substr(df2$ID_Extended, 1, nchar(x))))]

df1
  ID_short Value        D_Extended
1     Boar     4 Boar_Ko_1999_test
2      Pig     5             PigGG
3     Duck     6          Duck_p15
4      Dog     7             Dog32
5      Cat     8            Cat_Ok
6    Horse     9         Horse_p12

Data:

df1 <- data.frame(
  ID_short = c("Boar", "Pig", "Duck", "Dog", "Cat", "Horse"), 
  Value = 4:9,
  stringsAsFactors = FALSE
)
df2 <- data.frame(
  ID_Extended = c("Duck_p15", "Dog32", "PigGG","Horse_p12", "Cat_Ok", "Boar_Ko_1999_test"),
  stringsAsFactors = FALSE
)

Partial string match two columns R

Here is a solution using dplyr for piping. The core component is using grepl for logical string matching of species in both species.descriptor and product.authorised.

library(dplyr)
dats %>%
rowwise() %>%
mutate(authorised = 
           grepl(species, species.descriptor) & 
           grepl(species, product.authorise)
       )

Source: local data frame [3 x 6]
Groups: <by row>

     ID species species.descriptor product         product.authorise authorised
  (int)  (fctr)             (fctr)   (dbl)                    (fctr)      (lgl)
1     1     dog     all animal dog       1 all animal dog cat rabbit       TRUE
2     2     cat     all animal cat       2             cat horse pig       TRUE
3     3  rabbit      rabbit exotic       3                   dog cat      FALSE

If you really like stringr you can use the str_detect function for more user friendly syntax.

library(stringr)
dats %>%
mutate(authorised = 
           str_detect(species.descriptor, species) & 
           str_detect(product.authorise, species)
       )

And if you don't like dplyr you can add the column directly

dats$authorised <- 
    with(dats, 
         str_detect(species.descriptor, species) & 
             str_detect(product.authorise, species)
         )

R - All or Partial String Matching?

Based on the data showed, all of them should be TRUE

library(dplyr)
library(stringr)
pat <-  str_c(gsub(" ", "\\b|\\b", str_c("\\b", name, "\\b"),
       fixed = TRUE), collapse="|")

tweets %>% 
      mutate(ind = str_detect(text, pat))

-output

#                                                               text   ind
#1           Smith said he’s not counting on Monday being a makeup day.  TRUE
#2       Williams says that Steve Austin will miss the rest of the week  TRUE
#3 Weird times: Jeff Smith just got thrown out attempting to steal home  TRUE
#4                                            Rest day for Austin today  TRUE
#5                                  Jeff says he expects to bat leadoff  TRUE
#6                                                                 Jeff  TRUE
#7                                          No reference to either name FALSE

Create new column in dataframe based on partial string matching other column

Since you have only two conditions, you can use a nested ifelse:

#random data; it wasn't easy to copy-paste yours  
DF <- data.frame(GL = sample(10), GLDESC = paste(sample(letters, 10), 
  c("gas", "payroll12", "GaSer", "asdf", "qweaa", "PayROll-12", 
     "asdfg", "GAS--2", "fghfgh", "qweee"), sample(letters, 10), sep = " "))

DF$KIND <- ifelse(grepl("gas", DF$GLDESC, ignore.case = T), "Materials", 
         ifelse(grepl("payroll", DF$GLDESC, ignore.case = T), "Payroll", "Other"))

DF
#   GL         GLDESC      KIND
#1   8        e gas l Materials
#2   1  c payroll12 y   Payroll
#3  10      m GaSer v Materials
#4   6       t asdf n     Other
#5   2      w qweaa t     Other
#6   4 r PayROll-12 q   Payroll
#7   9      n asdfg a     Other
#8   5     d GAS--2 w Materials
#9   7     s fghfgh e     Other
#10  3      g qweee k     Other

EDIT 10/3/2016 (..after receiving more attention than expected)

A possible solution to deal with more patterns could be to iterate over all patterns and, whenever there is match, progressively reduce the amount of comparisons:

ff = function(x, patterns, replacements = patterns, fill = NA, ...)
{
    stopifnot(length(patterns) == length(replacements))

    ans = rep_len(as.character(fill), length(x))    
    empty = seq_along(x)

    for(i in seq_along(patterns)) {
        greps = grepl(patterns[[i]], x[empty], ...)
        ans[empty[greps]] = replacements[[i]]  
        empty = empty[!greps]
    }

    return(ans)
}

ff(DF$GLDESC, c("gas", "payroll"), c("Materials", "Payroll"), "Other", ignore.case = TRUE)
# [1] "Materials" "Payroll"   "Materials" "Other"     "Other"     "Payroll"   "Other"     "Materials" "Other"     "Other"

ff(c("pat1a pat2", "pat1a pat1b", "pat3", "pat4"), 
   c("pat1a|pat1b", "pat2", "pat3"), 
   c("1", "2", "3"), fill = "empty")
#[1] "1"     "1"     "3"     "empty"

ff(c("pat1a pat2", "pat1a pat1b", "pat3", "pat4"), 
   c("pat2", "pat1a|pat1b", "pat3"), 
   c("2", "1", "3"), fill = "empty")
#[1] "2"     "1"     "3"     "empty"

Partial Animal String Matching in R