Remove Certain Words in String from Column in Dataframe in R

Remove certain words in string from column in dataframe in R

We can use 'tm' package

library(tm)

stopwords = readLines('stopwords.txt')     #Your stop words file
x  = df$company        #Company column data
x  =  removeWords(x,stopwords)     #Remove stopwords

df$company_new <- x     #Add the list as new column and check

Remove specific words from dataframe column in R

This should work:

df$text=gsub(paste(word_to_remove,collapse='|'),"",df$text)

r- how to remove a particular string from column values

As an example see this process:

# example data
x = c("Full Name A B", "Full Name F B")
y = c("Playing role G G", "Playing role G M")
dt = data.frame(x,y)

dt

#   x             y
# 1 Full Name A B Playing role G G
# 2 Full Name F B Playing role G M

library(dplyr)

dt %>% mutate_all(~gsub("Full Name |Playing role |Batting style |Bowling style ", "", .))

#   x   y
# 1 A B G G
# 2 F B G M

How to remove all unspecified specified words from dataframe column in R

Here is another base R option:

df$bio <- sapply(lapply(strsplit(df$bio, "\\s"), intersect, termstokeep),
       paste, collapse = " ")

Output

  account                          bio
1   38374 love much life proud liberal
2   45673                 can just get
3   94928  conserv christian mom proud
4   11204                        women
5   37465 former state coach now proud

Data (thanks @RuiBarradas!)

df <- structure(list(account = c(38374L, 45673L, 94928L, 11204L, 37465L
), bio = c("i love candy as much as life itself proud liberal", 
"can all just get along", "conserv christian mom and proud pro trump veteran maga", 
"professor of women and gender studies at wesleyan university blacklivesmatter", 
"former ohio state football coach now a proud papa to seven grandchildren"
)), class = "data.frame", row.names = c(NA, -5L))

termstokeep <- c("love", "life", "follow", "live", "just", "like", "music", 
"regist", "trademark", "make", "fan", "one", "copyright", "lover", 
"thing", "world", "time", "god", "can", "get", "design", "peopl", 
"artist", "girl", "univers", "writer", "will", "student", "work", 
"busi", "good", "new", "know", "friend", "famili", "best", "day", 
"account", "market", "sport", "art", "game", "manag", "want", 
"book", "enthusiast", "person", "alway", "travel", "never", "free", 
"real", "help", "dream", "servic", "mom", "husband", "profession", 
"beauti", "offici", "wife", "now", "news", "social", "food", 
"come", "father", "heart", "educ", "develop", "need", "anim", 
"everyth", "proud", "tri", "year", "happi", "also", "media", 
"way", "man", "team", "produc", "look", "state", "take", "back", 
"support", "director", "home", "find", "call", "engin", "learn", 
"provid", "photograph", "great", "author", "video", "guy", "communiti", 
"coach", "name", "big", "passion", "see", "teacher", "school", 
"product", "sinc", "gamer", "enjoy", "keep", "player", "better", 
"let", "believ", "mother", "think", "mind", "dog", "futur", "give", 
"colleg", "say", "owner", "jesus", "fun", "got", "littl", "chang", 
"founder", "boy", "use", "first", "liberal", "write", "footbal", 
"kid", "fuck", "event", "polit", "consult", "care", "conserv", 
"much", "health", "technolog", "tech", "opinion", "stay", "everi", 
"right", "full", "former", "member", "special", "well", "young", 
"high", "creat", "snap", "entrepreneur", "movi", "feel", "view", 
"compani", "coffe", "cat", "citi", "human", "digit", "show", 
"singer", "sometim", "interest", "dad", "watch", "scienc", "creativ", 
"blogger", "base", "addict", "fit", "read", "bless", "fashion", 
"part", "noth", "run", "forev", "editor", "born", "hard", "die", 
"around", "onlin", "nerd", "class", "web", "musician", "made", 
"stuff", "leader", "ever", "inspir", "still", "christian", "place", 
"current", "public", "danc", "pleas", "geek", "talk", "film", 
"realli", "babi", "someth", "page", "rock", "lot", "women", "lead", 
"two")

How to retain a specific word in a string throughout the column and remove all the rest?

Here is a possibility

library(tidyverse)
data %>% mutate(column = str_replace(column, "^.*word1.*$", "word1"))
       column
1       word1
2       word1
3 word3 word2
4       word1
5       word3
6       word1

or with a capture group

data %>% mutate(column = str_replace(column, "^.*(word1).*$", "\\1"))

R gsub remove words in column y from words in column x

Normally gsub takes three arguments 1) pattern, 2) replacement and 3) vector to replace values.

The pattern must be a single string. And the same for the replacement. The only part of the function that is open to multiple values is the vector. We call it vectorized because of this.

gsub(df$x, "", df$y)  #doesn't work because 'df$x' isn't one string

The pattern argument is not vectorized, but we can use mapply to complete the task.

mapply and gsub (bffs)

x = c("a","b","c")
y = c("asometext", "some, a b text", "c a text")
repl = ""

#We do
mapply(gsub, x, repl, y)

#On the inside
gsub(x[[1]], repl[[1]], y[[1]])
gsub(x[[2]], repl[[2]], y[[2]])
gsub(x[[3]], repl[[3]], y[[3]])

You may be asking, but I only have one repl, how does repl[[2]] and repl[[3]] work? The function noticed that for us and repeated 'repl' until it equaled the length of the others.

Matching a exact word from a column to a string in another column and remove the matching word from string of other column

You can use str_extract and gsub. I also added a scenario in row 4 where column 1 value doesn't match with column 2. You can find the final output in column 3.

library(stringr)
library(dplyr)

col1 <- c("STELLARN714WPUR", "STELLARN714WRED", "STELLARN814WRED", "AB")
col2 <- c("STELLARN594WPUR,STELLARN714WPUR,STELLARN814WPUR", "STELLARN594WRED,STELLARN814WRED,STELLARN714WRED", "STELLARN594WRED,STELLARN714WRED,STELLARN814WRED", "STELLARN594WPUR,STELLARN714WPUR,STELLARN814WPUR")

df <- data.frame(column1  = col1, Column2 = col2, stringsAsFactors = FALSE)
df
          column1                                         Column2
1 STELLARN714WPUR STELLARN594WPUR,STELLARN714WPUR,STELLARN814WPUR
2 STELLARN714WRED STELLARN594WRED,STELLARN814WRED,STELLARN714WRED
3 STELLARN814WRED STELLARN594WRED,STELLARN714WRED,STELLARN814WRED
4              AB STELLARN594WPUR,STELLARN714WPUR,STELLARN814WPUR

df %>%
  mutate(match_val = str_extract(Column2, column1),
         Column3 = ifelse(is.na(match_val), Column2, 
                          str_replace(Column2, paste0(match_val,",|, ",match_val), "")))

          column1                                         Column2       match_val
1 STELLARN714WPUR STELLARN594WPUR,STELLARN714WPUR,STELLARN814WPUR STELLARN714WPUR
2 STELLARN714WRED STELLARN594WRED,STELLARN814WRED,STELLARN714WRED STELLARN714WRED
3 STELLARN814WRED STELLARN594WRED,STELLARN714WRED,STELLARN814WRED STELLARN814WRED
4              AB STELLARN594WPUR,STELLARN714WPUR,STELLARN814WPUR            <NA>
                                          Column3
1                 STELLARN594WPUR,STELLARN814WPUR
2 STELLARN594WRED,STELLARN814WRED,STELLARN714WRED
3 STELLARN594WRED,STELLARN714WRED,STELLARN814WRED
4 STELLARN594WPUR,STELLARN714WPUR,STELLARN814WPUR

Remove Certain Words in String from Column in Dataframe in R