Simple Comparing of Two Texts in R

Identify differences in text paragraphs with R

We concatenate both the strings, split at the space after the . to create a list of sentences ('lst'), get the unique elements from unlisting the 'lst' ('un1'), using setdiff we get the elements that are not in 'un1'

lst <- strsplit(c(a= a, b = b), "(?<=[.])\\s", perl = TRUE)
un1 <- unique(unlist(lst))
lapply(lst, setdiff, x= un1)

Comparing two text files using R

That HTML widget package won't give you back output but it's based on a javascript library that was based on a python module.

We'll use the Python version but we won't use the reticulate package b/c I'm not about to show how to iterate over Python structures in R, so we'll take the pointer from the Python page about the script being at Tools/scripts/diff.py and grab it from github to avoid trying to find it on your system. This does mean python needs to be installed. Python 3 to be precise (since that's a fragile, fragmented ecosystem).

tf <- tempfile(fileext = ".py")
on.exit(unlink(tf), add = TRUE)
writeLines(
  readLines("https://raw.githubusercontent.com/python/cpython/master/Tools/scripts/diff.py"),
  tf
)

Now, we'll find the python3 binary on your system and the pip3 binary on your system:

python <- Sys.which("python3")
pip <- Sys.which("pip3")

And make sure a really critical module is installed that should always be installed but python is so daft it isn't:

# just in case you don't have it
system2(command = pip, args = c("install", "datetime"))

Now run the diff on two made up files of mine:

system2(
  command = python, 
  args = c(
    tf, 
    path.expand("~/Data/so.txt"), 
    path.expand("~/Data/so1.txt")
  ),
  stdout = TRUE
) -> res

And see the output you now need to parse:

res
##  [1] "*** /Users/bob/Data/so.txt\t2018-10-15T06:38:07.169832-04:00" 
##  [2] "--- /Users/bob/Data/so1.txt\t2018-10-18T08:50:51.745551-04:00"
##  [3] "***************"                                              
##  [4] "*** 6,29 ****"                                                
##  [5] "  QX = X-ray|NRW"                                             
##  [6] "  UI = Q000000981"                                            
##  [7] "  "                                                           
##  [8] "- *NEWRECORD"                                                 
##  [9] "- RECTYPE = Q"                                                
## [10] "- SH = analogs & derivatives"                                 
## [11] "- QE = ANALOGS"                                               
## [12] "- QA = AA"                                                    
## [13] "- QT = 1"                                                     
## [14] "- "                                                           
## [15] "- *NEWRECORD"                                                 
## [16] "- RECTYPE = Q"                                                
## [17] "- SH = abnormalities"                                         
## [18] "- QE = ABNORM"                                                
## [19] "- QX = agenesis|NRW"                                          
## [20] "- QX = anomalies|EQV"                                         
## [21] "- QX = aplasia|NRW"                                           
## [22] "- QX = atresia|NRW"                                           
## [23] "- QX = birth defects|NRW"                                     
## [24] "- QX = congenital defects|NRW"                                
## [25] "- QX = defects|NRW"                                           
## [26] "- QX = deformities|NRW"                                       
## [27] "- QX = hypoplasia|NRW"                                        
## [28] "- UI = Q000002"                                               
## [29] "--- 6,8 ----"

Having done all that ^^, you could also just use tools::Rdiff():

(res <- tools::Rdiff("~/Data/so.txt", "~/Data/so1.txt", Log=TRUE))
## $status
## [1] 1
## 
## $out
##  [1] "files differ in number of lines" "9,29d8"                         
##  [3] "< *NEWRECORD"                    "< RECTYPE = Q"                  
##  [5] "< SH = analogs & derivatives"    "< QE = ANALOGS"                 
##  [7] "< QA = AA"                       "< QT = 1"                       
##  [9] "< "                              "< *NEWRECORD"                   
## [11] "< RECTYPE = Q"                   "< SH = abnormalities"           
## [13] "< QE = ABNORM"                   "< QX = agenesis|NRW"            
## [15] "< QX = anomalies|EQV"            "< QX = aplasia|NRW"             
## [17] "< QX = atresia|NRW"              "< QX = birth defects|NRW"       
## [19] "< QX = congenital defects|NRW"   "< QX = defects|NRW"             
## [21] "< QX = deformities|NRW"          "< QX = hypoplasia|NRW"          
## [23] "< UI = Q000002"

but I wanted to show the twisty path first :-)

How to compare two strings in R?

It's pretty simple actually.

s1 <- "string"
s2 <- "String"

# Case-sensitive check
s1 == s2

# Case-insensitive check
tolower(s1) == tolower(s2)

The output in the first case is FALSE and in the second case it is TRUE. You can use toupper() as well.

how to compare two strings in R

one possibility would be to use str_split and then map2 from the purrr package.

First I create some pseuda data:

x <- c("sentence number one", "another one is here")
y <- c("setence number two", "aner one are hre")

Then I transform it:

x2 <- str_split(x, " ")
y2 <- str_split(y, " ")

library(purrr)
map2(x2, y2, ~ifelse(.x == .y, "", paste(.x, .y, sep = ":")))

    [[1]]
[1] "sentence:setence" ""                 "one:two"         

[[2]]
[1] "another:aner" ""             "is:are"       "here:hre"

r compare text in two columns by row

I dont understand how you want to organize the the output in X3 and X4 but maybe this helps:

words_x1 <- (df$X1 %>% paste(collapse = " ") %>% str_split(" "))[[1]] %>% unique()
words_x2 <- (df$X2 %>% paste(collapse = " ") %>% str_split(" "))[[1]] %>% unique()

c(words_x1[!(words_x1 %in% words_x2)], words_x2[!(words_x2 %in% words_x1)])

I think what you want to achieve is something like this (note that I am using a tibble as it does not seem to work with data.frame.

library(dplyr)
library(purrr)

df <- tibble(
  X1 = c("the fox ate grapes", "the cat ate"),
  X2 = c("the fox ate watermelon", "the cat ate backwards")
)
myfunction <- function(x1, x2) {
  w1 <- strsplit(x1, " ")[[1]]
  w2 <- strsplit(x2, " ")[[1]]
  c(w1[!(w1 %in% w2)], w2[!(w2 %in% w1)])
}

map2(df$X1, df$X2, myfunction)

Comparing strings and extracting differences at the phrase-level in R

Allow me to handle the process of making the course list unique while I'm at it, since it makes the solution cleaner.

Load stringr to take out any unnecessary whitespace, and then split the string of courses into a vector of courses...

library(stringr)
df$pre <- stringr::str_split(stringr::str_squish(df$pre_drop_courses), ",")
df$post <- stringr::str_split(stringr::str_squish(df$post_drop_courses), ",")

Use Map to get the unique values of courses for pre and post

df$pre <- Map(unique, df$pre)
df$post <- Map(unique, df$post)

Use Map to find the set differences. One direction is adds the other is drops

df$dropped <- Map(setdiff, df$pre, df$post)
df$added <- Map(setdiff, df$post, df$pre)

df
#>    TECH_ID  YRTR               pre_drop_courses             post_drop_courses
#> 1 00000108 20173  BUSN 2100,BUSN 2400,ACCT 2254 BUSN 1102,BUSN 1102,BUSN 1102
#> 2 00000108 20173 BUSN 1102,BUSN 1102,BUSN 1102  BUSN 2100,BUSN 2400,ACCT 2254
#> 3 00000270 20183  BIOL 2041,BIOL 2041,BIOL 2041 BIOL 2042,BIOL 2042,BIOL 2042
#>   unchanged                             pre                            post
#> 1      TRUE BUSN 2100, BUSN 2400, ACCT 2254                       BUSN 1102
#> 2     FALSE                       BUSN 1102 BUSN 2100, BUSN 2400, ACCT 2254
#> 3      TRUE                       BIOL 2041                       BIOL 2042
#>                           dropped                           added
#> 1 BUSN 2100, BUSN 2400, ACCT 2254                       BUSN 1102
#> 2                       BUSN 1102 BUSN 2100, BUSN 2400, ACCT 2254
#> 3                       BIOL 2041                       BIOL 2042

Your original sample data

df <- 
   structure(list(TECH_ID = c("00000108", "00000108", "00000270"), 
   YRTR = c("20173", "20173", "20183"), 
   pre_drop_courses = c("BUSN 2100,BUSN 2400,ACCT 2254", 
                        "BUSN 1102,BUSN 1102,BUSN 1102 ",
                        "BIOL 2041,BIOL 2041,BIOL 2041"), 
   post_drop_courses = c("BUSN 1102,BUSN 1102,BUSN 1102", 
                         "BUSN 2100,BUSN 2400,ACCT 2254", 
                         "BIOL 2042,BIOL 2042,BIOL 2042"
                                                             ), 
   unchanged = c(TRUE, FALSE, TRUE)), row.names = c(NA, 3L), class = "data.frame")