Identify differences in text paragraphs with R
We concatenate both the strings, split at the space after the .
to create a list
of sentences ('lst'), get the unique
elements from unlist
ing the 'lst' ('un1'), using setdiff
we get the elements that are not in 'un1'
lst <- strsplit(c(a= a, b = b), "(?<=[.])\\s", perl = TRUE)
un1 <- unique(unlist(lst))
lapply(lst, setdiff, x= un1)
Comparing two text files using R
That HTML widget package won't give you back output but it's based on a javascript library that was based on a python module.
We'll use the Python version but we won't use the reticulate
package b/c I'm not about to show how to iterate over Python structures in R, so we'll take the pointer from the Python page about the script being at Tools/scripts/diff.py
and grab it from github to avoid trying to find it on your system. This does mean python needs to be installed. Python 3 to be precise (since that's a fragile, fragmented ecosystem).
tf <- tempfile(fileext = ".py")
on.exit(unlink(tf), add = TRUE)
writeLines(
readLines("https://raw.githubusercontent.com/python/cpython/master/Tools/scripts/diff.py"),
tf
)
Now, we'll find the python3
binary on your system and the pip3
binary on your system:
python <- Sys.which("python3")
pip <- Sys.which("pip3")
And make sure a really critical module is installed that should always be installed but python is so daft it isn't:
# just in case you don't have it
system2(command = pip, args = c("install", "datetime"))
Now run the diff on two made up files of mine:
system2(
command = python,
args = c(
tf,
path.expand("~/Data/so.txt"),
path.expand("~/Data/so1.txt")
),
stdout = TRUE
) -> res
And see the output you now need to parse:
res
## [1] "*** /Users/bob/Data/so.txt\t2018-10-15T06:38:07.169832-04:00"
## [2] "--- /Users/bob/Data/so1.txt\t2018-10-18T08:50:51.745551-04:00"
## [3] "***************"
## [4] "*** 6,29 ****"
## [5] " QX = X-ray|NRW"
## [6] " UI = Q000000981"
## [7] " "
## [8] "- *NEWRECORD"
## [9] "- RECTYPE = Q"
## [10] "- SH = analogs & derivatives"
## [11] "- QE = ANALOGS"
## [12] "- QA = AA"
## [13] "- QT = 1"
## [14] "- "
## [15] "- *NEWRECORD"
## [16] "- RECTYPE = Q"
## [17] "- SH = abnormalities"
## [18] "- QE = ABNORM"
## [19] "- QX = agenesis|NRW"
## [20] "- QX = anomalies|EQV"
## [21] "- QX = aplasia|NRW"
## [22] "- QX = atresia|NRW"
## [23] "- QX = birth defects|NRW"
## [24] "- QX = congenital defects|NRW"
## [25] "- QX = defects|NRW"
## [26] "- QX = deformities|NRW"
## [27] "- QX = hypoplasia|NRW"
## [28] "- UI = Q000002"
## [29] "--- 6,8 ----"
Having done all that ^^, you could also just use tools::Rdiff()
:
(res <- tools::Rdiff("~/Data/so.txt", "~/Data/so1.txt", Log=TRUE))
## $status
## [1] 1
##
## $out
## [1] "files differ in number of lines" "9,29d8"
## [3] "< *NEWRECORD" "< RECTYPE = Q"
## [5] "< SH = analogs & derivatives" "< QE = ANALOGS"
## [7] "< QA = AA" "< QT = 1"
## [9] "< " "< *NEWRECORD"
## [11] "< RECTYPE = Q" "< SH = abnormalities"
## [13] "< QE = ABNORM" "< QX = agenesis|NRW"
## [15] "< QX = anomalies|EQV" "< QX = aplasia|NRW"
## [17] "< QX = atresia|NRW" "< QX = birth defects|NRW"
## [19] "< QX = congenital defects|NRW" "< QX = defects|NRW"
## [21] "< QX = deformities|NRW" "< QX = hypoplasia|NRW"
## [23] "< UI = Q000002"
but I wanted to show the twisty path first :-)
How to compare two strings in R?
It's pretty simple actually.
s1 <- "string"
s2 <- "String"
# Case-sensitive check
s1 == s2
# Case-insensitive check
tolower(s1) == tolower(s2)
The output in the first case is FALSE
and in the second case it is TRUE
. You can use toupper()
as well.
how to compare two strings in R
one possibility would be to use str_split
and then map2
from the purrr
package.
First I create some pseuda data:
x <- c("sentence number one", "another one is here")
y <- c("setence number two", "aner one are hre")
Then I transform it:
x2 <- str_split(x, " ")
y2 <- str_split(y, " ")
library(purrr)
map2(x2, y2, ~ifelse(.x == .y, "", paste(.x, .y, sep = ":")))
[[1]]
[1] "sentence:setence" "" "one:two"
[[2]]
[1] "another:aner" "" "is:are" "here:hre"
r compare text in two columns by row
I dont understand how you want to organize the the output in X3
and X4
but maybe this helps:
words_x1 <- (df$X1 %>% paste(collapse = " ") %>% str_split(" "))[[1]] %>% unique()
words_x2 <- (df$X2 %>% paste(collapse = " ") %>% str_split(" "))[[1]] %>% unique()
c(words_x1[!(words_x1 %in% words_x2)], words_x2[!(words_x2 %in% words_x1)])
I think what you want to achieve is something like this (note that I am using a tibble
as it does not seem to work with data.frame
.
library(dplyr)
library(purrr)
df <- tibble(
X1 = c("the fox ate grapes", "the cat ate"),
X2 = c("the fox ate watermelon", "the cat ate backwards")
)
myfunction <- function(x1, x2) {
w1 <- strsplit(x1, " ")[[1]]
w2 <- strsplit(x2, " ")[[1]]
c(w1[!(w1 %in% w2)], w2[!(w2 %in% w1)])
}
map2(df$X1, df$X2, myfunction)
Comparing strings and extracting differences at the phrase-level in R
Allow me to handle the process of making the course list unique while I'm at it, since it makes the solution cleaner.
Load stringr
to take out any unnecessary whitespace, and then split the string of courses into a vector of courses...
library(stringr)
df$pre <- stringr::str_split(stringr::str_squish(df$pre_drop_courses), ",")
df$post <- stringr::str_split(stringr::str_squish(df$post_drop_courses), ",")
Use Map
to get the unique values of courses for pre
and post
df$pre <- Map(unique, df$pre)
df$post <- Map(unique, df$post)
Use Map
to find the set differences. One direction is adds the other is drops
df$dropped <- Map(setdiff, df$pre, df$post)
df$added <- Map(setdiff, df$post, df$pre)
df
#> TECH_ID YRTR pre_drop_courses post_drop_courses
#> 1 00000108 20173 BUSN 2100,BUSN 2400,ACCT 2254 BUSN 1102,BUSN 1102,BUSN 1102
#> 2 00000108 20173 BUSN 1102,BUSN 1102,BUSN 1102 BUSN 2100,BUSN 2400,ACCT 2254
#> 3 00000270 20183 BIOL 2041,BIOL 2041,BIOL 2041 BIOL 2042,BIOL 2042,BIOL 2042
#> unchanged pre post
#> 1 TRUE BUSN 2100, BUSN 2400, ACCT 2254 BUSN 1102
#> 2 FALSE BUSN 1102 BUSN 2100, BUSN 2400, ACCT 2254
#> 3 TRUE BIOL 2041 BIOL 2042
#> dropped added
#> 1 BUSN 2100, BUSN 2400, ACCT 2254 BUSN 1102
#> 2 BUSN 1102 BUSN 2100, BUSN 2400, ACCT 2254
#> 3 BIOL 2041 BIOL 2042
Your original sample data
df <-
structure(list(TECH_ID = c("00000108", "00000108", "00000270"),
YRTR = c("20173", "20173", "20183"),
pre_drop_courses = c("BUSN 2100,BUSN 2400,ACCT 2254",
"BUSN 1102,BUSN 1102,BUSN 1102 ",
"BIOL 2041,BIOL 2041,BIOL 2041"),
post_drop_courses = c("BUSN 1102,BUSN 1102,BUSN 1102",
"BUSN 2100,BUSN 2400,ACCT 2254",
"BIOL 2042,BIOL 2042,BIOL 2042"
),
unchanged = c(TRUE, FALSE, TRUE)), row.names = c(NA, 3L), class = "data.frame")
Related Topics
Extract Consecutive Pairs of Elements from a Vector and Place in a Matrix
Arrow() in Ggplot2 No Longer Supported
Find and Replace Missing Values with Row Mean
Adding Shade to R Lineplot Denotes Standard Error
Check Whether All Elements of a List Are in Equal in R
Nls Troubles: Missing Value or an Infinity Produced When Evaluating the Model
How to Define the Version of a Package in R Install.Packages
How to Merge Multiple Data.Frames and Sum and Average Columns at the Same Time in R
Fitting Logarithmic Curve in R
Gradient Breaks in a Ggplot Stat_Bin2D Plot
Remove Multiple Patterns from Text Vector R
Rmarkdown::Render() in a Loop - Cannot Allocate Vector of Size
How to Let R Use All the Cores of the Computer
How to Reverse the Order of a Dataframe in R
How to Put a Complicated Equation into a R Formula