Find Duplicate Values in R

Find duplicate values in R

You could use table, i.e.

n_occur <- data.frame(table(vocabulary$id))

gives you a data frame with a list of ids and the number of times they occurred.

n_occur[n_occur$Freq > 1,]

tells you which ids occurred more than once.

vocabulary[vocabulary$id %in% n_occur$Var1[n_occur$Freq > 1],]

returns the records with more than one occurrence.

Find duplicated elements with dplyr

I guess you could use filter for this purpose:

mtcars %>% 
  group_by(carb) %>% 
  filter(n()>1)

Small example (note that I added summarize() to prove that the resulting data set does not contain rows with duplicate 'carb'. I used 'carb' instead of 'cyl' because 'carb' has unique values whereas 'cyl' does not):

mtcars %>% group_by(carb) %>% summarize(n=n())
#Source: local data frame [6 x 2]
#
#  carb  n
#1    1  7
#2    2 10
#3    3  3
#4    4 10
#5    6  1
#6    8  1

mtcars %>% group_by(carb) %>% filter(n()>1) %>% summarize(n=n())
#Source: local data frame [4 x 2]
#
#  carb  n
#1    1  7
#2    2 10
#3    3  3
#4    4 10

How to find duplicate records within the same timeperiod in R

Not sure what is your expected output. Here is one way which will give a unique ID to every "duplicates".

library(dplyr)

df %>%
  tidyr::unite(DateTime, Date, Time, sep = " ") %>%
  mutate(DateTime = lubridate::ymd_hms(DateTime)) %>%
  group_by(Observer, FocalID) %>%
  mutate(grp = floor(difftime(DateTime, first(DateTime), units = 'hour'))) %>%
  group_by(grp, .add = TRUE) %>%
  mutate(ID = cur_group_id()) %>%
  ungroup() %>%
  select(-grp)
  
# A tibble: 4 x 5
#      N DateTime            Observer FocalID    ID
#  <int> <dttm>              <chr>    <chr>   <int>
#1     1 2018-05-20 07:05:00 VR       JK          2
#2     2 2018-05-20 07:50:00 VR       JK          2
#3     3 2018-05-21 07:50:00 JD       CJD         1
#4     4 2018-05-21 08:25:00 JD       CJD         1

All the rows with similar ID's can be considered as a part of one group.

data

df <- structure(list(N = 1:4, Date = c(20180520L, 20180520L, 20180521L, 
20180521L), Time = c("07:05:00", "07:50:00", "07:50:00", "08:25:00"
), Observer = c("VR", "VR", "JD", "JD"), FocalID = c("JK", "JK", 
"CJD", "CJD")), class = "data.frame", row.names = c(NA, -4L))

How to get a map of repeated values in R?

in base R you could do something like:

t(do.call(cbind,tapply(df$position,df$names,function(x)if(length(x)>1)combn(x,2))))
     [,1] [,2]
[1,]    1    5
[2,]    1    7
[3,]    5    7
[4,]    2    4

Unique case of finding duplicate values flexibly across columns in R

tidyverse

df <- data.frame(animal_1 = c("cat", "dog", "mouse", "squirrel"),
                 predation_type = c("eats", "eats", "eaten by", "eats"),
                 animal_2 = c("mouse", "squirrel", "cat", "nuts"))
library(tidyverse)

df %>% 
  rowwise() %>% 
  mutate(duplicates = str_c(sort(c_across(c(1, 3))), collapse = "")) %>% 
  group_by(duplicates) %>% 
  mutate(duplicates = n() > 1) %>% 
  ungroup()
#> # A tibble: 4 x 4
#>   animal_1 predation_type animal_2 duplicates
#>   <chr>    <chr>          <chr>    <lgl>     
#> 1 cat      eats           mouse    TRUE      
#> 2 dog      eats           squirrel FALSE     
#> 3 mouse    eaten by       cat      TRUE      
#> 4 squirrel eats           nuts     FALSE

^{Created on 2022-01-17 by the reprex package (v2.0.1)}

removing duplicates


library(tidyverse)
df %>% 
  filter(!duplicated(map2(animal_1, animal_2, ~str_c(sort((c(.x, .y))), collapse = ""))))
#>   animal_1 predation_type animal_2
#> 1      cat           eats    mouse
#> 2      dog           eats squirrel
#> 3 squirrel           eats     nuts

^{Created on 2022-01-17 by the reprex package (v2.0.1)}

R - Finding duplicates in list entries

You can unlist first:

unlisted <- unlist(examplelist)
unlisted[duplicated(unlisted)]
#     b1      c1      c2 
#  "red" "black" "green" 

unlisted[!duplicated(unlisted)]
#      a1       a2       a3       b2       b3       c3 
#  "blue"    "red" "yellow"  "black"  "green"  "brown"

If you only want the vector (without the names), use unname:

unlisted <- unname(unlist(examplelist))

If/Then statement in R based on duplicate values

library(tidyverse)
df %>%
  rowid_to_column('rn') %>%
  left_join(pivot_longer(.,-c(EntryName, rn)) %>%
  group_by(rn, EntryName) %>%
  count(value) %>%
  filter(n>=5))

  rn EntryName Team1 Team2 Team3 Team4 Team5 Team6 Team7 Team8 value  n
1  1         a   MIN    SF    SF    SF   ATL   TOR    SF    SF    SF  5
2  2         b   MIN    SF    SF    SF    SF    SF   DET   MIA    SF  5
3  3         c   MIN   CWS    SF   MIA   ATL   MIA   TOR    SF  <NA> NA
4  4         d    SF    SF    SF    SF   TOR   TOR    SF   MIN    SF  5
5  5         e   MIN   TOR   ATL   ATL    SF   CIN   DET    TB  <NA> NA

Look at the value column. Second to last

How to find duplicate dates within a row in R, and then replace associated values with the mean?

This involved change the structure of the table into the long format, averaging the duplicates and then reformatting back into the desired table:

library(tidyr)
library(dplyr)
df.1 <- data.frame(ID,Gest1,Sys1,Dia1,Gest2,Sys2,Dia2,Gest3,Sys3, Dia3,Gest4,Sys4,Dia4)

#convert data to long format
longdf <- df.1 %>% pivot_longer(!ID, names_to = c(".value", "time"), names_pattern = "(\\D+)(\\d)", values_to="count")

#average duplicate rows
temp<-longdf %>% group_by(ID, Gest) %>% summarize(Sys=mean(Sys), Dia=mean(Dia)) %>%  mutate(time = row_number())

#convert back to wide format
answer<-temp %>% pivot_wider(ID, names_from = time, values_from = c("Gest", "Sys", "Dia"), names_glue = "{.value}{time}")
#resort the columns
answer <-answer[ , names(df.1)]
answer
# A tibble: 3 × 13
# Groups:   ID [3]
       ID Gest1  Sys1  Dia1 Gest2  Sys2  Dia2 Gest3  Sys3  Dia3 Gest4  Sys4  Dia4
     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
   1    27    27  120   90      29   122  89      32   123    90    33   124    94
   2    46    28  126.  83.5    29   122  88      30   123    89    NA    NA    NA
   3    72    29  124   92      30   119  84.5    32   128    80    NA    NA    NA

Find Duplicate Values in R