R, Conditionally Remove Duplicate Rows

Conditionally removing duplicates

Using data.table framework: Transform your set to data.table

require(data.table)
setDT(data)

Build a list of id where we can delete lines:

dropable_ids = unique(data[size != 0, SampleID])

Finaly keep lines that are not in the dropable list or with non 0 value

data = data[!(SampleID %in% dropable_ids & size == 0), ]

Please note that not( a and b ) is equivalent to a or b but data.table framework doesn't handle well or.

Hope it helps

remove duplicate row based on conditional matching in another column

I think the following solution will help you:

library(dplyr)

df %>% 
  group_by(county, mid) %>%
  mutate(duplicate = n() > 1) %>% 
  filter(!duplicate | (duplicate & kpi == "B")) %>% 
  select(-duplicate)

# A tibble: 71 x 3
# Groups:   county, mid [71]
   county mid   kpi  
   <chr>  <chr> <chr>
 1 Athens 1.1   A    
 2 Athens 1.2   A    
 3 Athens 1.3   A    
 4 Athens 1.4   A    
 5 Athens 1.5   A    
 6 Athens 1.6   A    
 7 Athens 2.1.1 A    
 8 Athens 2.1.2 A    
 9 Athens 2.1.3 A    
10 Athens 2.1.4 A    
# ... with 61 more rows

In R, conditionally remove duplicate rows within ID, Date, and Event

One option is to use unique

unique(d)
#>   ID event       date
#> 1  a   G12 2011-01-01
#> 3  a   O99 2011-12-23
#> 5  b    B4 2011-01-01
#> 6  b    B2 2011-07-12

Using data.table

library(data.table)

dt <- data.table(d)

unique(dt[, .(event, date), by = ID])

^{Created on 2021-11-23 by the reprex package (v2.0.1)}

R, Remove duplicate rows conditional on value of variable

I think any of these should work. If you provide copy/pasteable sample data, I'll test and make sure.

# group_by and top_n
new_data <- data_with_dups %>% 
  group_by(StudentID, District) %>% 
  arrange(desc(ERRaw)) %>%
  top_n(1) 

# base R sort, !duplicated
new_data = data_with_dups[order(data_with_dups$ERRaw, decreasing = TRUE), ]
new_data = new_data[!duplicated(new_data[c("StudentID", "District")]), ]

Match and Remove Rows Based on Condition R

We can slice the rows by checking the highest value in 'Income' grouped by 'ID'

library(dplyr)
df1 %>%
  group_by(ID) %>%
  slice(which.max(Income))

Or using data.table

library(data.table)
setDT(df1)[, .SD[which.max(Income)], by = ID]

Or with base R

df1[with(df1, ave(Income, ID, FUN = max) == Income),]
#     ID Income
#1   1  98765
#4   2   5498
#5   5     23
#6   6     98
#8   7  67871
#9   9 983754
#13 10   4744
#14 11   6853

data

df1 <- structure(list(ID = c(1L, 2L, 2L, 2L, 5L, 6L, 7L, 7L, 9L, 10L, 
10L, 10L, 10L, 11L), Income = c(98765L, 3456L, 67L, 5498L, 23L, 
98L, 5645L, 67871L, 983754L, 982L, 2374L, 875L, 4744L, 6853L)), 
class = "data.frame", row.names = c(NA, 
-14L))

R, conditionally remove duplicated row - based on closest to a particular value

You need to think about the condition when two rows have the same absolute age difference. If that happens, which one you want to keep? The solution here does not consider that.

library(data.table)
library(dplyr)

# Create example data frame
dat <- fread("ID SEX HEIGHT AGE_MONTHS
             P0001 1 120.5 87
             P0002 0 129.6 84
             P0003 1 150.8 103
             P0003 1 139.5 99
             P0003 1 126.2 97
             P0004 0 168.4 101
             P0005 0 117.1 82")

# Set target age
target_age <- 96

# Subset the data
dat2 <- dat %>%
  mutate(AGE_Diff = abs(AGE_MONTHS - target_age)) %>%
  arrange(ID, AGE_Diff) %>%
  group_by(ID) %>%
  slice(1) %>%
  select(-AGE_Diff)

Update

If you want to specify different target age for different ID, you can create a data frame storing the information first, then use left_join to merge the table.

# Set target age for each ID
taget_age_df <- data_frame(ID = c("P0001", "P0002", "P0003", "P0004", "P0005"),
                           Target_Age = c(86, 88, 96, 100, 82))

# Subset the data
dat2 <- dat %>%
  left_join(taget_age_df, by = "ID") %>%
  mutate(AGE_Diff = abs(AGE_MONTHS - Target_Age)) %>%
  arrange(ID, AGE_Diff) %>%
  group_by(ID) %>%
  slice(1) %>%
  select(-AGE_Diff, -Target_Age)

Remove duplicate rows conditionally within group_by in dplyr

After grouping by 'id', 'date', get the logical vector where 'code' is 'a', use duplicated on that or where the 'code' is not 'a'

x %>% 
  group_by(id, date) %>% 
  filter(!duplicated(code == "a") | code != 'a')
# A tibble: 5 x 3
# Groups:   id, date [3]
#     id date       code 
#  <dbl> <date>     <fct>
#1     1 2016-04-24 a    
#2     1 2016-04-24 b    
#3     1 2016-04-24 b    
#4     2 2016-04-24 a    
#5     3 2016-04-28 a