Dplyr - Filter by Group Size

dplyr - filter by group size

Here's another dplyr approach you can try

semi_join(dat, count(dat, cat) %>% filter(n == 5), by = "cat")

Here's another approach based on OP's original approach with a little modification:

n <- 1e5
x <- rnorm(n)
# Category size ranging each from 1 to 5
cat <- rep(seq_len(n/3), sample(1:5, n/3, replace = TRUE))[1:n]

dat <- data.frame(x = x, cat = cat)

# second data set for the dt approch
dat2 <- data.frame(x = x, cat = cat)

sol_floo0 <- function(dat){
  dat <- group_by(dat, cat)
  all_ind <- rep(seq_len(n_groups(dat)), group_size(dat))
  take_only <- which(group_size(dat) == 5L)
  dat[all_ind %in% take_only, ]
}

sol_floo0_v2 <- function(dat){
  g <- group_by(dat, cat) %>% group_size()
  ind <- rep(g == 5, g)
  dat[ind, ]
}



microbenchmark::microbenchmark(times = 10,
                               sol_floo0(dat),
                               sol_floo0_v2(dat2))
#Unit: milliseconds
#               expr      min       lq     mean   median       uq      max neval cld
#     sol_floo0(dat) 43.72903 44.89957 45.71121 45.10773 46.59019 48.64595    10   b
# sol_floo0_v2(dat2) 29.83724 30.56719 32.92777 31.97169 34.10451 38.31037    10  a 
all.equal(sol_floo0(dat), sol_floo0_v2(dat2))
#[1] TRUE

dplyr: How to filter groups by subgroup criteria

You could add group_by and filter to the codes

#OP's code
d1 <- dadmom %>%
           gather(key, value, named:incm) %>%
           separate(key, c("variable", "type"), -2) %>%
           spread(variable, value, convert = TRUE)

 d1 %>% 
    group_by(famid) %>%
    filter(all(sum(type=='m' & inc > 15000)==sum(type=='m')))

#    famid type   inc name
# 1     2    d 22000  Art
# 2     2    m 18000  Amy
# 3     3    d 25000 Paul
# 4     3    m 50000  Pat

NOTE: The above will also work when there are multiple 'm's per famid (a bit more general)

For normal cases of single 'm/f' pair per famid

 d1 %>%
     group_by(famid) %>% 
     filter(any(inc >15000 & type=='m'))
 #   famid type   inc name
 #1     2    d 22000  Art
 #2     2    m 18000  Amy
 #3     3    d 25000 Paul
 #4     3    m 50000  Pat

Also, if you wish to use data.table, melt from the devel version i.e. v1.9.5 can take multiple value columns. It can be installed from here

 library(data.table)
 melt(setDT(dadmom), measure.vars=list(c(2,4), c(3,5)), 
    variable.name='type', value.name=c('name', 'inc'))[,
    type:=c('d', 'm')[type]][, .SD[any(type=='m' & inc >15000)] ,famid]
 #    famid type name   inc
 #1:     2    d  Art 22000
 #2:     2    m  Amy 18000
 #3:     3    d Paul 25000
 #4:     3    m  Pat 50000

Filter by subgroup criteria (specify the occurrence of a value per group) using dplyr

We could group_by group and calculate occurrence of item == 1 in each group and filter where there are >= n occurrences.

library(dplyr)
n <- 1

df %>%
  group_by(group) %>%
  filter(sum(item == 1) >= n)

Or using the same logic with base R ave

df[with(df, ave(item == 1, group, FUN = sum) >= n), ]

and for completion one with data.table

library(data.table)
setDT(df)[, if(sum(item == 1) >= n) .SD, by = group]

conditional filtering based on grouped data in R using dplyr

Here's another method that selects directly using math rather than %in%

data %>% filter(col * sign((group < 3) - 0.5) > 0)
#> # A tibble: 76 x 3
#>    group  year    col
#>    <int> <int>  <dbl>
#>  1     2  1985  2.20 
#>  2     3  1986 -0.205
#>  3     4  1991 -2.10 
#>  4     3  1994 -0.113
#>  5     2  1997  1.90 
#>  6     1  2000  1.37 
#>  7     3  2002 -0.805
#>  8     4  2003 -0.535
#>  9     1  2004  0.792
#> 10     3  2006 -1.28 
#> # ... with 66 more rows

Filter either or with dplyr

One dplyr option could be:

dat1 %>%
 group_by(team) %>%
 filter(if(any(name == "LT")) name == "LT" else name == "CH")

   time team  name 
  <dbl> <fct> <fct>
1     1 A     LT   
2     2 A     LT   
3     1 B     CH   
4     2 B     CH

group data by year and filter for all year

In order for others to answer or help, it will be very helpful if you can provide a sample data. Based on your descriptions, I can only guess how your data looks like. You can try this:

# simulate a data set
set.seed(1234)
tb <- tibble(year = sample(2006:2018, 50, replace = TRUE),
             trade = runif(50, 50000, 1000000)) %>%
  mutate(log_trade = log(trade))

# compute the quantile
tb2 <- tb %>% 
  group_by(year) %>%
  summarise(q25 = quantile(log_trade, 0.25)) 

# join back and filter             
tb %>% left_join(tb2, by = "year") %>%
  filter(log_trade > q25)

Group by, take count and filter out entries corresponding to count greater than 1

We can use filter after grouping by 'date', 'number' and check whether the number of rows (n()) is equal to 1 and keep those rows with the filter command.

library(dplyr)
data %>% 
     group_by(date, number) %>% 
     filter(n() ==1)
#        date number value
#        <chr>  <int> <int>
#1 2016-05-07      3     9
#2 2016-05-08      4    10
#3 2016-05-09      5    11

Just to provide some alternatives using data.table

library(data.table)
setDT(data)[, if(.N == 1) .SD , .(date, number)]

Or with base R

data[with(data, ave(number, number, date, FUN = length) ==1),]

group by and filter data management using dplyr

Try

d %>% 
  group_by(c) %>% 
  filter(any(b == 1))

Which gives:

#Source: local data frame [6 x 3]
#Groups: c
#
#  a b c
#1 1 1 1
#2 2 2 1
#3 3 2 1
#4 4 1 2
#5 5 2 2
#6 6 2 2

R dplyr: how to remove smaller groups?

You can use n() to get the number of rows per group, and filter on it, take a look at ?n(), the last example about the usage of n() is filtering based on the size of groups:

df %>% group_by(group) %>% filter(n() >= 3)

# Source: local data frame [6 x 3]
# Groups: group [2]

#      ID group value
#   <int> <int> <int>
# 1     3     2     0
# 2     4     2     5
# 3     5     2     3
# 4     8     4     3
# 5     9     4     7
# 6    10     4     5

Dplyr - Filter by Group Size