Quantiles by Factor Levels in R

Quantiles by factor levels in R

I think your issue is that you don't really want to aggregate, but use ave, (or data.table or plyr)

qdat <- transform(dat, qq = ave(var1, strata, FUN = qfun))

#using plyr
library(plyr)

qdat <- ddply(dat, .(strata), mutate, qq = qfun(var1))

#using data.table (my preference)

dat[, qq := qfun(var1), by = strata]

Aggregate usually implies returning an object that is smaller that the original. (inthis case you were getting a data.frame where x was a list of 1 element for each strata.

Subset dataframe based on levels of a factor and create new variable of quantiles conditional on variable within subset

I updated the code to use case_when to make it more intuitive. You should be able to see each of the cases where the quant is classified and the corresponding values. I then use tidyr separate to make it into 2 columns.

library(dplyr)
library(tidyr)
set.seed(567) 
year= as.factor(c(rep("1998", 20), rep("1999", 16)))
lepsp= c(letters[seq(from = 1, to = 20 )], c('a','b','c'),letters[seq(from =8, to = 20 )]) 
freq= rpois(36, lambda=12)
df<-data.frame(year, lepsp, freq)

df<- 
  df %>%
  group_by(year) %>%
  mutate(rank = dense_rank(-freq))

df<-data.frame(df, quant= c(75,50,25,50,50,25,75,50,25,75,75,100,50,100,100,50,25,25,75,25,75,50,50,75,75,25,25,50,50,50,25,75,75,25,75,50), 
               abucat= c("c", "r", "r","r","r", "r","c","r","r", "c", "c", "c", "r","c", "c","r" , "r", "r", "c", "r", "c","r","r","c","c","r",
                         "r","r","r","r","r","c","c","r","c","r"))

df %>%
  group_by(year) %>%
  mutate(qtile = list(quantile(freq))) %>% 
  rowwise() %>% 
  mutate(q = case_when(freq <= qtile[2] ~ "25,r",
                           freq > qtile[2] & freq <=qtile[3] ~"50,r",
                           freq > qtile[3] & freq <=qtile[4] ~"75,c",
                           freq > qtile[4] ~ "100,c")) %>% 
  separate(q, c("quant","abucat")) %>% 
  select(-qtile)
#  Source: local data frame [36 x 6]
#  Groups: <by row>
#  
#  # A tibble: 36 x 6
#     year  lepsp  freq  rank quant abucat
#     <fct> <fct> <int> <int> <chr> <chr> 
#   1 1998  a        14     3 75    c     
#   2 1998  b        13     4 50    r     
#   3 1998  c         9     7 25    r     
#   4 1998  d        12     5 50    r     
#   5 1998  e        12     5 50    r     
#   6 1998  f         9     7 25    r     
#   7 1998  g        15     2 75    c     
#   8 1998  h        12     5 50    r     
#   9 1998  i        10     6 25    r     
#  10 1998  j        15     2 75    c     
#  # ... with 26 more rows

Create a factor variable using the quantiles

This seems to work

x=c(2,5,5,8,10)
qnt <- quantile(x,seq(0,1,.25))

cut(x,unique(qnt),include.lowest=TRUE)
# [1] [2,5]  [2,5]  [2,5]  (5,8]  (8,10]
# Levels: [2,5] (5,8] (8,10]

Alternative answer. If you still want four bins, even when your data do not justify it, there is a way!

set.seed(1024)
x <- sample(1:3,101,replace=TRUE)

binx <- rank(x,ties.method="random")%/%(ceiling(length(x)/4)+1)

And here you can see the effects.

binx_ranges <- by(x,binx,range)
# binx: 0
# [1] 1 1
# ------------------------------------------------------------ 
# binx: 1
# [1] 1 2
# ------------------------------------------------------------ 
# binx: 2
# [1] 2 3
# ------------------------------------------------------------ 
# binx: 3
# [1] 3 3

table(binx,x)
#     x
# binx  1  2  3
#    0 26  0  0
#    1  8 19  0
#    2  0 13 14
#    3  0  0 21

In R, Quantile based on flag/ factor value

This is a great time to use the ave function:

dat$top_q <- ave(dat$X, dat$A, FUN = function(x) quantile(x, .75))
dat$top_d <- ave(dat$X, dat$A, FUN = function(x) quantile(x, .9))

   A          X    top_q     top_d
1  a  1.7150650 1.346828 1.5677700
2  b  0.4609162 0.390532 0.4308438
3  a -1.2650612 1.346828 1.5677700
4  b -0.6868529 0.390532 0.4308438
5  b -0.4456620 0.390532 0.4308438
6  a  1.2240818 1.346828 1.5677700
7  b  0.3598138 0.390532 0.4308438
8  b  0.4007715 0.390532 0.4308438
9  b  0.1106827 0.390532 0.4308438
10 a -0.5558411 1.346828 1.5677700

data

set.seed(123)
dat <- data.frame(A = sample(letters[1:2], 10, TRUE), X = rnorm(10))

   A          X
1  a  1.7150650
2  b  0.4609162
3  a -1.2650612
4  b -0.6868529
5  b -0.4456620
6  a  1.2240818
7  b  0.3598138
8  b  0.4007715
9  b  0.1106827
10 a -0.5558411

Counting number of values great than a percentile value across multiple factor levels

with dplyr you can do this:

library(dplyr)
df %>%
  group_by(factor) %>% 
  summarize(Percentile_75 = quantile(values,0.75),n_sup = sum(values > Percentile_75))

# # A tibble: 2 x 3
#       factor Percentile_75 n_sup
#        <chr>         <dbl> <int>
#   1 Factor.A           8.5     4
#   2 Factor.B           8.5     4

Quantiles by Factor Levels in R