Quantiles by Factor Levels in R

Quantiles by factor levels in R

I think your issue is that you don't really want to aggregate, but use ave, (or data.table or plyr)

qdat <- transform(dat, qq = ave(var1, strata, FUN = qfun))

#using plyr
library(plyr)

qdat <- ddply(dat, .(strata), mutate, qq = qfun(var1))

#using data.table (my preference)

dat[, qq := qfun(var1), by = strata]

Aggregate usually implies returning an object that is smaller that the original. (inthis case you were getting a data.frame where x was a list of 1 element for each strata.

Subset dataframe based on levels of a factor and create new variable of quantiles conditional on variable within subset

I updated the code to use case_when to make it more intuitive. You should be able to see each of the cases where the quant is classified and the corresponding values. I then use tidyr separate to make it into 2 columns.

library(dplyr)
library(tidyr)
set.seed(567)
year= as.factor(c(rep("1998", 20), rep("1999", 16)))
lepsp= c(letters[seq(from = 1, to = 20 )], c('a','b','c'),letters[seq(from =8, to = 20 )])
freq= rpois(36, lambda=12)
df<-data.frame(year, lepsp, freq)

df<-
df %>%
group_by(year) %>%
mutate(rank = dense_rank(-freq))

df<-data.frame(df, quant= c(75,50,25,50,50,25,75,50,25,75,75,100,50,100,100,50,25,25,75,25,75,50,50,75,75,25,25,50,50,50,25,75,75,25,75,50),
abucat= c("c", "r", "r","r","r", "r","c","r","r", "c", "c", "c", "r","c", "c","r" , "r", "r", "c", "r", "c","r","r","c","c","r",
"r","r","r","r","r","c","c","r","c","r"))

df %>%
group_by(year) %>%
mutate(qtile = list(quantile(freq))) %>%
rowwise() %>%
mutate(q = case_when(freq <= qtile[2] ~ "25,r",
freq > qtile[2] & freq <=qtile[3] ~"50,r",
freq > qtile[3] & freq <=qtile[4] ~"75,c",
freq > qtile[4] ~ "100,c")) %>%
separate(q, c("quant","abucat")) %>%
select(-qtile)
# Source: local data frame [36 x 6]
# Groups: <by row>
#
# # A tibble: 36 x 6
# year lepsp freq rank quant abucat
# <fct> <fct> <int> <int> <chr> <chr>
# 1 1998 a 14 3 75 c
# 2 1998 b 13 4 50 r
# 3 1998 c 9 7 25 r
# 4 1998 d 12 5 50 r
# 5 1998 e 12 5 50 r
# 6 1998 f 9 7 25 r
# 7 1998 g 15 2 75 c
# 8 1998 h 12 5 50 r
# 9 1998 i 10 6 25 r
# 10 1998 j 15 2 75 c
# # ... with 26 more rows

Create a factor variable using the quantiles

This seems to work

x=c(2,5,5,8,10)
qnt <- quantile(x,seq(0,1,.25))

cut(x,unique(qnt),include.lowest=TRUE)
# [1] [2,5] [2,5] [2,5] (5,8] (8,10]
# Levels: [2,5] (5,8] (8,10]

Alternative answer. If you still want four bins, even when your data do not justify it, there is a way!

set.seed(1024)
x <- sample(1:3,101,replace=TRUE)

binx <- rank(x,ties.method="random")%/%(ceiling(length(x)/4)+1)

And here you can see the effects.

binx_ranges <- by(x,binx,range)
# binx: 0
# [1] 1 1
# ------------------------------------------------------------
# binx: 1
# [1] 1 2
# ------------------------------------------------------------
# binx: 2
# [1] 2 3
# ------------------------------------------------------------
# binx: 3
# [1] 3 3

table(binx,x)
# x
# binx 1 2 3
# 0 26 0 0
# 1 8 19 0
# 2 0 13 14
# 3 0 0 21

In R, Quantile based on flag/ factor value

This is a great time to use the ave function:

dat$top_q <- ave(dat$X, dat$A, FUN = function(x) quantile(x, .75))
dat$top_d <- ave(dat$X, dat$A, FUN = function(x) quantile(x, .9))

A X top_q top_d
1 a 1.7150650 1.346828 1.5677700
2 b 0.4609162 0.390532 0.4308438
3 a -1.2650612 1.346828 1.5677700
4 b -0.6868529 0.390532 0.4308438
5 b -0.4456620 0.390532 0.4308438
6 a 1.2240818 1.346828 1.5677700
7 b 0.3598138 0.390532 0.4308438
8 b 0.4007715 0.390532 0.4308438
9 b 0.1106827 0.390532 0.4308438
10 a -0.5558411 1.346828 1.5677700

data

set.seed(123)
dat <- data.frame(A = sample(letters[1:2], 10, TRUE), X = rnorm(10))

A X
1 a 1.7150650
2 b 0.4609162
3 a -1.2650612
4 b -0.6868529
5 b -0.4456620
6 a 1.2240818
7 b 0.3598138
8 b 0.4007715
9 b 0.1106827
10 a -0.5558411

Counting number of values great than a percentile value across multiple factor levels

with dplyr you can do this:

library(dplyr)
df %>%
group_by(factor) %>%
summarize(Percentile_75 = quantile(values,0.75),n_sup = sum(values > Percentile_75))

# # A tibble: 2 x 3
# factor Percentile_75 n_sup
# <chr> <dbl> <int>
# 1 Factor.A 8.5 4
# 2 Factor.B 8.5 4


Related Topics



Leave a reply



Submit