Create a Ranking Variable with Dplyr

Create a ranking variable with dplyr?

It sounds like you're looking for dense_rank from "dplyr" -- but applied in a reverse order than what rank normally does.

Try this:

df %>% mutate(rank = dense_rank(desc(score)))
# name score rank
# 1 A 10 1
# 2 B 10 1
# 3 C 9 2
# 4 D 8 3

Rank variable by group (dplyr)

The following produces the desired result as was specified.

library(dplyr)

by_species <- iris %>% arrange(Species, Sepal.Length) %>%
group_by(Species) %>%
mutate(rank = rank(Sepal.Length, ties.method = "first"))

by_species %>% filter(rank <= 3)
##Source: local data frame [9 x 6]
##Groups: Species [3]
##
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species rank
## (dbl) (dbl) (dbl) (dbl) (fctr) (int)
##1 4.3 3.0 1.1 0.1 setosa 1
##2 4.4 2.9 1.4 0.2 setosa 2
##3 4.4 3.0 1.3 0.2 setosa 3
##4 4.9 2.4 3.3 1.0 versicolor 1
##5 5.0 2.0 3.5 1.0 versicolor 2
##6 5.0 2.3 3.3 1.0 versicolor 3
##7 4.9 2.5 4.5 1.7 virginica 1
##8 5.6 2.8 4.9 2.0 virginica 2
##9 5.7 2.5 5.0 2.0 virginica 3

by_species %>% slice(1:3)
##Source: local data frame [9 x 6]
##Groups: Species [3]
##
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species rank
## (dbl) (dbl) (dbl) (dbl) (fctr) (int)
##1 4.3 3.0 1.1 0.1 setosa 1
##2 4.4 2.9 1.4 0.2 setosa 2
##3 4.4 3.0 1.3 0.2 setosa 3
##4 4.9 2.4 3.3 1.0 versicolor 1
##5 5.0 2.0 3.5 1.0 versicolor 2
##6 5.0 2.3 3.3 1.0 versicolor 3
##7 4.9 2.5 4.5 1.7 virginica 1
##8 5.6 2.8 4.9 2.0 virginica 2
##9 5.7 2.5 5.0 2.0 virginica 3

R: how to create a ranking variable for each subject excluding NA values

Here is one option. After grouping by 'Subject', replace the non-Na elements in 'FixationDuration' by the row_number ordered with 'Time' values where the 'FixationDuration' is non-NA

library(dplyr)
df1 %>%
group_by(Subject) %>%
mutate(OrdinalFixationNumber = replace(FixationDuration,
!is.na(FixationDuration), row_number(Time[!is.na(FixationDuration)])))
# A tibble: 13 x 4
# Groups: Subject [3]
# Subject FixationDuration Time OrdinalFixationNumber
# <int> <dbl> <dbl> <dbl>
# 1 1 NA 1 NA
# 2 1 0.33 2 1
# 3 1 NA 3 NA
# 4 1 0.15 4.22 2
# 5 1 3.2 5.93 3
# 6 2 6.88 1 1
# 7 2 9.23 3 2
# 8 2 0.77 3.01 3
# 9 2 1.88 4.91 4
#10 15 6.22 1 1
#11 15 NA 1.56 NA
#12 15 NA 1.76 NA
#13 15 0.24 2.39 2

In data.table, this can be done with

library(data.table)
setDT(df1)[!is.na(FixationDuration), OrdinalFixationNumber :=
seq_len(.N)[order(Time)], by = Subject]
df1
# Subject FixationDuration Time OrdinalFixationNumber
# 1: 1 NA 1.00 NA
# 2: 1 0.33 2.00 1
# 3: 1 NA 3.00 NA
# 4: 1 0.15 4.22 2
# 5: 1 3.20 5.93 3
# 6: 2 6.88 1.00 1
# 7: 2 9.23 3.00 2
# 8: 2 0.77 3.01 3
# 9: 2 1.88 4.91 4
#10: 15 6.22 1.00 1
#11: 15 NA 1.56 NA
#12: 15 NA 1.76 NA
#13: 15 0.24 2.39 2

data

df1 <- structure(list(Subject = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
15L, 15L, 15L, 15L), FixationDuration = c(NA, 0.33, NA, 0.15,
3.2, 6.88, 9.23, 0.77, 1.88, 6.22, NA, NA, 0.24), Time = c(1,
2, 3, 4.22, 5.93, 1, 3, 3.01, 4.91, 1, 1.56, 1.76, 2.39)),
class = "data.frame", row.names = c(NA,
-13L))

Create descending ranks for a set of columns using dplyr

We need to place this inside the funs

out1 <- USArrests %>%
tibble::rownames_to_column() %>%
arrange_at(vars(Murder:Rape), funs(desc))

Checking with applying desc on each column

out2 <-  USArrests %>% 
tibble::rownames_to_column() %>%
arrange(desc(Murder), desc(Assault), desc(UrbanPop), desc(Rape))
identical(out1, out2)
#[1] TRUE

Based on the above, we can make changes in the rank_f

out3 <-  out2 %>%
mutate_at(vars(Murder:Rape), min_rank)
rank_f <- function(ds, cols, fs){
ds %>%
arrange_at(vars(!!!cols), funs(desc))%>%
mutate_at(vars(!!!cols), funs(!!!fs))
}
out4 <- USArrests %>%
tibble::rownames_to_column()%>%
rank_f(quos((Murder:Rape)),quos(min_rank))

identical(out3, out4)
#[1] TRUE

Update

Based on the comments from OP, we don't need to do any arrange, we can directly apply min_rank by converting the column values to negative

USArrests %>% 
tibble::rownames_to_column() %>%
mutate_at(vars(Murder:Rape), funs(min_rank(-.)))

Get rank for every column using dplyr

We may use across - loop over the numeric column, get the rank and create new column names by adding a suffix in .names

library(dplyr)
out <- mtcars %>%
mutate(across(where(is.numeric), rank, .names = "{.col}_rank"))

-output

> head(out, 2)
mpg cyl disp hp drat wt qsec vs am gear carb mpg_rank cyl_rank disp_rank hp_rank drat_rank wt_rank qsec_rank vs_rank
Mazda RX4 21 6 160 110 3.9 2.620 16.46 0 1 4 4 19.5 15 13.5 13 21.5 9 6.0 9.5
Mazda RX4 Wag 21 6 160 110 3.9 2.875 17.02 0 1 4 4 19.5 15 13.5 13 21.5 12 10.5 9.5
am_rank gear_rank carb_rank
Mazda RX4 26 21.5 25.5
Mazda RX4 Wag 26 21.5 25.5

By default, if there are ties, then the rank may take average

rank(x, na.last = TRUE,
ties.method = c("average", "first", "last", "random", "max", "min"))

So, it may be better to specify ties.method or may use dense_rank

out <- mtcars %>% 
mutate(across(where(is.numeric), dense_rank, .names = "{.col}_rank"))

-output

> head(out, 2)
mpg cyl disp hp drat wt qsec vs am gear carb mpg_rank cyl_rank disp_rank hp_rank drat_rank wt_rank qsec_rank vs_rank
Mazda RX4 21 6 160 110 3.9 2.620 16.46 0 1 4 4 16 2 13 11 16 9 6 1
Mazda RX4 Wag 21 6 160 110 3.9 2.875 17.02 0 1 4 4 16 2 13 11 16 12 10 1
am_rank gear_rank carb_rank
Mazda RX4 2 2 4
Mazda RX4 Wag 2 2 4

Regarding the OP's function, it uses df as input dataset which is not an argument to the function and by default df is a function in base R. Also, the rank= returns each of the column name to be rank. The function could be modified as

cols <- colnames(mtcars)

get_rank <- function(data, col){

data %>%
transmute(!! stringr::str_c(col, "_rank") :=rank(.data[[col]]))
}

lapply(cols, get_rank, data = mtcars) %>%
bind_cols(mtcars, .)

How to rank numeric data by rows in a dataframe in r?

We may use pmap to loop over each of the rows (would be fast compared to rowwise) and apply dense_rank

library(purrr)
library(dplyr)
df %>%
pmap_dfr(~ setNames(dense_rank(-c(...)), names(c(...))))

-output

# A tibble: 5 x 3
a b c
<int> <int> <int>
1 1 2 NA
2 1 2 NA
3 2 1 NA
4 1 NA NA
5 1 2 1

Or a faster option may be using dapply from collapse

library(collapse)
library(data.table)
dapply(df, MARGIN = 1, FUN = frank, ties.method = 'dense', na.last = "keep")
a b c
1 2 1 NA
2 2 1 NA
3 1 2 NA
4 1 NA NA
5 2 1 2

Apply a rank across groups

You could try

library(dplyr)

data %>%
group_by(Grp) %>%
mutate(Rank = Value[which.max(YEAR)]) %>%
ungroup() %>%
mutate(Rank = dense_rank(-Rank))

# YEAR Grp Value Rank
# 1 2020 A 25 3
# 2 2019 A 24 3
# 3 2020 B 35 2
# 4 2019 B 34 2
# 5 2020 C 45 1
# 6 2019 C 44 1


Related Topics



Leave a reply



Submit