How to Find the Largest N Elements in a List in R

How to find the largest N elements in a list in R?

order(R, decreasing=TRUE)[1:N]

Find the top n largest values from a dataframe (or matrix) in r

unlist and convert it into a vector, sort them and find top values. So for top 2 values we can do

tail(sort(unlist(df, use.names = FALSE)), 2)
#[1] 9.581705 9.591726

If it's a matrix you'll not require unlist

tail(sort(as.matrix(df)), 2)

data

set.seed(1233)
df = data.frame(a = runif(10,0,10),
b = runif(10,1,10),
c = runif(10,0,12))

How can I get top n values with its index in R?

We can use sort with index.return=TRUE to return the value with the index in a list. Then we can subset the list based on the first 3 unique elements in the 'x'.

lst <- sort(df1$distance, index.return=TRUE, decreasing=TRUE)
lapply(lst, `[`, lst$x %in% head(unique(lst$x),3))
#$x
#[1] 5 5 4 4 3

#$ix
#[1] 6 7 2 5 4

R - fastest way to get the indices of the max n elements in a vector

I want to add a hybrid approach using sort's partial argument and which:

whichpart <- function(x, n=30) {
nx <- length(x)
p <- nx-n
xp <- sort(x, partial=p)[p]
which(x > xp)
}

Some benchmarking:

library("microbenchmark")
library("data.table")
library("compiler")

set.seed(123)
x <- rnorm(1e6)
y <- sample.int(1e6)


whichpart <- function(x, n=30) {
nx <- length(x)
p <- nx-n
xp <- sort(x, partial=p)[p]
which(x > xp)
}

cpwhichpart <- cmpfun(whichpart)

# using quicksort
quicksort <- function(x, n=30) {
sort(x, method="quick", decreasing=TRUE, index.return=TRUE)$ix[1:n]
}

cpquicksort <- cmpfun(quicksort)

# @Mariam
whichsort <- function(x, n=30) {
which(x >= sort(x, decreasing=TRUE)[30], arr.ind=TRUE)
}

cpwhichsort <- cmpfun(whichsort)

# @Ferdinand.kraft
top <- function(x, n=30) {
result <- numeric()
for(i in 1:n){
j <- which.max(x)
result[i] <- j
x[j] <- -Inf
}
result
}

cptop <- cmpfun(top)

# @Tony Breyal
dtable <- function(x, n=30) {
dt <- data.table(x=x, x.index=seq.int(x))
setkey(dt, "x")
dt$x.index[1:n]
}

cpdtable <- cmpfun(dtable)

# @Roland
roland <- cmpfun(function(x, n=30) {
y <- rep(-Inf, n)
for (i in seq_along(x)) {
if (x[i] > y[1]) {
y[1] <- x[i]
y <- y[order(y)]
}
}
y
})

## rnorm
microbenchmark(whichpart(x), cpwhichpart(x),
quicksort(x), cpquicksort(x),
whichsort(x), cpwhichsort(x),
top(x), cptop(x),
dtable(x), cpdtable(x),
roland(x), times=10)

# Unit: milliseconds
# expr min lq median uq max neval
# whichpart(x) 45.63544 46.05638 47.09077 49.68452 51.42065 10
# cpwhichpart(x) 45.65996 45.77212 47.02808 48.07482 82.20458 10
# quicksort(x) 100.90936 103.00783 105.17506 109.31784 139.83518 10
# cpquicksort(x) 100.53958 102.78017 107.64470 138.96630 142.52882 10
# whichsort(x) 148.86010 151.04350 155.80871 159.47063 184.56697 10
# cpwhichsort(x) 149.05578 150.21183 151.36918 166.58342 173.87567 10
# top(x) 146.10757 182.42089 184.53050 191.37293 193.62272 10
# cptop(x) 155.14354 179.14847 184.52323 196.80644 220.21222 10
# dtable(x) 1041.32457 1042.54904 1049.26096 1065.40606 1080.89969 10
# cpdtable(x) 1042.08247 1043.54915 1051.76366 1084.14360 1310.26485 10
# roland(x) 251.42885 261.47608 273.20838 295.09733 323.96257 10

## integer
microbenchmark(whichpart(y), cpwhichpart(y),
quicksort(y), cpquicksort(y),
whichsort(y), cpwhichsort(y),
top(y), cptop(y),
dtable(y), cpdtable(y),
roland(y), times=10)

# Unit: milliseconds
# expr min lq median uq max neval
# whichpart(y) 11.60703 11.76857 12.03704 12.52871 47.88526 10
# cpwhichpart(y) 11.62885 11.75006 12.53724 13.88563 46.93677 10
# quicksort(y) 88.14924 89.47630 92.42414 103.53439 137.44335 10
# cpquicksort(y) 88.11544 89.15334 92.63420 94.42244 133.78006 10
# whichsort(y) 122.34675 123.13634 124.91990 127.79134 131.43400 10
# cpwhichsort(y) 121.85618 122.91653 125.45211 127.14112 158.61535 10
# top(y) 163.06669 181.19004 211.11557 224.19237 239.63139 10
# cptop(y) 163.37903 173.55113 209.46770 218.59685 226.81545 10
# dtable(y) 499.50807 505.45513 514.55338 537.84129 604.86454 10
# cpdtable(y) 491.70016 498.62664 525.05342 527.14666 580.19429 10
# roland(y) 235.44664 237.52200 242.87925 268.34080 287.71196 10


identical(sort(quicksort(x)), whichpart(x))
# [1] TRUE

Edit: test @flodel's suggestion

# @flodel
whichpartrev <- function(x, n=30) {
which(x >= -sort(-x, partial=n)[n])
}

microbenchmark(whichpart(x), whichpartrev(x), times=100)

# Unit: milliseconds
# expr min lq median uq max neval
# whichpart(x) 45.44940 46.15011 46.51321 48.67986 80.63286 100
# whichpartrev(x) 28.84482 31.30661 32.87695 62.37843 67.84757 100

microbenchmark(whichpart(y), whichpartrev(y), times=100)

# Unit: milliseconds
# expr min lq median uq max neval
# whichpart(y) 11.56135 12.26539 13.05729 13.75199 43.78484 100
# whichpartrev(y) 16.00612 16.73690 17.71687 19.04153 49.02842 100

N Largest elements of the list

Here's another approach:

n_largest(L, N, R) :-
msort(L, LS),
length(R, N),
append(_, R, LS).

This will count duplicates. So:

| ?- n_largest([1,4,2,6,3,4,6], 3, L).

L = [4,6,6]

yes

If you want to choose unique elements, then you can use sort:

n_largest(L, N, R) :-
sort(L, LS),
length(R, N),
append(_, R, LS).

| ?- n_largest([1,4,2,6,3,4,6], 3, L).

L = [3,4,6]

yes

Note that these predicates provide the solutions in increasing order, not necessarily in the order that they occur in the original list. That was not stated as a requirement, however.

How to return 5 topmost values from vector in R?

> a <- c(1:100)
> tail(sort(a),5)
[1] 96 97 98 99 100

Make a table showing the 10 largest values of a variable in R?

This should do it...

data <- data[with(data,order(-Score)),]

data <- data[1:10,]

Fastest way to find second (third...) highest/lowest value in vector or column

Rfast has a function called nth_element that does exactly what you ask.

Further the methods discussed above that are based on partial sort, don't support finding the k smallest values

Update (28/FEB/21) package kit offers a faster implementation (topn) see https://stackoverflow.com/a/66367996/4729755, https://stackoverflow.com/a/53146559/4729755

Disclaimer: An issue appears to occur when dealing with integers which can by bypassed by using as.numeric (e.g. Rfast::nth(as.numeric(1:10), 2)), and will be addressed in the next update of Rfast.

Rfast::nth(x, 5, descending = T)

Will return the 5th largest element of x, while

Rfast::nth(x, 5, descending = F)

Will return the 5th smallest element of x

Benchmarks below against most popular answers.

For 10 thousand numbers:

N = 10000
x = rnorm(N)

maxN <- function(x, N=2){
len <- length(x)
if(N>len){
warning('N greater than length(x). Setting N=length(x)')
N <- length(x)
}
sort(x,partial=len-N+1)[len-N+1]
}

microbenchmark::microbenchmark(
Rfast = Rfast::nth(x,5,descending = T),
maxn = maxN(x,5),
order = x[order(x, decreasing = T)[5]])

Unit: microseconds
expr min lq mean median uq max neval
Rfast 160.364 179.607 202.8024 194.575 210.1830 351.517 100
maxN 396.419 423.360 559.2707 446.452 487.0775 4949.452 100
order 1288.466 1343.417 1746.7627 1433.221 1500.7865 13768.148 100

For 1 million numbers:

N = 1e6
x = rnorm(N)

microbenchmark::microbenchmark(
Rfast = Rfast::nth(x,5,descending = T),
maxN = maxN(x,5),
order = x[order(x, decreasing = T)[5]])

Unit: milliseconds
expr min lq mean median uq max neval
Rfast 89.7722 93.63674 114.9893 104.6325 120.5767 204.8839 100
maxN 150.2822 207.03922 235.3037 241.7604 259.7476 336.7051 100
order 930.8924 968.54785 1005.5487 991.7995 1031.0290 1164.9129 100


Related Topics



Leave a reply



Submit