How to Get Top N Companies from a Data Frame in Decreasing Order

How to get top n companies from a data frame in decreasing order

head and tail are really useful functions!

head(sort(Forbes2000$profits,decreasing=TRUE), n = 50)

If you want the first 50 rows of the data.frame, then you can use the arrange function from plyr to sort the data.frame and then use head

library(plyr)

head(arrange(Forbes2000,desc(profits)), n = 50)

Notice that I wrapped profits in a call to desc which means it will sort in decreasing order.

To work without plyr

head(Forbes2000[order(Forbes2000$profits, decreasing= T),], n = 50)

Sorting columns and selecting top n rows in each group pandas dataframe

There are 2 solutions:

1.sort_values and aggregate head:

df1 = df.sort_values('score',ascending = False).groupby('pidx').head(2)
print (df1)

mainid pidx pidy score
8 2 x w 12
4 1 a e 8
2 1 c a 7
10 2 y x 6
1 1 a c 5
7 2 z y 5
6 2 y z 3
3 1 c b 2
5 2 x y 1

2.set_index and aggregate nlargest:

df = df.set_index(['mainid','pidy']).groupby('pidx')['score'].nlargest(2).reset_index() 
print (df)
pidx mainid pidy score
0 a 1 e 8
1 a 1 c 5
2 c 1 a 7
3 c 1 b 2
4 x 2 w 12
5 x 2 y 1
6 y 2 x 6
7 y 2 z 3
8 z 2 y 5

Timings:

np.random.seed(123)
N = 1000000

L1 = list('abcdefghijklmnopqrstu')
L2 = list('efghijklmnopqrstuvwxyz')
df = pd.DataFrame({'mainid':np.random.randint(1000, size=N),
'pidx': np.random.randint(10000, size=N),
'pidy': np.random.choice(L2, N),
'score':np.random.randint(1000, size=N)})
#print (df)

def epat(df):
grouped = df.groupby('pidx')
new_df = pd.DataFrame([], columns = df.columns)
for key, values in grouped:
new_df = pd.concat([new_df, grouped.get_group(key).sort_values('score', ascending=True)[:2]], 0)
return (new_df)

print (epat(df))

In [133]: %timeit (df.sort_values('score',ascending = False).groupby('pidx').head(2))
1 loop, best of 3: 309 ms per loop

In [134]: %timeit (df.set_index(['mainid','pidy']).groupby('pidx')['score'].nlargest(2).reset_index())
1 loop, best of 3: 7.11 s per loop

In [147]: %timeit (epat(df))
1 loop, best of 3: 22 s per loop

Top n rows for every column in R dataframe

In your for loop you may want to combine the top n rows with the related rid- and column names in a data.frame.

csv <- read.csv("input.csv", row.names=1)

n <- 3
for (k in 1:ncol(csv)) {
o <- order(-csv[, k])[1:n]
print(data.frame(cid=names(csv)[k], rid=rownames(csv)[o], v=csv[o, k]))
}
# cid rid v
# 1 cid1 rid2 1.0
# 2 cid1 rid6 0.9
# 3 cid1 rid8 0.6
# cid rid v
# 1 cid2 rid9 0.9
# 2 cid2 rid7 0.8
# 3 cid2 rid3 0.5
# cid rid v
# 1 cid3 rid7 0.9
# 2 cid3 rid4 0.8
# 3 cid3 rid5 0.7

Alternatively you may use lapply, which results in a list.

n <- 3
lapply(seq(csv), function(x)
data.frame(cid=names(csv)[x], rid=rownames(csv), v=csv[, x])[order(-csv[, x]), ][1:n, ])
# [[1]]
# cid rid v
# 2 cid1 rid2 1.0
# 6 cid1 rid6 0.9
# 8 cid1 rid8 0.6
#
# [[2]]
# cid rid v
# 9 cid2 rid9 0.9
# 7 cid2 rid7 0.8
# 3 cid2 rid3 0.5
#
# [[3]]
# cid rid v
# 7 cid3 rid7 0.9
# 4 cid3 rid4 0.8
# 5 cid3 rid5 0.7

Edit

To subset at a threshold instead of the order, do

th <- .5
for (k in 1:ncol(csv)) {
rows <- csv[, k] >= th
print(data.frame(cid=names(csv)[k], rid=rownames(csv)[rows], v=csv[rows, k]))
}
# cid rid v
# 1 cid1 rid2 1.0
# 2 cid1 rid6 0.9
# 3 cid1 rid8 0.6
# cid rid v
# 1 cid2 rid3 0.5
# 2 cid2 rid7 0.8
# 3 cid2 rid8 0.5
# 4 cid2 rid9 0.9
# cid rid v
# 1 cid3 rid2 0.5
# 2 cid3 rid4 0.8
# 3 cid3 rid5 0.7
# 4 cid3 rid7 0.9
# 5 cid3 rid8 0.7

or, using lapply

th <- .5
lapply(seq(csv), function(x) {
ss <- csv[[x]] >= th
data.frame(cid=names(csv)[x], rid=rownames(csv), v=csv[, x])[ss, ]
})
# [[1]]
# cid rid v
# 2 cid1 rid2 1.0
# 6 cid1 rid6 0.9
# 8 cid1 rid8 0.6
#
# [[2]]
# cid rid v
# 3 cid2 rid3 0.5
# 7 cid2 rid7 0.8
# 8 cid2 rid8 0.5
# 9 cid2 rid9 0.9
#
# [[3]]
# cid rid v
# 2 cid3 rid2 0.5
# 4 cid3 rid4 0.8
# 5 cid3 rid5 0.7
# 7 cid3 rid7 0.9
# 8 cid3 rid8 0.7

Edit 2

And here comes the ordered version.

th <- .5
lapply(seq(csv), function(x) {
xo <- csv[order(-csv[, x]), x, F]
o <- xo[xo >= th,,F]
cbind(cid=colnames(o), rid=rownames(o), v=unname(o))
})
# [[1]]
# cid rid v
# rid2 cid1 rid2 1.0
# rid6 cid1 rid6 0.9
# rid8 cid1 rid8 0.6
#
# [[2]]
# cid rid v
# rid9 cid2 rid9 0.9
# rid7 cid2 rid7 0.8
# rid3 cid2 rid3 0.5
# rid8 cid2 rid8 0.5
#
# [[3]]
# cid rid v
# rid7 cid3 rid7 0.9
# rid4 cid3 rid4 0.8
# rid5 cid3 rid5 0.7
# rid8 cid3 rid8 0.7
# rid2 cid3 rid2 0.5

or

for (x in 1:ncol(csv)) {
xo <- csv[order(-csv[, x]), x, F]
o <- xo[xo >= th,,F]
print(cbind(cid=colnames(o), rid=rownames(o), v=unname(o)))
}
# cid rid v
# rid2 cid1 rid2 1.0
# rid6 cid1 rid6 0.9
# rid8 cid1 rid8 0.6
# cid rid v
# rid9 cid2 rid9 0.9
# rid7 cid2 rid7 0.8
# rid3 cid2 rid3 0.5
# rid8 cid2 rid8 0.5
# cid rid v
# rid7 cid3 rid7 0.9
# rid4 cid3 rid4 0.8
# rid5 cid3 rid5 0.7
# rid8 cid3 rid8 0.7
# rid2 cid3 rid2 0.5

Data:

csv <- structure(list(cid1 = c(0.1, 1, 0.2, 0.3, 0.2, 0.9, 0.4, 0.6, 
0.3), cid2 = c(0.4, 0.1, 0.5, 0.4, 0.3, 0.2, 0.8, 0.5, 0.9),
cid3 = c(0.3, 0.5, 0.1, 0.8, 0.7, 0.1, 0.9, 0.7, 0.4)), class = "data.frame", row.names = c("rid1",
"rid2", "rid3", "rid4", "rid5", "rid6", "rid7", "rid8", "rid9"
))

how to find top N descending values in group in dplyr

We can try with count/arrange/slice

df1 %>% 
count(Service, Codes) %>%
arrange(desc(n)) %>%
group_by(Service) %>%
slice(seq_len(3))
# A tibble: 6 x 3
# Groups: Service [2]
# Service Codes n
# <chr> <chr> <int>
#1 ABS DR 4
#2 ABS RT 2
#3 ABS TY 1
#4 DEF DR 4
#5 DEF RT 2
#6 DEF SE 2

In the OP's code, we need to arrange by 'Service' too. As @Marius said in the comments, the top_n will include more number of rows if there are ties. One option is to do a second grouping with 'Service' and slice (as showed above) or after the grouping, we can filter

df1 %>% 
group_by(Service,Codes) %>%
summarise(Count = n()) %>%
top_n(n=3,wt = Count) %>%
arrange(Service, desc(Count)) %>%
group_by(Service) %>%
filter(row_number() <=3)

Pandas: Get top n columns based on a row values

Use sorting per row and select first 3 values:

df1 = df.sort_values(0, axis=1, ascending=False).iloc[:, :3]
print (df1)
b d c
0 10 5 3

Solution with Series.nlargest:

df1 = df.iloc[0].nlargest(3).to_frame().T
print (df1)
b d c
0 10 5 3

R dataframe - Top n values in row with column names

You could pivot to long, group by the corresponding original row, use slice_max to get the top values, then pivot back to wide and bind that output to the original table.

library(dplyr, warn.conflicts = FALSE)
library(tidyr)

iris %>%
group_by(rn = row_number()) %>%
pivot_longer(-c(Species, rn), 'col', values_to = 'high') %>%
slice_max(col, n = 2) %>%
mutate(nm = row_number()) %>%
pivot_wider(values_from = c(high, col),
names_from = nm) %>%
ungroup() %>%
select(-c(Species, rn)) %>%
bind_cols(iris)
#> # A tibble: 150 × 9
#> high_1 high_2 col_1 col_2 Sepal.Length Sepal.Width Petal.Length Petal.Width
#> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 5.1 3.5 Sepal.… Sepa… 5.1 3.5 1.4 0.2
#> 2 4.9 3 Sepal.… Sepa… 4.9 3 1.4 0.2
#> 3 4.7 3.2 Sepal.… Sepa… 4.7 3.2 1.3 0.2
#> 4 4.6 3.1 Sepal.… Sepa… 4.6 3.1 1.5 0.2
#> 5 5 3.6 Sepal.… Sepa… 5 3.6 1.4 0.2
#> 6 5.4 3.9 Sepal.… Sepa… 5.4 3.9 1.7 0.4
#> 7 4.6 3.4 Sepal.… Sepa… 4.6 3.4 1.4 0.3
#> 8 5 3.4 Sepal.… Sepa… 5 3.4 1.5 0.2
#> 9 4.4 2.9 Sepal.… Sepa… 4.4 2.9 1.4 0.2
#> 10 4.9 3.1 Sepal.… Sepa… 4.9 3.1 1.5 0.1
#> # … with 140 more rows, and 1 more variable: Species <fct>

Created on 2022-02-16 by the reprex package (v2.0.1)

Edited to remove the unnecessary rename and mutate, thanks to tip from @Onyambu!

Extract top N count from data frame with column showing count while maintaining data frame structure

A solution using dplyr. The idea is to arrange the data frame by count by descending order, subset for the first three rows, and then update the count column with the last row to be 600 minus all the count of previous row. df2 is the final output.

library(dplyr)

df2 <- df %>%
arrange(desc(c1)) %>%
slice(1:which(cumsum(c1) > 600)[1])) %>%
mutate(count = ifelse(row_number() == n(),
600 - sum(count[1:(n() - 1)]),
count))
df2
# # A tibble: 3 x 3
# count seq other
# <dbl> <fct> <fct>
# 1 324 SDOIHHFOEKN G
# 2 213 SDIUFONBSD T
# 3 63.0 DSLIHFEIHDFS U

How to find top n% of records in a column of a dataframe using R

For the top 5%:

n <- 5
data[data$V2 > quantile(data$V2,prob=1-n/100),]

Get the positions of a certain column in a data frame in ascending order in R

You can use which to get the positions of the rows where p$par == 0, and then use order to reorder the vector based on ascending order.

which(p$par == 0)[order(p$par1[p$par == 0])]
# [1] 1 2 5 8 3


Related Topics



Leave a reply



Submit