Fastest Way to Sort Each Row of a Large Matrix in R

Fastest way to sort each row of a large matrix in R

Well, I'm not aware of that many ways to sort faster in R, and the problem is that you're only sorting 300 values, but many times. Still, you can eek some extra performance out of sort by directly calling sort.int and using method='quick':

set.seed(1)
a <- matrix(runif(9e+07),ncol=300)

# Your original code
system.time(sorted <- t(apply(a,1,sort))) # 31 secs

# sort.int with method='quick'
system.time(sorted2 <- t(apply(a,1,sort.int, method='quick'))) # 27 secs

# using a for-loop is slightly faster than apply (and avoids transpose):
system.time({sorted3 <- a; for(i in seq_len(nrow(a))) sorted3[i,] <- sort.int(a[i,], method='quick') }) # 26 secs

But a better way should be to use the parallel package to sort parts of the matrix in parallel. However, the overhead of transferring data seems to be too big, and on my machine it starts swapping since I "only" have 8 GB memory:

library(parallel)
cl <- makeCluster(4)
system.time(sorted4 <- t(parApply(cl,a,1,sort.int, method='quick'))) # Forever...
stopCluster(cl)

Fastest way to sort and desort rows of a matrix [r]

Row-wise sorting seems to be straightforward. To get the original order back (un-sort) we need the row-wise ranks rather than their order. Thereafter, what works for column sorting in @Josh O'Brien's answer we can adapt for rows.

Base R solution:

rr <- t(apply(m, 1, rank))  ## get initial RANKS by row
sm <- t(apply(m, 1, sort))  ## sort m

##  DOING STUFF HERE  ##

sm[] <- sm[cbind(as.vector(row(rr)), as.vector(rr))]  ## un-sort
all(m == sm)  ## check
# [1] TRUE

Seems to work.

In your linked answer, the rowSort function of the Rfast package stands out well in terms of performance, which may cover the sorting issue. Moreover there's also a rowRanks function that will cover our ranking issue. So we can avoid apply.

Let's try it out.

m[1:3, ]
#           [,1]      [,2]      [,3]        [,4]
# [1,] 0.9148060 0.5142118 0.3334272 0.719355838
# [2,] 0.9370754 0.3902035 0.3467482 0.007884739
# [3,] 0.2861395 0.9057381 0.3984854 0.375489965

library(Rfast)
rr <- rowRanks(m)  ## get initial RANKS by row
sm <- rowSort(m)   ## sort m
sm[1:3, ]  # check
#            [,1]      [,2]      [,3]      [,4]
# [1,] 0.36106962 0.4112159 0.6262453 0.6311956
# [2,] 0.01405302 0.2171577 0.5459867 0.6836634
# [3,] 0.07196981 0.2165673 0.5739766 0.6737271

##  DOING STUFF HERE  ##

sm[] <- sm[cbind(as.vector(row(rr)), as.vector(rr))]  ## un-sort
all(sm == m)  ## check
# [1] TRUE

Dito.

Benchmark

m.test <- matrix(runif(4e6), ncol = 4)
dim(m.test)
# [1] 1000000       4

# Unit: milliseconds
#     expr        min       lq       mean     median         uq        max neval cld
#    Rfast   897.6286   910.91   956.6259   924.1914   986.1246   1048.058     3 a  
#    baseR 87931.2824 88004.73 95659.8671 88078.1737 99524.1594 110970.145     3   c
#  forloop 58927.7784 59434.54 60317.3903 59941.2930 61012.1963  62083.100     3  b

Not so bad!!

Data/Code:

set.seed(42)

m <- matrix(runif(100), nrow = 25, ncol = 4)

## benchmark
m.test <- matrix(runif(4e6), ncol = 4)

microbenchmark::microbenchmark(
  Rfast={
    rr <- rowRanks(m.test)
    sm <- rowSort(m.test)
    sm[] <- sm[cbind(as.vector(row(rr)), as.vector(rr))]},
  baseR={
    rr <- t(apply(m.test, 1, rank))
    sm <- t(apply(m.test, 1, sort))
    sm[] <- sm[cbind(as.vector(row(rr)), as.vector(rr))]
  },
  forloop={
    om <- t(apply(m.test, 1, order, decreasing = T))
    sm <- m.test
    for (i in seq_len(nrow(m.test))) {
      sm[i, ] <- sm[i, om[i, ]]
    }
    for (i in seq_len(nrow(m.test))) {
      sm[i, ] <- sm[i, order(om[i, ])]
    }
  }, times=3L
)

Performance in R: What is the fastest way to sort the elements of a row in a matrix?

You could use package data.table:

set.seed(1)
mm <- matrix(rnorm(1000000*40,0,10),ncol=40) 
library(data.table)
system.time({
  d <- as.data.table(mm)
  d[, row := .I]
  d <- melt(d, id.vars = "row") #wide to long format
  setkey(d, row, value) #sort
  d[, variable := paste0("V", ncol(mm):1)] #decreasing order

  #back to wide format and coerce to matrix
  msorted <- as.matrix(dcast(d, row ~ variable)[, row := NULL]) 
})
#user  system elapsed 
#4.96    0.59    5.62

If you could keep it as a long-format data.table (i.e., skip the last step), it would take about 2 seconds on my machine.

For comparison, timings of @qjgods' answer on my machine:

#user  system elapsed 
#3.71    2.08    8.81

Note that using apply (or parallel versions of it) transposes the matrix.

Row wise Sorting in R

If we need only to sort by rows, use apply with MARGIN=1 and assign the output back to the original columns after transposing the output.

df1[-1] <- t(apply(df1[-1], 1, 
        FUN=function(x) sort(x, decreasing=TRUE)))

df1
#   Name English Math French
# 1 John      86   78     56
# 2  Sam      97   86     79
# 3 Viru      93   44     34

NOTE: But we may need to change the column names as sorting by row gives the new sorted values.

Another option will be use apply separately to get the column names and the values, with Map we get the corresponding columns, cbind with the first column to have the output.

nMat <- `dim<-`(names(df1)[-1][t(apply(df1[-1], 1,  
        order, decreasing=TRUE))], dim(df1[-1]))
vMat <- t(apply(df1[-1], 1,  sort, decreasing=TRUE))
cbind(df1[1], data.frame(Map(cbind, as.data.frame(nMat,
   stringsAsFactors=FALSE), as.data.frame(vMat))))
#  Name    V1.1 V1.2   V2.1 V2.2    V3.1 V3.2
#1 John  French   86   Math   78 English   56
#2  Sam    Math   97 French   86 English   79
#3 Viru English   93   Math   44  French   34

Or another option is data.table. We melt the 'wide' format to 'long' format, grouped by 'Name', we order the 'value' in decreasing order in 'i', get the Subset of Data.table (.SD), create a new column ('N'), grouped by 'Name' and use dcast to convert from 'long' to 'wide'.

library(data.table)
dcast(melt(setDT(df1), id.var='Name')[order(-value), 
  .SD, Name][, N:=paste0("Col", 1:.N) , .(Name)],
    Name~N, value.var=c("variable", "value"))
#   Name variable_Col1 variable_Col2 variable_Col3 value_Col1 value_Col2 value_Col3
#1: John        French          Math       English         86         78         56
#2:  Sam          Math        French       English         97         86         79
#3: Viru       English          Math        French         93         44         34

EDIT:
The above data.table solution will not work in case you have 10 or more columns with values, because then col10 will preceed col2 in the ordering, even though higher values will be stored in col2. To resolve this issue, you can use just number for the names of your new columns as in:

dcast(melt(setDT(df1), id.var='Name')[order(-value), 
      .SD, Name][, N:=1:.N , .(Name)],
        Name~N, value.var=c("variable", "value"))

Sorting each row of a data frame

You could use the plain apply function with MARGIN = 1 to apply over rows and then transpose the result.

t(apply(df, 1, sort))

do.call and order to sort each row to descending order of a matrix?

Two alternatives:

# Jaap
do.call(rbind, lapply(split(a, row(a)), sort, decreasing = TRUE))

# adaption of lmo's solution in the comments
for(i in 1:nrow(a)) a[i,] <- a[i,][order(a[i,], decreasing = TRUE)]

gives:

   [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
1     6    6    5    4    4    2    2    1    1     1
2     5    4    4    4    4    3    3    3    3     1
3     4    4    4    4    3    3    3    3    3     2
4     7    5    4    4    4    3    2    2    1     0
5     5    4    4    3    3    2    2    1    1     0
6     5    4    3    3    3    2    2    2    1     1
7     6    4    4    4    3    3    2    2    2     1
8     5    5    4    4    3    3    3    2    1     1
9     5    5    4    3    3    2    2    2    2     2
10    6    5    3    3    3    2    2    2    1     1

A benchmark with:

library(microbenchmark)
microbenchmark(dc.lapply.sort = do.call(rbind, lapply(split(a, row(a)), sort, decreasing = TRUE)),
               t.apply.sort = t(apply(a, 1, sort, decreasing = TRUE)),
               for.order = for(i in 1:nrow(a)) a[i,] <- a[i,][order(a[i,], decreasing = TRUE)],
               for.sort = for(i in 1:nrow(a)) a[i,] <- sort(a[i,], decreasing = TRUE),
               for.sort.list = for(x in seq_len(nrow(a))) a[x,] <- a[x,][sort.list(a[x,], decreasing = TRUE, method="radix")])

gives:

Unit: microseconds
           expr     min       lq      mean   median       uq      max neval cld
 dc.lapply.sort 189.811 206.5890 222.52223 217.8070 228.0905  332.034   100   c
   t.apply.sort 185.474 200.4515 212.59608 210.4930 220.0025  286.288   100  bc
      for.order  82.631  91.1860  98.66552  97.8475 102.9680  176.666   100 a  
       for.sort 167.939 187.5025 192.90728 192.1195 198.8690  256.494   100  b 
  for.sort.list 187.617 206.4475 230.82960 215.7060 221.6115 1541.343   100   c

It should be noted however that benchmarks are only meaningful on larger datasets, so:

set.seed(123)
a <- matrix(rbinom(10e5, 10, 0.3), ncol = 10)

microbenchmark(dc.lapply.sort = do.call(rbind, lapply(split(a, row(a)), sort, decreasing = TRUE)),
               t.apply.sort = t(apply(a, 1, sort, decreasing = TRUE)),
               for.order = for(i in 1:nrow(a)) a[i,] <- a[i,][order(a[i,], decreasing = TRUE)],
               for.sort = for(i in 1:nrow(a)) a[i,] <- sort(a[i,], decreasing = TRUE),
               for.sort.list = for(x in seq_len(nrow(a))) a[x,] <- a[x,][sort.list(a[x,], decreasing = TRUE, method="radix")],
               times = 10)

gives:

Unit: seconds
           expr      min       lq     mean   median       uq      max neval  cld
 dc.lapply.sort 6.790179 6.924036 7.036330 7.013996 7.121343 7.351729    10    d
   t.apply.sort 5.032052 5.057022 5.151560 5.081459 5.177159 5.538416    10   c 
      for.order 1.368351 1.463285 1.514652 1.471467 1.583873 1.736544    10 a   
       for.sort 5.028314 5.102993 5.317597 5.154104 5.348614 6.123278    10   c 
  for.sort.list 2.417857 2.464817 2.573294 2.519408 2.726118 2.815964    10  b

Conclusion: the for-loop in combination with order is still the fastest solution.

Using the order2 and sort2 functions of the grr-package can give a further improvement in speed. Comparing them with the fastest solution from above:

set.seed(123)
a <- matrix(rbinom(10e5, 10, 0.3), ncol = 10)

microbenchmark(for.order = for(i in 1:nrow(a)) a[i,] <- a[i,][order(a[i,], decreasing = TRUE)],
               for.order2 = for(i in 1:nrow(a)) a[i,] <- a[i,][rev(grr::order2(a[i,]))],
               for.sort2 = for(i in 1:nrow(a)) a[i,] <- rev(grr::sort2(a[i,])),
               times = 10)

giving:

Unit: milliseconds
       expr       min        lq      mean    median        uq      max neval cld
  for.order 1243.8140 1263.4423 1316.4662 1305.1823 1378.5836 1404.251    10   c
 for.order2  956.1536  962.8226 1110.1778 1090.9984 1233.4241 1368.416    10  b 
  for.sort2  830.1887  843.6765  920.5668  847.1601  972.8703 1144.135    10 a

Sort rows in matrix, based on relative order of columns within each row

Use max.col and order:

mat[ order(max.col(mat, "first")), ]

#     [,1] [,2] [,3] [,4] [,5] [,6]
#[1,]    7    1    2    3    2    4
#[2,]    3    5    4    4    3    5
#[3,]    1    2    3    4    5    6

Where mat was:

mat <- structure(c(1L, 3L, 7L, 2L, 5L, 1L, 3L, 4L, 2L, 4L, 4L, 3L, 5L, 
3L, 2L, 6L, 5L, 4L), .Dim = c(3L, 6L))

It works because it calculates:

\1. Column position of the maximum value in each row:

max.col(mat, "first")
#[1] 6 2 1

\2. The order of rows based on these maximum values:

order(max.col(mat, "first"))
#[1] 3 2 1

Sorting Large R Matrix by Column with Vector

Not sure if I got the question correctly, but you might try:

test[order(match(test[,2],x)),]      
#     [,1] [,2]
# [1,] "D"  "3" 
# [2,] "E"  "4" 
# [3,] "E"  "5" 
# [4,] "F"  "6" 
# [5,] "F"  "6" 
# [6,] "F"  "6" 
# [7,] "A"  "1" 
# [8,] "B"  "2" 
# [9,] "C"  "2"

Fastest Way to Sort Each Row of a Large Matrix in R