R Sum Every K Columns in Matrix

R Sum every k columns in matrix

If the dimensions are equal for the sub matrices, you could change the dimensions to an array and then do the rowSums

 m1 <- as.matrix(temp1)
 n <- 4
 dim(m1) <- c(nrow(m1), ncol(m1)/n, n)
 res <- matrix(rowSums(apply(m1, 2, I)), ncol=n)
 identical(res[,1],rowSums(temp1[,1:4]))
 #[1] TRUE

Or if the dimensions are unequal

  t(sapply(seq(1,ncol(temp2), by=4), function(i) {
                  indx <- i:(i+3)
               rowSums(temp2[indx[indx <= ncol(temp2)]])}))

data

set.seed(24)
temp1 <- as.data.frame(matrix(sample(1:20, 16*4, replace=TRUE), ncol=16))

set.seed(35)
temp2 <- as.data.frame(matrix(sample(1:20, 17*4, replace=TRUE), ncol=17))

Efficient way to sum every k columns in each row of large sparse matrix

Using a dgCMatrix as input, this is one possible solution that is very fast:

new_combine <- function(mat,k){
  #Convert dgCMatrix to dgTMatrix
  x.T <- as(mat, "dgTMatrix") 
  #Map column indices to new set of indices 
  #based on partitioning every k columns
  x.T@j <- as.integer(x.T@j %/% k)
  #Correct dimensions of new matrix
  x.T@Dim <- as.integer(c(nrow(x.T),floor(ncol(mat)/k)))
  #Convert back to dgCMatrix
  y <- as(x.T,"dgCMatrix")
  y
}

microbenchmark::microbenchmark(sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x+k-1)])),
                               new_combine(mat2,k),
                               times=5L)

Unit: milliseconds
                                                                    expr
 sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x + k -      1)]))
                                                            new_combine(mat2, k)
         min          lq       mean     median         uq
 1808.872676 1864.783181 1925.17118 1935.98946 1990.28866
    8.471521    9.396441   10.99871   10.04459   10.96175
        max neval cld
 2025.92192     5   b
   16.11923     5  a

comp <- sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x+k-1)]))
comp2 <- new_combine(mat2,k)

> all.equal(comp2,as(comp,"dgCMatrix"))
[1] TRUE

Sum Every N Values in Matrix

To sum consecutive sets of n elements from each row, you just need to write a function that does the summing and apply it to each row:

n <- 3
t(apply(y, 1, function(x) tapply(x, ceiling(seq_along(x)/n), sum)))
#       1  2  3
# [1,] 12 39 66
# [2,] 15 42 69
# [3,] 18 45 72

R matrix summing columns vector

You can use the rowSums function to sum a subset of the columns of a matrix (in your case, those corresponding to a particular value in your vector). To iterate over all possible values of your vector, you could use sapply:

# Reproducible dataset
set.seed(144)
mat1=matrix(runif(10*length(vec)),nrow=10)

sapply(unique(vec), function(x) rowSums(mat1[,vec == x,drop=F]))
#            [,1]      [,2]        [,3]
#  [1,] 0.8908481 1.1987764 0.200360078
#  [2,] 0.9143586 0.4320678 0.617083644
#  [3,] 1.8743282 0.8998081 0.463207436
#  [4,] 1.2169977 1.9502429 0.116956239
#  [5,] 0.7510266 0.6792186 0.249493016
#  [6,] 1.5971054 0.8156898 0.860322422
#  [7,] 0.7507476 0.7435681 0.976815212
#  [8,] 1.7472541 0.5949144 0.169615928
#  [9,] 1.5338936 0.7695170 0.859721852
# [10,] 1.3822168 1.3014881 0.007783816

The drop=F argument makes sure your subsets of mat1 remain matrices even if you select a single column.

R Sum every n rows across n columns

Using data.table:

library(data.table)
setDT(mydata)
output = mydata[Strategy != "Demand", 
             .(Year.1.output = sum (Year.1), 
               Year.2.output = sum (Year.2), 
               Year.3.output = sum (Year.3), 
               Year.4.output = sum (Year.4)),
             by = Geotype]

#    Geotype Year.1.output Year.2.output Year.3.output Year.4.output
# 1:       1            27            27            27            27
# 2:       2            69            69            69            69
# 3:       3           111           111           111           111

We can simplify this to deal more easily with many year columns by

setDT(mydata)[Strategy != "Demand", 
             lapply(.SD, sum), 
             by=Geotype, 
             .SDcols=grep("Year", names(mydata))]

sum columns with different combinations in R?

Counting concurrent 1s in column pairs, we can use matrix muliplication:

xs = grep("X", names(df), value = T)
ys = grep("Y", names(df), value = T)

xm = as.matrix(df[xs])
ym = as.matrix(df[ys])
t(ym) %*% (xm)
#      X_0 X_1 X_3 X_6 X_12
# Y_0    1   2   1   0    0
# Y_1    0   2   1   0    0
# Y_3    0   0   1   0    0
# Y_6    0   0   0   0    0
# Y_12   0   0   1   0    0

Counting all 1s in column pairs:

xs = grep("X", names(df), value = T)
ys = grep("Y", names(df), value = T)

sums = colSums(df)

t(outer(setNames(xs, xs), setNames(ys, ys), FUN = function(x, y) sums[x] + sums[y]))
#      X_0 X_1 X_3 X_6 X_12
# Y_0   11  12  11  10   10
# Y_1    8   9   8   7    7
# Y_3    7   8   7   6    6
# Y_6    4   5   4   3    3
# Y_12   4   5   4   3    3

Using this data:

df = read.table(text = 'X_0 X_1 X_3 X_6 X_12 Y_0 Y_1 Y_3 Y_6 Y_12 
0 1 0 0 0 1 1 0 0 0 
0 0 0 0 0 1 1 1 0 1 
0 1 0 0 0 1 1 0 0 0 
1 0 0 0 0 1 0 0 0 0 
0 0 0 0 0 1 0 0 0 0 
0 0 0 0 0 1 1 1 0 0 
0 0 0 0 0 1 1 1 1 0 
0 0 0 0 0 1 1 1 1 0 
0 0 0 0 0 1 0 1 1 1 
0 0 1 0 0 1 1 1 0 1 ', header = T)

Matrix: Summing columns and rows conditional on variable name

Here is another base R way :

matched_sum <- function(dfr){
    matched_col <- function(col_id) {
        col_pattern <- gsub("[0-9]", "", colnames(dfr[col_id]))
        dfr[grepl(col_pattern, rownames(x)),col_id] <- NA
        return(dfr[col_id])
    }
    new_col <- lapply(1:ncol(dfr), matched_col)
    new_dfr <- do.call(cbind.data.frame, new_col)
    colSums(new_dfr, na.rm = TRUE)
}

# Your data frame. You can use as.data.frame(x) in case x is not a data frame 
x
     AUS1 AUS2 AUS3 AUT1 AUT2 AUT3
AUS1    1    7   13   19   25   31
AUS2    2    8   14   20   26   32
AUS3    3    9   15   21   27   33
AUT1    4   10   16   22   28   34
AUT2    5   11   17   23   29   35
AUT3    6   12   18   24   30   36

# Apply the function to x
matched_sum(x)

AUS1 AUS2 AUS3 AUT1 AUT2 AUT3 
  15   33   51   60   78   96

What the function does

col_pattern <- gsub("[0-9]", "", colnames(dfr[col_id])) finds a pattern in each column name. The pattern is any string other than numbers. For example : the pattern in "AUS1" is "AUS".
dfr[grepl(col_pattern, rownames(x)),col_id] <- NA assigns NA to any row in the column that has pattern found in the 1st step. For example, the first column after this step will become:

    AUS1
AUS1   NA
AUS2   NA
AUS3   NA
AUT1    4
AUT2    5
AUT3    6

lapply(1:ncol(dfr), matched_col) apply the 1st step and the 2nd step to each column in the data frame.
do.call(cbind.data.frame, new_col) binds all columns (that already has NA in the selected rows) to a data frame. For example, if the input is x that you provides, after this step it will become:

     AUS1 AUS2 AUS3 AUT1 AUT2 AUT3
AUS1   NA   NA   NA   19   25   31
AUS2   NA   NA   NA   20   26   32
AUS3   NA   NA   NA   21   27   33
AUT1    4   10   16   NA   NA   NA
AUT2    5   11   17   NA   NA   NA
AUT3    6   12   18   NA   NA   NA

colSums(new_dfr, na.rm = TRUE) sums all non-NA values in each column in the data frame created in the 4th step.

In case you want to keep the matrix structure for you data, you can use this:

matched_sum_mat <- function(mat){
    matched_col <- function(col_id) {
        col_pattern <- gsub("[0-9]", "", dimnames(mat)[[2]][col_id])
        mat[grepl(col_pattern, dimnames(mat)[[1]]),col_id] <- NA
        return(mat[,col_id])
    }
    new_col <- lapply(1:ncol(mat), matched_col)
    new_mat <- do.call(cbind, new_col)
    colnames(new_mat) <- colnames(mat)
    colSums(new_mat, na.rm = TRUE)
}

# Apply to x as a matrix

matched_sum_mat(x)

AUS1 AUS2 AUS3 AUT1 AUT2 AUT3 
  15   33   51   60   78   96

Updates

In case you want an exact match between a column name and a row name, such as between "AUS1" in the column names and "AUS1" (instead of "AUS") in the row names, one of several ways to get it is as follows:

# Option 1
matched_name_location <- lapply(
   colnames(x), 
   function(a_col_name) rownames(x) %in% a_col_name) |> 
   unlist() |> 
   which()
x[matched_name_location] <- NA

# The result
     AUS1 AUS2 AUS3 AUT1 AUT2 AUT3
AUS1   NA    7   13   19   25   31
AUS2    2   NA   14   20   26   32
AUS3    3    9   NA   21   27   33
AUT1    4   10   16   NA   28   34
AUT2    5   11   17   23   NA   35
AUT3    6   12   18   24   30   NA

Another option is to use == instead of %in% :

# Option 2
matched_name_location <- lapply(
   colnames(x), 
   function(a_col_name) rownames(x) == a_col_name) |> 
   unlist() |> 
   which()
x[matched_name_location] <- NA

%in% gives the same result as == does in this case because a_col_name is a single name. If multiple names are used, the order of the names is ignored in %in%, but not in ==. For example:

y <- c("AUS1", "AUS2" ,"AUS3", "AUT1", "AUT2", "AUT3")
y %in% c("AUS2","AUS1")
#[1]  TRUE  TRUE FALSE FALSE FALSE FALSE

y == c("AUS2","AUS1")
#[1] FALSE FALSE FALSE FALSE FALSE FALSE

Another option is to use grepl.

# Option 3
matched_name_location <- lapply(
   colnames(x), 
   function(a_col_name) grepl(a_col_name, rownames(x))) |> 
   unlist() |> 
   which()
x[matched_name_location] <- NA

The last one is used to find the pattern within a string. So, for example, it grepl("AUS1", "AUS10") returns TRUE, whereas each of "AUS1" %in% "AUS10" and "AUS1" == "AUS10" returns FALSE.

R Sum Every K Columns in Matrix