R Sum Every K Columns in Matrix

R Sum every k columns in matrix

If the dimensions are equal for the sub matrices, you could change the dimensions to an array and then do the rowSums

 m1 <- as.matrix(temp1)
n <- 4
dim(m1) <- c(nrow(m1), ncol(m1)/n, n)
res <- matrix(rowSums(apply(m1, 2, I)), ncol=n)
identical(res[,1],rowSums(temp1[,1:4]))
#[1] TRUE

Or if the dimensions are unequal

  t(sapply(seq(1,ncol(temp2), by=4), function(i) {
indx <- i:(i+3)
rowSums(temp2[indx[indx <= ncol(temp2)]])}))

data

set.seed(24)
temp1 <- as.data.frame(matrix(sample(1:20, 16*4, replace=TRUE), ncol=16))

set.seed(35)
temp2 <- as.data.frame(matrix(sample(1:20, 17*4, replace=TRUE), ncol=17))

Efficient way to sum every k columns in each row of large sparse matrix

Using a dgCMatrix as input, this is one possible solution that is very fast:

new_combine <- function(mat,k){
#Convert dgCMatrix to dgTMatrix
x.T <- as(mat, "dgTMatrix")
#Map column indices to new set of indices
#based on partitioning every k columns
x.T@j <- as.integer(x.T@j %/% k)
#Correct dimensions of new matrix
x.T@Dim <- as.integer(c(nrow(x.T),floor(ncol(mat)/k)))
#Convert back to dgCMatrix
y <- as(x.T,"dgCMatrix")
y
}

microbenchmark::microbenchmark(sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x+k-1)])),
new_combine(mat2,k),
times=5L)

Unit: milliseconds
expr
sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x + k - 1)]))
new_combine(mat2, k)
min lq mean median uq
1808.872676 1864.783181 1925.17118 1935.98946 1990.28866
8.471521 9.396441 10.99871 10.04459 10.96175
max neval cld
2025.92192 5 b
16.11923 5 a

comp <- sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x+k-1)]))
comp2 <- new_combine(mat2,k)

> all.equal(comp2,as(comp,"dgCMatrix"))
[1] TRUE

Sum Every N Values in Matrix

To sum consecutive sets of n elements from each row, you just need to write a function that does the summing and apply it to each row:

n <- 3
t(apply(y, 1, function(x) tapply(x, ceiling(seq_along(x)/n), sum)))
# 1 2 3
# [1,] 12 39 66
# [2,] 15 42 69
# [3,] 18 45 72

R matrix summing columns vector

You can use the rowSums function to sum a subset of the columns of a matrix (in your case, those corresponding to a particular value in your vector). To iterate over all possible values of your vector, you could use sapply:

# Reproducible dataset
set.seed(144)
mat1=matrix(runif(10*length(vec)),nrow=10)

sapply(unique(vec), function(x) rowSums(mat1[,vec == x,drop=F]))
# [,1] [,2] [,3]
# [1,] 0.8908481 1.1987764 0.200360078
# [2,] 0.9143586 0.4320678 0.617083644
# [3,] 1.8743282 0.8998081 0.463207436
# [4,] 1.2169977 1.9502429 0.116956239
# [5,] 0.7510266 0.6792186 0.249493016
# [6,] 1.5971054 0.8156898 0.860322422
# [7,] 0.7507476 0.7435681 0.976815212
# [8,] 1.7472541 0.5949144 0.169615928
# [9,] 1.5338936 0.7695170 0.859721852
# [10,] 1.3822168 1.3014881 0.007783816

The drop=F argument makes sure your subsets of mat1 remain matrices even if you select a single column.

R Sum every n rows across n columns

Using data.table:

library(data.table)
setDT(mydata)
output = mydata[Strategy != "Demand",
.(Year.1.output = sum (Year.1),
Year.2.output = sum (Year.2),
Year.3.output = sum (Year.3),
Year.4.output = sum (Year.4)),
by = Geotype]

# Geotype Year.1.output Year.2.output Year.3.output Year.4.output
# 1: 1 27 27 27 27
# 2: 2 69 69 69 69
# 3: 3 111 111 111 111

We can simplify this to deal more easily with many year columns by

setDT(mydata)[Strategy != "Demand", 
lapply(.SD, sum),
by=Geotype,
.SDcols=grep("Year", names(mydata))]

sum columns with different combinations in R?

Counting concurrent 1s in column pairs, we can use matrix muliplication:

xs = grep("X", names(df), value = T)
ys = grep("Y", names(df), value = T)

xm = as.matrix(df[xs])
ym = as.matrix(df[ys])
t(ym) %*% (xm)
# X_0 X_1 X_3 X_6 X_12
# Y_0 1 2 1 0 0
# Y_1 0 2 1 0 0
# Y_3 0 0 1 0 0
# Y_6 0 0 0 0 0
# Y_12 0 0 1 0 0

Counting all 1s in column pairs:

xs = grep("X", names(df), value = T)
ys = grep("Y", names(df), value = T)

sums = colSums(df)

t(outer(setNames(xs, xs), setNames(ys, ys), FUN = function(x, y) sums[x] + sums[y]))
# X_0 X_1 X_3 X_6 X_12
# Y_0 11 12 11 10 10
# Y_1 8 9 8 7 7
# Y_3 7 8 7 6 6
# Y_6 4 5 4 3 3
# Y_12 4 5 4 3 3

Using this data:

df = read.table(text = 'X_0 X_1 X_3 X_6 X_12 Y_0 Y_1 Y_3 Y_6 Y_12 
0 1 0 0 0 1 1 0 0 0
0 0 0 0 0 1 1 1 0 1
0 1 0 0 0 1 1 0 0 0
1 0 0 0 0 1 0 0 0 0
0 0 0 0 0 1 0 0 0 0
0 0 0 0 0 1 1 1 0 0
0 0 0 0 0 1 1 1 1 0
0 0 0 0 0 1 1 1 1 0
0 0 0 0 0 1 0 1 1 1
0 0 1 0 0 1 1 1 0 1 ', header = T)

Matrix: Summing columns and rows conditional on variable name

Here is another base R way :

matched_sum <- function(dfr){
matched_col <- function(col_id) {
col_pattern <- gsub("[0-9]", "", colnames(dfr[col_id]))
dfr[grepl(col_pattern, rownames(x)),col_id] <- NA
return(dfr[col_id])
}
new_col <- lapply(1:ncol(dfr), matched_col)
new_dfr <- do.call(cbind.data.frame, new_col)
colSums(new_dfr, na.rm = TRUE)
}

# Your data frame. You can use as.data.frame(x) in case x is not a data frame
x
AUS1 AUS2 AUS3 AUT1 AUT2 AUT3
AUS1 1 7 13 19 25 31
AUS2 2 8 14 20 26 32
AUS3 3 9 15 21 27 33
AUT1 4 10 16 22 28 34
AUT2 5 11 17 23 29 35
AUT3 6 12 18 24 30 36

# Apply the function to x
matched_sum(x)

AUS1 AUS2 AUS3 AUT1 AUT2 AUT3
15 33 51 60 78 96

What the function does

  1. col_pattern <- gsub("[0-9]", "", colnames(dfr[col_id])) finds a pattern in each column name. The pattern is any string other than numbers. For example : the pattern in "AUS1" is "AUS".
  2. dfr[grepl(col_pattern, rownames(x)),col_id] <- NA assigns NA to any row in the column that has pattern found in the 1st step. For example, the first column after this step will become:
    AUS1
AUS1 NA
AUS2 NA
AUS3 NA
AUT1 4
AUT2 5
AUT3 6

  1. lapply(1:ncol(dfr), matched_col) apply the 1st step and the 2nd step to each column in the data frame.
  2. do.call(cbind.data.frame, new_col) binds all columns (that already has NA in the selected rows) to a data frame. For example, if the input is x that you provides, after this step it will become:
     AUS1 AUS2 AUS3 AUT1 AUT2 AUT3
AUS1 NA NA NA 19 25 31
AUS2 NA NA NA 20 26 32
AUS3 NA NA NA 21 27 33
AUT1 4 10 16 NA NA NA
AUT2 5 11 17 NA NA NA
AUT3 6 12 18 NA NA NA

  1. colSums(new_dfr, na.rm = TRUE) sums all non-NA values in each column in the data frame created in the 4th step.

In case you want to keep the matrix structure for you data, you can use this:

matched_sum_mat <- function(mat){
matched_col <- function(col_id) {
col_pattern <- gsub("[0-9]", "", dimnames(mat)[[2]][col_id])
mat[grepl(col_pattern, dimnames(mat)[[1]]),col_id] <- NA
return(mat[,col_id])
}
new_col <- lapply(1:ncol(mat), matched_col)
new_mat <- do.call(cbind, new_col)
colnames(new_mat) <- colnames(mat)
colSums(new_mat, na.rm = TRUE)
}

# Apply to x as a matrix

matched_sum_mat(x)

AUS1 AUS2 AUS3 AUT1 AUT2 AUT3
15 33 51 60 78 96

Updates

In case you want an exact match between a column name and a row name, such as between "AUS1" in the column names and "AUS1" (instead of "AUS") in the row names, one of several ways to get it is as follows:

# Option 1
matched_name_location <- lapply(
colnames(x),
function(a_col_name) rownames(x) %in% a_col_name) |>
unlist() |>
which()
x[matched_name_location] <- NA

# The result
AUS1 AUS2 AUS3 AUT1 AUT2 AUT3
AUS1 NA 7 13 19 25 31
AUS2 2 NA 14 20 26 32
AUS3 3 9 NA 21 27 33
AUT1 4 10 16 NA 28 34
AUT2 5 11 17 23 NA 35
AUT3 6 12 18 24 30 NA

Another option is to use == instead of %in% :

# Option 2
matched_name_location <- lapply(
colnames(x),
function(a_col_name) rownames(x) == a_col_name) |>
unlist() |>
which()
x[matched_name_location] <- NA

%in% gives the same result as == does in this case because a_col_name is a single name. If multiple names are used, the order of the names is ignored in %in%, but not in ==. For example:

y <- c("AUS1", "AUS2" ,"AUS3", "AUT1", "AUT2", "AUT3")
y %in% c("AUS2","AUS1")
#[1] TRUE TRUE FALSE FALSE FALSE FALSE

y == c("AUS2","AUS1")
#[1] FALSE FALSE FALSE FALSE FALSE FALSE

Another option is to use grepl.

# Option 3
matched_name_location <- lapply(
colnames(x),
function(a_col_name) grepl(a_col_name, rownames(x))) |>
unlist() |>
which()
x[matched_name_location] <- NA

The last one is used to find the pattern within a string. So, for example, it grepl("AUS1", "AUS10") returns TRUE, whereas each of "AUS1" %in% "AUS10" and "AUS1" == "AUS10" returns FALSE.



Related Topics



Leave a reply



Submit