R Sum every k columns in matrix
If the dimensions are equal for the sub matrices, you could change the dimensions to an array
and then do the rowSums
m1 <- as.matrix(temp1)
n <- 4
dim(m1) <- c(nrow(m1), ncol(m1)/n, n)
res <- matrix(rowSums(apply(m1, 2, I)), ncol=n)
identical(res[,1],rowSums(temp1[,1:4]))
#[1] TRUE
Or if the dimensions are unequal
t(sapply(seq(1,ncol(temp2), by=4), function(i) {
indx <- i:(i+3)
rowSums(temp2[indx[indx <= ncol(temp2)]])}))
data
set.seed(24)
temp1 <- as.data.frame(matrix(sample(1:20, 16*4, replace=TRUE), ncol=16))
set.seed(35)
temp2 <- as.data.frame(matrix(sample(1:20, 17*4, replace=TRUE), ncol=17))
Efficient way to sum every k columns in each row of large sparse matrix
Using a dgCMatrix
as input, this is one possible solution that is very fast:
new_combine <- function(mat,k){
#Convert dgCMatrix to dgTMatrix
x.T <- as(mat, "dgTMatrix")
#Map column indices to new set of indices
#based on partitioning every k columns
x.T@j <- as.integer(x.T@j %/% k)
#Correct dimensions of new matrix
x.T@Dim <- as.integer(c(nrow(x.T),floor(ncol(mat)/k)))
#Convert back to dgCMatrix
y <- as(x.T,"dgCMatrix")
y
}
microbenchmark::microbenchmark(sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x+k-1)])),
new_combine(mat2,k),
times=5L)
Unit: milliseconds
expr
sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x + k - 1)]))
new_combine(mat2, k)
min lq mean median uq
1808.872676 1864.783181 1925.17118 1935.98946 1990.28866
8.471521 9.396441 10.99871 10.04459 10.96175
max neval cld
2025.92192 5 b
16.11923 5 a
comp <- sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x+k-1)]))
comp2 <- new_combine(mat2,k)
> all.equal(comp2,as(comp,"dgCMatrix"))
[1] TRUE
Sum Every N Values in Matrix
To sum consecutive sets of n
elements from each row, you just need to write a function that does the summing and apply it to each row:
n <- 3
t(apply(y, 1, function(x) tapply(x, ceiling(seq_along(x)/n), sum)))
# 1 2 3
# [1,] 12 39 66
# [2,] 15 42 69
# [3,] 18 45 72
R matrix summing columns vector
You can use the rowSums
function to sum a subset of the columns of a matrix (in your case, those corresponding to a particular value in your vector). To iterate over all possible values of your vector, you could use sapply
:
# Reproducible dataset
set.seed(144)
mat1=matrix(runif(10*length(vec)),nrow=10)
sapply(unique(vec), function(x) rowSums(mat1[,vec == x,drop=F]))
# [,1] [,2] [,3]
# [1,] 0.8908481 1.1987764 0.200360078
# [2,] 0.9143586 0.4320678 0.617083644
# [3,] 1.8743282 0.8998081 0.463207436
# [4,] 1.2169977 1.9502429 0.116956239
# [5,] 0.7510266 0.6792186 0.249493016
# [6,] 1.5971054 0.8156898 0.860322422
# [7,] 0.7507476 0.7435681 0.976815212
# [8,] 1.7472541 0.5949144 0.169615928
# [9,] 1.5338936 0.7695170 0.859721852
# [10,] 1.3822168 1.3014881 0.007783816
The drop=F
argument makes sure your subsets of mat1
remain matrices even if you select a single column.
R Sum every n rows across n columns
Using data.table:
library(data.table)
setDT(mydata)
output = mydata[Strategy != "Demand",
.(Year.1.output = sum (Year.1),
Year.2.output = sum (Year.2),
Year.3.output = sum (Year.3),
Year.4.output = sum (Year.4)),
by = Geotype]
# Geotype Year.1.output Year.2.output Year.3.output Year.4.output
# 1: 1 27 27 27 27
# 2: 2 69 69 69 69
# 3: 3 111 111 111 111
We can simplify this to deal more easily with many year columns by
setDT(mydata)[Strategy != "Demand",
lapply(.SD, sum),
by=Geotype,
.SDcols=grep("Year", names(mydata))]
sum columns with different combinations in R?
Counting concurrent 1s in column pairs, we can use matrix muliplication:
xs = grep("X", names(df), value = T)
ys = grep("Y", names(df), value = T)
xm = as.matrix(df[xs])
ym = as.matrix(df[ys])
t(ym) %*% (xm)
# X_0 X_1 X_3 X_6 X_12
# Y_0 1 2 1 0 0
# Y_1 0 2 1 0 0
# Y_3 0 0 1 0 0
# Y_6 0 0 0 0 0
# Y_12 0 0 1 0 0
Counting all 1s in column pairs:
xs = grep("X", names(df), value = T)
ys = grep("Y", names(df), value = T)
sums = colSums(df)
t(outer(setNames(xs, xs), setNames(ys, ys), FUN = function(x, y) sums[x] + sums[y]))
# X_0 X_1 X_3 X_6 X_12
# Y_0 11 12 11 10 10
# Y_1 8 9 8 7 7
# Y_3 7 8 7 6 6
# Y_6 4 5 4 3 3
# Y_12 4 5 4 3 3
Using this data:
df = read.table(text = 'X_0 X_1 X_3 X_6 X_12 Y_0 Y_1 Y_3 Y_6 Y_12
0 1 0 0 0 1 1 0 0 0
0 0 0 0 0 1 1 1 0 1
0 1 0 0 0 1 1 0 0 0
1 0 0 0 0 1 0 0 0 0
0 0 0 0 0 1 0 0 0 0
0 0 0 0 0 1 1 1 0 0
0 0 0 0 0 1 1 1 1 0
0 0 0 0 0 1 1 1 1 0
0 0 0 0 0 1 0 1 1 1
0 0 1 0 0 1 1 1 0 1 ', header = T)
Matrix: Summing columns and rows conditional on variable name
Here is another base R way :
matched_sum <- function(dfr){
matched_col <- function(col_id) {
col_pattern <- gsub("[0-9]", "", colnames(dfr[col_id]))
dfr[grepl(col_pattern, rownames(x)),col_id] <- NA
return(dfr[col_id])
}
new_col <- lapply(1:ncol(dfr), matched_col)
new_dfr <- do.call(cbind.data.frame, new_col)
colSums(new_dfr, na.rm = TRUE)
}
# Your data frame. You can use as.data.frame(x) in case x is not a data frame
x
AUS1 AUS2 AUS3 AUT1 AUT2 AUT3
AUS1 1 7 13 19 25 31
AUS2 2 8 14 20 26 32
AUS3 3 9 15 21 27 33
AUT1 4 10 16 22 28 34
AUT2 5 11 17 23 29 35
AUT3 6 12 18 24 30 36
# Apply the function to x
matched_sum(x)
AUS1 AUS2 AUS3 AUT1 AUT2 AUT3
15 33 51 60 78 96
What the function does
col_pattern <- gsub("[0-9]", "", colnames(dfr[col_id]))
finds a pattern in each column name. The pattern is any string other than numbers. For example : the pattern in "AUS1" is "AUS".dfr[grepl(col_pattern, rownames(x)),col_id] <- NA
assigns NA to any row in the column that has pattern found in the 1st step. For example, the first column after this step will become:
AUS1
AUS1 NA
AUS2 NA
AUS3 NA
AUT1 4
AUT2 5
AUT3 6
lapply(1:ncol(dfr), matched_col)
apply the 1st step and the 2nd step to each column in the data frame.do.call(cbind.data.frame, new_col)
binds all columns (that already has NA in the selected rows) to a data frame. For example, if the input isx
that you provides, after this step it will become:
AUS1 AUS2 AUS3 AUT1 AUT2 AUT3
AUS1 NA NA NA 19 25 31
AUS2 NA NA NA 20 26 32
AUS3 NA NA NA 21 27 33
AUT1 4 10 16 NA NA NA
AUT2 5 11 17 NA NA NA
AUT3 6 12 18 NA NA NA
colSums(new_dfr, na.rm = TRUE)
sums all non-NA values in each column in the data frame created in the 4th step.
In case you want to keep the matrix structure for you data, you can use this:
matched_sum_mat <- function(mat){
matched_col <- function(col_id) {
col_pattern <- gsub("[0-9]", "", dimnames(mat)[[2]][col_id])
mat[grepl(col_pattern, dimnames(mat)[[1]]),col_id] <- NA
return(mat[,col_id])
}
new_col <- lapply(1:ncol(mat), matched_col)
new_mat <- do.call(cbind, new_col)
colnames(new_mat) <- colnames(mat)
colSums(new_mat, na.rm = TRUE)
}
# Apply to x as a matrix
matched_sum_mat(x)
AUS1 AUS2 AUS3 AUT1 AUT2 AUT3
15 33 51 60 78 96
Updates
In case you want an exact match between a column name and a row name, such as between "AUS1" in the column names and "AUS1" (instead of "AUS") in the row names, one of several ways to get it is as follows:
# Option 1
matched_name_location <- lapply(
colnames(x),
function(a_col_name) rownames(x) %in% a_col_name) |>
unlist() |>
which()
x[matched_name_location] <- NA
# The result
AUS1 AUS2 AUS3 AUT1 AUT2 AUT3
AUS1 NA 7 13 19 25 31
AUS2 2 NA 14 20 26 32
AUS3 3 9 NA 21 27 33
AUT1 4 10 16 NA 28 34
AUT2 5 11 17 23 NA 35
AUT3 6 12 18 24 30 NA
Another option is to use ==
instead of %in%
:
# Option 2
matched_name_location <- lapply(
colnames(x),
function(a_col_name) rownames(x) == a_col_name) |>
unlist() |>
which()
x[matched_name_location] <- NA
%in%
gives the same result as ==
does in this case because a_col_name
is a single name. If multiple names are used, the order of the names is ignored in %in%
, but not in ==
. For example:
y <- c("AUS1", "AUS2" ,"AUS3", "AUT1", "AUT2", "AUT3")
y %in% c("AUS2","AUS1")
#[1] TRUE TRUE FALSE FALSE FALSE FALSE
y == c("AUS2","AUS1")
#[1] FALSE FALSE FALSE FALSE FALSE FALSE
Another option is to use grepl
.
# Option 3
matched_name_location <- lapply(
colnames(x),
function(a_col_name) grepl(a_col_name, rownames(x))) |>
unlist() |>
which()
x[matched_name_location] <- NA
The last one is used to find the pattern within a string. So, for example, it grepl("AUS1", "AUS10")
returns TRUE
, whereas each of "AUS1" %in% "AUS10"
and "AUS1" == "AUS10"
returns FALSE
.
Related Topics
Creating New Shape Palettes in Ggplot2 and Other R Graphics
Cast String Directly to Idatetime
Add Missing Rows to a Data Table
How to 'Unlist' a Column in a Data.Table
Replacing Negative Values in a Model (System of Odes) with Zero
Using Grep to Subset Rows from a Data.Table, Comparing Row Content
Row-Wise Sum of Values Grouped by Columns with Same Name
Highlight a Line in Ggplot with Multiple Lines
How to Modify Unexported Object in a Package
Can Sparklyr Be Used with Spark Deployed on Yarn-Managed Hadoop Cluster
Using Variable Value as Column Name in Data.Frame or Cbind
Difference Between Sort(), Rank(), and Order()
Max and Min Functions That Are Similar to Colmeans
The Rolling Regression in R Using Roll Apply
Read CSV with Two Headers into a Data.Frame