I have a large banded sparse matrix S with few non-zero elements in its rows and columns. And I have vector b, I want to calculate sum(S*tcrossprod(b)). But because the data is too large, the system can't calculate it, I need to multiply the non-zero parts in S, how can I do that? Thanks! Here is the example data:
S = matrix(rnorm(64),8,8)
bw = 3
pick = (row(S)<(col(S) - bw)) | (row(S)>(col(S) bw))
S[pick] = 0
S.r <- sample(1:8,40,replace = T)
S.c <- sample(1:8,40,replace = T)
for (i in 1:length(S.r)) {
S[S.r[i],S.c[i]] <-0
}
S <- as(S,"sparseMatrix")
rowlab = collab = NULL
for (ii in 1:S@Dim[2]) {
n <- S@i[(S@p[ii] 1):S@p[ii 1]]
rowlab <-c(rowlab, n)
collab <- c(collab, rep(ii,length(n)))
lab.b <- data.frame(rowlab = rowlab 1, collab = collab)
}
b = rnorm(8)
And I want to get
sum(S*tcrossprod(b))
For the real data, the dimension of S is c(20000,20000), and the dimension of b is c(20000,1).
CodePudding user response:
You are running into memory issues due to the dense matrix returned from tcrossprod(b). But as many of the terms of this are getting multiplied by zero we can avoid calculating some of the values (avoid generating the full cross-product). You can grab the indices of the non-zero elements of the sparse matrix S, use these to index the vector b and multiply. Below are a couple of ways to calculate S * tcrossprod(b):
library(Matrix)
# Several functions to calculate the matrix products:
f1 <- function(S, b) S*tcrossprod(b)
f2 <- function(S, b) (S * b) %*% Diagonal(x=b)
f3 <- function(S, b) {indices = summary(S); S@x = S@x * b[indices$i]*b[indices$j]; S}
f4 <- function(S, b) {dp = diff(S@p); j = rep(seq_along(dp),dp); S@x = S@x * b[S@i 1]*b[j]; S}
# Check equality of function results
all.equal(f1(S, b), f2(S, b))
# [1] TRUE
all.equal(f2(S, b), f3(S, b))
# [1] TRUE
all.equal(f1(S, b), f3(S, b))
# [1] TRUE
all.equal(f1(S, b), f4(S, b))
# [1] TRUE
all.equal(f2(S, b), f4(S, b))
# [1] TRUE
all.equal(f3(S, b), f4(S, b))
# [1] TRUE
Benchmark:
rbenchmark::benchmark(f1(S, b), f2(S, b), f3(S, b), f4(S, b), replications=1) # 1 rep as f1() is slow
# test replications elapsed relative user.self sys.self user.child sys.child
# 1 f1(S, b) 1 12.182 12182 7.972 4.932 0 0
# 2 f2(S, b) 1 0.003 3 0.003 0.000 0 0
# 3 f3(S, b) 1 0.002 2 0.002 0.000 0 0
# 4 f4(S, b) 1 0.001 1 0.001 0.000 0 0
# longer benchmark for the other functions, drop f1()
# test replications elapsed relative user.self sys.self user.child sys.child
# 1 f2(S, b) 100 0.363 2.556 0.305 0.042 0 0
# 2 f3(S, b) 100 0.279 1.965 0.255 0.024 0 0
# 3 f4(S, b) 100 0.142 1.000 0.142 0.000 0 0
As you just want the sum it is enough to do
dp = diff(S@p); j = rep(seq_along(dp),dp); sum(S@x * b[S@i 1]*b[j])
Data:
n = 1e4 # I dont have enough memeory on my laptop for 20k
set.seed(1)
b = rnorm(n)
S = matrix(rnorm(n*n), n, n)
bw = 3
pick = (row(S)<(col(S) - bw)) | (row(S)>(col(S) bw))
S[pick] = 0
S.r <- sample(1:n,40,replace = T)
S.c <- sample(1:n,40,replace = T)
for (i in 1:length(S.r)) {
S[S.r[i],S.c[i]] <-0
}
S <- as(S,"sparseMatrix")
