I have a super large data frame containing nearly 5 million rows. data then I have a char list containing around 2000 items, I need to do a pairwise(lets say one is A other is B) calculation on them, so at the end, I will have a 2000*2000 matrix containing values. The value I need is: (#id has A and B)/ min(#id has A, #id has B)
load("data.RData")
n = length(itemlist) # n=1831
a = matrix(0, n, n)
rownames(a) <- colnames(a) <- itemlist
aa = sapply(itemlist, function(x) grepl(x, data$Item))
for(i in 1:1830) {
for(j in (i 1):1831) {
a1 <- aa[,i]
a2 <- aa[,j]
a3 <- a1 & a2
a[i,j] <- sum(a3) / min(sum(a1), sum(a2))
}
print(i)
}
result <- a
This code works but it is super slow(take days). I was wondering if it can be much faster.
CodePudding user response:
Here is an approach using paralleldist with a custom C function.
library(parallelDist)
library(RcppArmadillo)
library(RcppXPtrUtils)
I am taking as input an integer matrix with values in 0,1
mat <- as.integer(rnorm(10*10) > 0) |>
matrix(nrow = 10)
mat
##> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##> [1,] 0 1 0 0 1 1 1 1 0 0
##> [2,] 0 0 0 0 0 1 1 0 0 0
##> [3,] 1 1 1 0 1 0 1 1 1 0
##> [4,] 0 1 1 0 1 0 1 1 1 0
##> [5,] 0 1 0 0 0 0 0 1 0 1
##> [6,] 1 0 0 0 0 0 1 1 1 0
##> [7,] 0 0 0 0 1 1 1 1 0 0
##> [8,] 0 1 0 1 1 1 1 0 1 1
##> [9,] 1 0 0 1 0 1 1 1 1 1
##>[10,] 0 1 1 0 1 0 0 0 0 1
Now let's create a custom C function using the armadillo C library.
customDist <- cppXPtr(
"double customDist(const arma::mat &A, const arma::mat &B) {
double a = arma::accu(A);
double b = arma::accu(B);
double s = arma::accu(A && B);
return s/(a > b ? b : a);
}", depends = c("RcppArmadillo"))
Calculation using parDist is performed using multiple threads defaulting to all the cpus.
dst <- mat |>
parallelDist::parDist(method = "custom",
func = customDist)
as.matrix(dst)
##> 1 2 3 4 5 6 7 8
##>1 0.0000000 1.0 0.8000000 0.8000000 0.6666667 0.5000000 1.0000000 0.8000000
##>2 1.0000000 0.0 0.5000000 0.5000000 0.0000000 0.5000000 1.0000000 1.0000000
##>3 0.8000000 0.5 0.0000000 1.0000000 0.6666667 1.0000000 0.7500000 0.5714286
##>4 0.8000000 0.5 1.0000000 0.0000000 0.6666667 0.7500000 0.7500000 0.6666667
##>5 0.6666667 0.0 0.6666667 0.6666667 0.0000000 0.3333333 0.3333333 0.6666667
##>6 0.5000000 0.5 1.0000000 0.7500000 0.3333333 0.0000000 0.5000000 0.5000000
##>7 1.0000000 1.0 0.7500000 0.7500000 0.3333333 0.5000000 0.0000000 0.7500000
##>8 0.8000000 1.0 0.5714286 0.6666667 0.6666667 0.5000000 0.7500000 0.0000000
##>9 0.6000000 1.0 0.5714286 0.5000000 0.6666667 1.0000000 0.7500000 0.7142857
##>10 0.5000000 0.0 0.7500000 0.7500000 0.6666667 0.0000000 0.2500000 0.7500000
##> 9 10
##>1 0.6000000 0.5000000
##>2 1.0000000 0.0000000
##>3 0.5714286 0.7500000
##>4 0.5000000 0.7500000
##>5 0.6666667 0.6666667
##>6 1.0000000 0.0000000
##>7 0.7500000 0.2500000
##>8 0.7142857 0.7500000
##>9 0.0000000 0.2500000
##>10 0.2500000 0.0000000
CodePudding user response:
Here's a simple approach using base R (not in parallel)
mat <- as.integer(rnorm(10*10) > 0) |>
matrix(nrow = 10)
##> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##> [1,] 0 1 0 0 1 1 1 1 0 0
##> [2,] 0 0 0 0 0 1 1 0 0 0
##> [3,] 1 1 1 0 1 0 1 1 1 0
##> [4,] 0 1 1 0 1 0 1 1 1 0
##> [5,] 0 1 0 0 0 0 0 1 0 1
##> [6,] 1 0 0 0 0 0 1 1 1 0
##> [7,] 0 0 0 0 1 1 1 1 0 0
##> [8,] 0 1 0 1 1 1 1 0 1 1
##> [9,] 1 0 0 1 0 1 1 1 1 1
##>[10,] 0 1 1 0 1 0 0 0 0 1
S <- mat %*% t(mat)
C <- apply(mat, 1, sum)
S/outer(C, C, "pmin")
##> [,1] [,2] [,3] [,4] [,5] [,6] [,7]
##> [1,] 1.0000000 1.0 0.8000000 0.8000000 0.6666667 0.5000000 1.0000000
##> [2,] 1.0000000 1.0 0.5000000 0.5000000 0.0000000 0.5000000 1.0000000
##> [3,] 0.8000000 0.5 1.0000000 1.0000000 0.6666667 1.0000000 0.7500000
##> [4,] 0.8000000 0.5 1.0000000 1.0000000 0.6666667 0.7500000 0.7500000
##> [5,] 0.6666667 0.0 0.6666667 0.6666667 1.0000000 0.3333333 0.3333333
##> [6,] 0.5000000 0.5 1.0000000 0.7500000 0.3333333 1.0000000 0.5000000
##> [7,] 1.0000000 1.0 0.7500000 0.7500000 0.3333333 0.5000000 1.0000000
##> [8,] 0.8000000 1.0 0.5714286 0.6666667 0.6666667 0.5000000 0.7500000
##> [9,] 0.6000000 1.0 0.5714286 0.5000000 0.6666667 1.0000000 0.7500000
##>[10,] 0.5000000 0.0 0.7500000 0.7500000 0.6666667 0.0000000 0.2500000
##> [,8] [,9] [,10]
##> [1,] 0.8000000 0.6000000 0.5000000
##> [2,] 1.0000000 1.0000000 0.0000000
##> [3,] 0.5714286 0.5714286 0.7500000
##> [4,] 0.6666667 0.5000000 0.7500000
##> [5,] 0.6666667 0.6666667 0.6666667
##> [6,] 0.5000000 1.0000000 0.0000000
##> [7,] 0.7500000 0.7500000 0.2500000
##> [8,] 1.0000000 0.7142857 0.7500000
##> [9,] 0.7142857 1.0000000 0.2500000
##>[10,] 0.7500000 0.2500000 1.0000000
