I originally asked this question in data.table forum, and received some good answers but I hope to find a less-than-a-sec solution (real data dims 3M x 303)
The link to my post in R community, and MWE in julia (using DataFrames.jl if I am using wrong packages please let me know)
julia> using DataFrames
julia> df = DataFrame(v1 = [1,2,3], v2 = [1,3,3], v3 = [missing,2,3])
3×3 DataFrame
Row │ v1 v2 v3
│ Int64 Int64 Int64?
─────┼───────────────────────
1 │ 1 1 missing
2 │ 2 3 2
3 │ 3 3 3
julia> desired = [true,false,true]
3-element Vector{Bool}:
1
0
1
CodePudding user response:
Structure matters. Here is a matrix approach in R, using the matrixStats package (source), which ships optimized matrix functions implemented in C.
x = sample.int(3, size = 303*3e6, replace = T)
m = matrix(x, ncol = 303, byrow = T)
bench::mark(
var = matrixStats::rowVars(m, na.rm = T) == 0
)
On my (certainly not high performance) machine this takes roughly 3.5 seconds for a 3 million row matrix.
CodePudding user response:
In Julia using a matrix for such operation, similarly to R, is also natural:
julia> using BenchmarkTools
julia> function helper(x)
nonempty = false
local fv
for v in x
if !ismissing(v)
if nonempty
v == fv || return false
else
fv = v
nonempty = true
end
end
end
return true
end
helper (generic function with 1 method)
julia> mat = rand([1, 2, missing], 3_000_000, 303);
julia> @benchmark helper.(eachrow($mat))
BenchmarkTools.Trial: 34 samples with 1 evaluation.
Range (min … max): 139.440 ms … 154.628 ms ┊ GC (min … max): 5.74% … 5.15%
Time (median): 147.890 ms ┊ GC (median): 5.27%
Time (mean ± σ): 147.876 ms ± 3.114 ms ┊ GC (mean ± σ): 5.23% ± 0.95%
▃ ▃ ▃ ▃ ██ ▃
▇▁▁▁▁▁▁▁▁▁▇▁▁▁▁▁▁▁█▁▁▁▁█▇▇▇█▁▁█▇▁██▇█▁▇▁▇▇▁▇▇▁▇▁▇▇▇▁▁▁▁▇▁▁▁▁▇ ▁
139 ms Histogram: frequency by time 155 ms <
Memory estimate: 114.80 MiB, allocs estimate: 6.
The operation can be also done in DataFrames.jl, here is an example how to do it:
julia> function helper2(x, i)
nonempty = false
local fv
for v in x
vv = v[i]
if !ismissing(vv)
if nonempty
vv == fv || return false
else
fv = vv
nonempty = true
end
end
end
return true
end
helper2 (generic function with 1 method)
julia> df = DataFrame(mat, :auto, copycols=false); # copycols to avoid copying as data is large
julia> @benchmark helper2.(Ref(identity.(eachcol($df))), 1:nrow($df))
BenchmarkTools.Trial: 46 samples with 1 evaluation.
Range (min … max): 105.265 ms … 123.345 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 110.682 ms ┊ GC (median): 0.00%
Time (mean ± σ): 110.581 ms ± 2.692 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
▄ ▂ ▄█▂
▄▁▁▄▄▁▁▆▄▁▁▁▆▆█▄█▆███▄▄▁█▄▁▁▁▁▁▁▁▁▁▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ ▁
105 ms Histogram: frequency by time 123 ms <
Memory estimate: 385.28 KiB, allocs estimate: 15.
(if anything in the code is not clear please let me know and I can explain)
CodePudding user response:
Do you have about 350MB of RAM available? If so, you can try this function
rowequal <- function(x) {
undetermined <- function(x, can_del) {
if (length(can_del) < 1L)
return(x)
x[-can_del]
}
if (ncol(x) < 1L)
return(logical())
out <- logical(nrow(x))
if (ncol(x) < 2L)
return(!out)
x1 <- x[[1L]]
need_compare <- undetermined(seq_len(nrow(x)), which(x1 != x[[2L]]))
x1[nas] <- x[[2L]][nas <- which(is.na(x1))]
if (ncol(x) > 2L) {
for (j in 3:ncol(x)) {
need_compare <- undetermined(
need_compare, which(x1[need_compare] != x[[j]][need_compare])
)
x1[nas] <- x[[j]][nas <- which(is.na(x1))]
if (length(need_compare) < 1L)
return(out)
}
}
`[<-`(out, need_compare, TRUE)
}
Below is the benchmark
> dim(d)
[1] 3000000 300
> bench::mark(f(d), rowequal(d), iterations = 10)
# A tibble: 2 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <lis> <lis>
1 f(d) 2.97s 2.98s 0.335 34.4MB 0 10 0 29.8s <lgl ~ <Rpro~ <ben~ <tib~
2 rowequal(d) 88.52ms 93.34ms 10.7 352.2MB 0 10 0 932.5ms <lgl ~ <Rpro~ <ben~ <tib~
, where f (I got this from this post) and d are
f <- function(x) {
v1 = do.call(pmin, c(x, na.rm = TRUE))
v2 = do.call(pmax, c(x, na.rm = TRUE))
v1 == v2
}
mkDT <- function(rows, cols, type = as.integer) {
data.table::setDT(
replicate(cols, sample(type(c(1:10, NA)), rows, replace = TRUE), simplify = FALSE)
)
}
d <- mkDT(3e6, 300)
An Rcpp version of the code. It could achieve its best performance (in terms of both memory usage and speed) if you can ensure that all the columns in your dataframe are of the numeric type. I guess this is the most efficient solution to your problem (in R).
rowequalcpp <- function(x) {
if (ncol(x) < 1L)
return(logical())
out <- logical(nrow(x))
if (ncol(x) < 2L)
return(!out)
mark_equal(out, x)
out
}
Rcpp::sourceCpp(code = '
#include <Rcpp.h>
// [[Rcpp::export]]
void mark_equal(Rcpp::LogicalVector& res, const Rcpp::DataFrame& df) {
Rcpp::NumericVector x1 = df[0];
int n = df.nrows();
int need_compare[n];
for (int i = 0; i < n; i)
need_compare[i] = i;
for (int j = 1; j < df.length(); j) {
Rcpp::NumericVector dfj = df[j];
for (int i = 0; i < n; i) {
if (Rcpp::NumericVector::is_na(x1[i]))
x1[i] = dfj[i];
if (!Rcpp::NumericVector::is_na(dfj[i]) && x1[i] != dfj[i]) {
int tmp = need_compare[i];
need_compare[i] = need_compare[n - 1];
need_compare[n-- - 1] = tmp;
}
}
if (n < 1)
return;
}
for (int i = 0; i < n; i)
res[need_compare[i]] = TRUE;
}
')
Benchmark (the type of your columns are crucial for the performance):
> d <- mkDT(3000000, 300, as.integer)
> bench::mark(rowequal(d), rowequalcpp(d), iterations = 10)
# A tibble: 2 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:t> <bch:> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <lis> <lis>
1 rowequal(d) 86.7ms 101ms 9.84 329MB 2.46 8 2 812.8ms <lgl ~ <Rpro~ <ben~ <tib~
2 rowequalcpp(d) 169.4ms 176ms 5.61 607MB 0.623 9 1 1.6s <lgl ~ <Rpro~ <ben~ <tib~
> d <- mkDT(3000000, 300, as.numeric)
> bench::mark(rowequal(d), rowequalcpp(d), iterations = 10)
# A tibble: 2 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <lis>
1 rowequal(d) 100.2ms 114.6ms 7.88 349.4MB 3.38 7 3 889ms <lgl [~ <Rprofm~ <ben~
2 rowequalcpp(d) 21.3ms 21.5ms 46.0 11.4MB 0 10 0 217ms <lgl [~ <Rprofm~ <ben~
# ... with 1 more variable: gc <list>
