Data.table instead of dplyr-CodePudding

I would like to change data.table instead of dplyr in dplyr::summarize(dplyr::across(ends_with("PV"), median),.groups = 'drop'). Also, I would like to know if you think the processing time is faster by data.table than by dplyr?

library(dplyr)
library(tidyr)
library(lubridate)

#database
df1 <- data.frame( Id = rep(1:5, length=100000),
                   date1 =  as.Date( "2021-12-01"),
                   date2= rep(seq( as.Date("2021-01-01"), length.out=50000, by=1), each = 2),
                   Category = rep(c("ABC", "EFG"), length.out = 100000),
                   Week = rep(c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
                                "Saturday", "Sunday"), length.out = 100000),
                   DR1 = sample( 200:250, 100000, repl=TRUE),  
                   setNames( replicate(365, { sample(0:100000, 100000)}, simplify=FALSE),
                             paste0("DRM", formatC(1:365, width = 2, format = "d", flag = "0"))))

subsetDRM<-  df1 %>% select(starts_with("DRM")) 

DR1_subsetDRM<-cbind (df1, setNames(df1$DR1 - subsetDRM, paste0(names(subsetDRM), "_PV"))) 

subset_PV<-select(DR1_subsetDRM,Id, date2,Week, Category, DR1, ends_with("PV")) 

result_median<-subset_PV %>%
  group_by(Id,Category,Week) %>%
  dplyr::summarize(dplyr::across(ends_with("PV"), median),.groups = 'drop')

CodePudding user response：

I think akrun's answer provides a great expression-for-expression translation. If you don't need the steps replicated, though, you can try this:

library(data.table)
dt1 <- as.data.table(df1)
cols <- grep("^DRM", colnames(dt1), value = TRUE)
dt1_results_median <- 
  dt1[, (paste0(cols, "_PV")) := DR1 - .SD, .SDcols = cols
    ][, lapply(.SD, median), by = .(Id, Category, Week), .SDcols = paste0(cols, "_PV") ]

Relative performance, showing a small improvement (33-41%) in both answers:

bench::mark(OP = {...}, akrun = {...}, r2evans = {...}, check = FALSE, iterations = 10)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 3 x 13
#   expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory     time   gc     
#   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>     <list> <list> 
# 1 OP            1.61s    1.72s     0.583    2.01GB     3.15    10    54      17.2s <NULL> <Rprofmem~ <benc~ <tibbl~
# 2 akrun         1.24s    1.29s     0.773    2.29GB     1.47    10    19      12.9s <NULL> <Rprofmem~ <benc~ <tibbl~
# 3 r2evans       1.19s    1.21s     0.823    1.88GB     1.65    10    20      12.2s <NULL> <Rprofmem~ <benc~ <tibbl~

I recognize that akrun's answer may be intended more as a teaching moment, translating from one dialect of R to another, so the speed difference is "moot". At this comparative run-time, I think it's more important to go with the code that is more readable and understandable, which makes it more maintainable and easier to troubleshoot if/when you need to change your methods.

CodePudding user response：

If we want a direct translation to data.table, perhaps below code would help

library(data.table)
# convert data.frame to data.table - setDT
setDT(df1)

# subset the columns of DRM to create subsetDRM
subsetDRM <- df1[, .SD, .SDcols = patterns("^DRM")]
# subtract the subsetDRM from df1
DR1_subsetDRM <- cbind(df1, setNames(df1$DR1 - 
               subsetDRM, paste0(names(subsetDRM), "_PV"))) 
pv_nm1 <- names(DR1_subsetDRM)[endsWith(names(DR1_subsetDRM), "PV")]
nm1 <- c("Id", "date2", "Week", "Category", "DR1", pv_nm1)
# create the subset_PV
subset_PV <- DR1_subsetDRM[, .SD, .SDcols = nm1]
# do a group by median
result_median <- subset_PV[, lapply(.SD, median),
         by =  .(Id, Category, Week), .SDcols = pv_nm1]