I would like to change data.table instead of dplyr in dplyr::summarize(dplyr::across(ends_with("PV"), median),.groups = 'drop'). Also, I would like to know if you think the processing time is faster by data.table than by dplyr?
library(dplyr)
library(tidyr)
library(lubridate)
#database
df1 <- data.frame( Id = rep(1:5, length=100000),
date1 = as.Date( "2021-12-01"),
date2= rep(seq( as.Date("2021-01-01"), length.out=50000, by=1), each = 2),
Category = rep(c("ABC", "EFG"), length.out = 100000),
Week = rep(c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
"Saturday", "Sunday"), length.out = 100000),
DR1 = sample( 200:250, 100000, repl=TRUE),
setNames( replicate(365, { sample(0:100000, 100000)}, simplify=FALSE),
paste0("DRM", formatC(1:365, width = 2, format = "d", flag = "0"))))
subsetDRM<- df1 %>% select(starts_with("DRM"))
DR1_subsetDRM<-cbind (df1, setNames(df1$DR1 - subsetDRM, paste0(names(subsetDRM), "_PV")))
subset_PV<-select(DR1_subsetDRM,Id, date2,Week, Category, DR1, ends_with("PV"))
result_median<-subset_PV %>%
group_by(Id,Category,Week) %>%
dplyr::summarize(dplyr::across(ends_with("PV"), median),.groups = 'drop')
CodePudding user response:
I think akrun's answer provides a great expression-for-expression translation. If you don't need the steps replicated, though, you can try this:
library(data.table)
dt1 <- as.data.table(df1)
cols <- grep("^DRM", colnames(dt1), value = TRUE)
dt1_results_median <-
dt1[, (paste0(cols, "_PV")) := DR1 - .SD, .SDcols = cols
][, lapply(.SD, median), by = .(Id, Category, Week), .SDcols = paste0(cols, "_PV") ]
Relative performance, showing a small improvement (33-41%) in both answers:
bench::mark(OP = {...}, akrun = {...}, r2evans = {...}, check = FALSE, iterations = 10)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 3 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 OP 1.61s 1.72s 0.583 2.01GB 3.15 10 54 17.2s <NULL> <Rprofmem~ <benc~ <tibbl~
# 2 akrun 1.24s 1.29s 0.773 2.29GB 1.47 10 19 12.9s <NULL> <Rprofmem~ <benc~ <tibbl~
# 3 r2evans 1.19s 1.21s 0.823 1.88GB 1.65 10 20 12.2s <NULL> <Rprofmem~ <benc~ <tibbl~
I recognize that akrun's answer may be intended more as a teaching moment, translating from one dialect of R to another, so the speed difference is "moot". At this comparative run-time, I think it's more important to go with the code that is more readable and understandable, which makes it more maintainable and easier to troubleshoot if/when you need to change your methods.
CodePudding user response:
If we want a direct translation to data.table, perhaps below code would help
library(data.table)
# convert data.frame to data.table - setDT
setDT(df1)
# subset the columns of DRM to create subsetDRM
subsetDRM <- df1[, .SD, .SDcols = patterns("^DRM")]
# subtract the subsetDRM from df1
DR1_subsetDRM <- cbind(df1, setNames(df1$DR1 -
subsetDRM, paste0(names(subsetDRM), "_PV")))
pv_nm1 <- names(DR1_subsetDRM)[endsWith(names(DR1_subsetDRM), "PV")]
nm1 <- c("Id", "date2", "Week", "Category", "DR1", pv_nm1)
# create the subset_PV
subset_PV <- DR1_subsetDRM[, .SD, .SDcols = nm1]
# do a group by median
result_median <- subset_PV[, lapply(.SD, median),
by = .(Id, Category, Week), .SDcols = pv_nm1]
