R reshape() extremely slow-CodePudding

I need to perform a simple reshape of data from long to wide, and this needs to work in base R. For this use case, reshape() seems to be extraordinarily slow (despite assertions that it is very fast https://stackoverflow.com/a/12073077/3017280). This example is a reasonable approximation of my data. I know that in this example I do not need both Index columns, but I do in the real data. On my laptop 10,000 rows takes 3 seconds, and 40,000 rows takes over 200 seconds. The real data has over one million rows, so reshape() is obviously a non-starter. Can anyone shed any light on why it takes so long in this case? I worked around the problem using split / lapply / Reduce merge, which is clumsy but very much quicker.

n <- 5000
dfLong <- data.frame(Index1 = rep(sample(1E6:2E6, n), 4),
                  Index2 = rep(sample(3E6:4E6, n), 4),
                  Key = rep(1:4, each = n),
                  Date = sample(seq.Date(as.Date("2020-01-01"),
                                         as.Date("2021-12-31"), 
                                         by = "1 day"),
                                size = n * 4, replace = TRUE),
                  Score = sample(0:48, n * 4, replace = TRUE))
                                
system.time(dfWide <- reshape(data = dfLong,
          v.names = c("Date", "Score"),
          timevar = "Key",
          idvar = c("Index1", "Index2"),
          sep = "_Q",
          direction = "wide"))

CodePudding user response：

I don't know that I've ever made the claim that stats::reshape is the fastest.

For comparisons, stats::reshape is not as fast on my i9/64GB-ram system:

system.time(
dfWide <- reshape(data = dfLong,
          v.names = c("Date", "Score"),
          timevar = "Key",
          idvar = c("Index1", "Index2"),
          sep = "_Q",
          direction = "wide")
)
#    user  system elapsed 
#   19.63    0.03   19.73

But other reshaping functions do much better:

system.time(
  tidyrWide <- pivot_wider(
    dfLong, c("Index1", "Index2"),
    names_prefix = "Q", names_from = "Key",
    values_from = c("Date", "Score"))
)
#    user  system elapsed 
#    0.01    0.00    0.02 

nms <- names(dfWide)
tidyrWide <- subset(tidyrWide, select = nms) # column order
dfOrder <- do.call(order, dfWide)
tidyrOrder <- do.call(order, tidyrWide)
all.equal(dfWide[dfOrder,], as.data.frame(tidyrWide)[tidyrOrder,], check.attributes = FALSE)
# [1] TRUE

Similarly, data.table::dcast is equally fast:

dtLong <- as.data.table(dfLong)
system.time(
  dtWide <- data.table::dcast(
    Index1   Index2 ~ paste0("Q", Key),
    data = dtLong, value.var = c("Date", "Score"))
)
#    user  system elapsed 
#    0.00    0.01    0.02 

dtWide <- subset(dtWide, select = nms) # column order
dtOrder <- do.call(order, dtWide)
all.equal(dfWide[dfOrder,nms], as.data.frame(dtWide)[dtOrder,nms], check.attributes = FALSE)
# [1] TRUE

CodePudding user response：

If you look at what functions reshape calls with the profvis package, you can see that almost all of the total time spent is on this one line in the function. The interaction function is used as a pairing function to combine your two id columns into a single column.

data[, tempidname] <- interaction(data[, idvar], 
                drop = TRUE)

Rather than interaction, you could use plyr:::ninteraction. The only non-base dependency of this function is plyr:::id_var, which has no dependencies, meaning if you can't install packages you can just copy-paste this function definition pretty easily (adding a comment giving credit).

So, what we can do is create an environment with interaction equal to a copy-pasted version of plyr:::ninteraction and then run reshape in that environment so that the line above uses the faster function we've copied from plyr.

new_reshape <- function(...){
  # interaction = plyr:::ninteraction
  # id_var = plyr:::id_var
  interaction <- 
    function (.variables, drop = FALSE) 
    {
        lengths <- vapply(.variables, length, integer(1))
        .variables <- .variables[lengths != 0]
        if (length(.variables) == 0) {
            n <- nrow(.variables) %||% 0L
            return(structure(seq_len(n), n = n))
        }
        if (length(.variables) == 1) {
            return(id_var(.variables[[1]], drop = drop))
        }
        ids <- rev(lapply(.variables, id_var, drop = drop))
        p <- length(ids)
        ndistinct <- vapply(ids, attr, "n", FUN.VALUE = numeric(1), 
            USE.NAMES = FALSE)
        n <- prod(ndistinct)
        if (n > 2^31) {
            char_id <- do.call("paste", c(ids, sep = "\r"))
            res <- match(char_id, unique(char_id))
        }
        else {
            combs <- c(1, cumprod(ndistinct[-p]))
            mat <- do.call("cbind", ids)
            res <- c((mat - 1L) %*% combs   1L)
        }
        attr(res, "n") <- n
        if (drop) {
            id_var(res, drop = TRUE)
        }
        else {
            structure(as.integer(res), n = attr(res, "n"))
        }
    }  
  id_var <- 
  function (x, drop = FALSE) 
  {
      if (length(x) == 0) 
          return(structure(integer(), n = 0L))
      if (!is.null(attr(x, "n")) && !drop) 
          return(x)
      if (is.factor(x) && !drop) {
          x <- addNA(x, ifany = TRUE)
          id <- as.integer(x)
          n <- length(levels(x))
      }
      else {
          levels <- sort(unique(x), na.last = TRUE)
          id <- match(x, levels)
          n <- max(id)
      }
      structure(id, n = n)
  }
  environment(reshape) <- environment()
  reshape(...)
}

Now it's much faster

system.time(dfWide <- reshape(data = dfLong,
          v.names = c("Date", "Score"),
          timevar = "Key",
          idvar = c("Index1", "Index2"),
          sep = "_Q",
          direction = "wide"))
 #   user  system elapsed 
 # 35.292   0.538  36.236 

system.time(new_dfWide <- new_reshape(data = dfLong,
          v.names = c("Date", "Score"),
          timevar = "Key",
          idvar = c("Index1", "Index2"),
          sep = "_Q",
          direction = "wide"))

  #  user  system elapsed 
  # 0.015   0.000   0.015 

all.equal(new_dfWide, dfWide)
# [1] TRUE

CodePudding user response：

Consider an advanced modified version of @Moody_Mudskipper's matrix_spread, using base R. Since matrix will simplify complex types like Date, some adhoc changes will be required:

Function

matrix_spread <- function(df1, id, key, value, sep){
  unique_ids <-  unique(df1[[key]])
  mats <- lapply(df1[value], function(x) 
    matrix(x, ncol=length(unique_ids), byrow = FALSE)
  )
  df2 <- do.call(
    data.frame, list(unique(df1[id]), mats)
  )
  
  # RENAME COLS
  names(df2)[(length(id) 1):ncol(df2)] <- as.vector(
    sapply(value, function(x, y) paste0(x, sep, y), unique_ids)
  )
  # REORDER COLS
  df2 <- df2[c(id, as.vector(
    outer(c(value), unique_ids, function(x, y) paste0(x, sep, y))
  ))]
  
  return(df2)
}

Application

system.time(
  dfWide2 <- matrix_spread(
    df1 = dfLong, 
    id = c("Index1", "Index2"),
    key = "Key",
    value = c("Date", "Score"),
    sep = "_Q"
  )
)
#  user  system elapsed 
# 0.022   0.000   0.023 

# CONVERT INTEGERS TO DATES
dfWide2[grep("Date", names(dfWide2))] <- lapply(
  dfWide2[grep("Date", names(dfWide2))],
  as.Date,
  origin = "1970-01-01"
)

# REPLICATES OP'S reshape
identical(data.frame(dfWide), dfWide2)
# [1] TRUE