Converting for loop to function and applying to list of dataframes-CodePudding

I previously created a for loop to identify periods when water level decreased. This worked fine for smaller continuous time series data.

library(tidyverse)
library(lubridate)

level_data <- c(10:4, 20:9, 16:5, rep(0, 3))           
times_stamp <- seq(ymd_hms('2015-07-22 12:15:00'), ymd_hms('2015-07-22 20:30:00'), by = '15 mins')
precip_data <- c(rep(0, 10), 1:4, rep(0, 10), 1:5, rep(0, 5))

maxP.neg <- .1

# Create objects for holding the start and end dates. These lists should end up 
# the same length, so that each start date has a corresponding end date.

startDates <- c()                                     
endDates <- c()

recede <- 0                                           # This is a switch to keep track of whether a recession period is in progress

for (i in 2:length(level_data)) {                     # i.e. start at the second data point
 diffQ <- level_data[i] - level_data[i - 1]          # Calculate difference between current and previous timestamp
 
 if (diffQ < 0 &&                                    # If difference is negative (i.e. receding) AND
     recede == 0 &&                                  # a recession period has not already begun (recede == 0) AND
     precip_data <= maxP.neg) {                      # min. dry period criteria is met ...
   startDates <- append(startDates, times_stamp[i])  # Record the start time of the recession period
   recede <- 1                                       # Change recede to 1 to indicate a recession period has begun
   
 } else if (diffQ >= 0 &&                            # If the difference becomes positive 
            recede == 1) {                           # and a recession period was in progress...
   endDates <- append(endDates, times_stamp[i - 1])  # Record the previous timestamp as the end date of the recession     
   recede <- 0                                       # Set recede back to 0 to show the recession period has ended                             
   
 } else {                                            # Otherwise just continue to the next data point.
   next                                     
 }
}

However, I want to perform the same analysis over a larger dataset where there are time breaks. I thought to split the data into a list of dataframes and use lapply and a custom function.

This is what I have come up with but I am not getting the same output as the for loop method?

diff_Q <- diff(level_data)

date_time1 <-
  seq(ymd_hms('2015-07-22 12:15:00'),
      ymd_hms('2015-07-22 20:15:00'),
      by = '15 mins')
date_time2 <-
  seq(ymd_hms('2015-07-25 08:00:00'),
      ymd_hms('2015-07-25 16:00:00'),
      by = '15 mins')

cum_precip <- c(rep(0, 10), 1:4, rep(0, 10), 1:5, rep(0, 4))

df1 <-
  data.frame(date_time1, diff_Q, cum_precip) %>% rename(date_time = date_time1)
df2 <-
  data.frame(date_time2, diff_Q, cum_precip) %>%  rename(date_time = date_time2)

recede_ls <- list(df1, df2)

startDates2 <- c()
endDates2 <- c()

RA.function <- function(x) {
  recede <- 0
  
  if (diff_Q < 0 &&
      recede == 0 &&
      cum_precip <= maxP.neg) {
    startDates2 <- x$date_time
    recede <- 1
  } else if (diffQ >= 0 &&
             recede == 1) {
    endDates2 <- x$date_time[-1]
    recede <-
      0
    
  } else {
    next
  }
}

lapply(recede_ls, RA.function)

Thanks for your help!

CodePudding user response：

library(tidyverse)
library(lubridate)

level_data <- c(10:4, 20:9, 16:5, rep(0, 3))           
times_stamp <- seq(ymd_hms('2015-07-22 12:15:00'), ymd_hms('2015-07-22 20:30:00'), by = '15 mins')
precip_data <- c(rep(0, 10), 1:4, rep(0, 10), 1:5, rep(0, 5))

df <- data.frame(date_time = times_stamp,
                 level_data = level_data, 
                 cum_precip = precip_data)

dfs <- list(df, df)

RA.function <- function(x) {
  
  startDates2 <- c()
  endDates2 <- c()
  recede <- 0
  maxP.neg <- 0.1
  
  for(i in 2:nrow(df)){
    diffQ <- level_data[i] - level_data[i - 1]
    if (diffQ < 0 && recede == 0 && x[i,"cum_precip"] <= maxP.neg) {
      startDates2 <- c(startDates2, paste(x[i, "date_time"]))
      recede <- 1
    } else if (diffQ >= 0 && recede == 1) {
      endDates2 <- c(endDates2, paste(x[i-1,"date_time"]))
      recede <- 0
    }
  }
  return(cbind(startDates2, endDates2))
}

lapply(dfs, RA.function)
#> [[1]]
#>      startDates2           endDates2            
#> [1,] "2015-07-22 12:30:00" "2015-07-22 13:45:00"
#> [2,] "2015-07-22 14:15:00" "2015-07-22 16:45:00"
#> [3,] "2015-07-22 17:15:00" "2015-07-22 20:00:00"
#> 
#> [[2]]
#>      startDates2           endDates2            
#> [1,] "2015-07-22 12:30:00" "2015-07-22 13:45:00"
#> [2,] "2015-07-22 14:15:00" "2015-07-22 16:45:00"
#> [3,] "2015-07-22 17:15:00" "2015-07-22 20:00:00"
Created on 2022-01-06 by the reprex package (v2.0.1)

Is this what you are looking for? Let me know if you have any questions.

CodePudding user response：

Consider avoiding loops and use more set-based approaches such as merge with the data frame onto itself but one row back using a pseudo id based on nrow.

# BIND VECTORS INTO DATA FRAME
df <- data.frame(
    level_data = c(10:4, 20:9, 16:5, rep(0, 3))           
    times_stamp = seq(
        as.POSIXct('2015-07-22 12:15:00'), 
        as.POSIXct('2015-07-22 20:30:00'), 
        by = '15 mins'
    )
    precip_data = c(rep(0, 10), 1:4, rep(0, 10), 1:5, rep(0, 5))
)

# ADD id COLUMN FOR MERGING
df <- transform(df, id = 1:nrow(df))

# SELF-MERGE
df <- merge(
  df,
  transform(df, id = id - 1),
  by = "id",
  all.x = TRUE,
  suffix = c("_prev", "_curr")
)

# ASSIGN COLUMNS FOR LEVEL DIFFERENCES AND RECEDE GROUPS
df <- within(
  df, {
    diffQ <- level_data_curr - level_data_prev    # Calculate difference        
    recede_grp <- cumsum(diffQ >= 0)              # Create recede groupings
    recede_grp <- ifelse(diffQ >= 0, recede_grp-1, recede_grp)
  }
)

Then, extract needed values which can involve an apply method for groupwise aggregation, tapply. (Last values may be ignored given the zero-zero end cut-off comparison.)

startDates_new <- as.POSIXct(
  tapply(df$times_stamp_curr, df$recede_grp, min), 
  origin="1970-01-01"
)
endDates_new <- as.POSIXct(tapply(
  df$times_stamp_prev, df$recede_grp, max), 
  origin="1970-01-01"
)

startDates_new
#                         0                         1                         2                         3 
# "2015-07-22 12:30:00 CDT" "2015-07-22 14:15:00 CDT" "2015-07-22 17:15:00 CDT" "2015-07-22 20:30:00 CDT" 

endDates_new
#                         0                         1                         2                         3 
# "2015-07-22 13:45:00 CDT" "2015-07-22 16:45:00 CDT" "2015-07-22 20:00:00 CDT" "2015-07-22 20:15:00 CDT"