Select previous and next N rows based on the condition for current row-CodePudding

I construct the following panel data with keys id and time:

pdata <- tibble(
  id = rep(1:10, each = 5),
  time = rep(2016:2020, times = 10),
  value = c(c(1,1,1,0,0), c(1,1,0,0,0), c(0,0,1,0,0), c(0,0,0,0,0), c(1,0,0,0,1), c(0,1,1,1,0), c(0,1,1,1,1), c(1,1,1,1,1), c(1,0,1,1,1), c(1,1,0,1,1))
)
pdata
# A tibble: 50 × 3
      id  time value
   <int> <int> <dbl>
 1     1  2016     1
 2     1  2017     1
 3     1  2018     1
 4     1  2019     0
 5     1  2020     0
 6     2  2016     1
 7     2  2017     1
 8     2  2018     0
 9     2  2019     0
10     2  2020     0
# … with 40 more rows

Let's assume a shock happened in 2018. I wish to slice pairs of previous and next N rows by id that have the same value as the shock rows' value.

I take several examples for illustration. For id == 5, the dataset looks like:

pdata %>% filter(id == 5)
# A tibble: 5 × 3
     id  time value
  <int> <int> <dbl>
1     5  2016     1
2     5  2017     0
3     5  2018     0
4     5  2019     0
5     5  2020     1

The value in 2018 for id == 5 is 0, and I wish to keep the previous and next 1 row including the current row because all these observations have the same value that equals 0.

# A tibble: 3 × 3
     id  time value
  <int> <int> <dbl>
1     5  2017     0
2     5  2018     0
3     5  2019     0

For id == 8, I wish to get:

# A tibble: 5 × 3
     id  time value
  <int> <int> <dbl>
1     8  2016     1
2     8  2017     1
3     8  2018     1
4     8  2019     1
5     8  2020     1

For id == 1, I wish to get the empty dataset, since the pair of the observation in 2017 and the observation in 2019 does not have the same value.

The final dataset should be:

# A tibble: 19 × 3
      id  time value
   <int> <int> <dbl>
 1     4  2016     0
 2     4  2017     0
 3     4  2018     0
 4     4  2019     0
 5     4  2020     0
 6     5  2017     0
 7     5  2018     0
 8     5  2019     0
 9     6  2017     1
10     6  2018     1
11     6  2019     1
12     7  2017     1
13     7  2018     1
14     7  2019     1
15     8  2016     1
16     8  2017     1
17     8  2018     1
18     8  2019     1
19     8  2020     1

CodePudding user response：

A solution with data.table:

# load the package & convert data to a data.table
library(data.table)
setDT(pdata)

# define shock-year and number of previous/next rows
shock <- 2018
n <- 2

# filter
pdata[, .SD[value == value[time == shock] &
              between(time, shock - n, shock   n) & 
              value == rev(value)][.N > 1 & all(diff(time) == 1)]
      , by = id]

which gives:

    id time value
 1:  4 2016     0
 2:  4 2017     0
 3:  4 2018     0
 4:  4 2019     0
 5:  4 2020     0
 6:  5 2017     0
 7:  5 2018     0
 8:  5 2019     0
 9:  6 2017     1
10:  6 2018     1
11:  6 2019     1
12:  7 2017     1
13:  7 2018     1
14:  7 2019     1
15:  8 2016     1
16:  8 2017     1
17:  8 2018     1
18:  8 2019     1
19:  8 2020     1

Used data:

pdata <- data.frame(
  id = rep(1:10, each = 5),
  time = rep(2016:2020, times = 10),
  value = c(c(1,1,1,0,0), c(1,1,0,0,0), c(0,0,1,0,0), c(0,0,0,0,0), c(1,0,0,0,1), c(0,1,1,1,0), c(0,1,1,1,1), c(1,1,1,1,1), c(1,0,1,1,1), c(1,1,0,1,1))
)

CodePudding user response：

Within each 'id' (by = id), use rleid to create a grouping variable 'r' based on runs of equal values. Within each 'id' and run (by = .(id, r)), check if both previous and next year from the focal year (e.g. 2018) are present (sum(time %in% c(yr-1, yr 1)) == 2). If so, select the current group (.SD).

library(data.table)
setDT(pdata)
yr = 2018
pdata[ , r := rleid(value), by = id]
pdata[ , if(sum(time %in% c(yr-1, yr 1)) == 2) .SD, by = .(id, r)]

    id r time value
 1:  4 1 2016     0
 2:  4 1 2017     0
 3:  4 1 2018     0
 4:  4 1 2019     0
 5:  4 1 2020     0
 6:  5 2 2017     0
 7:  5 2 2018     0
 8:  5 2 2019     0
 9:  6 2 2017     1
10:  6 2 2018     1
11:  6 2 2019     1
12:  7 2 2017     1
13:  7 2 2018     1
14:  7 2 2019     1
15:  7 2 2020     1
16:  8 1 2016     1
17:  8 1 2017     1
18:  8 1 2018     1
19:  8 1 2019     1
20:  8 1 2020     1

CodePudding user response：

As far as I understood, here's a dplyr suggestion:

library(dplyr)

MyF <- function(id2, shock, nb_row) {
  values <- pdata %>%
    filter(id == id2) %>%
    pull(value)
  
  if (length(unique(values)) == 1) {
    pdata %>%
      filter(id == id2)
  } else {
    pdata %>%
      filter(id == id2) %>%
      filter(time >= shock - nb_row & time <= shock   nb_row) %>%
      filter(length(unique(value)) == 1)
  }
  
  
}

map_df(pdata %>%
         select(id) %>% 
         distinct() %>% 
         pull(),
       MyF,
       shock = 2018, nb_row = 1)

## Or map_df(1:8,MyF,shock = 2018, nb_row = 1)

Output:

# A tibble: 19 x 3
      id  time value
   <int> <int> <dbl>
 1     4  2016     0
 2     4  2017     0
 3     4  2018     0
 4     4  2019     0
 5     4  2020     0
 6     5  2017     0
 7     5  2018     0
 8     5  2019     0
 9     6  2017     1
10     6  2018     1
11     6  2019     1
12     7  2017     1
13     7  2018     1
14     7  2019     1
15     8  2016     1
16     8  2017     1
17     8  2018     1
18     8  2019     1
19     8  2020     1

CodePudding user response：

Here's another dplyr solution. We basically group by sequences of unique values for each id and then just filter around the maximum distance to the shock time that is duplicated.

pdata %>%
  group_by(id) %>%
  mutate(value_group = cumsum(value != lag(value, default = value[1]))) %>%
  group_by(id, value_group) %>%
  mutate(shock_diff = abs(time - 2018)) %>%
  filter(shock_diff <= max(shock_diff[duplicated(shock_diff)], -Inf))
#> # A tibble: 19 × 5
#> # Groups:   id, value_group [5]
#>       id  time value value_group shock_diff
#>    <int> <int> <dbl>       <int>      <dbl>
#>  1     4  2016     0           0          2
#>  2     4  2017     0           0          1
#>  3     4  2018     0           0          0
#>  4     4  2019     0           0          1
#>  5     4  2020     0           0          2
#>  6     5  2017     0           1          1
#>  7     5  2018     0           1          0
#>  8     5  2019     0           1          1
#>  9     6  2017     1           1          1
#> 10     6  2018     1           1          0
#> 11     6  2019     1           1          1
#> 12     7  2017     1           1          1
#> 13     7  2018     1           1          0
#> 14     7  2019     1           1          1
#> 15     8  2016     1           0          2
#> 16     8  2017     1           0          1
#> 17     8  2018     1           0          0
#> 18     8  2019     1           0          1
#> 19     8  2020     1           0          2

CodePudding user response：

A dplyr solution inspired by Jaap's great solution:

refTime = 2018
n = 2

library(dplyr)
pdata %>% 
  group_by(id) %>% 
  filter(value == value[time==refTime] & 
           between(time, refTime-n, refTime n) &
           value == rev(value)) %>% 
  filter(n() > 1 & all(diff(time) == 1))

Output:

# A tibble: 19 x 3
# Groups:   id [5]
      id  time value
   <int> <int> <dbl>
 1     4  2016     0
 2     4  2017     0
 3     4  2018     0
 4     4  2019     0
 5     4  2020     0
 6     5  2017     0
 7     5  2018     0
 8     5  2019     0
 9     6  2017     1
10     6  2018     1
11     6  2019     1
12     7  2017     1
13     7  2018     1
14     7  2019     1
15     8  2016     1
16     8  2017     1
17     8  2018     1
18     8  2019     1
19     8  2020     1

CodePudding user response：

One way to solve your problem using data.table:

library(data.table)

yrs=2017:2019
setDT(pdata)[, if(uniqueN(value)==1) .(time, value) 
               else if(uniqueN(value <- value[time %in% yrs])==1) .(time=yrs, value), 
             by=id]

#        id  time value
#  1:     4  2016     0
#  2:     4  2017     0
#  3:     4  2018     0
#  4:     4  2019     0
#  5:     4  2020     0
#  6:     5  2017     0
#  7:     5  2018     0
#  8:     5  2019     0
#  9:     6  2017     1
# 10:     6  2018     1
# 11:     6  2019     1
# 12:     7  2017     1
# 13:     7  2018     1
# 14:     7  2019     1
# 15:     8  2016     1
# 16:     8  2017     1
# 17:     8  2018     1
# 18:     8  2019     1
# 19:     8  2020     1