How do I select the most recent sample UNLESS there is a sample in a certain month?-CodePudding

I have a dataset with sites and dates of sampling. I am trying to keep only the most recent August sample at each site, but if there is no August sample I would want to simply keep the most recent sample (whether it is before or after August). The months range from June-September.

I'm not sure how I can ask r to decide which row to keep based on what is in the other rows.

I've tried option a, which is based on answers to questions on keeping the most recent sample:

att <- att %>%
  group_by(siteID) %>% 
  arrange(desc(SAMPLE_DATE)) %>% 
  slice(1:1)

but that doesn't necessarily keep the most recent August sample.

And option b which is a little messy but I think closer to what I want:

# get duplicates (multiple samples of same lake)
n_occur <- data.frame(table(att$siteID))
dupes <- att[att$siteID %in% n_occur$Var1[n_occur$Freq > 1],]

# keep most recent august sample
m_att <- att %>%
  mutate(keep=case_when(
    siteID %in% dupes$siteID & month(SAMPLE_DATE) != 9 & !is.na(`CHLOROPHYLL A_OW_TOTAL`) ~ "yes",
    !(siteID %in% dupes$siteID) ~ "yes")) %>%
  arrange(siteID,desc(SAMPLE_DATE)) %>%
  filter(keep=="yes")

m_occur <- data.frame(table(m_att$siteID))
m_att <- m_att[!duplicated(m_att$siteID),]

att <- m_att %>%
  distinct()

But this missed some sites and I ended up losing some sites in the filter.

For example, here's a chunk of my data:

SAMPLE_DATE         siteID
1   2020-07-22 LAK20_NY-10001
2         <NA> LAK20_NY-10002
3         <NA> LAK20_NY-10003
4         <NA> LAK20_NY-10004
5         <NA> LAK20_NY-10005
6         <NA> LAK20_NY-10006
7   2020-09-01 LAK20_NY-10007
8         <NA> LAK20_NY-10008
9         <NA> LAK20_NY-10009
10        <NA> LAK20_NY-10010
11  2020-07-07 LAK20_NY-10011
12  2020-07-09 LAK20_NY-10012
13  2020-08-03 LAK20_NY-10013
14  2020-09-01 LAK20_NY-10014
15        <NA> LAK20_NY-10015
16  2020-09-20 LAK20_NY-10016
17  2020-08-09 LAK20_NY-10016
18  2020-07-26 LAK20_NY-10016
19  2020-08-23 LAK20_NY-10016
20  2020-06-14 LAK20_NY-10016
21  2020-09-08 LAK20_NY-10016
22  2020-06-28 LAK20_NY-10016
23  2020-07-12 LAK20_NY-10016
24  2020-09-02 LAK20_NY-10016
25  2020-07-22 LAK20_NY-10017
26  2020-09-30 LAK20_NY-10018
27        <NA> LAK20_NY-10019
28  2020-07-08 LAK20_NY-10020
29  2020-08-04 LAK20_NY-10021
30        <NA> LAK20_NY-10022
31        <NA> LAK20_NY-10023
32  2020-07-07 LAK20_NY-10024
33  2021-09-21 LAK20_NY-10025
34  2021-07-27 LAK20_NY-10025
35        <NA> LAK20_NY-10026
36  2020-06-20 LAK20_NY-10027
37  2020-08-02 LAK20_NY-10027
38  2020-06-07 LAK20_NY-10027
39  2020-08-15 LAK20_NY-10027
40  2020-09-06 LAK20_NY-10027
41  2020-07-18 LAK20_NY-10027
42  2020-08-31 LAK20_NY-10027
43  2020-08-30 LAK20_NY-10027
44  2020-06-21 LAK20_NY-10027
45  2020-09-12 LAK20_NY-10027
46  2020-08-16 LAK20_NY-10027
47  2021-09-13 LAK20_NY-10027
48  2020-09-20 LAK20_NY-10027
49  2020-07-19 LAK20_NY-10027
50  2020-07-05 LAK20_NY-10027
51  2020-08-01 LAK20_NY-10027

Option a would keep the wrong sample for LAK20_NY-10016 (I would want to keep the sample on 2020-08-09, not 2020-09-30). Option b would work for that lake but for LAK20_NY-10025, option b wouldn't include either of the sampling events when I would want to keep the sample on 2021-09-21.

CodePudding user response：

Here's one solution where you annotate the month and whether a particular lake has a reading from August and then filter with a condition on this value.

library(tidyverse)
library(lubridate)

att <- structure(list(SAMPLE_DATE = c("2020-07-22", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "2020-09-01", "<NA>", "<NA>", "<NA>", "2020-07-07", "2020-07-09", "2020-08-03", "2020-09-01", "<NA>", "2020-09-20", "2020-08-09", "2020-07-26", "2020-08-23", "2020-06-14", "2020-09-08", "2020-06-28", "2020-07-12", "2020-09-02", "2020-07-22", "2020-09-30", "<NA>", "2020-07-08", "2020-08-04", "<NA>", "<NA>", "2020-07-07", "2021-09-21", "2021-07-27", "<NA>", "2020-06-20", "2020-08-02", "2020-06-07", "2020-08-15", "2020-09-06", "2020-07-18", "2020-08-31", "2020-08-30", "2020-06-21", "2020-09-12", "2020-08-16", "2021-09-13", "2020-09-20", "2020-07-19", "2020-07-05", "2020-08-01"), siteID = c("LAK20_NY-10001", "LAK20_NY-10002", "LAK20_NY-10003", "LAK20_NY-10004", "LAK20_NY-10005", "LAK20_NY-10006", "LAK20_NY-10007", "LAK20_NY-10008", "LAK20_NY-10009", "LAK20_NY-10010", "LAK20_NY-10011", "LAK20_NY-10012", "LAK20_NY-10013", "LAK20_NY-10014", "LAK20_NY-10015", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10017", "LAK20_NY-10018", "LAK20_NY-10019", "LAK20_NY-10020", "LAK20_NY-10021", "LAK20_NY-10022", "LAK20_NY-10023", "LAK20_NY-10024", "LAK20_NY-10025", "LAK20_NY-10025", "LAK20_NY-10026", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027")), row.names = c(NA, -51L), class = c("tbl_df", "tbl", "data.frame"))

att %>%
  mutate(SAMPLE_DATE = ymd(SAMPLE_DATE)) %>% # convert to date
  mutate(month = month(SAMPLE_DATE)) %>% # extract month from date
  group_by(siteID) %>%
  mutate(has_aug = 8 %in% month) %>% # annotate as having date in august
  filter(has_aug & month == 8 |
           !has_aug) %>% # filter only august dates from sites with or all from those without
  slice_max(n = 1,
            order_by = SAMPLE_DATE,
            with_ties = F) %>% # change with_ties to TRUE to discard NA values
  select(SAMPLE_DATE, siteID)

#> # A tibble: 27 x 2
#> # Groups:   siteID [27]
#>    SAMPLE_DATE siteID        
#>    <date>      <chr>         
#>  1 2020-07-22  LAK20_NY-10001
#>  2 NA          LAK20_NY-10002
#>  3 NA          LAK20_NY-10003
#>  4 NA          LAK20_NY-10004
#>  5 NA          LAK20_NY-10005
#>  6 NA          LAK20_NY-10006
#>  7 2020-09-01  LAK20_NY-10007
#>  8 NA          LAK20_NY-10008
#>  9 NA          LAK20_NY-10009
#> 10 NA          LAK20_NY-10010
#> # ... with 17 more rows

^{Created on 2022-02-03 by the reprex package (v2.0.1)}