I have a dataset with sites and dates of sampling. I am trying to keep only the most recent August sample at each site, but if there is no August sample I would want to simply keep the most recent sample (whether it is before or after August). The months range from June-September.
I'm not sure how I can ask r to decide which row to keep based on what is in the other rows.
I've tried option a, which is based on answers to questions on keeping the most recent sample:
att <- att %>%
group_by(siteID) %>%
arrange(desc(SAMPLE_DATE)) %>%
slice(1:1)
but that doesn't necessarily keep the most recent August sample.
And option b which is a little messy but I think closer to what I want:
# get duplicates (multiple samples of same lake)
n_occur <- data.frame(table(att$siteID))
dupes <- att[att$siteID %in% n_occur$Var1[n_occur$Freq > 1],]
# keep most recent august sample
m_att <- att %>%
mutate(keep=case_when(
siteID %in% dupes$siteID & month(SAMPLE_DATE) != 9 & !is.na(`CHLOROPHYLL A_OW_TOTAL`) ~ "yes",
!(siteID %in% dupes$siteID) ~ "yes")) %>%
arrange(siteID,desc(SAMPLE_DATE)) %>%
filter(keep=="yes")
m_occur <- data.frame(table(m_att$siteID))
m_att <- m_att[!duplicated(m_att$siteID),]
att <- m_att %>%
distinct()
But this missed some sites and I ended up losing some sites in the filter.
For example, here's a chunk of my data:
SAMPLE_DATE siteID
1 2020-07-22 LAK20_NY-10001
2 <NA> LAK20_NY-10002
3 <NA> LAK20_NY-10003
4 <NA> LAK20_NY-10004
5 <NA> LAK20_NY-10005
6 <NA> LAK20_NY-10006
7 2020-09-01 LAK20_NY-10007
8 <NA> LAK20_NY-10008
9 <NA> LAK20_NY-10009
10 <NA> LAK20_NY-10010
11 2020-07-07 LAK20_NY-10011
12 2020-07-09 LAK20_NY-10012
13 2020-08-03 LAK20_NY-10013
14 2020-09-01 LAK20_NY-10014
15 <NA> LAK20_NY-10015
16 2020-09-20 LAK20_NY-10016
17 2020-08-09 LAK20_NY-10016
18 2020-07-26 LAK20_NY-10016
19 2020-08-23 LAK20_NY-10016
20 2020-06-14 LAK20_NY-10016
21 2020-09-08 LAK20_NY-10016
22 2020-06-28 LAK20_NY-10016
23 2020-07-12 LAK20_NY-10016
24 2020-09-02 LAK20_NY-10016
25 2020-07-22 LAK20_NY-10017
26 2020-09-30 LAK20_NY-10018
27 <NA> LAK20_NY-10019
28 2020-07-08 LAK20_NY-10020
29 2020-08-04 LAK20_NY-10021
30 <NA> LAK20_NY-10022
31 <NA> LAK20_NY-10023
32 2020-07-07 LAK20_NY-10024
33 2021-09-21 LAK20_NY-10025
34 2021-07-27 LAK20_NY-10025
35 <NA> LAK20_NY-10026
36 2020-06-20 LAK20_NY-10027
37 2020-08-02 LAK20_NY-10027
38 2020-06-07 LAK20_NY-10027
39 2020-08-15 LAK20_NY-10027
40 2020-09-06 LAK20_NY-10027
41 2020-07-18 LAK20_NY-10027
42 2020-08-31 LAK20_NY-10027
43 2020-08-30 LAK20_NY-10027
44 2020-06-21 LAK20_NY-10027
45 2020-09-12 LAK20_NY-10027
46 2020-08-16 LAK20_NY-10027
47 2021-09-13 LAK20_NY-10027
48 2020-09-20 LAK20_NY-10027
49 2020-07-19 LAK20_NY-10027
50 2020-07-05 LAK20_NY-10027
51 2020-08-01 LAK20_NY-10027
Option a would keep the wrong sample for LAK20_NY-10016 (I would want to keep the sample on 2020-08-09, not 2020-09-30). Option b would work for that lake but for LAK20_NY-10025, option b wouldn't include either of the sampling events when I would want to keep the sample on 2021-09-21.
CodePudding user response:
Here's one solution where you annotate the month and whether a particular lake has a reading from August and then filter with a condition on this value.
library(tidyverse)
library(lubridate)
att <- structure(list(SAMPLE_DATE = c("2020-07-22", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "2020-09-01", "<NA>", "<NA>", "<NA>", "2020-07-07", "2020-07-09", "2020-08-03", "2020-09-01", "<NA>", "2020-09-20", "2020-08-09", "2020-07-26", "2020-08-23", "2020-06-14", "2020-09-08", "2020-06-28", "2020-07-12", "2020-09-02", "2020-07-22", "2020-09-30", "<NA>", "2020-07-08", "2020-08-04", "<NA>", "<NA>", "2020-07-07", "2021-09-21", "2021-07-27", "<NA>", "2020-06-20", "2020-08-02", "2020-06-07", "2020-08-15", "2020-09-06", "2020-07-18", "2020-08-31", "2020-08-30", "2020-06-21", "2020-09-12", "2020-08-16", "2021-09-13", "2020-09-20", "2020-07-19", "2020-07-05", "2020-08-01"), siteID = c("LAK20_NY-10001", "LAK20_NY-10002", "LAK20_NY-10003", "LAK20_NY-10004", "LAK20_NY-10005", "LAK20_NY-10006", "LAK20_NY-10007", "LAK20_NY-10008", "LAK20_NY-10009", "LAK20_NY-10010", "LAK20_NY-10011", "LAK20_NY-10012", "LAK20_NY-10013", "LAK20_NY-10014", "LAK20_NY-10015", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10017", "LAK20_NY-10018", "LAK20_NY-10019", "LAK20_NY-10020", "LAK20_NY-10021", "LAK20_NY-10022", "LAK20_NY-10023", "LAK20_NY-10024", "LAK20_NY-10025", "LAK20_NY-10025", "LAK20_NY-10026", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027")), row.names = c(NA, -51L), class = c("tbl_df", "tbl", "data.frame"))
att %>%
mutate(SAMPLE_DATE = ymd(SAMPLE_DATE)) %>% # convert to date
mutate(month = month(SAMPLE_DATE)) %>% # extract month from date
group_by(siteID) %>%
mutate(has_aug = 8 %in% month) %>% # annotate as having date in august
filter(has_aug & month == 8 |
!has_aug) %>% # filter only august dates from sites with or all from those without
slice_max(n = 1,
order_by = SAMPLE_DATE,
with_ties = F) %>% # change with_ties to TRUE to discard NA values
select(SAMPLE_DATE, siteID)
#> # A tibble: 27 x 2
#> # Groups: siteID [27]
#> SAMPLE_DATE siteID
#> <date> <chr>
#> 1 2020-07-22 LAK20_NY-10001
#> 2 NA LAK20_NY-10002
#> 3 NA LAK20_NY-10003
#> 4 NA LAK20_NY-10004
#> 5 NA LAK20_NY-10005
#> 6 NA LAK20_NY-10006
#> 7 2020-09-01 LAK20_NY-10007
#> 8 NA LAK20_NY-10008
#> 9 NA LAK20_NY-10009
#> 10 NA LAK20_NY-10010
#> # ... with 17 more rows
Created on 2022-02-03 by the reprex package (v2.0.1)
