I have the following dataset where I have replicates of the same ID entry (example: 20103 is repeated twice):
head(Data)
## X ID DOB sector meters Oct Res_FROM Res_TO Exp_FROM Exp_TO
## 1 1 20100 8/24/1979 H38 6400 W 8/15/1979 5/15/1991 8/24/1979 12/31/1988
## 2 2 20101 5/5/1980 B01 1600 NW 5/15/1980 4/15/1991 5/15/1980 12/31/1988
## 3 3 20102 3/17/1979 H04 1600 SW 6/15/1972 8/15/1979 3/17/1979 8/15/1979
## 4 4 20103 11/30/1981 B09 3200 NE 1/15/1982 1/15/1984 1/15/1982 1/15/1984
## 5 5 20103 11/30/1981 B37 8000 N 1/15/1984 4/15/1986 1/15/1984 4/15/1986
## 6 6 20104 9/1/1978 B09 3200 NE 1/15/1982 1/15/1984 1/15/1982 1/15/1984
## Exps_Grp Yr1952 Yr1953 Yr1954 Yr1955 Yr1956 Yr1957 Yr1958 Yr1959 Yr1960
## 1 NA NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA
## Yr1961 Yr1962 Yr1963 Yr1964 Yr1965 Yr1966 Yr1967 Yr1968 Yr1969 Yr1970 Yr1971
## 1 NA NA NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA NA
## Yr1972 Yr1973 Yr1974 Yr1975 Yr1976 Yr1977 Yr1978 Yr1979 Yr1980 Yr1981
## 1 NA NA NA NA NA NA NA 1.082616 0.7834218 0.7834218
## 2 NA NA NA NA NA NA NA NA 0.6825884 1.0937646
## 3 NA NA NA NA NA NA NA 4.673775 NA NA
## 4 NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA
## Yr1982 Yr1983 Yr1984 Yr1985 Yr1986 Yr1987 Yr1988
## 1 0.7834218 0.7834218 0.7834218 0.7834218 0.7834218 0.7834218 0.1956091
## 2 1.0937646 1.0937646 1.0937646 1.0937646 1.0937646 1.0937646 0.2730972
## 3 NA NA NA NA NA NA NA
## 4 2.7934596 2.8975827 0.1041230 NA NA NA NA
## 5 NA NA 0.5662659 0.5890579 0.1416258 NA NA
## 6 2.7934596 2.8975827 0.1041230 NA NA NA NA
## Yrs_Exp arth_mean median cumulative age
## 1 9.3616438 0.7545599 0.7834218 7.545599 9
## 2 8.6356164 0.9568931 1.0937646 8.612038 8
## 3 0.4136986 4.6737751 4.6737751 4.673775 0
## 4 2.0000000 1.9317218 2.7934596 5.795165 2
## 5 2.2493151 0.4323165 0.5662659 1.296950 4
## 6 2.0000000 1.9317218 2.7934596 5.795165 5
My full data has over 14,000 rows. I want to eliminate the duplicate ID entries; however, when I do this elimination, I need to keep the row that has the highest age. For example, the two rows with ID = 20103, the first age is 2 and the other age is 4. I would only want to keep the row with the highest age, which would be 4. I've used the aggregate function before, but I don't know how to apply it to only eliminate duplicates and keep the row with the highest number for age. I have included a reproducible dataset below.
structure(list(X = 1:9, ID = c(20100L, 20101L, 20102L, 20103L,
20103L, 20104L, 20104L, 20105L, 20105L), DOB = c("8/24/1979",
"5/5/1980", "3/17/1979", "11/30/1981", "11/30/1981", "9/1/1978",
"9/1/1978", "12/3/1980", "12/3/1980"), sector = c("H38", "B01",
"H04", "B09", "B37", "B09", "B37", "B09", "B09"), meters = c(6400L,
1600L, 1600L, 3200L, 8000L, 3200L, 8000L, 3200L, 3200L), Oct = c("W",
"NW", "SW", "NE", "N", "NE", "N", "NE", "NE"), Res_FROM = c("8/15/1979",
"5/15/1980", "6/15/1972", "1/15/1982", "1/15/1984", "1/15/1982",
"1/15/1984", "12/15/1980", "8/15/1983"), Res_TO = c("5/15/1991",
"4/15/1991", "8/15/1979", "1/15/1984", "4/15/1986", "1/15/1984",
"4/15/1986", "8/15/1983", "3/15/1991"), Exp_FROM = c("8/24/1979",
"5/15/1980", "3/17/1979", "1/15/1982", "1/15/1984", "1/15/1982",
"1/15/1984", "12/15/1980", "8/15/1983"), Exp_TO = c("12/31/1988",
"12/31/1988", "8/15/1979", "1/15/1984", "4/15/1986", "1/15/1984",
"4/15/1986", "8/15/1983", "12/31/1988"), Exps_Grp = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA), Yr1952 = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA), Yr1953 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA),
Yr1954 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1955 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1956 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA), Yr1957 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA), Yr1958 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA),
Yr1959 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1960 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1961 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA), Yr1962 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA), Yr1963 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA),
Yr1964 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1965 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1966 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA), Yr1967 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA), Yr1968 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA),
Yr1969 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1970 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1971 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA), Yr1972 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA), Yr1973 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA),
Yr1974 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1975 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1976 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA), Yr1977 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA), Yr1978 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA),
Yr1979 = c(1.082616155, NA, 4.673775148, NA, NA, NA, NA,
NA, NA), Yr1980 = c(0.783421772, 0.682588398, NA, NA, NA,
NA, NA, 0.120085751, NA), Yr1981 = c(0.783421772, 1.093764595,
NA, NA, NA, NA, NA, 2.897582683, NA), Yr1982 = c(0.783421772,
1.093764595, NA, 2.793459642, NA, 2.793459642, NA, 2.897582683,
NA), Yr1983 = c(0.783421772, 1.093764595, NA, 2.897582683,
NA, 2.897582683, NA, 1.805844233, 1.09173845), Yr1984 = c(0.783421772,
1.093764595, NA, 0.104123041, 0.566265934, 0.104123041, 0.566265934,
NA, 2.897582683), Yr1985 = c(0.783421772, 1.093764595, NA,
NA, 0.589057923, NA, 0.589057923, NA, 2.897582683), Yr1986 = c(0.783421772,
1.093764595, NA, NA, 0.141625765, NA, 0.141625765, NA, 2.897582683
), Yr1987 = c(0.783421772, 1.093764595, NA, NA, NA, NA, NA,
NA, 2.897582683), Yr1988 = c(0.1956091, 0.27309722, NA, NA,
NA, NA, NA, NA, 0.723484539), Yrs_Exp = c(9.361643836, 8.635616438,
0.41369863, 2, 2.249315068, 2, 2.249315068, 2.665753425,
5.383561644), arth_mean = c(0.754559943, 0.956893087, 4.673775148,
1.931721789, 0.432316541, 1.931721789, 0.432316541, 1.930273838,
2.234258954), median = c(0.783421772, 1.093764595, 4.673775148,
2.793459642, 0.566265934, 2.793459642, 0.566265934, 2.351713458,
2.897582683), cumulative = c(7.545599433, 8.612037782, 4.673775148,
5.795165366, 1.296949622, 5.795165366, 1.296949622, 7.72109535,
13.40555372), age = c(9L, 8L, 0L, 2L, 4L, 5L, 7L, 2L, 8L)), class = "data.frame", row.names = c(NA,
-9L))
CodePudding user response:
Simply arrange by in decreasing order and then use distinct:
library(dplyr)
df %>%
select(ID, age) %>%
arrange(desc(age)) %>%
distinct(ID, .keep_all = TRUE)
#> ID age
#> 1 20100 9
#> 2 20101 8
#> 3 20105 8
#> 4 20104 7
#> 5 20103 4
#> 6 20102 0
Note that, I have only selected the two columns of interest ID and age just for concise output and to focus on the issue given here, using select(), which obviously you don't need to do.
CodePudding user response:
Option using aggregate:
aggregate(age~ID,df,function(x) max(x))
#> ID age
#> 1 20100 9
#> 2 20101 8
#> 3 20102 0
#> 4 20103 4
#> 5 20104 7
#> 6 20105 8
Created on 2022-07-10 by the reprex package (v2.0.1)
CodePudding user response:
Here is another dplyr way, grouping and filtering by condition has the same effect:
library(dplyr)
df %>%
select(ID, age) %>% # just to keep the main columns (remove this)
group_by(ID) %>%
filter(age==max(age)) %>%
ungroup()
ID age
<int> <int>
1 20100 9
2 20101 8
3 20102 0
4 20103 4
5 20104 7
6 20105 8
CodePudding user response:
In base R
df_new <- df[order(df$age, decreasing = T), ]
df_new <- df_new[!duplicated(df_new$ID), ]
output of ID and age columns
> df_new[ , c("ID","age")]
ID age
1 20100 9
2 20101 8
9 20105 8
7 20104 7
5 20103 4
3 20102 0
