I have the following DF
head(sample_data)
article value date
1 A 21920 2015
2 I 615 2017
3 B 1414 2018
4 D 102 2018
5 I 1096 2015
6 A 2577 2021
Full dataset
dput(sample_data)
structure(list(article = c("A", "I", "B", "D", "I", "A", "C",
"C", "D", "H", "B", "I", "A", "G", "E", "G", "D", "A", "D", "B",
"A", "C", "D", "F", "G", "D", "G", "C", "E", "E", "G", "G", "A",
"A", "E", "H", "B", "E", "E", "B", "B", "A", "H", "A", "B", "G",
"D", "C", "E", "A"), value = c(21920, 615, 1414, 102, 1096, 2577,
840, 311, 804, 695, 3863, 279, 7324, 299, 311, 133, 759, 5386,
5396, 11051, 14708, 856, 1749, 2212, 318, 3478, 415, 781, 227,
248, 122, 185, 1344, 15442, 248, 433, 5068, 38, 165, 369, 805,
18944, 264, 11716, 4274, 442, 2530, 827, 164, 18506), date = c("2015",
"2017", "2018", "2018", "2015", "2021", "2016", "2021", "2017",
"2021", "2019", "2015", "2019", "2016", "2015", "2019", "2018",
"2020", "2017", "2015", "2015", "2016", "2015", "2015", "2021",
"2015", "2019", "2016", "2016", "2015", "2019", "2020", "2019",
"2016", "2016", "2015", "2015", "2021", "2021", "2020", "2020",
"2015", "2016", "2017", "2019", "2016", "2015", "2016", "2019",
"2016")), row.names = c(NA, -50L), class = "data.frame")
I'm trying to use dplyr to get something along the lines of this:
sample_data %>%
group_by(article, date) %>%
summarise(weight = sum(value))
`summarise()` has grouped output by 'article'. You can override using the `.groups` argument.
# A tibble: 29 x 3
# Groups: article [9]
article date weight
<chr> <chr> <dbl>
1 A 2015 55572
2 A 2016 33948
3 A 2017 11716
4 A 2019 8668
5 A 2020 5386
6 A 2021 2577
7 B 2015 16119
8 B 2018 1414
9 B 2019 8137
10 B 2020 1174
# ... with 19 more rows
However, I want to add another column with a proportion of each article's weight of the total (sum of A:I) per year. The sum of all article proportions should then amount to 1 for each year.
I tried the code below. I suspect this occurs because I use "value" that results in all values being printed, hence all occurrences. How can I summarise this so it looks like the one above with the added column?
sample_data %>%
group_by(article, date) %>%
summarise(weight = sum(value), prop = value/weight)
`summarise()` has grouped output by 'article', 'date'. You can override using the `.groups` argument.
# A tibble: 50 x 4
# Groups: article, date [29]
article date weight prop
<chr> <chr> <dbl> <dbl>
1 A 2015 55572 0.394
2 A 2015 55572 0.265
3 A 2015 55572 0.341
4 A 2016 33948 0.455
5 A 2016 33948 0.545
6 A 2017 11716 1
7 A 2019 8668 0.845
8 A 2019 8668 0.155
9 A 2020 5386 1
10 A 2021 2577 1
# ... with 40 more rows
CodePudding user response:
After the initial summarize, you have one entry for each article per year. You then wish to know what the contribution of each article was to each year's total, so you need to group_by again using just the year, and finally mutate to get the proportion for each article.
library(dplyr)
sample_data %>%
group_by(article, date) %>%
summarise(weight = sum(value), .groups = "keep") %>%
group_by(date) %>%
mutate(prop = weight / sum(weight))
#> # A tibble: 29 x 4
#> # Groups: date [7]
#> article date weight prop
#> <chr> <chr> <dbl> <dbl>
#> 1 A 2015 55572 0.661
#> 2 A 2016 33948 0.876
#> 3 A 2017 11716 0.632
#> 4 A 2019 8668 0.491
#> 5 A 2020 5386 0.799
#> 6 A 2021 2577 0.628
#> 7 B 2015 16119 0.192
#> 8 B 2018 1414 0.622
#> 9 B 2019 8137 0.461
#> 10 B 2020 1174 0.174
#> # ... with 19 more rows
Created on 2022-02-19 by the reprex package (v2.0.1)
CodePudding user response:
An option is also to have do the group by sum within first summarise
library(dplyr)
library(tibble)
library(tidyr)
sample_data %>%
group_by(date) %>%
summarise(out = enframe(tapply(value, article, sum)/sum(value),
name = 'article', value = 'prop'), .groups = 'drop') %>%
unpack(out)
# A tibble: 29 × 3
date article prop
<chr> <chr> <dbl>
1 2015 A 0.661
2 2015 B 0.192
3 2015 D 0.0923
4 2015 E 0.00665
5 2015 F 0.0263
6 2015 H 0.00515
7 2015 I 0.0164
8 2016 A 0.876
9 2016 C 0.0853
10 2016 E 0.0123
# … with 19 more rows
