I am working with the R programming language.
I have the following dataset ("my_data"):
structure(list(idd = 1:50, group_1 = c("B", "B", "A", "B", "B",
"A", "A", "A", "B", "A", "A", "B", "B", "B", "A", "A", "A", "A",
"B", "B", "A", "B", "A", "B", "A", "B", "B", "A", "B", "B", "B",
"A", "B", "A", "B", "B", "A", "A", "A", "A", "A", "B", "B", "B",
"A", "B", "B", "B", "B", "B"), v1 = c(15.7296737049317, -4.33377704672207,
-0.551850185265, 2.66888122578048, 12.109072642513, 0.0107927293899017,
20.7785032320562, -1.98974382507874, 12.1663703518471, 11.4308702978893,
-0.657500910529805, 5.71376589298221, 3.43820523228653, 19.5939432685761,
25.5605263610222, -0.407964337882465, 19.3057240854025, 9.24554068987809,
-9.6719534905096, 2.44096357354807, 14.6114916050676, 11.4510663104787,
-14.4231132108142, 15.8031868545157, 16.5505199848675, 6.95491162740581,
2.92431767382703, 29.7157201447823, 9.10001319352251, 9.85982748068076,
-1.23456937110154, -3.44130123376206, -5.23155771062088, 5.78031789617826,
23.6092446408098, 27.5379484533487, 25.6836473435279, 22.9675556994775,
7.62403748556388, -2.24150135680706, 6.72187319859928, -14.1245027627225,
6.8620712655661, 26.5987870464572, 11.3095310060752, 20.9588868268958,
14.8934095694391, 2.21089704551347, 27.4355935292935, 9.21612714668934
), group_2 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L)), row.names = c(NA, -50L), class = "data.frame")
head(my_data)
idd group_1 v1 group_2
1 1 B 15.72967370 1
2 2 B -4.33377705 2
3 3 A -0.55185019 3
4 4 B 2.66888123 4
5 5 B 12.10907264 5
6 6 A 0.01079273 6
7 7 A 20.77850323 7
8 8 A -1.98974383 8
9 9 B 12.16637035 9
10 10 A 11.43087030 10
11 11 A -0.65750091 1
12 12 B 5.71376589 2
For this dataset, I want to perform the following steps in "dplyr":
- For each grouping of 10 rows, find the sum of "v1" for group_1 = "A" and group_2 = "B"
- For each of these groupings, create a new variable ("v2") that is : "A" if sum(group_1 = A) > sum(group_1 = B), "B" if sum(group_1 = A) < sum(group_1 = B) or "0" if sum(group_1 = A) = sum(group_1 = B)
I know how to do this manually in R:
#STEP 1: since my_data has 50 rows, break my_data into 5 groups of 10 rows
rows_1 = my_data[1:10,]
rows_2 = my_data[11:20,]
rows_3 = my_data[21:30,]
rows_4 = my_data[31:40,]
rows_5 = my_data[41:50,]
# STEP 2: find out values of "v2"
library(dplyr)
dplyr_row_1 = data.frame(rows_1 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_1$v2 = ifelse(dplyr_row_1[1,2] > dplyr_row_1[2,2], "A", ifelse(dplyr_row_1[1,2] < dplyr_row_1[2,2], "B", 0))
dplyr_row_2 = data.frame(rows_2 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_2$v2 = ifelse(dplyr_row_2[1,2] > dplyr_row_2[2,2], "A", ifelse(dplyr_row_2[1,2] < dplyr_row_2[2,2], "B", 0))
dplyr_row_3 = data.frame(rows_3 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_3$v2 = ifelse(dplyr_row_3[1,2] > dplyr_row_3[2,2], "A", ifelse(dplyr_row_3[1,2] < dplyr_row_3[2,2], "B", 0))
dplyr_row_4 = data.frame(rows_4 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_4$v2 = ifelse(dplyr_row_4[1,2] > dplyr_row_4[2,2], "A", ifelse(dplyr_row_4[1,2] < dplyr_row_4[2,2], "B", 0))
dplyr_row_5 = data.frame(rows_5 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_5$v2 = ifelse(dplyr_row_5[1,2] > dplyr_row_5[2,2], "A", ifelse(dplyr_row_5[1,2] < dplyr_row_5[2,2], "B", 0))
# STEP 3: append "v2" to first 5 files:
rows_1$v2 = dplyr_row_1$v2
rows_2$v2 = dplyr_row_2$v2
rows_3$v2 = dplyr_row_3$v2
rows_4$v2 = dplyr_row_4$v2
rows_5$v2 = dplyr_row_5$v2
# STEP 4: create final file:
final_file = rbind(rows_1,rows_2, rows_3, rows_4, rows_5)
As a result, the final file looks something like this:
idd group_1 v1 group_2 v2
1 1 B 15.72967370 1 B
2 2 B -4.33377705 2 B
3 3 A -0.55185019 3 B
4 4 B 2.66888123 4 B
5 5 B 12.10907264 5 B
6 6 A 0.01079273 6 B
7 7 A 20.77850323 7 B
8 8 A -1.98974383 8 B
9 9 B 12.16637035 9 B
10 10 A 11.43087030 10 B
11 11 A -0.65750091 1 A
My Question: Can someone please show me how to perform Steps 1 to Step 4 in a single "dplyr" command?
Thanks!
CodePudding user response:
- First I'll create a
group_indexto group every 10 rows together. - Then
group_bythe relevant columns and calculate sum. - Remove the grouping layer of
group_1, since we need to compare the values inAandB. - If the unique length of sum is equal to "1", that means they are the same, then input "0" in column
v2. If they are not the same, output the maximum category stored ingroup_1. - Finally remove the
sumcolumn and sort byidd.
This method is able to solve problem with more than two groups in group_1.
The first 20 rows are shown here for example.
library(tidyverse)
df %>%
mutate(group_index = rep(1:(nrow(df)/10), each = 10)) %>%
group_by(group_index, group_1) %>%
mutate(sum = sum(v1)) %>%
group_by(group_index) %>%
mutate(v2 = ifelse(length(unique(sum)) == 1, 0, group_1[which.max(sum)])) %>%
ungroup() %>%
select(-c(sum, group_index))
# A tibble: 20 x 5
idd group_1 v1 group_2 v2
<int> <chr> <dbl> <int> <chr>
1 1 B 15.7 1 B
2 2 B -4.33 2 B
3 3 A -0.552 3 B
4 4 B 2.67 4 B
5 5 B 12.1 5 B
6 6 A 0.0108 6 B
7 7 A 20.8 7 B
8 8 A -1.99 8 B
9 9 B 12.2 9 B
10 10 A 11.4 10 B
11 11 A -0.658 1 A
12 12 B 5.71 2 A
13 13 B 3.44 3 A
14 14 B 19.6 4 A
15 15 A 25.6 5 A
16 16 A -0.408 6 A
17 17 A 19.3 7 A
18 18 A 9.25 8 A
19 19 B -9.67 9 A
20 20 B 2.44 10 A
CodePudding user response:
Here is alternative method.
library(tidyverse)
df %>%
mutate(group_index = rep(1:(nrow(df)/10), each = 10)) %>%
group_by(group_index) %>%
mutate(
v2 =
if_else(sum(v1[group_1 == 'A']) > sum(v1[group_1 == 'B']), 'A',
if_else(sum(v1[group_1 == 'A']) < sum(v1[group_1 == 'B']), 'B', '0'))
)
