When analyzing data using R, I need to compute row-wise summary statistics (mean, min, max, standard deviation) for columns that have similar names (e.g., the same prefix). While I can achieve it using package such as matrixStats, I wonder if there is a more elegant way to do it using dplyr. Attached below is the code for generating a sample dataset and my solution for computing the row-wise summary statistics. Thanks!
######SAMPEL CODE#####
library("tidyverse")
library("matrixStats")
# create a sample dataset
sample_data <- data.frame(matrix(nrow = 10, ncol = 6))
names(sample_data) <- c("ID", "score_1", "score_2", "score_3", "score_4", "score_5")
sample_data <- sample_data %>%
mutate(ID = seq(1:10),
score_1 = round(runif(10, 1, 5)),
score_2 = round(runif(10, 1, 5)),
score_3 = round(runif(10, 1, 5)),
score_4 = round(runif(10, 1, 5)),
score_5 = round(runif(10, 1, 5)))
score_columns <- grep("score_", names(sample_data))
sample_data <- sample_data %>%
mutate(mean_score = rowMeans(select(., starts_with("score_")), na.rm = TRUE),
max_score = rowMaxs(as.matrix(sample_data[,c(score_columns)]), na.rm = TRUE),
sd_score = rowSds(as.matrix(sample_data[,c(score_columns)]), na.rm = TRUE))
CodePudding user response:
You can calculate these statistics 'by row' using dplyr row-wise operations combined with the c_across function, e.g.
library(tidyverse)
library(matrixStats)
#>
#> Attaching package: 'matrixStats'
#> The following object is masked from 'package:dplyr':
#>
#> count
# create a sample dataset
sample_data <- data.frame(matrix(nrow = 10, ncol = 6))
names(sample_data) <- c("ID", "score_1", "score_2", "score_3", "score_4", "score_5")
sample_data <- sample_data %>%
mutate(ID = seq(1:10),
score_1 = round(runif(10, 1, 5)),
score_2 = round(runif(10, 1, 5)),
score_3 = round(runif(10, 1, 5)),
score_4 = round(runif(10, 1, 5)),
score_5 = round(runif(10, 1, 5)))
score_columns <- grep("score_", names(sample_data))
output <- sample_data %>%
mutate(mean_score = rowMeans(select(., starts_with("score_")), na.rm = TRUE),
max_score = rowMaxs(as.matrix(sample_data[,c(score_columns)]), na.rm = TRUE),
sd_score = rowSds(as.matrix(sample_data[,c(score_columns)]), na.rm = TRUE))
output
#> ID score_1 score_2 score_3 score_4 score_5 mean_score max_score sd_score
#> 1 1 1 5 4 3 2 3.0 5 1.5811388
#> 2 2 3 5 1 5 2 3.2 5 1.7888544
#> 3 3 2 5 3 2 4 3.2 5 1.3038405
#> 4 4 3 4 3 3 5 3.6 5 0.8944272
#> 5 5 2 3 2 2 3 2.4 3 0.5477226
#> 6 6 4 2 4 3 2 3.0 4 1.0000000
#> 7 7 2 2 1 3 1 1.8 3 0.8366600
#> 8 8 3 4 1 3 4 3.0 4 1.2247449
#> 9 9 3 2 3 4 2 2.8 4 0.8366600
#> 10 10 5 2 3 3 4 3.4 5 1.1401754
output2 <- sample_data %>%
rowwise() %>%
mutate(mean_score = mean(c_across(starts_with("score_"))),
max_score = max(c_across(!!score_columns)),
sd_score = sd(c_across(!!score_columns)))
output2
#> # A tibble: 10 × 9
#> # Rowwise:
#> ID score_1 score_2 score_3 score_4 score_5 mean_score max_score sd_score
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 5 4 3 2 3 5 1.58
#> 2 2 3 5 1 5 2 3.2 5 1.79
#> 3 3 2 5 3 2 4 3.2 5 1.30
#> 4 4 3 4 3 3 5 3.6 5 0.894
#> 5 5 2 3 2 2 3 2.4 3 0.548
#> 6 6 4 2 4 3 2 3 4 1
#> 7 7 2 2 1 3 1 1.8 3 0.837
#> 8 8 3 4 1 3 4 3 4 1.22
#> 9 9 3 2 3 4 2 2.8 4 0.837
#> 10 10 5 2 3 3 4 3.4 5 1.14
Created on 2022-01-31 by the reprex package (v2.0.1)
As this is a dplyr-specific solution, you can incorporate further data manipulations (e.g. filter() to select specific rows) where other solutions will fail or require awkward workarounds, e.g.
library(tidyverse)
library(matrixStats)
#>
#> Attaching package: 'matrixStats'
#> The following object is masked from 'package:dplyr':
#>
#> count
# create a sample dataset
sample_data <- data.frame(matrix(nrow = 10, ncol = 6))
names(sample_data) <- c("ID", "score_1", "score_2", "score_3", "score_4", "score_5")
sample_data <- sample_data %>%
mutate(ID = seq(1:10),
score_1 = round(runif(10, 1, 5)),
score_2 = round(runif(10, 1, 5)),
score_3 = round(runif(10, 1, 5)),
score_4 = round(runif(10, 1, 5)),
score_5 = round(runif(10, 1, 5)))
score_columns <- grep("score_", names(sample_data))
output <- sample_data %>%
filter(ID < 6) %>%
mutate(mean_score = rowMeans(select(., starts_with("score_")), na.rm = TRUE),
max_score = rowMaxs(as.matrix(sample_data[,c(score_columns)]), na.rm = TRUE),
sd_score = rowSds(as.matrix(sample_data[,c(score_columns)]), na.rm = TRUE))
#> Error: Problem with `mutate()` column `max_score`.
#> ℹ `max_score = rowMaxs(as.matrix(sample_data[, c(score_columns)]), na.rm = TRUE)`.
#> ℹ `max_score` must be size 5 or 1, not 10.
output2 <- sample_data %>%
filter(ID < 6) %>%
rowwise() %>%
mutate(mean_score = mean(c_across(starts_with("score_"))),
max_score = max(c_across(!!score_columns)),
sd_score = sd(c_across(!!score_columns)))
output2
#> # A tibble: 5 × 9
#> # Rowwise:
#> ID score_1 score_2 score_3 score_4 score_5 mean_score max_score sd_score
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 5 2 5 5 1 3.6 5 1.95
#> 2 2 1 2 4 1 2 2 4 1.22
#> 3 3 1 2 2 4 5 2.8 5 1.64
#> 4 4 5 4 3 3 5 4 5 1
#> 5 5 4 2 2 5 4 3.4 5 1.34
Created on 2022-01-31 by the reprex package (v2.0.1)
CodePudding user response:
A slightly different way I used to answer your question:
library("tidyverse")
library("matrixStats")
# create a sample dataset
sample_data <- data.frame(matrix(nrow = 10, ncol = 6))
names(sample_data) <- c("ID", "score_1", "score_2", "score_3", "score_4", "score_5")
sample_data <- sample_data %>%
mutate(ID = seq(1:10),
score_1 = round(runif(10, 1, 5)),
score_2 = round(runif(10, 1, 5)),
score_3 = round(runif(10, 1, 5)),
score_4 = round(runif(10, 1, 5)),
score_5 = round(runif(10, 1, 5)))
score_columns <- colnames(sample_data)[grep("score_", names(sample_data))]
sample_data<- sample_data %>%
rowwise(ID) %>%
mutate(mean_score = mean(c_across(score_columns[1]:length(score_columns))),
max_score = max(c_across(score_columns[1]:length(score_columns))),
sd_score = sd(c_across(score_columns[1]:length(score_columns))))
If you want an even easier way using base functions, try the following:
score_columns <- grep("score_", names(sample_data))
sample_data['mean_score'] <- apply(sample_data[,score_columns], 1, mean)
sample_data['max_score'] <- apply(sample_data[,score_columns], 1, max)
sample_data['sd_score'] <- apply(sample_data[,score_columns], 1, sd)
