Say I have a matrix like the following, with marker values per id, 10 events per id (in this example):
set.seed(123)
mymat <- matrix(rnorm(300), nrow=30)
rownames(mymat) <- paste0('id',rep(1:3,each=10))
colnames(mymat) <- letters[1:10]
> head(mymat)
a b c d e f g h i j
id1 -0.56047565 0.4264642 0.3796395 0.9935039 0.1176466 0.7877388 -1.0633261 0.1192452 -0.7886220 0.8450130
id1 -0.23017749 -0.2950715 -0.5023235 0.5483970 -0.9474746 0.7690422 1.2631852 0.2436874 -0.5021987 0.9625280
id1 1.55870831 0.8951257 -0.3332074 0.2387317 -0.4905574 0.3322026 -0.3496504 1.2324759 1.4960607 0.6843094
id1 0.07050839 0.8781335 -1.0185754 -0.6279061 -0.2560922 -1.0083766 -0.8655129 -0.5160638 -1.1373036 -1.3952743
id1 0.12928774 0.8215811 -1.0717912 1.3606524 1.8438620 -0.1194526 -0.2362796 -0.9925072 -0.1790516 0.8496430
id1 1.71506499 0.6886403 0.3035286 -0.6002596 -0.6519499 -0.2803953 -0.1971759 1.6756969 1.9023618 -0.4465572
And an associated data frame of cutoff values (a min and a max cutoff per id and marker), like this one:
cutoff_df <- data.frame(id=paste0('id',rep(1:3,each=10)), marker=rep(letters[1:10],3), min=runif(30, 0, 2), max=runif(30, 5, 7))
> head(cutoff_df)
id marker min max
1 id1 a 0.4744594 6.518271
2 id1 b 1.3729807 6.689669
3 id1 c 0.4516368 5.915843
4 id1 d 0.6369892 6.459263
5 id1 e 0.3479676 5.208157
6 id1 f 1.6028592 5.439966
What I want to do here, is calculate a frequency table, so that I record the percentage of events per id and marker that fall into the cutoffs for that id and marker.
This is my attempt using some ugly nested loops... Wondering if there is a nicer and cleaner way to do this, ideally with base functions or data.table or tidyr...
My ugly code:
freq_mat <- matrix(nrow=length(unique(rownames(mymat))))
rownames(freq_mat) <- unique(rownames(mymat))
for (mk in colnames(mymat)){
mk_freq <- NULL
for (id in unique(rownames(mymat))){
data <- mymat[rownames(mymat)==id,mk]
min <- cutoff_df$min[cutoff_df$id==id & cutoff_df$marker==mk]
max <- cutoff_df$max[cutoff_df$id==id & cutoff_df$marker==mk]
ins <- length(data[data>=min & data<=max])
freq <- ins/length(data)*100
mk_freq <- c(mk_freq, freq)
}
mk_freq <- as.data.frame(mk_freq)
names(mk_freq) <- mk
freq_mat <- cbind(freq_mat, mk_freq)
}
> freq_mat
freq_mat a b c d e f g h i j
id1 NA 20 0 20 40 10 0 30 10 20 30
id2 NA 10 30 30 0 20 10 10 0 0 70
id3 NA 0 0 0 0 30 10 30 10 30 60
CodePudding user response:
Something like this? Here, the sum of all cells is 100.
library(tidyverse)
set.seed(123)
mymat <- matrix(rnorm(300), nrow = 30)
rownames(mymat) <- paste0("id", rep(1:3, each = 10))
colnames(mymat) <- letters[1:10]
cutoff_df <- data.frame(
id = paste0("id", rep(1:3, each = 10)),
marker = rep(letters[1:10], 3), min = runif(30, 0, 2), max = runif(30, 5, 7)
)
mymat %>%
as_tibble(rownames = "id") %>%
pivot_longer(-id, names_to = "marker") %>%
left_join(cutoff_df) %>%
filter(value <= max & value >= min) %>%
count(id, marker) %>%
# group_by(marker) %>% # e.g. to make sum of 100 per marker
mutate(n = n / sum(n) * 100) %>%
pivot_wider(names_from = marker, values_from = n, values_fill = list(n = 0))
#> Joining, by = c("id", "marker")
#> # A tibble: 3 × 11
#> id a c d e g h i j b f
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 id1 3.77 3.77 7.55 1.89 5.66 1.89 3.77 5.66 0 0
#> 2 id2 1.89 5.66 0 3.77 1.89 0 0 13.2 5.66 1.89
#> 3 id3 0 0 0 5.66 5.66 1.89 5.66 11.3 0 1.89
Created on 2022-03-30 by the reprex package (v2.0.0)
CodePudding user response:
Here is a solution based on the purrr package. I'm not sure that it is cleaner but it is shorter.
library(purrr)
asplit(mymat,2) |>
imap(~{
with(filter(cutoff_df, marker == .y),
outer(.x, min, ">=") &
outer(.x, max, "<") &
outer(names(.x), id, "=="))
}) |>
map(rowSums) |>
map_dfr(~tapply(.x, names(.x), FUN = sum),
.id = "marker")
##> # A tibble: 10 × 4
##> marker id1 id2 id3
##> <chr> <dbl> <dbl> <dbl>
##> 1 a 2 1 0
##> 2 b 0 3 0
##> 3 c 2 3 0
##> 4 d 4 0 0
##> 5 e 1 2 3
##> 6 f 0 1 1
##> 7 g 3 1 3
##> 8 h 1 0 1
##> 9 i 2 0 3
##> 10 j 3 7 6
