R - calculate difference between all combinations of observations within groups-CodePudding

I want to compare the effect of different fertilizer doses on multiple crop cultivars at various locations. My dataset is similar to the one generated below:

locs <- rep(c("loc1","loc2","loc3"), length.out=180)
cults <- rep(c("cult1","cult2","cult3","cult4","cult5"), length.out=180)
doses <- rep(c("no_fert","40kg","50kg","60kg"), length.out=180)
set.seed(123); yld <- runif(3*length(unique(locs))*length(unique(cults))*length(unique(doses)), min=3, max=15)

dat <- data.frame(location=locs,
                  cultivar=cults,
                  fert_dose=doses,
                  yield=yld)

Note there are three repetitions of each fertilizer dosage (but there are more in my actual dataset).

The first thing I need to do is to calculate the average for the three repetitions of each location-cultivar-fertilizer combination.

I can do it - in a probably not so efficient way - like this:

d1 <- d2 <- d3 <- list()
for (i in 1:length(unique(locs))){
  for (j in 1:length(unique(cults))){
    for (k in 1:length(unique(doses))){
      d1[[k]] <- data.frame(location=locs[i],
                            cultivar=cults[j],
                            fert_dose=doses[k],
                            mean_yield=mean(dat[dat$location==locs[i]&dat$cultivar==cults[j]&dat$fert_dose==doses[k],]$yield))
    }
    d2[[j]] <- do.call(rbind,d1)
  }
  d3[[i]] <- do.call(rbind,d2)
}

(mean_dat <- do.call(rbind, d3))

Next, what I need to do is: for each location, find the yield difference among all combinations of cultivar and fertilizer doses.

For example, considering only loc1 and cult1, the expected result would be:

res <- "
location cultivar dose dose_mean other_cultivar other_dose other_mean diff
loc1 cult1 no_fert 9.402345 cult1 40kg 9.251377 0.150968
loc1 cult1 no_fert 9.402345 cult1 50kg 10.764692 -1.362347
loc1 cult1 no_fert 9.402345 cult1 60kg 10.119129 -0.716784

loc1 cult1 40kg 9.251377 cult1 no_fert 9.402345 -0.150968
loc1 cult1 40kg 9.251377 cult1 50kg 10.764692 -1.513315
loc1 cult1 40kg 9.251377 cult1 60kg 10.119129 -0.867752

loc1 cult1 50kg 10.764692 cult1 no_fert 9.402345 1.362347
loc1 cult1 50kg 10.764692 cult1 40kg 9.251377 1.513315
loc1 cult1 50kg 10.764692 cult1 60kg 10.119129 0.645563

loc1 cult1 60kg 10.119129 cult1 no_fert 9.402345 0.716784
loc1 cult1 60kg 10.119129 cult1 40kg 9.251377 0.867752
loc1 cult1 60kg 10.119129 cult1 50kg 10.764692 -0.645563
"
(res <- read.table(textConnection(res), sep=" ", header=T, stringsAsFactors=F))

In this table, I am repeating the yield values for each dose obtained in the previous step (mean_dat table) and calculating a simple the difference between them. The resulting table would continue this analysis, including the other cultivars in the other_cultivar column.

I reckon the expected table does not look very good, but it will be used to feed an interactive dashboard, and this is the format it requires, so I don't think I have much choice here.

Is there any programmatic way to achieve these two results in just one step?

CodePudding user response：

dplyr

library(dplyr)
dat %>%
  group_by(location, cultivar, dose = fert_dose) %>%
  summarize(dose_mean = mean(yield), .groups = "drop") %>%
  full_join(., ., by = "location", suffix = c("", "_other")) %>%
  filter(cultivar != cultivar_other | dose != dose_other) %>%
  mutate(diff = dose_mean - dose_mean_other)
# # A tibble: 1,140 x 8
#    location cultivar dose  dose_mean cultivar_other dose_other dose_mean_other     diff
#    <chr>    <chr>    <chr>     <dbl> <chr>          <chr>                <dbl>    <dbl>
#  1 loc1     cult1    40kg       9.25 cult1          50kg                 10.8  -1.51   
#  2 loc1     cult1    40kg       9.25 cult1          60kg                 10.1  -0.868  
#  3 loc1     cult1    40kg       9.25 cult1          no_fert               9.40 -0.151  
#  4 loc1     cult1    40kg       9.25 cult2          40kg                 10.1  -0.830  
#  5 loc1     cult1    40kg       9.25 cult2          50kg                  8.97  0.282  
#  6 loc1     cult1    40kg       9.25 cult2          60kg                  6.71  2.54   
#  7 loc1     cult1    40kg       9.25 cult2          no_fert              11.5  -2.20   
#  8 loc1     cult1    40kg       9.25 cult3          40kg                 11.9  -2.70   
#  9 loc1     cult1    40kg       9.25 cult3          50kg                  9.21  0.0421 
# 10 loc1     cult1    40kg       9.25 cult3          60kg                  9.26 -0.00416
# # ... with 1,130 more rows

Note that this is doing an outer join on cultivar and dose. We started with 180 rows and ended with 1140, this will grow geometrically.

data.table

library(data.table)
DT <- as.data.table(dat)[, .(dose_mean = mean(yield)), by = .(location, cultivar, dose = fert_dose)]
merge(DT, DT, by = "location", all = TRUE, suffix = c("", "_other"), allow.cartesian = TRUE
  )[(cultivar != cultivar_other | dose != dose_other),
  ][, diff := dose_mean - dose_mean_other][]
#       location cultivar    dose dose_mean cultivar_other dose_other dose_mean_other         diff
#         <char>   <char>  <char>     <num>         <char>     <char>           <num>        <num>
#    1:     loc1    cult1 no_fert  9.402345          cult4       60kg        8.508675  0.893670057
#    2:     loc1    cult1 no_fert  9.402345          cult2       50kg        8.969489  0.432856209
#    3:     loc1    cult1 no_fert  9.402345          cult5       40kg        9.345814  0.056530679
#    4:     loc1    cult1 no_fert  9.402345          cult3    no_fert       11.243009 -1.840663741
#    5:     loc1    cult1 no_fert  9.402345          cult1       60kg       10.119129 -0.716784445
#    6:     loc1    cult1 no_fert  9.402345          cult4       50kg        9.638162 -0.235817407
#    7:     loc1    cult1 no_fert  9.402345          cult2       40kg       10.081336 -0.678991009
#    8:     loc1    cult1 no_fert  9.402345          cult5    no_fert        9.405199 -0.002854273
#    9:     loc1    cult1 no_fert  9.402345          cult3       60kg        9.255537  0.146807576
#   10:     loc1    cult1 no_fert  9.402345          cult1       50kg       10.764692 -1.362347580
#   ---                                                                                           
# 1131:     loc3    cult5    60kg  8.442893          cult5       40kg        7.217206  1.225686617
# 1132:     loc3    cult5    60kg  8.442893          cult3    no_fert        8.688523 -0.245630492
# 1133:     loc3    cult5    60kg  8.442893          cult1       60kg        7.221926  1.220966527
# 1134:     loc3    cult5    60kg  8.442893          cult4       50kg        7.918912  0.523980425
# 1135:     loc3    cult5    60kg  8.442893          cult2       40kg        7.405098  1.037794838
# 1136:     loc3    cult5    60kg  8.442893          cult5    no_fert        6.963170  1.479722527
# 1137:     loc3    cult5    60kg  8.442893          cult3       60kg        8.183201  0.259691148
# 1138:     loc3    cult5    60kg  8.442893          cult1       50kg        9.444416 -1.001523464
# 1139:     loc3    cult5    60kg  8.442893          cult4       40kg       10.264777 -1.821884187
# 1140:     loc3    cult5    60kg  8.442893          cult2    no_fert        7.196217  1.246675164

Note that doing this is data.table works well but does not really reduce the memory footprint of in-place calculations or speed usually attributed to data.table-based solutions.

CodePudding user response：

With data.table, you can do the following join:

library(data.table)

locs <- rep(c("loc1","loc2","loc3"), length.out=180)
cults <- rep(c("cult1","cult2","cult3","cult4","cult5"), length.out=180)
doses <- rep(c("no_fert","40kg","50kg","60kg"), length.out=180)
set.seed(123); yld <- runif(3*length(unique(locs))*length(unique(cults))*length(unique(doses)), min=3, max=15)

dat <- data.frame(location=locs,
                  cultivar=cults,
                  fert_dose=doses,
                  yield=yld)


setDT(dat)

dat[dat, .(cultivar_1 = cultivar, 
           cultivar_2 = i.cultivar,
           fert_dose_1 = fert_dose,
           fert_dose_2 = i.fert_dose,
           yield_1 = yield,
           yield_2 = i.yield,
           diff = yield - i.yield), on = "location", by = .EACHI][
           !(cultivar_1 == cultivar_2 & fert_dose_1 == fert_dose_2)][
             order(location, cultivar_1,fert_dose_1, cultivar_2, fert_dose_2)]

#>        location cultivar_1 cultivar_2 fert_dose_1 fert_dose_2   yield_1
#>     1:     loc1      cult1      cult1        40kg        50kg  4.665673
#>     2:     loc1      cult1      cult1        40kg        50kg 13.684203
#>     3:     loc1      cult1      cult1        40kg        50kg  9.404255
#>     4:     loc1      cult1      cult1        40kg        50kg  4.665673
#>     5:     loc1      cult1      cult1        40kg        50kg 13.684203
#>    ---                                                                 
#> 10256:     loc3      cult5      cult5     no_fert        60kg  8.794829
#> 10257:     loc3      cult5      cult5     no_fert        60kg  7.265345
#> 10258:     loc3      cult5      cult5     no_fert        60kg  4.829337
#> 10259:     loc3      cult5      cult5     no_fert        60kg  8.794829
#> 10260:     loc3      cult5      cult5     no_fert        60kg  7.265345
#>          yield_2        diff
#>     1: 14.556291 -9.89061803
#>     2: 14.556291 -0.87208812
#>     3: 14.556291 -5.15203544
#>     4:  4.568348  0.09732446
#>     5:  4.568348  9.11585437
#>    ---                      
#> 10256:  7.854123  0.94070539
#> 10257:  7.854123 -0.58877881
#> 10258:  9.981001 -5.15166422
#> 10259:  9.981001 -1.18617243
#> 10260:  9.981001 -2.71565663

^{Created on 2022-10-26 with reprex v2.0.2}

CodePudding user response：

A tidy dplyr solution with adding all values for each location to a new column, then filtering to remove the few identical combinations:

library(tidyverse)

myfunc <- function(df) {
  df %>%
    add_column(other = list(.)) %>%
    unnest(other, names_sep = "_") %>%
    filter(!(cultivar == other_cultivar & fert_dose == other_fert_dose)) %>%
    mutate(diff = yield - other_yield)
}

datmeans <- dat %>%
  group_by(location, cultivar, fert_dose) %>%
  summarise(yield = mean(yield), .groups = "drop") %>%
  group_split(location) %>%
  map(myfunc) %>%
  bind_rows()

CodePudding user response：

A data.table solution that avoids a larger join and subsequent filtering. It will be fast and memory-efficient.

dat_mean <- setDT(dat)[,.(mean_yield = mean(yield)), location:fert_dose][, doseIdx := match(fert_dose, unique(fert_dose))]

rbindlist(
  lapply(
    parse(
      text = c(
        ".(location, doseIdx > doseIdx)",
        ".(location, doseIdx < doseIdx)"
      )
    ),
    function(e) {
      dat_mean[
        dat_mean,
        .(
          location,
          cultivar1 = cultivar,
          fert_dose1 = fert_dose,
          yield1 = mean_yield,
          cultivar2 = i.cultivar,
          fert_dose2 = i.fert_dose,
          yield2 = i.mean_yield,
          diff = mean_yield - i.mean_yield
        ),
        on = eval(e)
      ]
    }
  )
)
#>      location cultivar1 fert_dose1    yield1 cultivar2 fert_dose2   yield2        diff
#>   1:     loc1     cult4       60kg  8.508675     cult1    no_fert 9.402345 -0.89367006
#>   2:     loc1     cult2       50kg  8.969489     cult1    no_fert 9.402345 -0.43285621
#>   3:     loc1     cult5       40kg  9.345814     cult1    no_fert 9.402345 -0.05653068
#>   4:     loc1     cult1       60kg 10.119129     cult1    no_fert 9.402345  0.71678444
#>   5:     loc1     cult4       50kg  9.638162     cult1    no_fert 9.402345  0.23581741
#>  ---                                                                                  
#> 926:     loc3     cult2       40kg  7.405098     cult5       60kg 8.442893 -1.03779484
#> 927:     loc3     cult5    no_fert  6.963170     cult5       60kg 8.442893 -1.47972253
#> 928:     loc3     cult1       50kg  9.444416     cult5       60kg 8.442893  1.00152346
#> 929:     loc3     cult4       40kg 10.264777     cult5       60kg 8.442893  1.82188419
#> 930:     loc3     cult2    no_fert  7.196217     cult5       60kg 8.442893 -1.24667516