how to see if random paired sample is in dataframe (with conditions)-CodePudding

say I have a df like so:

T1 <- c("a","b","c","d","e")
T2 <- c("f","g","h","i","j")
score1 <- c(NA,0.01,0.5,0.78,NA)
score2 <- c(1, 2, 3, NA, 6)
df <- data.frame(T1, T2, score1, score2)

> df
      T1    T2  score1 score2
1     a     f     NA      1
2     b     g   0.01      2
3     c     h   0.50      3
4     d     i   0.78     NA
5     e     j     NA      6

If i want to randomly create new T1-T2 pairs, how can I see if these new pairs are in the df but only if score1 column is not NA? In other words, I randomly sample, say, 2 values from T1 and T2:

(l1 <- sample(df$T1, 2))
(l2 <- sample(df$T2, 2))

and get:

> l1
[1] "c" "d"
> l2
[1] "h" "g"

How would one go about to get the score2 of the c-h and d-g pairs from df but only if score1 is not NA? My first instinct would be to create a new df2 without NAs in score1 column:

df2 <- df[which(!is.na(df$score1)), ]

Then I can create a new df for the new pairs:

df3$X1 <- l1
df3$X2 <- l2
df3$X3 <- l2
df3$X4 <- l1

#stack X3 with X1 and X4 with X2 (considering that T1-T2 pair is the same as T2-T1 pair)
df4 <- data.frame(T1 = c(df3[,"X1"], df3[,"X3"]),
                  T2 = c(df3[,"X2"], df3[,"X4"]))
> df4
  T1 T2
1  c  h
2  d  g
3  h  c
4  g  d

But I'm missing the last step of how to get see if the paired columns from df4 matches the paired columns in df2. In the end, I want to get something like:

df
      T1    T2  score1 score2
1     c     h   0.50      3
2     d     g   NA       NA

CodePudding user response：

I think a merge/join operation makes sense here:

res <- merge(df, data.frame(T1=l1, T2=l2, found=TRUE), by = c("T1","T2"), all = TRUE)

subset(res, found, select = -found)
#   T1 T2 score1 score2
# 3  c  h    0.5      3
# 4  d  g     NA     NA

Data

df <- structure(list(T1 = c("a", "b", "c", "d", "e"), T2 = c("f", "g", "h", "i", "j"), score1 = c(NA, 0.01, 0.5, 0.78, NA), score2 = c(1, 2, 3, NA, 6)), class = "data.frame", row.names = c(NA, -5L))
l1 <- c("c", "d"); l2 <- c("h", "g")

CodePudding user response：

Something like this?

set.seed(2022)

(l1 <- sample(df$T1, 2))
#> [1] "d" "c"
(l2 <- sample(df$T2, 2))
#> [1] "h" "i"
mapply(\(x1, x2, data){
  i <- match(x1, data$T1)
  j <- match(x2, data$T2)
  if(any(is.na(c(data$score1[i], data$score1[i])))) {
    NA_real_
  } else {
    sum(c(data$score2[i], -1*data$score2[j]), na.rm = TRUE)
  }
}, l1, l2, MoreArgs = list(data = df))
#>  d  c 
#> -3  3

^{Created on 2022-01-30 by the reprex package (v2.0.1)}