closest value and data frame index index of all data frame elements of a list-CodePudding

I have a list containing data frames:

test <- list()
test[[1]] <- data.frame(C1=c(0.2,0.4,0.5), C2=c(2,3.5,3.7), C3=c(0.3,4,5))
test[[2]] <- data.frame(C1=c(0.1,0.3,0.6), C2=c(3.9,4.3,8), C3=c(3,5.2,10))
test[[3]] <- data.frame(C1=c(0.4,0.55,0.8), C2=c(8.9,10.3,14), C3=c(7,8.4,11))

I´d like to get the line among all data frames lines inside this list which column (e.g.V2 in this example) has the closest value to all elements in a vector "vec" (below), as well as the list index (1, 2 or 3 in this example) where it happened.

vector <- c(3, 14.4, 7, 0)

The desired answer would be something like:

list.index    line.number.in.df    C1  C2 C3
     1              2              0.4 3.5 4 
     3              3              0.8 14 11
     2              3              0.6  8 10
     1              1              0.2  2 0.3

I could manage to use lapply to get 10% of the problem solved for a single value, but couldn´t do it for a bunch of values (vector) besides getting all list elements dataframe lines where the closest value as found (not only a single line among all data frames),and could not get the corresponding list index as well, i.e.

value <- 3
lapply(test, function(x) x[which.min(abs(value-x$C2)),])

Result I got:

[[1]]
  C1  C2 C3
2 0.4 3.5  4

[[2]]
  C1  C2 C3
1 0.1 3.9  3

[[3]]
  C1  C2 C3
1 0.4 8.9  7

Would anyone be so kind and patient to get me further on this?

Thanks in advance and Happy New Year.

CodePudding user response：

You could exploit the substrings of the names.

w <- sapply(v, \(v) 
            names(which.min(abs(unlist(setNames(test, seq_along(test))) - v))))
t(mapply(\(x, y) c(list=x, line=y, test[[x]][y, ]), 
         as.numeric(substr(w, 1, 1)), as.numeric(substring(w, 5)))) |> 
  as.data.frame()
#   list line  C1  C2 C3
# 1    2    1 0.1 3.9  3
# 2    3    3 0.8  14 11
# 3    3    1 0.4 8.9  7
# 4    2    1 0.1 3.9  3

Note: R >= 4.1 used.

Data:

test <- list(structure(list(C1 = c(0.2, 0.4, 0.5), C2 = c(2, 3.5, 3.7
), C3 = c(0.3, 4, 5)), class = "data.frame", row.names = c(NA, 
-3L)), structure(list(C1 = c(0.1, 0.3, 0.6), C2 = c(3.9, 4.3, 
8), C3 = c(3, 5.2, 10)), class = "data.frame", row.names = c(NA, 
-3L)), structure(list(C1 = c(0.4, 0.55, 0.8), C2 = c(8.9, 10.3, 
14), C3 = c(7, 8.4, 11)), class = "data.frame", row.names = c(NA, 
-3L)))

v <- c(3, 14.4, 7, 0)

CodePudding user response：

I believe this is what you are looking for. Please note that the line.number.in.df is the mean of row_numbers_df per unique column in the data frames of the list test. As I mentioned above in the comments, it is not possible to have to different numeric values in the same position of a data.frame, unless it is a character string.

#install.packages('birk')
library(birk) # required for which.closest()

# find which of the values across the columns C1:C3 in each element of test are closest
# to the values of vector and return the corresponding row numbers
x <- sapply(1:length(vector), \(x) sapply(test, \(i) apply(i, 2, \(j) which.closest(j, vector[x]))))
row_numbers_df <- apply(x, 1, \(i) which.max(table(i)))

# extract the values in each of the column C1:C3 corresponding to row_numbers_df
vals <- array(0, dim = length(row_numbers_df))
for (i in 1:length(row_numbers_df)) { vals[i] <- do.call(cbind, test)[row_numbers_df[i], i] }

# how many columns does each data.frame embedded in test have?
unique_number_of_cols <- unique(sapply(test, ncol))

# store results in a data.frame
r <- \(x) round(x, 1)
out <- data.frame(
  seq_len(length(test)),
  r(rowMeans(matrix(row_numbers_df, ncol = unique_number_of_cols, byrow = TRUE))),
  matrix(vals, ncol = unique_number_of_cols, byrow = TRUE)
)
names(out) <- c('list.index', 'line.number.in.df', sapply(test, colnames)[, 1])

Result

> out
  list.index line.number.in.df  C1   C2   C3
1          1               2.3 0.5  3.5  4.0
2          2               2.3 0.6  4.3  5.2
3          3               3.0 0.8 14.0 11.0

Alternatively, if you really want to have each line.number.in.df per unique column, then you can easily store them as separate columns in out.

x <- sapply(1:length(vector), \(x) sapply(test, \(i) apply(i, 2, \(j) which.closest(j, vector[x]))))
row_numbers_df <- apply(x, 1, \(i) which.max(table(i)))
names(row_numbers_df) <- do.call(c, lapply(test, names))

row_numbers_df
vals <- array(0, dim = length(row_numbers_df))
for (i in 1:length(row_numbers_df)) { vals[i] <- do.call(cbind, test)[row_numbers_df[i], i] }

unique_number_of_cols <- unique(sapply(test, ncol))

out <- data.frame(
  seq_len(length(test)),
  split(row_numbers_df, names(row_numbers_df)),
  matrix(vals, ncol = unique_number_of_cols, byrow = TRUE)
)
column_names <- sapply(test, colnames)[, 1]
names(out) <- c('list.index',
                paste0('line.number.in.df.', column_names),
                column_names)

Result

> out
  list.index line.number.in.df.C1 line.number.in.df.C2 line.number.in.df.C3  C1   C2   C3
1          1                    3                    2                    2 0.5  3.5  4.0
2          2                    3                    2                    2 0.6  4.3  5.2
3          3                    3                    3                    3 0.8 14.0 11.0