With the parent-child relationships data frame as below:

  parent_id child_id
1         1        2
2         2        3
3         3        4

The goal is to achieve the following, i.e. expanded version of previous data frame where all the descendants (children, grandchildren, etc.) are assigned to each parent (and incl. the parent/child itself):

   parent_id child_id
1          1        1
2          1        2
3          1        3
4          1        4
5          2        2
6          2        3
7          2        4
8          3        3
9          3        4
10         4        4

The question I have: What is the fastest possible way (or one of them) of achieving that in R?

I've already tried various methods - from a for loop, SQL recursion to using igraph (as described here). They are all rather slow, and some of them are also prone to crashing when dealing with a larger number of combinations.

Below are examples with sqldf and igraph, benchmarked on a slightly larger data frame than above.

library(sqldf)
library(purrr)
library(dplyr)
library(igraph)

df <- data.frame(parent_id = 1:1000L)
df$child_id <- df$parent_id   1L

# SQL recursion

sqlQuery <- 'with recursive
             dfDescendants (parent_id, child_id)
             as
             (select parent_id, child_id from df
             union all
             select d.parent_id, s.child_id from dfDescendants d
             join df s
             on d.child_id = s.parent_id)
             select distinct parent_id, parent_id as child_id from dfDescendants
             union
             select distinct child_id as parent_id, child_id from dfDescendants
             union
             select * from dfDescendants;'

sqldf(sqlQuery)

# igraph with purrr

df_g = graph_from_data_frame(df, directed = TRUE)

map(V(df_g), ~ names(subcomponent(df_g, .x, mode = "out"))) %>% 
  map_df(~ data.frame(child_id = .x), .id = "parent_id")

Benchmark (excl. query creation in sqldf and conversion to graph in igraph):

set.seed(23423)

microbenchmark::microbenchmark(
  sqldf = sqldf(sqlQuery),
  tidyigraph = map(V(df_g), ~ names(subcomponent(df_g, .x, mode = "out"))) %>% 
    map_df(~ data.frame(child_id = .x), .id = "parent_id"),
  times = 5
)

#    Unit: seconds
#           expr      min       lq     mean   median       uq      max neval
#          sqldf 7.815179 8.002836 8.113392 8.084038 8.315207 8.349701     5
#     tidyigraph 5.784239 5.806539 5.883241 5.889171 5.964906 5.971350     5

CodePudding user response：

We can use ego like below

g <- graph_from_data_frame(df)

setNames(
  rev(
    stack(
      Map(
        names,
        setNames(
          ego(g,
            order = vcount(g),
            mode = "out"
          ),
          names(V(g))
        )
      )
    )
  ),
  names(df)
)

which gives

   parent_id child_id
1          1        1
2          1        2
3          1        3
4          1        4
5          2        2
6          2        3
7          2        4
8          3        3
9          3        4
10         4        4

Benchmarking

set.seed(23423)

microbenchmark::microbenchmark(
  sqldf = sqldf(sqlQuery),
  tidyigraph = map(V(df_g), ~ names(subcomponent(df_g, .x, mode = "out"))) %>%
    map_df(~ data.frame(child_id = .x), .id = "parent_id"),
  ego = setNames(
    rev(
      stack(
        Map(
          names,
          setNames(
            ego(df_g,
              order = vcount(df_g),
              mode = "out"
            ),
            names(V(df_g))
          )
        )
      )
    ),
    names(df)
  ),
  times = 5
)

shows

Unit: milliseconds
       expr       min       lq      mean    median         uq        max neval
      sqldf 7156.2753 9072.155 9402.6904 9518.2796 10206.3683 11060.3738     5
 tidyigraph 2483.9943 2623.558 3136.7490 2689.8388  2879.5688  5006.7853     5
        ego  182.5941  219.151  307.2481  253.2171   325.8721   555.4064     5

CodePudding user response：

igraph is of course the right way to answer graph questions, but I think this has an easy iterative solution in R:

Create convenience variables for the original parent and child, as well as current child, ids

pid0 <- df$parent_id
cid0 <- cid <- df$child_id

Create a list of 'ancestors' (parents) and 'offspring', with one element of each list per 'generation'; start with parents being their own ancestors / children, and parents having the children in the original data frame

aid = list(pid0, pid0) # 'ancestor'
oid = list(pid0, cid0) # 'offspring' -- including parent and child
i = 3L                 # ith generation

Repeat while there is a 'generation' of child ids

while (!all(is.na(cid))) {

find the index of the child in the parent; only keep children who have known parents (in the sample data set the last parent n has child n 1, but n 1 is not known as a parent...)

    idx = match(cid, pid0)
    keep = !is.na(idx)

Update the current children to be the children of the parent

    cid = cid0[idx]

and record the ancestor / offspring relationship

    aid[[i]] = pid0[keep]
    oid[[i]] = cid[keep]

proceed to the next generation

    i = i   1L
}

then tidy up the results by placing them in a data.frame and in a convenient order

result <- data.frame(aid = unlist(aid), oid = unlist(oid))
o <- order(result$aid, result$oid)
result <- result[o,]

Here's a function capturing this

ancestor_offspring <- function(df) {
    pid0 <- df$parent_id
    cid0 <- cid <- df$child_id
    aid = list(pid0, pid0) # 'ancestor'
    oid = list(pid0, cid0) # 'offspring' -- including parent and child
    i = 3L                 # ith generation
    while (!all(is.na(cid))) {
        idx = match(cid, pid0)
        keep = !is.na(idx)
        cid = cid0[idx]
        aid[[i]] = pid0[keep]
        oid[[i]] = cid[keep]
        i = i   1L
    }
    result <- data.frame(aid = unlist(aid), oid = unlist(oid))
    o <- order(result$aid, result$oid)
    result[o,]
}

It performs quite well

df <- data.frame(parent_id = 1:1000L)
df$child_id <- df$parent_id   1L
df = df[sample(nrow(df)),]  # randomize, to check for silly mistakes

system.time(result <- ancestor_offspring(df))
##    user  system elapsed 
##   0.183   0.004   0.188

and seems at least superficially to be correct

head(result)
##      aid oid
## 146    1   1
## 1146   1   2
## 2145   1   3
## 3144   1   4
## 4142   1   5
## 5139   1   6
tail(result)
##       aid  oid
## 3748  998 1001
## 151   999  999
## 1151  999 1000
## 2150  999 1001
## 139  1000 1000
## 1139 1000 1001