With the parent-child relationships data frame as below:
parent_id child_id
1 1 2
2 2 3
3 3 4
The goal is to achieve the following, i.e. expanded version of previous data frame where all the descendants (children, grandchildren, etc.) are assigned to each parent (and incl. the parent/child itself):
parent_id child_id
1 1 1
2 1 2
3 1 3
4 1 4
5 2 2
6 2 3
7 2 4
8 3 3
9 3 4
10 4 4
The question I have: What is the fastest possible way (or one of them) of achieving that in R?
I've already tried various methods - from a for loop, SQL recursion to using igraph (as described here). They are all rather slow, and some of them are also prone to crashing when dealing with a larger number of combinations.
Below are examples with sqldf and igraph, benchmarked on a slightly larger data frame than above.
library(sqldf)
library(purrr)
library(dplyr)
library(igraph)
df <- data.frame(parent_id = 1:1000L)
df$child_id <- df$parent_id 1L
# SQL recursion
sqlQuery <- 'with recursive
dfDescendants (parent_id, child_id)
as
(select parent_id, child_id from df
union all
select d.parent_id, s.child_id from dfDescendants d
join df s
on d.child_id = s.parent_id)
select distinct parent_id, parent_id as child_id from dfDescendants
union
select distinct child_id as parent_id, child_id from dfDescendants
union
select * from dfDescendants;'
sqldf(sqlQuery)
# igraph with purrr
df_g = graph_from_data_frame(df, directed = TRUE)
map(V(df_g), ~ names(subcomponent(df_g, .x, mode = "out"))) %>%
map_df(~ data.frame(child_id = .x), .id = "parent_id")
Benchmark (excl. query creation in sqldf and conversion to graph in igraph):
set.seed(23423)
microbenchmark::microbenchmark(
sqldf = sqldf(sqlQuery),
tidyigraph = map(V(df_g), ~ names(subcomponent(df_g, .x, mode = "out"))) %>%
map_df(~ data.frame(child_id = .x), .id = "parent_id"),
times = 5
)
# Unit: seconds
# expr min lq mean median uq max neval
# sqldf 7.815179 8.002836 8.113392 8.084038 8.315207 8.349701 5
# tidyigraph 5.784239 5.806539 5.883241 5.889171 5.964906 5.971350 5
CodePudding user response:
We can use ego like below
g <- graph_from_data_frame(df)
setNames(
rev(
stack(
Map(
names,
setNames(
ego(g,
order = vcount(g),
mode = "out"
),
names(V(g))
)
)
)
),
names(df)
)
which gives
parent_id child_id
1 1 1
2 1 2
3 1 3
4 1 4
5 2 2
6 2 3
7 2 4
8 3 3
9 3 4
10 4 4
Benchmarking
set.seed(23423)
microbenchmark::microbenchmark(
sqldf = sqldf(sqlQuery),
tidyigraph = map(V(df_g), ~ names(subcomponent(df_g, .x, mode = "out"))) %>%
map_df(~ data.frame(child_id = .x), .id = "parent_id"),
ego = setNames(
rev(
stack(
Map(
names,
setNames(
ego(df_g,
order = vcount(df_g),
mode = "out"
),
names(V(df_g))
)
)
)
),
names(df)
),
times = 5
)
shows
Unit: milliseconds
expr min lq mean median uq max neval
sqldf 7156.2753 9072.155 9402.6904 9518.2796 10206.3683 11060.3738 5
tidyigraph 2483.9943 2623.558 3136.7490 2689.8388 2879.5688 5006.7853 5
ego 182.5941 219.151 307.2481 253.2171 325.8721 555.4064 5
CodePudding user response:
igraph is of course the right way to answer graph questions, but I think this has an easy iterative solution in R:
Create convenience variables for the original parent and child, as well as current child, ids
pid0 <- df$parent_id
cid0 <- cid <- df$child_id
Create a list of 'ancestors' (parents) and 'offspring', with one element of each list per 'generation'; start with parents being their own ancestors / children, and parents having the children in the original data frame
aid = list(pid0, pid0) # 'ancestor'
oid = list(pid0, cid0) # 'offspring' -- including parent and child
i = 3L # ith generation
Repeat while there is a 'generation' of child ids
while (!all(is.na(cid))) {
find the index of the child in the parent; only keep children who have known parents (in the sample data set the last parent n has child n 1, but n 1 is not known as a parent...)
idx = match(cid, pid0)
keep = !is.na(idx)
Update the current children to be the children of the parent
cid = cid0[idx]
and record the ancestor / offspring relationship
aid[[i]] = pid0[keep]
oid[[i]] = cid[keep]
proceed to the next generation
i = i 1L
}
then tidy up the results by placing them in a data.frame and in a convenient order
result <- data.frame(aid = unlist(aid), oid = unlist(oid))
o <- order(result$aid, result$oid)
result <- result[o,]
Here's a function capturing this
ancestor_offspring <- function(df) {
pid0 <- df$parent_id
cid0 <- cid <- df$child_id
aid = list(pid0, pid0) # 'ancestor'
oid = list(pid0, cid0) # 'offspring' -- including parent and child
i = 3L # ith generation
while (!all(is.na(cid))) {
idx = match(cid, pid0)
keep = !is.na(idx)
cid = cid0[idx]
aid[[i]] = pid0[keep]
oid[[i]] = cid[keep]
i = i 1L
}
result <- data.frame(aid = unlist(aid), oid = unlist(oid))
o <- order(result$aid, result$oid)
result[o,]
}
It performs quite well
df <- data.frame(parent_id = 1:1000L)
df$child_id <- df$parent_id 1L
df = df[sample(nrow(df)),] # randomize, to check for silly mistakes
system.time(result <- ancestor_offspring(df))
## user system elapsed
## 0.183 0.004 0.188
and seems at least superficially to be correct
head(result)
## aid oid
## 146 1 1
## 1146 1 2
## 2145 1 3
## 3144 1 4
## 4142 1 5
## 5139 1 6
tail(result)
## aid oid
## 3748 998 1001
## 151 999 999
## 1151 999 1000
## 2150 999 1001
## 139 1000 1000
## 1139 1000 1001
