Home > Blockchain >  Unnesting character string list and converting into wide data
Unnesting character string list and converting into wide data

Time:01-24

I often have nested lists containing characters string in a tibble. I would like to convert this to long data attempts have been to transform it into wide data using the tidyr unnest_wider function, but I am unable to produce my desired results.
Suppose you have a dataframe with a nested character list like the following:

#Producing a list with random character strings and embedding into dataframe.
set.seed(112)
df <- data.frame(id = 1:10)
df$alist <- replicate(10, list(sample(letters[1:5],round(runif(1, min = 1, max = 5)))))

Using tidyr::unnest_wider almost gets me there, but I would like for the strings to go into the column names.

df %>% unnest_wider(alist, names_sep = "_")

By using two for loops I´m able to achieve what I want:

df %>% select(alist) %>% unlist(alist) %>% unique() %>% sort() %>% as.list() -> let

df[unlist(let)] <- NA

for(index in let) {
  
for (i in df$id){
  ifelse(index %in% df$alist[[i]],
         T,
         F) -> df[i,index]
  }
}
id alist a b c d e
1 b, d, a TRUE TRUE FALSE TRUE FALSE
2 a, b TRUE TRUE FALSE FALSE FALSE
3 d, a, c, e TRUE FALSE TRUE TRUE TRUE
4 d, a, b, c TRUE TRUE TRUE TRUE FALSE
5 a, c TRUE FALSE TRUE FALSE FALSE
6 e FALSE FALSE FALSE FALSE TRUE
7 c FALSE FALSE TRUE FALSE FALSE
8 c, e, d FALSE FALSE TRUE TRUE TRUE
9 a, d, e, c, b TRUE TRUE TRUE TRUE TRUE
10 a TRUE FALSE FALSE FALSE FALSE

But frankly I am looking for a more elegant solution to the problem. I have the sense I am missing something obvious here...

CodePudding user response:

set.seed(112)
df <- data.frame(id = 1:10)
df$alist <-
  replicate(10, list(sample(letters[1:5], round(
    runif(1, min = 1, max = 5)
  ))))

library(tidyverse)

nm <- sort(unique(unlist(df$alist)))

bind_cols(df, map_df(
  .x = df$alist,
  .f = ~ purrr::set_names(x = nm %in% .x, nm = nm)
)) 
#>    id         alist     a     b     c     d     e
#> 1   1       b, d, a  TRUE  TRUE FALSE  TRUE FALSE
#> 2   2          a, b  TRUE  TRUE FALSE FALSE FALSE
#> 3   3    d, a, c, e  TRUE FALSE  TRUE  TRUE  TRUE
#> 4   4    d, a, b, c  TRUE  TRUE  TRUE  TRUE FALSE
#> 5   5          a, c  TRUE FALSE  TRUE FALSE FALSE
#> 6   6             e FALSE FALSE FALSE FALSE  TRUE
#> 7   7             c FALSE FALSE  TRUE FALSE FALSE
#> 8   8       c, e, d FALSE FALSE  TRUE  TRUE  TRUE
#> 9   9 a, d, e, c, b  TRUE  TRUE  TRUE  TRUE  TRUE
#> 10 10             a  TRUE FALSE FALSE FALSE FALSE

Created on 2022-01-23 by the reprex package (v2.0.1)

CodePudding user response:

This works:

library(tidyverse)

df %>% 
  as_tibble() %>% 
  unnest_longer(alist) %>%
  mutate(alist_2 = alist) %>% 
  complete(id, alist, fill = list(alist_2 = NA_character_)) %>% 
  mutate(present = if_else(is.na(alist_2), FALSE, TRUE)) %>% 
  select(-alist_2) %>% 
  pivot_wider(
    names_from = alist,
    values_from = present
  ) %>%
  bind_cols(df %>% select(alist))

output:

# A tibble: 10 x 7
      id a     b     c     d     e     alist    
   <int> <lgl> <lgl> <lgl> <lgl> <lgl> <list>   
 1     1 TRUE  TRUE  FALSE TRUE  FALSE <chr [3]>
 2     2 TRUE  TRUE  FALSE FALSE FALSE <chr [2]>
 3     3 TRUE  FALSE TRUE  TRUE  TRUE  <chr [4]>
 4     4 TRUE  TRUE  TRUE  TRUE  FALSE <chr [4]>
 5     5 TRUE  FALSE TRUE  FALSE FALSE <chr [2]>
 6     6 FALSE FALSE FALSE FALSE TRUE  <chr [1]>
 7     7 FALSE FALSE TRUE  FALSE FALSE <chr [1]>
 8     8 FALSE FALSE TRUE  TRUE  TRUE  <chr [3]>
 9     9 TRUE  TRUE  TRUE  TRUE  TRUE  <chr [5]>
10    10 TRUE  FALSE FALSE FALSE FALSE <chr [1]>
  •  Tags:  
  • Related