Extracting info from lists in R dataframes-CodePudding

I'm a beginner at R and I've been having some trouble with this specific data set.

I've downloaded the newsatlasbr package (https://github.com/voltdatalab/newsatlasbr), which was "created to give easier access to the datasets of the News Atlas project, which is an initiative that researches and maps news organizations in the Brazilian territory".

When I run organizations_state(uf = "All"), I have access to data from news vehicles from every single Brazilian state. The generated dataframe (let's say, vehicles <- organizations_state(uf = "All"), comes with columns such as :

id, nome_veiculo, media_source_id, fonte, segment_id, segmento, city_id, municipio, codmun, state_id,uf, region_id, media_channels

The most critical one for what I need to do is media_channels, a list type column that comes with columns such as:

id, channel_id, link, media_id, created_at

Now, I really needed to extract the links with channel_id == 1 for each news vehicle (that is, its website — channel_id == 3 would be its Facebook Page and so on).

How is it possible for me to do it?

vehicles df media_channels list

EDIT: reproducible code example

dput(head(vehicles, 4))

    structure(list(id = c(14030L, 14068L, 9L, 10L), nome_veiculo = c("JURUÁ EM TEMPO", 
    "VOZ DO NORTE", "PAGINA 20", "A GAZETA"), media_source_id = c(5L, 
    5L, 5L, 5L), fonte = c("Atlas da Notícia", "Atlas da Notícia", 
    "Atlas da Notícia", "Atlas da Notícia"), segment_id = c(1L, 1L, 
    1L, 1L), segmento = c("impresso", "impresso", "impresso", "impresso"
    ), city_id = c(6L, 6L, 15L, 15L), municipio = c("Cruzeiro do Sul", 
    "Cruzeiro do Sul", "Rio Branco", "Rio Branco"), codmun = c("1200203", 
    "1200203", "1200401", "1200401"), state_id = c(1L, 1L, 1L, 1L
    ), uf = c("AC", "AC", "AC", "AC"), region_id = c(3L, 3L, 3L, 
    3L), regiao = c("Norte", "Norte", "Norte", "Norte"), address = c(NA_character_, 
    NA_character_, NA_character_, NA_character_), annotations = c("não tenho certeza do e-mail do veículo", 
    NA, NA, NA), email = c("[email protected]", "[email protected]", 
    "[email protected]", "[email protected]"), employees_range_id = c(1L, 
    NA, NA, NA), num_funcionarios = c("1 a 5 colaboradores", NA, 
    NA, NA), periodicity = c("daily", "daily", "another", "another"
    ), another_periodicity = c(NA_character_, NA_character_, NA_character_, 
    NA_character_), eh_jornal = c(1L, 1L, 1L, 1L), eh_site_pago = c(1L, 
    NA, NA, NA), data_fechamento = c(NA, "2018-01", NA, NA), ativo = c(1L, 
    0L, 1L, 1L), data_inclusao = c("2019-11-25 04:41:23", "2019-11-26 00:13:43", 
    "2019-10-22 00:33:45", "2019-10-22 00:33:45"), business_models = list(
        structure(list(id = 8L, name = "Publicidade (outros tipos)", 
            created_at = "2019-10-22 00:33:45", updated_at = "2019-10-22 00:33:45", 
            deleted_at = NA, pivot = structure(list(media_id = 14030L, 
                business_model_id = 8L), class = "data.frame", row.names = 1L)), class = "data.frame", row.names = 1L), 
        structure(list(id = 7:8, name = c("Prestação de serviços", 
        "Publicidade (outros tipos)"), created_at = c("2019-10-22 00:33:45", 
        "2019-10-22 00:33:45"), updated_at = c("2019-10-22 00:33:45", 
        "2019-10-22 00:33:45"), deleted_at = c(NA, NA), pivot = structure(list(
            media_id = c(14068L, 14068L), business_model_id = 7:8), class = "data.frame", row.names = 1:2)), class = "data.frame", row.names = 1:2), 
        structure(list(), .Names = character(0), row.names = integer(0), class = "data.frame"), 
        structure(list(id = c(2L, 2L), name = c("Conteúdo patrocinado", 
        "Conteúdo patrocinado"), created_at = c("2019-10-22 00:33:45", 
        "2019-10-22 00:33:45"), updated_at = c("2019-10-22 00:33:45", 
        "2019-10-22 00:33:45"), deleted_at = c(NA, NA), pivot = structure(list(
            media_id = c(10L, 10L), business_model_id = c(2L, 2L)), class = "data.frame", row.names = 1:2)), class = "data.frame", row.names = 1:2)), 
        features = list(structure(list(id = c(1L, 5L, 8L, 9L), name = c("Blogs", 
        "Opinião/colunistas", "Redes sociais", "Vídeos"), created_at = c("2019-10-22 00:33:45", 
        "2019-10-22 00:33:45", "2019-10-22 00:33:45", "2019-10-22 00:33:45"
        ), updated_at = c("2019-10-22 00:33:45", "2019-10-22 00:33:45", 
        "2019-10-22 00:33:45", "2019-10-22 00:33:45"), deleted_at = c(NA, 
        NA, NA, NA), pivot = structure(list(media_id = c(14030L, 
        14030L, 14030L, 14030L), feature_id = c(1L, 5L, 8L, 9L)), class = "data.frame", row.names = c(NA, 
        4L))), class = "data.frame", row.names = c(NA, 4L)), structure(list(
            id = c(1L, 8L, 9L), name = c("Blogs", "Redes sociais", 
            "Vídeos"), created_at = c("2019-10-22 00:33:45", "2019-10-22 00:33:45", 
            "2019-10-22 00:33:45"), updated_at = c("2019-10-22 00:33:45", 
            "2019-10-22 00:33:45", "2019-10-22 00:33:45"), deleted_at = c(NA, 
            NA, NA), pivot = structure(list(media_id = c(14068L, 
            14068L, 14068L), feature_id = c(1L, 8L, 9L)), class = "data.frame", row.names = c(NA, 
            3L))), class = "data.frame", row.names = c(NA, 3L)), 
            structure(list(), .Names = character(0), row.names = integer(0), class = "data.frame"), 
            structure(list(id = c(4L, 4L, 7L, 7L), name = c("Noticiário em tempo real", 
            "Noticiário em tempo real", "Possui alguma edição impressa", 
            "Possui alguma edição impressa"), created_at = c("2019-10-22 00:33:45", 
            "2019-10-22 00:33:45", "2019-10-22 00:33:45", "2019-10-22 00:33:45"
            ), updated_at = c("2019-10-22 00:33:45", "2019-10-22 00:33:45", 
            "2019-10-22 00:33:45", "2019-10-22 00:33:45"), deleted_at = c(NA, 
            NA, NA, NA), pivot = structure(list(media_id = c(10L, 
            10L, 10L, 10L), feature_id = c(4L, 4L, 7L, 7L)), class = "data.frame", row.names = c(NA, 
            4L))), class = "data.frame", row.names = c(NA, 4L))), 
        collaborators = list(structure(list(id = 3L, name = "Estudante", 
            created_at = "2019-10-22 00:33:45", updated_at = "2019-10-22 00:33:45", 
            deleted_at = NA, pivot = structure(list(media_id = 14030L, 
                collaborator_id = 3L), class = "data.frame", row.names = 1L)), class = "data.frame", row.names = 1L), 
            structure(list(id = 3L, name = "Estudante", created_at = "2019-10-22 00:33:45", 
                updated_at = "2019-10-22 00:33:45", deleted_at = NA, 
                pivot = structure(list(media_id = 14068L, collaborator_id = 3L), class = "data.frame", row.names = 1L)), class = "data.frame", row.names = 1L), 
            structure(list(), .Names = character(0), row.names = integer(0), class = "data.frame"), 
            structure(list(id = c(8L, 8L, 9L, 9L, 11L, 11L), name = c("Pesquisador do Atlas da Notícia", 
            "Pesquisador do Atlas da Notícia", "Projor", "Projor", 
            "Volt Data Lab", "Volt Data Lab"), created_at = c("2019-10-22 00:33:45", 
            "2019-10-22 00:33:45", "2019-10-22 00:33:45", "2019-10-22 00:33:45", 
            "2019-10-22 00:33:45", "2019-10-22 00:33:45"), updated_at = c("2019-10-22 00:33:45", 
            "2019-10-22 00:33:45", "2019-10-22 00:33:45", "2019-10-22 00:33:45", 
            "2019-10-22 00:33:45", "2019-10-22 00:33:45"), deleted_at = c(NA, 
            NA, NA, NA, NA, NA), pivot = structure(list(media_id = c(10L, 
            10L, 10L, 10L, 10L, 10L), collaborator_id = c(8L, 8L, 
            9L, 9L, 11L, 11L)), class = "data.frame", row.names = c(NA, 
            6L))), class = "data.frame", row.names = c(NA, 6L))), 
        media_channels = list(structure(list(id = 8190:8191, channel_id = c(1L, 
        3L), link = c("www. Juruaemtempo.com.br", "https://www.facebook.com/cruzeirodosulinfo/"
        ), media_id = c(14030L, 14030L), created_at = c("2019-12-04 17:51:46", 
        "2019-12-04 17:51:46"), updated_at = c("2019-12-04 17:51:46", 
        "2019-12-04 17:51:46"), deleted_at = c(NA, NA), channel = structure(list(
            id = c(1L, 3L), name = c("Site", "Facebook"), text_hint = c("Cole aqui a URL do veículo, se houver. É um item importante para nós, e nos ajudará na validação de sua sugestão - mas entendemos que alguns veículos nem site têm, por isso não é obrigatório. Agradecemos o esforço!", 
            "Se houver, favor colocar link direto para a conta - se souber apenas o nome (ex. @observatório), acrescente antes https://facebook.com/@nome da pagin"
            ), created_at = c("2019-10-22 00:33:45", "2019-10-22 00:33:45"
            ), updated_at = c("2019-10-22 00:33:45", "2019-10-22 00:33:45"
            ), deleted_at = c(NA, NA)), class = "data.frame", row.names = 1:2)), class = "data.frame", row.names = 1:2), 
            structure(list(id = 46403:46407, channel_id = c(1L, 3L, 
            9L, 5L, 2L), link = c("http://www.vozdonorte.com.br/", 
            "https://www.facebook.com/vozdonorte", "(68) 99932-3607", 
            "https://www.instagram.com/vozdonorte/", "https://twitter.com/VozdoNorte"
            ), media_id = c(14068L, 14068L, 14068L, 14068L, 14068L
            ), created_at = c("2021-02-15 14:21:40", "2021-02-15 14:21:41", 
            "2021-02-15 14:21:41", "2021-02-15 14:21:41", "2021-02-15 14:21:41"
            ), updated_at = c("2021-02-15 14:21:40", "2021-02-15 14:21:41", 
            "2021-02-15 14:21:41", "2021-02-15 14:21:41", "2021-02-15 14:21:41"
            ), deleted_at = c(NA, NA, NA, NA, NA), channel = structure(list(
                id = c(1L, 3L, 9L, 5L, 2L), name = c("Site", "Facebook", 
                "Whatsapp/Telefone", "Instagram", "Twitter"), text_hint = c("Cole aqui a URL do veículo, se houver. É um item importante para nós, e nos ajudará na validação de sua sugestão - mas entendemos que alguns veículos nem site têm, por isso não é obrigatório. Agradecemos o esforço!", 
                "Se houver, favor colocar link direto para a conta - se souber apenas o nome (ex. @observatório), acrescente antes https://facebook.com/@nome da pagin", 
                "Adicione o número de telefone, lembrando de colocar o DDD na frente", 
                "Adicione a URL do feed de Instagram", "Se houver, favor colocar link direto para a conta - se souber apenas o nome (ex. @observatório), acrescente antes https://twitter.com/@nome da pagina"
                ), created_at = c("2019-10-22 00:33:45", "2019-10-22 00:33:45", 
                "2020-08-20 22:43:52", "2020-05-09 16:58:26", "2019-10-22 00:33:45"
                ), updated_at = c("2019-10-22 00:33:45", "2019-10-22 00:33:45", 
                "2020-08-20 22:43:52", "2020-05-09 16:58:26", "2019-10-22 00:33:45"
                ), deleted_at = c(NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA, 
            5L))), class = "data.frame", row.names = c(NA, 5L)), 
            structure(list(id = 46408:46412, channel_id = c(3L, 1L, 
            2L, 5L, 9L), link = c("https://www.facebook.com/pagina20net", 
            "https://www.pagina20.net/", "https://twitter.com/Pagina20online", 
            "https://www.instagram.com/pagina20net/", "(68) 3223-8051"
            ), media_id = c(9L, 9L, 9L, 9L, 9L), created_at = c("2021-02-15 14:34:46", 
            "2021-02-15 14:34:47", "2021-02-15 14:34:47", "2021-02-15 14:34:47", 
            "2021-02-15 14:34:47"), updated_at = c("2021-02-15 14:34:46", 
            "2021-02-15 14:34:47", "2021-02-15 14:34:47", "2021-02-15 14:34:47", 
            "2021-02-15 14:34:47"), deleted_at = c(NA, NA, NA, NA, 
            NA), channel = structure(list(id = c(3L, 1L, 2L, 5L, 
            9L), name = c("Facebook", "Site", "Twitter", "Instagram", 
            "Whatsapp/Telefone"), text_hint = c("Se houver, favor colocar link direto para a conta - se souber apenas o nome (ex. @observatório), acrescente antes https://facebook.com/@nome da pagin", 
            "Cole aqui a URL do veículo, se houver. É um item importante para nós, e nos ajudará na validação de sua sugestão - mas entendemos que alguns veículos nem site têm, por isso não é obrigatório. Agradecemos o esforço!", 
            "Se houver, favor colocar link direto para a conta - se souber apenas o nome (ex. @observatório), acrescente antes https://twitter.com/@nome da pagina", 
            "Adicione a URL do feed de Instagram", "Adicione o número de telefone, lembrando de colocar o DDD na frente"
            ), created_at = c("2019-10-22 00:33:45", "2019-10-22 00:33:45", 
            "2019-10-22 00:33:45", "2020-05-09 16:58:26", "2020-08-20 22:43:52"
            ), updated_at = c("2019-10-22 00:33:45", "2019-10-22 00:33:45", 
            "2019-10-22 00:33:45", "2020-05-09 16:58:26", "2020-08-20 22:43:52"
            ), deleted_at = c(NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA, 
            5L))), class = "data.frame", row.names = c(NA, 5L)), 
            structure(list(id = 46413:46417, channel_id = c(1L, 2L, 
            3L, 9L, 5L), link = c("http://www.agazetadoacre.com/", 
            "https://twitter.com/agazetadoacre", "https://www.facebook.com/A-Gazeta-do-Acre-208579685881525/", 
            "(68) 3224-7776", "https://www.instagram.com/jornalagazetadoacre/"
            ), media_id = c(10L, 10L, 10L, 10L, 10L), created_at = c("2021-02-15 14:59:01", 
            "2021-02-15 14:59:01", "2021-02-15 14:59:02", "2021-02-15 14:59:02", 
            "2021-02-15 14:59:02"), updated_at = c("2021-02-15 14:59:01", 
            "2021-02-15 14:59:01", "2021-02-15 14:59:02", "2021-02-15 14:59:02", 
            "2021-02-15 14:59:02"), deleted_at = c(NA, NA, NA, NA, 
            NA), channel = structure(list(id = c(1L, 2L, 3L, 9L, 
            5L), name = c("Site", "Twitter", "Facebook", "Whatsapp/Telefone", 
            "Instagram"), text_hint = c("Cole aqui a URL do veículo, se houver. É um item importante para nós, e nos ajudará na validação de sua sugestão - mas entendemos que alguns veículos nem site têm, por isso não é obrigatório. Agradecemos o esforço!", 
            "Se houver, favor colocar link direto para a conta - se souber apenas o nome (ex. @observatório), acrescente antes https://twitter.com/@nome da pagina", 
            "Se houver, favor colocar link direto para a conta - se souber apenas o nome (ex. @observatório), acrescente antes https://facebook.com/@nome da pagin", 
            "Adicione o número de telefone, lembrando de colocar o DDD na frente", 
            "Adicione a URL do feed de Instagram"), created_at = c("2019-10-22 00:33:45", 
            "2019-10-22 00:33:45", "2019-10-22 00:33:45", "2020-08-20 22:43:52", 
            "2020-05-09 16:58:26"), updated_at = c("2019-10-22 00:33:45", 
            "2019-10-22 00:33:45", "2019-10-22 00:33:45", "2020-08-20 22:43:52", 
            "2020-05-09 16:58:26"), deleted_at = c(NA, NA, NA, NA, 
            NA)), class = "data.frame", row.names = c(NA, 5L))), class = "data.frame", row.names = c(NA, 
            5L)))), row.names = c(NA, 4L), class = "data.frame")

CodePudding user response：

A dplyr approach

We need to first extract the desired list-columns with pull. Second, we can map regular dplyr filter to get only elements with channel_id == 1, then select the link column, Finally, bind the elements back together with bind_rows. If we would rather have a vector of links instead of a data.frame, we can replace bind_rows with unlist %>% unname

library(dplyr)
library(purrr)

df %>%
    pull(media_channels) %>%
    map(filter, channel_id==1) %>%
    map(select, link) %>%
    bind_rows()

                           link
1      www. Juruaemtempo.com.br
2 http://www.vozdonorte.com.br/
3     https://www.pagina20.net/
4 http://www.agazetadoacre.com/

CodePudding user response：

You can loop over the media_channels list

> names(vehicles)
 [1] "id"                  "nome_veiculo"        "media_source_id"     "fonte"               "segment_id"         
 [6] "segmento"            "city_id"             "municipio"           "codmun"              "state_id"           
[11] "uf"                  "region_id"           "regiao"              "address"             "annotations"        
[16] "email"               "employees_range_id"  "num_funcionarios"    "periodicity"         "another_periodicity"
[21] "eh_jornal"           "eh_site_pago"        "data_fechamento"     "ativo"               "data_inclusao"      
[26] "business_models"     "features"            "collaborators"       "media_channels"     
> vehicles$media_channels

.... (and since it's actually a list with regular column structure in its own right, you can use [.data.frame logic) and then extract the items from it which meet you criteria and finally select only the link column from such items with this code:

lapply(vehicles$media_channels, function(item) { item[ item[["channel_id"]]==1 , ]$link } )
[[1]]
[1] "www. Juruaemtempo.com.br"

[[2]]
[1] "http://www.vozdonorte.com.br/"

[[3]]
[1] "https://www.pagina20.net/"

[[4]]
[1] "http://www.agazetadoacre.com/"