I'm a beginner at R and I've been having some trouble with this specific data set.
I've downloaded the newsatlasbr package (https://github.com/voltdatalab/newsatlasbr), which was "created to give easier access to the datasets of the News Atlas project, which is an initiative that researches and maps news organizations in the Brazilian territory".
When I run organizations_state(uf = "All"), I have access to data from news vehicles from every single Brazilian state. The generated dataframe (let's say, vehicles <- organizations_state(uf = "All"), comes with columns such as :
id, nome_veiculo, media_source_id, fonte, segment_id, segmento, city_id, municipio, codmun, state_id,uf, region_id, media_channels
The most critical one for what I need to do is media_channels, a list type column that comes with columns such as:
id, channel_id, link, media_id, created_at
Now, I really needed to extract the links with channel_id == 1 for each news vehicle (that is, its website — channel_id == 3 would be its Facebook Page and so on).
How is it possible for me to do it?
vehicles df media_channels list
EDIT: reproducible code example
dput(head(vehicles, 4))
structure(list(id = c(14030L, 14068L, 9L, 10L), nome_veiculo = c("JURUÁ EM TEMPO",
"VOZ DO NORTE", "PAGINA 20", "A GAZETA"), media_source_id = c(5L,
5L, 5L, 5L), fonte = c("Atlas da Notícia", "Atlas da Notícia",
"Atlas da Notícia", "Atlas da Notícia"), segment_id = c(1L, 1L,
1L, 1L), segmento = c("impresso", "impresso", "impresso", "impresso"
), city_id = c(6L, 6L, 15L, 15L), municipio = c("Cruzeiro do Sul",
"Cruzeiro do Sul", "Rio Branco", "Rio Branco"), codmun = c("1200203",
"1200203", "1200401", "1200401"), state_id = c(1L, 1L, 1L, 1L
), uf = c("AC", "AC", "AC", "AC"), region_id = c(3L, 3L, 3L,
3L), regiao = c("Norte", "Norte", "Norte", "Norte"), address = c(NA_character_,
NA_character_, NA_character_, NA_character_), annotations = c("não tenho certeza do e-mail do veículo",
NA, NA, NA), email = c("[email protected]", "[email protected]",
"[email protected]", "[email protected]"), employees_range_id = c(1L,
NA, NA, NA), num_funcionarios = c("1 a 5 colaboradores", NA,
NA, NA), periodicity = c("daily", "daily", "another", "another"
), another_periodicity = c(NA_character_, NA_character_, NA_character_,
NA_character_), eh_jornal = c(1L, 1L, 1L, 1L), eh_site_pago = c(1L,
NA, NA, NA), data_fechamento = c(NA, "2018-01", NA, NA), ativo = c(1L,
0L, 1L, 1L), data_inclusao = c("2019-11-25 04:41:23", "2019-11-26 00:13:43",
"2019-10-22 00:33:45", "2019-10-22 00:33:45"), business_models = list(
structure(list(id = 8L, name = "Publicidade (outros tipos)",
created_at = "2019-10-22 00:33:45", updated_at = "2019-10-22 00:33:45",
deleted_at = NA, pivot = structure(list(media_id = 14030L,
business_model_id = 8L), class = "data.frame", row.names = 1L)), class = "data.frame", row.names = 1L),
structure(list(id = 7:8, name = c("Prestação de serviços",
"Publicidade (outros tipos)"), created_at = c("2019-10-22 00:33:45",
"2019-10-22 00:33:45"), updated_at = c("2019-10-22 00:33:45",
"2019-10-22 00:33:45"), deleted_at = c(NA, NA), pivot = structure(list(
media_id = c(14068L, 14068L), business_model_id = 7:8), class = "data.frame", row.names = 1:2)), class = "data.frame", row.names = 1:2),
structure(list(), .Names = character(0), row.names = integer(0), class = "data.frame"),
structure(list(id = c(2L, 2L), name = c("Conteúdo patrocinado",
"Conteúdo patrocinado"), created_at = c("2019-10-22 00:33:45",
"2019-10-22 00:33:45"), updated_at = c("2019-10-22 00:33:45",
"2019-10-22 00:33:45"), deleted_at = c(NA, NA), pivot = structure(list(
media_id = c(10L, 10L), business_model_id = c(2L, 2L)), class = "data.frame", row.names = 1:2)), class = "data.frame", row.names = 1:2)),
features = list(structure(list(id = c(1L, 5L, 8L, 9L), name = c("Blogs",
"Opinião/colunistas", "Redes sociais", "Vídeos"), created_at = c("2019-10-22 00:33:45",
"2019-10-22 00:33:45", "2019-10-22 00:33:45", "2019-10-22 00:33:45"
), updated_at = c("2019-10-22 00:33:45", "2019-10-22 00:33:45",
"2019-10-22 00:33:45", "2019-10-22 00:33:45"), deleted_at = c(NA,
NA, NA, NA), pivot = structure(list(media_id = c(14030L,
14030L, 14030L, 14030L), feature_id = c(1L, 5L, 8L, 9L)), class = "data.frame", row.names = c(NA,
4L))), class = "data.frame", row.names = c(NA, 4L)), structure(list(
id = c(1L, 8L, 9L), name = c("Blogs", "Redes sociais",
"Vídeos"), created_at = c("2019-10-22 00:33:45", "2019-10-22 00:33:45",
"2019-10-22 00:33:45"), updated_at = c("2019-10-22 00:33:45",
"2019-10-22 00:33:45", "2019-10-22 00:33:45"), deleted_at = c(NA,
NA, NA), pivot = structure(list(media_id = c(14068L,
14068L, 14068L), feature_id = c(1L, 8L, 9L)), class = "data.frame", row.names = c(NA,
3L))), class = "data.frame", row.names = c(NA, 3L)),
structure(list(), .Names = character(0), row.names = integer(0), class = "data.frame"),
structure(list(id = c(4L, 4L, 7L, 7L), name = c("Noticiário em tempo real",
"Noticiário em tempo real", "Possui alguma edição impressa",
"Possui alguma edição impressa"), created_at = c("2019-10-22 00:33:45",
"2019-10-22 00:33:45", "2019-10-22 00:33:45", "2019-10-22 00:33:45"
), updated_at = c("2019-10-22 00:33:45", "2019-10-22 00:33:45",
"2019-10-22 00:33:45", "2019-10-22 00:33:45"), deleted_at = c(NA,
NA, NA, NA), pivot = structure(list(media_id = c(10L,
10L, 10L, 10L), feature_id = c(4L, 4L, 7L, 7L)), class = "data.frame", row.names = c(NA,
4L))), class = "data.frame", row.names = c(NA, 4L))),
collaborators = list(structure(list(id = 3L, name = "Estudante",
created_at = "2019-10-22 00:33:45", updated_at = "2019-10-22 00:33:45",
deleted_at = NA, pivot = structure(list(media_id = 14030L,
collaborator_id = 3L), class = "data.frame", row.names = 1L)), class = "data.frame", row.names = 1L),
structure(list(id = 3L, name = "Estudante", created_at = "2019-10-22 00:33:45",
updated_at = "2019-10-22 00:33:45", deleted_at = NA,
pivot = structure(list(media_id = 14068L, collaborator_id = 3L), class = "data.frame", row.names = 1L)), class = "data.frame", row.names = 1L),
structure(list(), .Names = character(0), row.names = integer(0), class = "data.frame"),
structure(list(id = c(8L, 8L, 9L, 9L, 11L, 11L), name = c("Pesquisador do Atlas da Notícia",
"Pesquisador do Atlas da Notícia", "Projor", "Projor",
"Volt Data Lab", "Volt Data Lab"), created_at = c("2019-10-22 00:33:45",
"2019-10-22 00:33:45", "2019-10-22 00:33:45", "2019-10-22 00:33:45",
"2019-10-22 00:33:45", "2019-10-22 00:33:45"), updated_at = c("2019-10-22 00:33:45",
"2019-10-22 00:33:45", "2019-10-22 00:33:45", "2019-10-22 00:33:45",
"2019-10-22 00:33:45", "2019-10-22 00:33:45"), deleted_at = c(NA,
NA, NA, NA, NA, NA), pivot = structure(list(media_id = c(10L,
10L, 10L, 10L, 10L, 10L), collaborator_id = c(8L, 8L,
9L, 9L, 11L, 11L)), class = "data.frame", row.names = c(NA,
6L))), class = "data.frame", row.names = c(NA, 6L))),
media_channels = list(structure(list(id = 8190:8191, channel_id = c(1L,
3L), link = c("www. Juruaemtempo.com.br", "https://www.facebook.com/cruzeirodosulinfo/"
), media_id = c(14030L, 14030L), created_at = c("2019-12-04 17:51:46",
"2019-12-04 17:51:46"), updated_at = c("2019-12-04 17:51:46",
"2019-12-04 17:51:46"), deleted_at = c(NA, NA), channel = structure(list(
id = c(1L, 3L), name = c("Site", "Facebook"), text_hint = c("Cole aqui a URL do veículo, se houver. É um item importante para nós, e nos ajudará na validação de sua sugestão - mas entendemos que alguns veículos nem site têm, por isso não é obrigatório. Agradecemos o esforço!",
"Se houver, favor colocar link direto para a conta - se souber apenas o nome (ex. @observatório), acrescente antes https://facebook.com/@nome da pagin"
), created_at = c("2019-10-22 00:33:45", "2019-10-22 00:33:45"
), updated_at = c("2019-10-22 00:33:45", "2019-10-22 00:33:45"
), deleted_at = c(NA, NA)), class = "data.frame", row.names = 1:2)), class = "data.frame", row.names = 1:2),
structure(list(id = 46403:46407, channel_id = c(1L, 3L,
9L, 5L, 2L), link = c("http://www.vozdonorte.com.br/",
"https://www.facebook.com/vozdonorte", "(68) 99932-3607",
"https://www.instagram.com/vozdonorte/", "https://twitter.com/VozdoNorte"
), media_id = c(14068L, 14068L, 14068L, 14068L, 14068L
), created_at = c("2021-02-15 14:21:40", "2021-02-15 14:21:41",
"2021-02-15 14:21:41", "2021-02-15 14:21:41", "2021-02-15 14:21:41"
), updated_at = c("2021-02-15 14:21:40", "2021-02-15 14:21:41",
"2021-02-15 14:21:41", "2021-02-15 14:21:41", "2021-02-15 14:21:41"
), deleted_at = c(NA, NA, NA, NA, NA), channel = structure(list(
id = c(1L, 3L, 9L, 5L, 2L), name = c("Site", "Facebook",
"Whatsapp/Telefone", "Instagram", "Twitter"), text_hint = c("Cole aqui a URL do veículo, se houver. É um item importante para nós, e nos ajudará na validação de sua sugestão - mas entendemos que alguns veículos nem site têm, por isso não é obrigatório. Agradecemos o esforço!",
"Se houver, favor colocar link direto para a conta - se souber apenas o nome (ex. @observatório), acrescente antes https://facebook.com/@nome da pagin",
"Adicione o número de telefone, lembrando de colocar o DDD na frente",
"Adicione a URL do feed de Instagram", "Se houver, favor colocar link direto para a conta - se souber apenas o nome (ex. @observatório), acrescente antes https://twitter.com/@nome da pagina"
), created_at = c("2019-10-22 00:33:45", "2019-10-22 00:33:45",
"2020-08-20 22:43:52", "2020-05-09 16:58:26", "2019-10-22 00:33:45"
), updated_at = c("2019-10-22 00:33:45", "2019-10-22 00:33:45",
"2020-08-20 22:43:52", "2020-05-09 16:58:26", "2019-10-22 00:33:45"
), deleted_at = c(NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA,
5L))), class = "data.frame", row.names = c(NA, 5L)),
structure(list(id = 46408:46412, channel_id = c(3L, 1L,
2L, 5L, 9L), link = c("https://www.facebook.com/pagina20net",
"https://www.pagina20.net/", "https://twitter.com/Pagina20online",
"https://www.instagram.com/pagina20net/", "(68) 3223-8051"
), media_id = c(9L, 9L, 9L, 9L, 9L), created_at = c("2021-02-15 14:34:46",
"2021-02-15 14:34:47", "2021-02-15 14:34:47", "2021-02-15 14:34:47",
"2021-02-15 14:34:47"), updated_at = c("2021-02-15 14:34:46",
"2021-02-15 14:34:47", "2021-02-15 14:34:47", "2021-02-15 14:34:47",
"2021-02-15 14:34:47"), deleted_at = c(NA, NA, NA, NA,
NA), channel = structure(list(id = c(3L, 1L, 2L, 5L,
9L), name = c("Facebook", "Site", "Twitter", "Instagram",
"Whatsapp/Telefone"), text_hint = c("Se houver, favor colocar link direto para a conta - se souber apenas o nome (ex. @observatório), acrescente antes https://facebook.com/@nome da pagin",
"Cole aqui a URL do veículo, se houver. É um item importante para nós, e nos ajudará na validação de sua sugestão - mas entendemos que alguns veículos nem site têm, por isso não é obrigatório. Agradecemos o esforço!",
"Se houver, favor colocar link direto para a conta - se souber apenas o nome (ex. @observatório), acrescente antes https://twitter.com/@nome da pagina",
"Adicione a URL do feed de Instagram", "Adicione o número de telefone, lembrando de colocar o DDD na frente"
), created_at = c("2019-10-22 00:33:45", "2019-10-22 00:33:45",
"2019-10-22 00:33:45", "2020-05-09 16:58:26", "2020-08-20 22:43:52"
), updated_at = c("2019-10-22 00:33:45", "2019-10-22 00:33:45",
"2019-10-22 00:33:45", "2020-05-09 16:58:26", "2020-08-20 22:43:52"
), deleted_at = c(NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA,
5L))), class = "data.frame", row.names = c(NA, 5L)),
structure(list(id = 46413:46417, channel_id = c(1L, 2L,
3L, 9L, 5L), link = c("http://www.agazetadoacre.com/",
"https://twitter.com/agazetadoacre", "https://www.facebook.com/A-Gazeta-do-Acre-208579685881525/",
"(68) 3224-7776", "https://www.instagram.com/jornalagazetadoacre/"
), media_id = c(10L, 10L, 10L, 10L, 10L), created_at = c("2021-02-15 14:59:01",
"2021-02-15 14:59:01", "2021-02-15 14:59:02", "2021-02-15 14:59:02",
"2021-02-15 14:59:02"), updated_at = c("2021-02-15 14:59:01",
"2021-02-15 14:59:01", "2021-02-15 14:59:02", "2021-02-15 14:59:02",
"2021-02-15 14:59:02"), deleted_at = c(NA, NA, NA, NA,
NA), channel = structure(list(id = c(1L, 2L, 3L, 9L,
5L), name = c("Site", "Twitter", "Facebook", "Whatsapp/Telefone",
"Instagram"), text_hint = c("Cole aqui a URL do veículo, se houver. É um item importante para nós, e nos ajudará na validação de sua sugestão - mas entendemos que alguns veículos nem site têm, por isso não é obrigatório. Agradecemos o esforço!",
"Se houver, favor colocar link direto para a conta - se souber apenas o nome (ex. @observatório), acrescente antes https://twitter.com/@nome da pagina",
"Se houver, favor colocar link direto para a conta - se souber apenas o nome (ex. @observatório), acrescente antes https://facebook.com/@nome da pagin",
"Adicione o número de telefone, lembrando de colocar o DDD na frente",
"Adicione a URL do feed de Instagram"), created_at = c("2019-10-22 00:33:45",
"2019-10-22 00:33:45", "2019-10-22 00:33:45", "2020-08-20 22:43:52",
"2020-05-09 16:58:26"), updated_at = c("2019-10-22 00:33:45",
"2019-10-22 00:33:45", "2019-10-22 00:33:45", "2020-08-20 22:43:52",
"2020-05-09 16:58:26"), deleted_at = c(NA, NA, NA, NA,
NA)), class = "data.frame", row.names = c(NA, 5L))), class = "data.frame", row.names = c(NA,
5L)))), row.names = c(NA, 4L), class = "data.frame")
CodePudding user response:
A dplyr approach
We need to first extract the desired list-columns with pull.
Second, we can map regular dplyr filter to get only elements with channel_id == 1, then select the link column,
Finally, bind the elements back together with bind_rows.
If we would rather have a vector of links instead of a data.frame, we can replace bind_rows with unlist %>% unname
library(dplyr)
library(purrr)
df %>%
pull(media_channels) %>%
map(filter, channel_id==1) %>%
map(select, link) %>%
bind_rows()
link
1 www. Juruaemtempo.com.br
2 http://www.vozdonorte.com.br/
3 https://www.pagina20.net/
4 http://www.agazetadoacre.com/
CodePudding user response:
You can loop over the media_channels list
> names(vehicles)
[1] "id" "nome_veiculo" "media_source_id" "fonte" "segment_id"
[6] "segmento" "city_id" "municipio" "codmun" "state_id"
[11] "uf" "region_id" "regiao" "address" "annotations"
[16] "email" "employees_range_id" "num_funcionarios" "periodicity" "another_periodicity"
[21] "eh_jornal" "eh_site_pago" "data_fechamento" "ativo" "data_inclusao"
[26] "business_models" "features" "collaborators" "media_channels"
> vehicles$media_channels
.... (and since it's actually a list with regular column structure in its own right, you can use [.data.frame logic) and then extract the items from it which meet you criteria and finally select only the link column from such items with this code:
lapply(vehicles$media_channels, function(item) { item[ item[["channel_id"]]==1 , ]$link } )
[[1]]
[1] "www. Juruaemtempo.com.br"
[[2]]
[1] "http://www.vozdonorte.com.br/"
[[3]]
[1] "https://www.pagina20.net/"
[[4]]
[1] "http://www.agazetadoacre.com/"
