Run a R script for all files in a directory, and store the outputs in one common data frame-CodePudding

I have a script that works fine for one file. It takes the information from a json file, extracts a list and a sublist of it (A), and then another list B with the third element of list A. It creates a data frame with list B and compares it with a master file. Finally, it provides two numbers: the number of elements in the list B and the number of matching elements of that list when comparing with the master file.

However, I have 180 different json files in a folder and I need to run the script for all of them, and build a data frame with the results for each file. So the final result should be something like this (note that the last line's figures are correct, the first two are fictitious):

The code I have so far is the following:

library(rjson)
library(dplyr)
library(tidyverse)

        #load data from file
file <- "./raw_data/whf.json"
json_data <- fromJSON(file = file)
org_name <- json_data$id

        # extract lists and the sublist
usernames <- json_data$twitter   
following <- usernames$following 

        # create empty vector to populate
longitud = length(following)
names <- vector(length = longitud)

        # loop to populate the empty vector with third element of the sub-list
for(i in 1:longitud){
    names[i] <- following[[i]][3]
}

        # create a data frame and change column name
names_list <- data.frame(sapply(names, c))
colnames(names_list) <- "usernames"

        # create a data frame with the correct formatting ready to comparison
org_handles <- data.frame(paste("@", names_list$usernames, sep=""))
colnames(org_handles) <- "Twitter"

        # load master file and select the needed columns
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv") %>%
    select(Name, AKA, Twitter)

        # merge data frames and present the results
org_list <- inner_join(psa_handles, org_handles)
length(org_list$Twitter)
length(usernames$following)

My first attempt is to include this code at the beginning:

files <- list.files()
for(f in files){

json_data <- fromJSON(file = f)

# the rest of the script for one file here

}

but I do not know how to write the code for the data frame or even how to integrate both ideas -the working script and the loop for the file names. I took the idea from here.

The new code after Alvaro Morales' answer is the following

library(rjson)
library(dplyr)
library(tidyverse)

archivos <- list.files("./raw_data/")
calculate_accounts <- function(archivos){

        #load data from file
path <- paste("./raw_data/", archivos, sep = "")
json_data <- fromJSON(file = path)
org_name <- json_data$id

        # extract lists and the sublist
usernames <- json_data$twitter   
following <- usernames$following 

        # create empty vector to populate
longitud = length(following)
names <- vector(length = longitud)

        # loop to populate the empty vector with third element of the sub-list
for(i in 1:longitud){
    names[i] <- following[[i]][3]
}

        # create a data frame and change column name
names_list <- data.frame(sapply(names, c))
colnames(names_list) <- "usernames"

        # create a data frame with the correct formatting ready to comparison
org_handles <- data.frame(paste("@", names_list$usernames, sep=""))
colnames(org_handles) <- "Twitter"

        # load master file and select the needed columns
psa_handles <- read_csv(file = "./psa_handles.csv") %>%
    select(Name, AKA, Twitter)

        # merge data frames and present the results
org_list <- inner_join(psa_handles, org_handles)

accounts_db_org <- length(org_list$Twitter)
accounts_total_org <- length(usernames$following)
}

table_psa <- map_dfr(archivos, calculate_accounts)

However, now there is an error when Joining, by = "Twitter", it says subindex out of limits.

Links to 3 test files to put together in raw_data folder:

https://drive.google.com/file/d/1ilUHwLjgtZCzh0LneIJEhTryrGumDF1V/view?usp=sharing

https://drive.google.com/file/d/1KM3hRZ8DzgPMEsMFmwBdmMNHrPCttuaB/view?usp=sharing

https://drive.google.com/file/d/17cWXJ9ltGXZ6izkgJv0uyNwStrE95_OA/view?usp=sharing

Link to the master file to compare:

https://drive.google.com/file/d/11fOpYFFfHijhZl_CuWHKvkrI7edkpUNQ/view?usp=sharing

<<<<< UPDATE >>>>>>

I am trying to find the solution and I did the code work and provide a valide output (a 180x3 data frame), but the columns that should be filled with the values of the objects accounts_db_org and accounts_total_org are showing NA. When checking the value stored in those objects, the values are correct (for the last iteration). So the output now is in its right format, but with NA instead of numbers.

I am really close, but I am not being able to make the code to show the right numbers. My last attempt is:

library(rjson)
library(dplyr)
library(tidyverse)


archivos <- list.files("./raw_data", pattern = "json", full.names = TRUE)
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv", show_col_types = FALSE) %>%
  select(Name, AKA, Twitter)

nr_archivos <- length(archivos)
psa_result <- matrix(nrow = nr_archivos, ncol = 3)

# loop for working with all files, one by one
for(f in 1:nr_archivos){

# load file
    json_data <- fromJSON(file = archivos[f])
    org_name <- json_data$id

# extract lists and the sublist
    usernames <- json_data$twitter
    following <- usernames$following

# empty vector
    longitud = length(following)
    names <- vector(length = longitud)

# loop to populate with the third element of each i item of the sublist
    for(i in 1:longitud){
        names[i] <- following[[i]][3]
    }

# convert the list into a data frame
    names_list <- data.frame(sapply(names, c))
    colnames(names_list) <- "usernames"

# applying some format prior to comparison
    org_handles <- data.frame(paste("@", names_list$usernames, sep=""))
    colnames(org_handles) <- "Twitter"
                                        
# merge tables and calculate the results for each iteration
    org_list <- inner_join(psa_handles, org_handles)
    accounts_db_org <- length(org_list$Twitter)
    accounts_total_org <- length(usernames$following)

# populate the matrix row by row
psa_result[f] <- c(org_name, accounts_db_org, accounts_total_org)
}

# create a data frame from the matrix and save the result
psa_result <- data.frame(psa_result)
write_csv(psa_result, file = "./outputs/cuentas_seguidas_en_psa.csv")

The subscript out of bounds error was caused by a json file with 0 records. That was fixed deleting the file.

CodePudding user response：

You can do it with purrr::map or purrr::map_dfr.

Is this what you looking for?

archivos <- list.files("./raw_data", pattern = "json", full.names = TRUE)

# load master file and select the needed columns. This needs to be out of "calculate_accounts" because you only read it once. 
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv") %>%
  select(Name, AKA, Twitter)

# calculate accounts
calculate_accounts <- function(archivo){
  
   json_data <- rjson::fromJSON(file = archivo)
  
  org_handles <- json_data %>%
    pluck("twitter", "following") %>%
    map_chr("username") %>% 
    as_tibble() %>% 
    rename(usernames = value) %>% 
    mutate(Twitter = str_c("@", usernames)) %>% 
    select(Twitter)
  
  org_list <- inner_join(psa_handles, org_handles)
  
  org_list %>% 
    mutate(accounts_db_org = length(Twitter),
           accounts_total_org = nrow(org_handles)) %>% 
    select(-Twitter)
}

table_psa <- map_dfr(archivos, calculate_accounts)

#output:
# A tibble: 53 x 4
   Name                                                                       AKA   accounts_db_org accounts_total_org
   <chr>                                                                      <chr>           <int>              <int>
 1 Association of American Medical Colleges                                   AAMC               20               2924
 2 American College of Cardiology                                             ACC                20               2924
 3 American Heart Association                                                 AHA                20               2924
 4 British Association of Dermatologists                                      BAD                20               2924
 5 Canadian Psoriasis Network                                                 CPN                20               2924
 6 Canadian Skin Patient Alliance                                             CSPA               20               2924
 7 European Academy of Dermatology and Venereology                            EADV               20               2924
 8 European Society for Dermatological Research                               ESDR               20               2924
 9 US Department of Health and Human Service                                  HHS                20               2924
10 International Alliance of Dermatology Patients Organisations (Global Skin) IADPO              20               2924
# ... with 43 more rows

CodePudding user response：

Unfortunately, the answer provided by Álvaro does not work as expected, since the output repeats the same number with different organisation names, making it really difficult to read. Actually, the number 20 is repeated 20 times, the number 11, 11 times, and so on. The information is there, but it is not accessible without further data treatment.

I was doing my own research in the meantime and I got to the following code. Finally I made it to work, but the data format was "matrix" "array", really confusing. Fortunately, I wrote the last lines to transpose the data, unlist the array and convert in a matrix, which is able to be converted in a data frame and manipulated as usual.

Maybe my explanation is not very useful, and since I am a newbie, I am sure the code is far from being elegant and optimised. Anyway, please review the code below:

library(purrr)
library(rjson)
library(dplyr)
library(tidyverse)
setwd("~/documentos/varios/proyectos/programacion/R/psa_twitter")

                                        # Establecer la lista de
                                        # archivos con los que se va a
                                        # trabajar y leer el archivo
                                        # maestro de cuentas de la bd.
archivos <- list.files("./raw_data/json_files",
                       pattern = ".json",
                       full.names = TRUE)
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv") %>%
    select(Name, AKA, Twitter)

nr_archivos <- length(archivos)

calcula_cuentas <- function(a){
                                        # Cargar los datos de la
                                        # organización desde el
                                        # archivo json
    json_data <- fromJSON(file = a)
    org_aka <- json_data$id
    org_meta <- json_data$metadata
    org_name <- org_meta$company

    twitter <- json_data$twitter    # extraer la lista twitter
    following <- twitter$following  # extraer la lista de usuarios
                                        # crear un vector vacío de
                                        # longitud equivalente a la
                                        # lista de usuarios
    longitud = length(following)
    names <- vector(length = longitud)
                                        # crear un bucle que rellene
                                        # los elementos del vector
                                        # vacío con las observaciones
                                        # de la variable 3 en cada
                                        # elemento de la lista
    for(i in 1:longitud){
        names[i] <- following[[i]][3]
    }
                                        # convertir la lista en una
                                        # tabla y cambiar el nombre de
                                        # la columna
    names_list <- data.frame(sapply(names, c))
    colnames(names_list) <- "usernames"
                                        # añadir una @ para que sean
                                        # 'twitter handles' en lugar
                                        # de nombres de usuario y
                                        # cambiar el nombre de la
                                        # columna
    org_handles <- data.frame(paste("@",
                                    names_list$usernames,
                                    sep="")
                              )
    colnames(org_handles) <- "Twitter"
    
                                        # unir las tablas solo en las
                                        # filas comunes según la
                                        # columna 'Twitter' y comprobar
                                        # el número de seguidores
    org_list <- inner_join(psa_handles, org_handles)
    cuentas_db_org <- length(org_list$Twitter)
    cuentas_total_org <- length(twitter$following)
    resultado <- data.frame(Name = org_name,
                            AKA = org_aka,
                            Cuentas_db = cuentas_db_org,
                            Total = cuentas_total_org)
    resultado
}

psa <- sapply(archivos, calcula_cuentas)
psa1 <- t(as.data.frame(psa))
psa2 <- matrix(unlist(psa1), ncol = 4) %>%
    as.data.frame()
colnames(psa2) <- c("Name", "AKA", "tw_int_outbound", "tw_ext_outbound")

save(psa2, file = "rda/psa.rda")