How to add a dataframe to a dataframe for every iteration?-CodePudding

I have a folder which contains json files, every json file represents a drink. The files all look like below, they just have a different prize output. Below is the json file of coffee.json:

{
    "data": {
        "start_date": "2022-10-01",
        "end_date": "2022-10-04",
        "cur": "EUR",
        "prizes": {
            "2022-10-01": {
                "coffee": 0.1448939471560284
            },
            "2022-10-02": {
                "coffee": 0.14487923291390148
            },
            "2022-10-03": {
                "coffee": 0.1454857922753868
            }
        }
    }
}

Now I want to grab the first json file and create a dataframe of it, after that I want to grab the second json file and add only the column prizes to the first df, and grab the third json file and again add the prizes column etc.

This is what I tried:

base_path = r'C:\Users\Geo\Desktop\python-exer\json_files'

drinks_list = ['soda', 'water', 'coffee', 'coke', 'fanta']

for file in os.listdir(base_path):
    file_without_ext = file.split('.')[0]

    #check if file occurs in drinks_list
    if file_without_ext in drinks_list:
        with open(base_path    '/'   file,'r') as f:
            data = json.loads(f.read())
    
        df = pd.DataFrame(data['data']['prizes']).T.reset_index().rename(columns={'index': 'date'}).assign(cur=data['data']['cur'])
        print(df)

        if df.columns[1] != 'coffee':
            df.join(df[file_without_ext],  how='left', lsuffix='_left', rsuffix='_right')
        
        print(df)

This is my output:

         date    coffee cur
0  2022-10-01  0.06889  EUR
1  2022-10-02  0.06287  EUR
2  2022-10-03  0.07864  EUR
         date      soda cur
0  2022-10-01  0.94444  EUR
1  2022-10-02  0.86877  EUR
2  2022-10-03  0.68972  EUR
         date     water cur
0  2022-10-01  0.98751  EUR
1  2022-10-02  0.87982  EUR
2  2022-10-03  0.56677  EUR
         date     water cur
0  2022-10-01  0.98751  EUR
1  2022-10-02  0.87982  EUR
2  2022-10-03  0.56677  EUR

This is my desired output:

         date    coffee     soda        water      cur
0  2022-10-01  0.06889      0.94444     0.98751    EUR
1  2022-10-02  0.06287      0.86877     0.87982    EUR
2  2022-10-03  0.07864      0.68972     0.56677    EUR

CodePudding user response：

You can create DataFrames with MultiIndex by dates and currency, then append to list dfs and last join by concat:

base_path = r'C:\Users\Geo\Desktop\python-exer\json_files'

drinks_list = ['soda', 'water', 'coffee', 'coke', 'fanta']

dfs= []

for file in os.listdir(base_path):
    file_without_ext = file.split('.')[0]

    #check if file occurs in drinks_list
    if file_without_ext in drinks_list:
        with open(base_path    '/'   file,'r') as f:
            data = json.loads(f.read())
            df = pd.DataFrame(data['data']['prizes']).T.assign(cur = data['data']['cur']).set_index('cur', append=True)
            dfs.append(df)
            
final = pd.concat(dfs, axis=1).reset_index()

If there is always same currency, solution is simplier - last assign column currency:

base_path = r'C:\Users\Geo\Desktop\python-exer\json_files'
drinks_list = ['soda', 'water', 'coffee', 'coke', 'fanta']
dfs= []

for file in os.listdir(base_path):
    file_without_ext = file.split('.')[0]

    #check if file occurs in drinks_list
    if file_without_ext in drinks_list:
        with open(base_path    '/'   file,'r') as f:
            data = json.loads(f.read())
            df = pd.DataFrame(data['data']['prizes']).T
            
            cur = data['data']['cur']
            dfs.append(df)
            
final = pd.concat(dfs, axis=1).reset_index().assign(cur=cur)

Test first solution:

d1 = {
    "data": {
        "start_date": "2022-10-01",
        "end_date": "2022-10-04",
        "cur": "EUR",
        "prizes": {
            "2022-10-01": {
                "coffee": 0.1448939471560284
            },
            "2022-10-02": {
                "coffee": 0.14487923291390148
            },
            "2022-10-03": {
                "coffee": 0.1454857922753868
            }
        }
    }
}

d2 = {
    "data": {
        "start_date": "2022-10-01",
        "end_date": "2022-10-04",
        "cur": "EUR",
        "prizes": {
            "2022-10-01": {
                "water": 0.7448939471560284
            },
            "2022-10-02": {
                "water": 0.74487923291390148
            },
            "2022-10-03": {
                "water": 0.7454857922753868
            }
        }
    }
}

d3 = {
    "data": {
        "start_date": "2022-10-01",
        "end_date": "2022-10-04",
        "cur": "EUR",
        "prizes": {
            "2022-10-01": {
                "fanta": 0.88939471560284
            },
            "2022-10-02": {
                "fanta": 0.9923291390148
            },
            "2022-10-03": {
                "fanta": 0.957922753868
            }
        }
    }
}

dfs= []

for d in [d1, d2, d3]:
    data = d
    df = pd.DataFrame(data['data']['prizes']).T.assign(cur = data['data']['cur']).set_index('cur', append=True)
    dfs.append(df)
    
final = pd.concat(dfs, axis=1).rename_axis(('date','cur')).reset_index()
print (final)
         date  cur    coffee     water     fanta
0  2022-10-01  EUR  0.144894  0.744894  0.889395
1  2022-10-02  EUR  0.144879  0.744879  0.992329
2  2022-10-03  EUR  0.145486  0.745486  0.957923

And second solution:

dfs= []

for d in [d1, d2, d3]:
    data = d
    df = pd.DataFrame(data['data']['prizes']).T
    cur = data['data']['cur']
    dfs.append(df)
    
final = pd.concat(dfs, axis=1).rename_axis('date').reset_index().assign(cur=cur)
print (final)
         date    coffee     water     fanta  cur
0  2022-10-01  0.144894  0.744894  0.889395  EUR
1  2022-10-02  0.144879  0.744879  0.992329  EUR
2  2022-10-03  0.145486  0.745486  0.957923  EUR