I have a folder which contains json files, every json file represents a drink. The files all look like below, they just have a different prize output. Below is the json file of coffee.json:
{
"data": {
"start_date": "2022-10-01",
"end_date": "2022-10-04",
"cur": "EUR",
"prizes": {
"2022-10-01": {
"coffee": 0.1448939471560284
},
"2022-10-02": {
"coffee": 0.14487923291390148
},
"2022-10-03": {
"coffee": 0.1454857922753868
}
}
}
}
Now I want to grab the first json file and create a dataframe of it, after that I want to grab the second json file and add only the column prizes to the first df, and grab the third json file and again add the prizes column etc.
This is what I tried:
base_path = r'C:\Users\Geo\Desktop\python-exer\json_files'
drinks_list = ['soda', 'water', 'coffee', 'coke', 'fanta']
for file in os.listdir(base_path):
file_without_ext = file.split('.')[0]
#check if file occurs in drinks_list
if file_without_ext in drinks_list:
with open(base_path '/' file,'r') as f:
data = json.loads(f.read())
df = pd.DataFrame(data['data']['prizes']).T.reset_index().rename(columns={'index': 'date'}).assign(cur=data['data']['cur'])
print(df)
if df.columns[1] != 'coffee':
df.join(df[file_without_ext], how='left', lsuffix='_left', rsuffix='_right')
print(df)
This is my output:
date coffee cur
0 2022-10-01 0.06889 EUR
1 2022-10-02 0.06287 EUR
2 2022-10-03 0.07864 EUR
date soda cur
0 2022-10-01 0.94444 EUR
1 2022-10-02 0.86877 EUR
2 2022-10-03 0.68972 EUR
date water cur
0 2022-10-01 0.98751 EUR
1 2022-10-02 0.87982 EUR
2 2022-10-03 0.56677 EUR
date water cur
0 2022-10-01 0.98751 EUR
1 2022-10-02 0.87982 EUR
2 2022-10-03 0.56677 EUR
This is my desired output:
date coffee soda water cur
0 2022-10-01 0.06889 0.94444 0.98751 EUR
1 2022-10-02 0.06287 0.86877 0.87982 EUR
2 2022-10-03 0.07864 0.68972 0.56677 EUR
CodePudding user response:
You can create DataFrames with MultiIndex by dates and currency, then append to list dfs and last join by concat:
base_path = r'C:\Users\Geo\Desktop\python-exer\json_files'
drinks_list = ['soda', 'water', 'coffee', 'coke', 'fanta']
dfs= []
for file in os.listdir(base_path):
file_without_ext = file.split('.')[0]
#check if file occurs in drinks_list
if file_without_ext in drinks_list:
with open(base_path '/' file,'r') as f:
data = json.loads(f.read())
df = pd.DataFrame(data['data']['prizes']).T.assign(cur = data['data']['cur']).set_index('cur', append=True)
dfs.append(df)
final = pd.concat(dfs, axis=1).reset_index()
If there is always same currency, solution is simplier - last assign column currency:
base_path = r'C:\Users\Geo\Desktop\python-exer\json_files'
drinks_list = ['soda', 'water', 'coffee', 'coke', 'fanta']
dfs= []
for file in os.listdir(base_path):
file_without_ext = file.split('.')[0]
#check if file occurs in drinks_list
if file_without_ext in drinks_list:
with open(base_path '/' file,'r') as f:
data = json.loads(f.read())
df = pd.DataFrame(data['data']['prizes']).T
cur = data['data']['cur']
dfs.append(df)
final = pd.concat(dfs, axis=1).reset_index().assign(cur=cur)
Test first solution:
d1 = {
"data": {
"start_date": "2022-10-01",
"end_date": "2022-10-04",
"cur": "EUR",
"prizes": {
"2022-10-01": {
"coffee": 0.1448939471560284
},
"2022-10-02": {
"coffee": 0.14487923291390148
},
"2022-10-03": {
"coffee": 0.1454857922753868
}
}
}
}
d2 = {
"data": {
"start_date": "2022-10-01",
"end_date": "2022-10-04",
"cur": "EUR",
"prizes": {
"2022-10-01": {
"water": 0.7448939471560284
},
"2022-10-02": {
"water": 0.74487923291390148
},
"2022-10-03": {
"water": 0.7454857922753868
}
}
}
}
d3 = {
"data": {
"start_date": "2022-10-01",
"end_date": "2022-10-04",
"cur": "EUR",
"prizes": {
"2022-10-01": {
"fanta": 0.88939471560284
},
"2022-10-02": {
"fanta": 0.9923291390148
},
"2022-10-03": {
"fanta": 0.957922753868
}
}
}
}
dfs= []
for d in [d1, d2, d3]:
data = d
df = pd.DataFrame(data['data']['prizes']).T.assign(cur = data['data']['cur']).set_index('cur', append=True)
dfs.append(df)
final = pd.concat(dfs, axis=1).rename_axis(('date','cur')).reset_index()
print (final)
date cur coffee water fanta
0 2022-10-01 EUR 0.144894 0.744894 0.889395
1 2022-10-02 EUR 0.144879 0.744879 0.992329
2 2022-10-03 EUR 0.145486 0.745486 0.957923
And second solution:
dfs= []
for d in [d1, d2, d3]:
data = d
df = pd.DataFrame(data['data']['prizes']).T
cur = data['data']['cur']
dfs.append(df)
final = pd.concat(dfs, axis=1).rename_axis('date').reset_index().assign(cur=cur)
print (final)
date coffee water fanta cur
0 2022-10-01 0.144894 0.744894 0.889395 EUR
1 2022-10-02 0.144879 0.744879 0.992329 EUR
2 2022-10-03 0.145486 0.745486 0.957923 EUR
