元数据 https://github.com/chrisrui/pydata-book
import json
import pandas as pd
import matplotlib.pyplot as plt
db=json.load(open('/home/baba/database.json'))
len(db)
db
'group': 'Composition'}, {'value': 4.74, 'units': 'g', 'description': 'Carbohydrate, by difference', 'group': 'Composition'}, {'value': 0.8, 'units': 'g', 'description': 'Ash', 'group': 'Other'}, {'value': 63.0, 'units': 'kcal', 'description': 'Energy', 'group': 'Energy'}, {'value': 87.67, 'units': 'g', 'description': 'Water', 'group': 'Composition'}, {'value': 264.0, 'units': 'kJ', 'description': 'Energy', 'group': 'Energy'}, {'value': 0.0, 'units': 'g',
db[0].keys()
db[0]['nutrients'][0]
nutrients=pd.DataFrame(db[0]['nutrients'])
nutrients[:7]
info_keys=['description','group','id','manufacturer']
info_db=pd.DataFrame(db,columns=info_keys)
info_db[:5
value units description group
0 25.18 g Protein Composition
1 29.20 g Total lipid (fat) Composition
2 3.06 g Carbohydrate, by difference Composition
3 3.28 g Ash Other
4 376.00 kcal Energy Energy
5 39.28 g Water Composition
6 1573.00 kJ Energy Energy]
info_db.info()
pd.value_counts(info_db.group)[:10]
nutrients_list=[]
for rec in db:
fnuts=pd.DataFrame(rec['nutrients'])
fnuts['id']=rec['id']
nutrients_list.append(fnuts)
nutrients_list=pd.concat(nutrients_list,ignore_index=True)
nutrients_list
nutrients_list.duplicated().sum()
nutrients_list=nutrients_list.drop_duplicates()
col1={'description':'food',
'group':'fgroup'}
info_db=info_db.rename(columns=col1,copy=False)
info_db.info()
co2={'description':'nutrient',
'group':'nutgroup'}
nutrients_list=nutrients_list.rename(columns=co2,copy=False)
nutrients_list
ndata=pd.merge(nutrients_list,info_db,on='id',how='outer')
ndata.info()
ndata.iloc[30000]
fig=plt.figure()
result=ndata.groupby(['nutrient','fgroup'])['value'].quantile(0.5)
result['Zinc, Zn'].sort_values().plot(kind='barh')
by_nutrietn=ndata.groupby(['nutgroup','nutrient'])
get_maximum=lambda x: x.loc[x.value.idxmax()]
get_minimum=lambda x: x.loc[x.value.idxmin()]
max_foods=by_nutrietn.apply(get_maximum)[['value','food']]
max_foods.food=max_foods[:50]
max_foods.loc['Amino Acids']['food']