粮农组织的三个主要目标是:
- 消除饥饿、粮食不安全和营养不良
- 消除贫困促进经济社会进步
- 自然资源的可持续管理和利用,包括土地、水、空气、气候和遗传资源,以造福今世后代。
为支持这些目标,《宪法》第1条要求粮农组织“收集、分析、解释和传播与营养、粮食和农业有关的信息”。因此,水温自动调节器开始,其目的是通过收集有助于联合国粮农组织的目标,与水资源相关的信息传播分析,用水和农业用水管理,对国家重点在非洲,亚洲,美国,拉丁美洲,加勒比海。
联合国粮农组织提供数据,元数据,报告国家概况,河流域概况,分析区域,图,表空间,数据,指导方针,和其他的在线工具:
- 水资源:内部、跨界、总
- 水的用途:按部门,按来源,废水
- 灌溉:地点、面积、类型、技术、作物
- 水坝:位置,高度,容量,表面积
- 与水有关的机构、政策和立法
Load the data
data = pd.read_csv('aquastat.csv.gzip', compression='gzip')
data.head()
data.shape
(143280, 7)
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143280 entries, 0 to 143279
Data columns (total 7 columns):
country 143280 non-null object
region 143280 non-null object
variable 143280 non-null object
variable_full 143280 non-null object
time_period 143280 non-null object
year_measured 96411 non-null float64
value 96411 non-null float64
dtypes: float64(2), object(5)
memory usage: 7.7+ MB
data[['variable','variable_full']].drop_duplicates()#属性 单位
#total_area 国土面积(1000公顷)
#arable_land 可耕作面积
#permanent_crop_area 多年生作物面积
#cultivated_area 耕地面积
#percent_cultivated 耕地面积占比
#total_pop 总人口
#rural_pop 农村人口
#urban_pop 城市人口
#gdp 国内生产总值
#gdp_per_capita 人均国内生产总值
#agg_to_gdp 农业,增加国内生产总值
#human_dev_index 人类发展指数
#gender_inequal_index 性别不平等指数
#percent_undernourished 营养不良患病率
#avg_annual_rain_depth 长期平均年降水量
#national_rainfall_index 全国降雨指数
- 某个时间段国家与variable内属性的值
def time_slice(df, time_period):
df = df[df.time_period==time_period]
# Pivot table
df = df.pivot(index='country', columns='variable', values='value')
df.columns.name = time_period
return df
time_slice(data, time_periods[0]).head()
- 某个国家variable属性的值随时间变化
def country_slice(df, country):
# Only take data for country of interest
df = df[df.country==country]
# Pivot table
df = df.pivot(index='variable', columns='time_period', values='value')
df.index.name = country
return df
- variable中的某一属性的值各个国家随时间变化
def variable_slice(df, variable):
# Only data for that variable
df = df[df.variable==variable]
# Get variable for each country over the time periods
df = df.pivot(index='country', columns='time_period', values='value')
return df
- 某个国家 variable某一值随时间变化
def time_series(df, country, variable):
# Only take data for country/variable combo
series = df[(df.country==country) & (df.variable==variable)]
# Drop years with no data
series = series.dropna()[['year_measured', 'value']]
# Change years to int and set as index
series.year_measured = series.year_measured.astype(int)
series.set_index('year_measured', inplace=True)
series.columns = [variable]
return series
- 提取单个区域的函数
def subregion(data, region):
return data[data.region==region]
缺失值
By variable
recent = time_slice(data, '2013-2017')
msno.matrix(recent, labels=True)
水资源总量
#Total exploitable water resources 水资源总量
msno.matrix(variable_slice(data, 'exploitable_total'), inline=False, sort='descending');
plt.xlabel('Time period');
plt.ylabel('Country');
plt.title('Missing total exploitable water resources data across countries and time periods \n \n \n \n');
只有一小部分国家报告了可利用的水资源总量,这些国家中只有极少数国家拥有最近一段时间的数据。
我们将删除该变量,因为这么少的数据点会导致很多问题。
In [10]:
data = data.loc[~data.variable.str.contains('exploitable'),:]# ~删除
national_rainfall_index 全国降水指数(NRI)(毫米/年)
national_rainfall_index 全国降水指数(NRI)(毫米/年)
msno.matrix(variable_slice(data, 'national_rainfall_index'),
inline=False, sort='descending');
plt.xlabel('Time period');
plt.ylabel('Country');
plt.title('Missing national rainfall index data across countries and time periods \n \n \n \n');
全国降雨指数在2002以后不再报告。
data = data.loc[~(data.variable=='national_rainfall_index')]
南美国家某个
north_america = subregion(data, 'North America')
#指数完整性
msno.matrix(msno.nullity_sort(time_slice(north_america, '2013-2017'), sort='descending').T, inline=False)
#plt.title('Fraction of fields complete by country for North America \n \n');
查看空值较多的Bahamas的属性随时间变化
msno.nullity_filter(country_slice(data, 'Bahamas').T, filter='bottom', p=0.1)
随时间变化各属性的值的热力图
fig, ax = plt.subplots(figsize=(16, 16));
sns.heatmap(data.groupby(['time_period','variable']).value.count().unstack().T , ax=ax);
plt.xticks(rotation=45);
plt.xlabel('Time period');
plt.ylabel('Variable');
plt.title('Number of countries with data reported for each variable over time');
农粮数据成图
总人口直方图
fig, ax = plt.subplots(figsize=(12, 8))
ax.hist(recent.total_pop.values, bins=50);
ax.set_xlabel('Total population');
ax.set_ylabel('Number of countries');
ax.set_title('Distribution of population of countries 2013-2017');
def plot_hist(df, variable, bins=20, xlabel=None, by=None,
ylabel=None, title=None, logx=False, ax=None):
if not ax:
fig, ax = plt.subplots(figsize=(12,8))
if logx:
if df[variable].min() <=0:
df[variable] = df[variable] - df[variable].min() + 1
print('Warning: data <=0 exists, data transformed by %0.2g before plotting' % (- df[variable].min() + 1))
bins = np.logspace(np.log10(df[variable].min()),
np.log10(df[variable].max()), bins)
ax.set_xscale("log")
ax.hist(df[variable].dropna().values, bins=bins);
if xlabel:
ax.set_xlabel(xlabel);
if ylabel:
ax.set_ylabel(ylabel);
if title:
ax.set_title(title);
return ax
- 使人口符合正太分布
plot_hist(recent, 'total_pop', bins=25, logx=True,
xlabel='Log of total population', ylabel='Number of countries',
title='Distribution of total population of countries 2013-2017');
- 总人口折线图
with sns.color_palette(sns.diverging_palette(220, 280, s=85, l=25, n=23)):
north_america = time_slice(subregion(data, 'North America'), '1958-1962').sort_values('total_pop').index.tolist()
for country in north_america:
plt.plot(time_series(data, country, 'total_pop'), label=country);
plt.xlabel('Year');
plt.ylabel('Population');
plt.title('North American populations over time');
plt.legend(loc=2,prop={'size':10});
这除了北美洲是最大的国家之外,什么也没有告诉我们。我们想了解每个国家的人口是如何随着时间的推移而变化的,主要是参照自身的变化。
我们应该通过什么标准化?我们可以选择一个国家的最小、平均、中位数、最大值…或任何其他位置
让我们选择最小值,这样我们就能看到每个国家在起始人口上的增长。
增长倍率折线图
with sns.color_palette(sns.diverging_palette(220, 280, s=85, l=25, n=23)):
for country in north_america:
ts = time_series(data, country, 'total_pop')
ts['norm_pop'] = ts.total_pop/ts.total_pop.min()*100
plt.plot(ts['norm_pop'], label=country);
plt.xlabel('Year');
plt.ylabel('Percent increase in population');
plt.title('Percent increase in population from 1960 in North American countries');
plt.legend(loc=2,prop={'size':10});
north_america_pop = variable_slice(subregion(data, 'North America'), 'total_pop')
north_america_norm_pop = north_america_pop.div(north_america_pop.min(axis=1), axis=0)*100
north_america_norm_pop = north_america_norm_pop.loc[north_america]
总人口热力图
fig, ax = plt.subplots(figsize=(16, 12));
sns.heatmap(north_america_norm_pop, ax=ax, cmap=sns.light_palette((214, 90, 60), input="husl", as_cmap=True));
plt.xticks(rotation=45);
plt.xlabel('Time period');
plt.ylabel('Country, ordered by population in 1960 (<- greatest to least ->)');
plt.title('Percent increase in population from 1960');
相关性
data = pd.read_csv('../../data/aquastat/aquastat.csv.gzip', compression='gzip')
# simplify regions
data.region = data.region.apply(lambda x: simple_regions[x])
# remove exploitable fields and national rainfall index
data = data.loc[~data.variable.str.contains('exploitable'),:]
data = data.loc[~(data.variable=='national_rainfall_index')]
# Uncomment to print out variable names and explanations
# data[['variable','variable_full']].drop_duplicates()
# Subset for cross-sectional analysis
recent = time_slice(data, '2013-2017')
recent_corr = recent.corr().loc['gdp_per_capita'].drop(['gdp','gdp_per_capita'])
GDP随季节变化散点图
#seasonal_variability 季节变化(WRI)
# recent.drop('gdp_bin', axis=1).astype(float).plot(x='seasonal_variability',y='gdp_per_capita', kind='scatter');
plt.scatter(recent.seasonal_variability, recent.gdp_per_capita)
plt.xlabel('Seasonal variability');
plt.ylabel('GDP per capita ($USD/person)');
GDP相关性直方图
def conditional_bar(series, bar_colors=None, color_labels=None, figsize=(13,24),
xlabel=None, by=None, ylabel=None, title=None):
fig, ax = plt.subplots(figsize=figsize)
if not bar_colors:
bar_colors = mpl.rcParams['axes.prop_cycle'].by_key()['color'][0]
plt.barh(range(len(series)),series.values, color=bar_colors)
plt.xlabel('' if not xlabel else xlabel);
plt.ylabel('' if not ylabel else ylabel)
plt.yticks(range(len(series)), series.index.tolist())
plt.title('' if not title else title);
plt.ylim([-1,len(series)]);
if color_labels:
for col, lab in color_labels.items():
plt.plot([], linestyle='',marker='s',c=col, label= lab);
lines, labels = ax.get_legend_handles_labels();
ax.legend(lines[-len(color_labels.keys()):], labels[-len(color_labels.keys()):], loc='upper right');
plt.close()
return fig
bar_colors = ['#0055A7' if x else '#2C3E4F' for x in list(recent_corr.values < 0)]
color_labels = {'#0055A7':'Negative correlation', '#2C3E4F':'Positive correlation'}
conditional_bar(recent_corr.apply(np.abs), bar_colors, color_labels,
title='Magnitude of correlation with GDP per capita, 2013-2017',
xlabel='|Correlation|')
可视化函数汇总
import pandas as pd
import folium
from matplotlib import pyplot as plt
import numpy as np
def time_slice(df, time_period):
# Only take data for time period of interest
df = df[df.time_period == time_period]
# Pivot table
df = df.pivot(index='country', columns='variable', values='value')
df.columns.name = time_period
return df
def country_slice(df, country):
# Only take data for country of interest
df = df[df.country == country]
# Pivot table
df = df.pivot(index='variable', columns='time_period', values='value')
df.index.name = country
return df
def time_series(df, country, variable):
# Only take data for country/variable combo
series = df[(df.country == country) & (df.variable == variable)]
# Drop years with no data
series = series.dropna()[['year_measured', 'value']]
# Change years to int and set as index
series.year_measured = series.year_measured.astype(int)
series.set_index('year_measured', inplace=True)
series.columns = [variable]
return series
simple_regions = {
'World | Asia': 'Asia',
'Americas | Central America and Caribbean | Central America': 'North America',
'Americas | Central America and Caribbean | Greater Antilles': 'North America',
'Americas | Central America and Caribbean | Lesser Antilles and Bahamas': 'North America',
'Americas | Northern America | Northern America': 'North America',
'Americas | Northern America | Mexico': 'North America',
'Americas | Southern America | Guyana': 'South America',
'Americas | Southern America | Andean': 'South America',
'Americas | Southern America | Brazil': 'South America',
'Americas | Southern America | Southern America': 'South America',
'World | Africa': 'Africa',
'World | Europe': 'Europe',
'World | Oceania': 'Oceania'
}
def subregion(data, region):
return data[data.region == region]
def variable_slice(df, variable):
df = df[df.variable==variable]
df = df.pivot(index='country', columns='time_period', values='value')
return df
def plot_map(df, variable, time_period=None, log=False,
legend_name=None, threshold_scale=None,
geo=r'../../data/aquastat/world.json'):
if time_period:
df = time_slice(df, time_period).reset_index()
else:
df = df.reset_index()
if log:
df[variable] = df[variable].apply(np.log)
map = folium.Map(location=[34, -45], zoom_start=2,
width=1200, height=600)
map.choropleth(geo_path=geo,
data=df,
columns=['country', variable],
key_on='feature.properties.name', reset=True,
fill_color='PuBuGn', fill_opacity=0.7, line_opacity=0.2,
legend_name=legend_name if legend_name else variable,
threshold_scale=threshold_scale)
return map
def map_over_time(df, variable, time_periods, log=False,
threshold_scale=None, legend_name=None,
geo=r'../../data/aquastat/world.json'):
time_slider = widgets.SelectionSlider(options=time_periods.tolist(),
value=time_periods[0],
description='Time period:',
disabled=False,
button_style='')
widgets.interact(plot_map, df=widgets.fixed(df),
variable=widgets.fixed(variable),
time_period=time_slider, log=widgets.fixed(log),
legend_name=widgets.fixed(legend_name),
threshold_scale=widgets.fixed(threshold_scale),
geo=widgets.fixed(geo));
def plot_hist(df, variable, bins=None, xlabel=None, by=None,
ylabel=None, title=None, logx=False, ax=None):
if not bins:
bins = 20
if not ax:
fig, ax = plt.subplots(figsize=(12, 8))
if logx:
if df[variable].min() <=0:
df[variable] = df[variable] - df[variable].min() + 1
print('Warning: data <=0 exists, data transformed by %0.2g before plotting' % (- df[variable].min() + 1))
bins = np.logspace(np.log10(df[variable].min()),
np.log10(df[variable].max()), bins)
ax.set_xscale("log")
if by:
if type(df[by].unique()) == pd.core.categorical.Categorical:
cats = df[by].unique().categories.tolist()
else:
cats = df[by].unique().tolist()
for cat in cats:
to_plot = df[df[by] == cat][variable].dropna()
ax.hist(to_plot, bins=bins);
else:
ax.hist(df[variable].dropna().values, bins=bins);
if xlabel:
ax.set_xlabel(xlabel);
if ylabel:
ax.set_ylabel(ylabel);
if title:
ax.set_title(title);
return ax
def conditional_bar(series, bar_colors=None, color_labels=None, figsize=(13,24),
xlabel=None, by=None, ylabel=None, title=None):
fig, ax = plt.subplots(figsize=figsize)
if not bar_colors:
bar_colors = mpl.rcParams['axes.prop_cycle'].by_key()['color'][0]
plt.barh(range(len(series)),series.values, color=bar_colors)
plt.xlabel('' if not xlabel else xlabel);
plt.ylabel('' if not ylabel else ylabel)
plt.yticks(range(len(series)), series.index.tolist())
plt.title('' if not title else title);
plt.ylim([-1,len(series)]);
if color_labels:
for col, lab in color_labels.items():
plt.plot([], linestyle='',marker='s',c=col, label= lab);
lines, labels = ax.get_legend_handles_labels();
ax.legend(lines[-len(color_labels.keys()):], labels[-len(color_labels.keys()):], loc='upper right');
plt.close()
return fig
def plot_scatter(df, x, y, xlabel=None, ylabel=None, title=None,
logx=False, logy=False, by=None, ax=None):
if not ax:
fig, ax = plt.subplots(figsize=(12, 10))
colors = mpl.rcParams['axes.prop_cycle'].by_key()['color']
if by:
groups = df.groupby(by)
for j, (name, group) in enumerate(groups):
ax.scatter(group[x], group[y], color=colors[j], label=name)
ax.legend()
else:
ax.scatter(df[x], df[y], color=colors[0])
if logx:
ax.set_xscale('log')
if logy:
ax.set_yscale('log')
ax.set_xlabel(xlabel if xlabel else x);
ax.set_ylabel(ylabel if ylabel else y);
if title:
ax.set_title(title);
def two_hist(df, variable, bins=50,
ylabel='Number of countries', title=None):
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18,8))
ax1 = plot_hist(df, variable, bins=bins,
xlabel=variable, ylabel=ylabel,
ax=ax1, title=variable if not title else title)
ax2 = plot_hist(df, variable, bins=bins,
xlabel='Log of '+ variable, ylabel=ylabel,
logx=True, ax=ax2,
title='Log of '+ variable if not title else title)
plt.close()
return fig
def hist_over_var(df, variables, bins=50, first_choice=None,
ylabel='Number of countries', title=None):
if not first_choice:
first_choice = variables[0]
variable_slider = widgets.Dropdown(options=variables.tolist(),
value=first_choice,
description='Variable:',
disabled=False,
button_style='')
widgets.interact(two_hist, df=widgets.fixed(df),
variable=variable_slider, ylabel=widgets.fixed(ylabel),
title=widgets.fixed(title), bins=widgets.fixed(bins));