探索性数据分析-粮农组织数据集

粮农组织的三个主要目标是:

  1. 消除饥饿、粮食不安全和营养不良
  2. 消除贫困促进经济社会进步
  3. 自然资源的可持续管理和利用,包括土地、水、空气、气候和遗传资源,以造福今世后代。

为支持这些目标,《宪法》第1条要求粮农组织“收集、分析、解释和传播与营养、粮食和农业有关的信息”。因此,水温自动调节器开始,其目的是通过收集有助于联合国粮农组织的目标,与水资源相关的信息传播分析,用水和农业用水管理,对国家重点在非洲,亚洲,美国,拉丁美洲,加勒比海。

联合国粮农组织提供数据,元数据,报告国家概况,河流域概况,分析区域,图,表空间,数据,指导方针,和其他的在线工具:

  • 水资源:内部、跨界、总
  • 水的用途:按部门,按来源,废水
  • 灌溉:地点、面积、类型、技术、作物
  • 水坝:位置,高度,容量,表面积
  • 与水有关的机构、政策和立法

Load the data

data = pd.read_csv('aquastat.csv.gzip', compression='gzip')
data.head()

在这里插入图片描述

data.shape
(143280, 7)
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143280 entries, 0 to 143279
Data columns (total 7 columns):
country          143280 non-null object
region           143280 non-null object
variable         143280 non-null object
variable_full    143280 non-null object
time_period      143280 non-null object
year_measured    96411 non-null float64
value            96411 non-null float64
dtypes: float64(2), object(5)
memory usage: 7.7+ MB
data[['variable','variable_full']].drop_duplicates()#属性 单位
#total_area 国土面积(1000公顷)
#arable_land 可耕作面积
#permanent_crop_area 多年生作物面积
#cultivated_area 耕地面积
#percent_cultivated 耕地面积占比
#total_pop 总人口
#rural_pop 农村人口
#urban_pop 城市人口
#gdp 国内生产总值
#gdp_per_capita 人均国内生产总值
#agg_to_gdp 农业,增加国内生产总值
#human_dev_index 人类发展指数
#gender_inequal_index 性别不平等指数
#percent_undernourished 营养不良患病率
#avg_annual_rain_depth 长期平均年降水量
#national_rainfall_index 全国降雨指数 

在这里插入图片描述

  • 某个时间段国家与variable内属性的值
def time_slice(df, time_period):

    df = df[df.time_period==time_period] 

    # Pivot table 
    df = df.pivot(index='country', columns='variable', values='value')
    
    df.columns.name = time_period
    
    return df
    
time_slice(data, time_periods[0]).head()

在这里插入图片描述

  • 某个国家variable属性的值随时间变化
def country_slice(df, country):
    
    # Only take data for country of interest
    df = df[df.country==country] 

    # Pivot table 
    df = df.pivot(index='variable', columns='time_period', values='value')
    
    df.index.name = country
    return df

在这里插入图片描述

  • variable中的某一属性的值各个国家随时间变化
def variable_slice(df, variable):
    
    # Only data for that variable
    df = df[df.variable==variable]
    
    # Get variable for each country over the time periods 
    df = df.pivot(index='country', columns='time_period', values='value')
    return df

在这里插入图片描述

  • 某个国家 variable某一值随时间变化
def time_series(df, country, variable):
    
    # Only take data for country/variable combo 
    series = df[(df.country==country) & (df.variable==variable)]
    
    # Drop years with no data 
    series = series.dropna()[['year_measured', 'value']]
    
    # Change years to int and set as index 
    series.year_measured = series.year_measured.astype(int)
    series.set_index('year_measured', inplace=True)
    series.columns = [variable]
    return series

在这里插入图片描述

  • 提取单个区域的函数
def subregion(data, region):
    return data[data.region==region]

缺失值

By variable

recent = time_slice(data, '2013-2017')
msno.matrix(recent, labels=True)

在这里插入图片描述

水资源总量

#Total exploitable water resources 水资源总量
msno.matrix(variable_slice(data, 'exploitable_total'), inline=False, sort='descending');
plt.xlabel('Time period');
plt.ylabel('Country');
plt.title('Missing total exploitable water resources data across countries and time periods \n \n \n \n');

在这里插入图片描述
只有一小部分国家报告了可利用的水资源总量,这些国家中只有极少数国家拥有最近一段时间的数据。

我们将删除该变量,因为这么少的数据点会导致很多问题。
In [10]:

data = data.loc[~data.variable.str.contains('exploitable'),:]# ~删除

national_rainfall_index 全国降水指数(NRI)(毫米/年)


national_rainfall_index 全国降水指数(NRI)(毫米/)
msno.matrix(variable_slice(data, 'national_rainfall_index'), 
            inline=False, sort='descending');
plt.xlabel('Time period');
plt.ylabel('Country');
plt.title('Missing national rainfall index data across countries and time periods \n \n \n \n');

在这里插入图片描述
全国降雨指数在2002以后不再报告。

data = data.loc[~(data.variable=='national_rainfall_index')]

南美国家某个

north_america = subregion(data, 'North America')

#指数完整性
msno.matrix(msno.nullity_sort(time_slice(north_america, '2013-2017'), sort='descending').T, inline=False)
#plt.title('Fraction of fields complete by country for North America \n \n');

在这里插入图片描述

查看空值较多的Bahamas的属性随时间变化

msno.nullity_filter(country_slice(data, 'Bahamas').T, filter='bottom', p=0.1)

在这里插入图片描述

随时间变化各属性的值的热力图

fig, ax = plt.subplots(figsize=(16, 16));
sns.heatmap(data.groupby(['time_period','variable']).value.count().unstack().T , ax=ax);
plt.xticks(rotation=45);
plt.xlabel('Time period');
plt.ylabel('Variable');
plt.title('Number of countries with data reported for each variable over time');

在这里插入图片描述

农粮数据成图

总人口直方图

fig, ax = plt.subplots(figsize=(12, 8))
ax.hist(recent.total_pop.values, bins=50);
ax.set_xlabel('Total population');
ax.set_ylabel('Number of countries');
ax.set_title('Distribution of population of countries 2013-2017');

在这里插入图片描述

def plot_hist(df, variable, bins=20, xlabel=None, by=None,
              ylabel=None, title=None, logx=False, ax=None):

    if not ax:
        fig, ax = plt.subplots(figsize=(12,8))
    if logx:
        if df[variable].min() <=0:
            df[variable] = df[variable] - df[variable].min() + 1
            print('Warning: data <=0 exists, data transformed by %0.2g before plotting' % (- df[variable].min() + 1))
        
        bins = np.logspace(np.log10(df[variable].min()),
                           np.log10(df[variable].max()), bins)
        ax.set_xscale("log")

    ax.hist(df[variable].dropna().values, bins=bins);
    
    if xlabel:
        ax.set_xlabel(xlabel);
    if ylabel:
        ax.set_ylabel(ylabel);
    if title:
        ax.set_title(title);
    
    return ax
  • 使人口符合正太分布
plot_hist(recent, 'total_pop', bins=25, logx=True, 
          xlabel='Log of total population', ylabel='Number of countries',
          title='Distribution of total population of countries 2013-2017');

在这里插入图片描述

  • 总人口折线图
with sns.color_palette(sns.diverging_palette(220, 280, s=85, l=25, n=23)):
    north_america = time_slice(subregion(data, 'North America'), '1958-1962').sort_values('total_pop').index.tolist()
    for country in north_america:
        plt.plot(time_series(data, country, 'total_pop'), label=country);
        plt.xlabel('Year');
        plt.ylabel('Population');
        plt.title('North American populations over time');
    plt.legend(loc=2,prop={'size':10});

在这里插入图片描述

这除了北美洲是最大的国家之外,什么也没有告诉我们。我们想了解每个国家的人口是如何随着时间的推移而变化的,主要是参照自身的变化。
我们应该通过什么标准化?我们可以选择一个国家的最小、平均、中位数、最大值…或任何其他位置
让我们选择最小值,这样我们就能看到每个国家在起始人口上的增长。

增长倍率折线图

with sns.color_palette(sns.diverging_palette(220, 280, s=85, l=25, n=23)):
    for country in north_america:
        ts = time_series(data, country, 'total_pop')
        ts['norm_pop'] = ts.total_pop/ts.total_pop.min()*100
        plt.plot(ts['norm_pop'], label=country);
        plt.xlabel('Year');
        plt.ylabel('Percent increase in population');
        plt.title('Percent increase in population from 1960 in North American countries');
    plt.legend(loc=2,prop={'size':10});

在这里插入图片描述

north_america_pop = variable_slice(subregion(data, 'North America'), 'total_pop')
north_america_norm_pop = north_america_pop.div(north_america_pop.min(axis=1), axis=0)*100
north_america_norm_pop = north_america_norm_pop.loc[north_america]

总人口热力图

fig, ax = plt.subplots(figsize=(16, 12));
sns.heatmap(north_america_norm_pop, ax=ax, cmap=sns.light_palette((214, 90, 60), input="husl", as_cmap=True));
plt.xticks(rotation=45);
plt.xlabel('Time period');
plt.ylabel('Country, ordered by population in 1960 (<- greatest to least ->)');
plt.title('Percent increase in population from 1960');

在这里插入图片描述

相关性

data = pd.read_csv('../../data/aquastat/aquastat.csv.gzip', compression='gzip')

# simplify regions
data.region = data.region.apply(lambda x: simple_regions[x])

# remove exploitable fields and national rainfall index
data = data.loc[~data.variable.str.contains('exploitable'),:]
data = data.loc[~(data.variable=='national_rainfall_index')]

# Uncomment to print out variable names and explanations
# data[['variable','variable_full']].drop_duplicates()

# Subset for cross-sectional analysis
recent = time_slice(data, '2013-2017')
recent_corr = recent.corr().loc['gdp_per_capita'].drop(['gdp','gdp_per_capita'])

GDP随季节变化散点图

#seasonal_variability 季节变化(WRI)
# recent.drop('gdp_bin', axis=1).astype(float).plot(x='seasonal_variability',y='gdp_per_capita', kind='scatter');
plt.scatter(recent.seasonal_variability, recent.gdp_per_capita)
plt.xlabel('Seasonal variability');
plt.ylabel('GDP per capita ($USD/person)');

在这里插入图片描述

GDP相关性直方图

def conditional_bar(series, bar_colors=None, color_labels=None, figsize=(13,24),
                   xlabel=None, by=None, ylabel=None, title=None):
    fig, ax  = plt.subplots(figsize=figsize)
    if not bar_colors:
        bar_colors = mpl.rcParams['axes.prop_cycle'].by_key()['color'][0]
    plt.barh(range(len(series)),series.values, color=bar_colors)
    plt.xlabel('' if not xlabel else xlabel);
    plt.ylabel('' if not ylabel else ylabel)
    plt.yticks(range(len(series)), series.index.tolist())
    plt.title('' if not title else title);
    plt.ylim([-1,len(series)]);
    if color_labels:
        for col, lab in color_labels.items():
            plt.plot([], linestyle='',marker='s',c=col, label= lab);
        lines, labels = ax.get_legend_handles_labels();
        ax.legend(lines[-len(color_labels.keys()):], labels[-len(color_labels.keys()):], loc='upper right');
    plt.close()
    return fig
bar_colors = ['#0055A7' if x else '#2C3E4F' for x in list(recent_corr.values < 0)]
color_labels = {'#0055A7':'Negative correlation', '#2C3E4F':'Positive correlation'}

conditional_bar(recent_corr.apply(np.abs), bar_colors, color_labels,
               title='Magnitude of correlation with GDP per capita, 2013-2017',
               xlabel='|Correlation|')

在这里插入图片描述

可视化函数汇总

import pandas as pd
import folium
from matplotlib import pyplot as plt
import numpy as np

def time_slice(df, time_period):
    # Only take data for time period of interest
    df = df[df.time_period == time_period]

    # Pivot table
    df = df.pivot(index='country', columns='variable', values='value')

    df.columns.name = time_period

    return df

def country_slice(df, country):
    # Only take data for country of interest
    df = df[df.country == country]

    # Pivot table
    df = df.pivot(index='variable', columns='time_period', values='value')

    df.index.name = country
    return df

def time_series(df, country, variable):
    # Only take data for country/variable combo
    series = df[(df.country == country) & (df.variable == variable)]

    # Drop years with no data
    series = series.dropna()[['year_measured', 'value']]

    # Change years to int and set as index
    series.year_measured = series.year_measured.astype(int)
    series.set_index('year_measured', inplace=True)
    series.columns = [variable]
    return series

simple_regions = {
    'World | Asia': 'Asia',
    'Americas | Central America and Caribbean | Central America': 'North America',
    'Americas | Central America and Caribbean | Greater Antilles': 'North America',
    'Americas | Central America and Caribbean | Lesser Antilles and Bahamas': 'North America',
    'Americas | Northern America | Northern America': 'North America',
    'Americas | Northern America | Mexico': 'North America',
    'Americas | Southern America | Guyana': 'South America',
    'Americas | Southern America | Andean': 'South America',
    'Americas | Southern America | Brazil': 'South America',
    'Americas | Southern America | Southern America': 'South America',
    'World | Africa': 'Africa',
    'World | Europe': 'Europe',
    'World | Oceania': 'Oceania'
}

def subregion(data, region):
    return data[data.region == region]

def variable_slice(df, variable):
    df = df[df.variable==variable]
    df = df.pivot(index='country', columns='time_period', values='value')
    return df


def plot_map(df, variable, time_period=None, log=False,
             legend_name=None, threshold_scale=None,
             geo=r'../../data/aquastat/world.json'):

    if time_period:
        df = time_slice(df, time_period).reset_index()
    else:
        df = df.reset_index()

    if log:
        df[variable] = df[variable].apply(np.log)

    map = folium.Map(location=[34, -45], zoom_start=2,
                     width=1200, height=600)
    map.choropleth(geo_path=geo,
                   data=df,
                   columns=['country', variable],
                   key_on='feature.properties.name', reset=True,
                   fill_color='PuBuGn', fill_opacity=0.7, line_opacity=0.2,
                   legend_name=legend_name if legend_name else variable,
                   threshold_scale=threshold_scale)
    return map


def map_over_time(df, variable, time_periods, log=False,
                  threshold_scale=None, legend_name=None,
                  geo=r'../../data/aquastat/world.json'):

    time_slider = widgets.SelectionSlider(options=time_periods.tolist(),
                                          value=time_periods[0],
                                          description='Time period:',
                                          disabled=False,
                                          button_style='')
    widgets.interact(plot_map, df=widgets.fixed(df),
                     variable=widgets.fixed(variable),
                     time_period=time_slider, log=widgets.fixed(log),
                     legend_name=widgets.fixed(legend_name),
                     threshold_scale=widgets.fixed(threshold_scale),
                     geo=widgets.fixed(geo));


def plot_hist(df, variable, bins=None, xlabel=None, by=None,
              ylabel=None, title=None, logx=False, ax=None):
    if not bins:
        bins = 20

    if not ax:
        fig, ax = plt.subplots(figsize=(12, 8))
    if logx:
        if df[variable].min() <=0:
            df[variable] = df[variable] - df[variable].min() + 1
            print('Warning: data <=0 exists, data transformed by %0.2g before plotting' % (- df[variable].min() + 1))
        bins = np.logspace(np.log10(df[variable].min()),
                           np.log10(df[variable].max()), bins)
        ax.set_xscale("log")

    if by:
        if type(df[by].unique()) == pd.core.categorical.Categorical:
            cats = df[by].unique().categories.tolist()
        else:
            cats = df[by].unique().tolist()

        for cat in cats:
            to_plot = df[df[by] == cat][variable].dropna()
            ax.hist(to_plot, bins=bins);
    else:
        ax.hist(df[variable].dropna().values, bins=bins);

    if xlabel:
        ax.set_xlabel(xlabel);
    if ylabel:
        ax.set_ylabel(ylabel);
    if title:
        ax.set_title(title);

    return ax

def conditional_bar(series, bar_colors=None, color_labels=None, figsize=(13,24),
                   xlabel=None, by=None, ylabel=None, title=None):
    fig, ax  = plt.subplots(figsize=figsize)
    if not bar_colors:
        bar_colors = mpl.rcParams['axes.prop_cycle'].by_key()['color'][0]
    plt.barh(range(len(series)),series.values, color=bar_colors)
    plt.xlabel('' if not xlabel else xlabel);
    plt.ylabel('' if not ylabel else ylabel)
    plt.yticks(range(len(series)), series.index.tolist())
    plt.title('' if not title else title);
    plt.ylim([-1,len(series)]);
    if color_labels:
        for col, lab in color_labels.items():
            plt.plot([], linestyle='',marker='s',c=col, label= lab);
        lines, labels = ax.get_legend_handles_labels();
        ax.legend(lines[-len(color_labels.keys()):], labels[-len(color_labels.keys()):], loc='upper right');
    plt.close()
    return fig


def plot_scatter(df, x, y, xlabel=None, ylabel=None, title=None,
                 logx=False, logy=False, by=None, ax=None):
    if not ax:
        fig, ax = plt.subplots(figsize=(12, 10))

    colors = mpl.rcParams['axes.prop_cycle'].by_key()['color']
    if by:
        groups = df.groupby(by)
        for j, (name, group) in enumerate(groups):
            ax.scatter(group[x], group[y], color=colors[j], label=name)
        ax.legend()
    else:
        ax.scatter(df[x], df[y], color=colors[0])
    if logx:
        ax.set_xscale('log')
    if logy:
        ax.set_yscale('log')

    ax.set_xlabel(xlabel if xlabel else x);
    ax.set_ylabel(ylabel if ylabel else y);
    if title:
        ax.set_title(title);

def two_hist(df, variable, bins=50,
              ylabel='Number of countries', title=None):

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18,8))
    ax1 = plot_hist(df, variable, bins=bins,
                    xlabel=variable, ylabel=ylabel,
                    ax=ax1, title=variable if not title else title)
    ax2 = plot_hist(df, variable, bins=bins,
                    xlabel='Log of '+ variable, ylabel=ylabel,
                    logx=True, ax=ax2,
                    title='Log of '+ variable if not title else title)
    plt.close()
    return fig

def hist_over_var(df, variables, bins=50, first_choice=None,
                  ylabel='Number of countries', title=None):
    if not first_choice:
        first_choice = variables[0]
    variable_slider = widgets.Dropdown(options=variables.tolist(),
                                       value=first_choice,
                                       description='Variable:',
                                       disabled=False,
                                       button_style='')
    widgets.interact(two_hist, df=widgets.fixed(df),
                     variable=variable_slider, ylabel=widgets.fixed(ylabel),
                     title=widgets.fixed(title), bins=widgets.fixed(bins));
  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值