探索性数据分析-粮农组织数据集

最新推荐文章于 2024-05-04 17:03:57 发布

curd_boy

最新推荐文章于 2024-05-04 17:03:57 发布

阅读量1.6k

点赞数

分类专栏：机器学习

本文链接：https://blog.csdn.net/weixin_43746433/article/details/99563339

版权

机器学习专栏收录该内容

23 篇文章 10 订阅

订阅专栏

粮农组织的三个主要目标是:

消除饥饿、粮食不安全和营养不良
消除贫困促进经济社会进步
自然资源的可持续管理和利用，包括土地、水、空气、气候和遗传资源，以造福今世后代。

为支持这些目标，《宪法》第1条要求粮农组织“收集、分析、解释和传播与营养、粮食和农业有关的信息”。因此，水温自动调节器开始，其目的是通过收集有助于联合国粮农组织的目标，与水资源相关的信息传播分析，用水和农业用水管理，对国家重点在非洲，亚洲，美国，拉丁美洲，加勒比海。

联合国粮农组织提供数据，元数据，报告国家概况，河流域概况，分析区域，图，表空间，数据，指导方针，和其他的在线工具:

水资源：内部、跨界、总
水的用途：按部门，按来源，废水
灌溉：地点、面积、类型、技术、作物
水坝：位置，高度，容量，表面积
与水有关的机构、政策和立法

Load the data

data = pd.read_csv('aquastat.csv.gzip', compression='gzip')
data.head()

在这里插入图片描述

data.shape

(143280, 7)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143280 entries, 0 to 143279
Data columns (total 7 columns):
country          143280 non-null object
region           143280 non-null object
variable         143280 non-null object
variable_full    143280 non-null object
time_period      143280 non-null object
year_measured    96411 non-null float64
value            96411 non-null float64
dtypes: float64(2), object(5)
memory usage: 7.7+ MB

data[['variable','variable_full']].drop_duplicates()#属性 单位
#total_area 国土面积（1000公顷）
#arable_land 可耕作面积
#permanent_crop_area 多年生作物面积
#cultivated_area 耕地面积
#percent_cultivated 耕地面积占比
#total_pop 总人口
#rural_pop 农村人口
#urban_pop 城市人口
#gdp 国内生产总值
#gdp_per_capita 人均国内生产总值
#agg_to_gdp 农业，增加国内生产总值
#human_dev_index 人类发展指数
#gender_inequal_index 性别不平等指数
#percent_undernourished 营养不良患病率
#avg_annual_rain_depth 长期平均年降水量
#national_rainfall_index 全国降雨指数

在这里插入图片描述

某个时间段国家与variable内属性的值

def time_slice(df, time_period):

    df = df[df.time_period==time_period] 

    # Pivot table 
    df = df.pivot(index='country', columns='variable', values='value')
    
    df.columns.name = time_period
    
    return df
    
time_slice(data, time_periods[0]).head()

在这里插入图片描述

某个国家variable属性的值随时间变化

def country_slice(df, country):
    
    # Only take data for country of interest
    df = df[df.country==country] 

    # Pivot table 
    df = df.pivot(index='variable', columns='time_period', values='value')
    
    df.index.name = country
    return df

在这里插入图片描述

variable中的某一属性的值各个国家随时间变化

def variable_slice(df, variable):
    
    # Only data for that variable
    df = df[df.variable==variable]
    
    # Get variable for each country over the time periods 
    df = df.pivot(index='country', columns='time_period', values='value')
    return df

在这里插入图片描述

某个国家 variable某一值随时间变化

def time_series(df, country, variable):
    
    # Only take data for country/variable combo 
    series = df[(df.country==country) & (df.variable==variable)]
    
    # Drop years with no data 
    series = series.dropna()[['year_measured', 'value']]
    
    # Change years to int and set as index 
    series.year_measured = series.year_measured.astype(int)
    series.set_index('year_measured', inplace=True)
    series.columns = [variable]
    return series

在这里插入图片描述

提取单个区域的函数

def subregion(data, region):
    return data[data.region==region]

缺失值

By variable

recent = time_slice(data, '2013-2017')
msno.matrix(recent, labels=True)

在这里插入图片描述

水资源总量

#Total exploitable water resources 水资源总量
msno.matrix(variable_slice(data, 'exploitable_total'), inline=False, sort='descending');
plt.xlabel('Time period');
plt.ylabel('Country');
plt.title('Missing total exploitable water resources data across countries and time periods \n \n \n \n');

在这里插入图片描述
只有一小部分国家报告了可利用的水资源总量，这些国家中只有极少数国家拥有最近一段时间的数据。

我们将删除该变量，因为这么少的数据点会导致很多问题。
In [10]:

data = data.loc[~data.variable.str.contains('exploitable'),:]# ~删除

national_rainfall_index 全国降水指数（NRI）（毫米/年)


national_rainfall_index 全国降水指数（NRI）（毫米/年)
msno.matrix(variable_slice(data, 'national_rainfall_index'), 
            inline=False, sort='descending');
plt.xlabel('Time period');
plt.ylabel('Country');
plt.title('Missing national rainfall index data across countries and time periods \n \n \n \n');

在这里插入图片描述
全国降雨指数在2002以后不再报告。

data = data.loc[~(data.variable=='national_rainfall_index')]

南美国家某个

north_america = subregion(data, 'North America')

#指数完整性
msno.matrix(msno.nullity_sort(time_slice(north_america, '2013-2017'), sort='descending').T, inline=False)
#plt.title('Fraction of fields complete by country for North America \n \n');

在这里插入图片描述

查看空值较多的Bahamas的属性随时间变化

msno.nullity_filter(country_slice(data, 'Bahamas').T, filter='bottom', p=0.1)

在这里插入图片描述

随时间变化各属性的值的热力图

fig, ax = plt.subplots(figsize=(16, 16));
sns.heatmap(data.groupby(['time_period','variable']).value.count().unstack().T , ax=ax);
plt.xticks(rotation=45);
plt.xlabel('Time period');
plt.ylabel('Variable');
plt.title('Number of countries with data reported for each variable over time');

在这里插入图片描述

农粮数据成图

总人口直方图

fig, ax = plt.subplots(figsize=(12, 8))
ax.hist(recent.total_pop.values, bins=50);
ax.set_xlabel('Total population');
ax.set_ylabel('Number of countries');
ax.set_title('Distribution of population of countries 2013-2017');

在这里插入图片描述

def plot_hist(df, variable, bins=20, xlabel=None, by=None,
              ylabel=None, title=None, logx=False, ax=None):

    if not ax:
        fig, ax = plt.subplots(figsize=(12,8))
    if logx:
        if df[variable].min() <=0:
            df[variable] = df[variable] - df[variable].min() + 1
            print('Warning: data <=0 exists, data transformed by %0.2g before plotting' % (- df[variable].min() + 1))
        
        bins = np.logspace(np.log10(df[variable].min()),
                           np.log10(df[variable].max()), bins)
        ax.set_xscale("log")

    ax.hist(df[variable].dropna().values, bins=bins);
    
    if xlabel:
        ax.set_xlabel(xlabel);
    if ylabel:
        ax.set_ylabel(ylabel);
    if title:
        ax.set_title(title);
    
    return ax

使人口符合正太分布

plot_hist(recent, 'total_pop', bins=25, logx=True, 
          xlabel='Log of total population', ylabel='Number of countries',
          title='Distribution of total population of countries 2013-2017');

在这里插入图片描述

总人口折线图

with sns.color_palette(sns.diverging_palette(220, 280, s=85, l=25, n=23)):
    north_america = time_slice(subregion(data, 'North America'), '1958-1962').sort_values('total_pop').index.tolist()
    for country in north_america:
        plt.plot(time_series(data, country, 'total_pop'), label=country);
        plt.xlabel('Year');
        plt.ylabel('Population');
        plt.title('North American populations over time');
    plt.legend(loc=2,prop={'size':10});

在这里插入图片描述

这除了北美洲是最大的国家之外，什么也没有告诉我们。我们想了解每个国家的人口是如何随着时间的推移而变化的，主要是参照自身的变化。
我们应该通过什么标准化？我们可以选择一个国家的最小、平均、中位数、最大值…或任何其他位置
让我们选择最小值，这样我们就能看到每个国家在起始人口上的增长。

增长倍率折线图

with sns.color_palette(sns.diverging_palette(220, 280, s=85, l=25, n=23)):
    for country in north_america:
        ts = time_series(data, country, 'total_pop')
        ts['norm_pop'] = ts.total_pop/ts.total_pop.min()*100
        plt.plot(ts['norm_pop'], label=country);
        plt.xlabel('Year');
        plt.ylabel('Percent increase in population');
        plt.title('Percent increase in population from 1960 in North American countries');
    plt.legend(loc=2,prop={'size':10});

在这里插入图片描述

north_america_pop = variable_slice(subregion(data, 'North America'), 'total_pop')
north_america_norm_pop = north_america_pop.div(north_america_pop.min(axis=1), axis=0)*100
north_america_norm_pop = north_america_norm_pop.loc[north_america]

总人口热力图

fig, ax = plt.subplots(figsize=(16, 12));
sns.heatmap(north_america_norm_pop, ax=ax, cmap=sns.light_palette((214, 90, 60), input="husl", as_cmap=True));
plt.xticks(rotation=45);
plt.xlabel('Time period');
plt.ylabel('Country, ordered by population in 1960 (<- greatest to least ->)');
plt.title('Percent increase in population from 1960');

在这里插入图片描述

可视化函数汇总

import pandas as pd
import folium
from matplotlib import pyplot as plt
import numpy as np

def time_slice(df, time_period):
    # Only take data for time period of interest
    df = df[df.time_period == time_period]

    # Pivot table
    df = df.pivot(index='country', columns='variable', values='value')

    df.columns.name = time_period

    return df

def country_slice(df, country):
    # Only take data for country of interest
    df = df[df.country == country]

    # Pivot table
    df = df.pivot(index='variable', columns='time_period', values='value')

    df.index.name = country
    return df

def time_series(df, country, variable):
    # Only take data for country/variable combo
    series = df[(df.country == country) & (df.variable == variable)]

    # Drop years with no data
    series = series.dropna()[['year_measured', 'value']]

    # Change years to int and set as index
    series.year_measured = series.year_measured.astype(int)
    series.set_index('year_measured', inplace=True)
    series.columns = [variable]
    return series

simple_regions = {
    'World | Asia': 'Asia',
    'Americas | Central America and Caribbean | Central America': 'North America',
    'Americas | Central America and Caribbean | Greater Antilles': 'North America',
    'Americas | Central America and Caribbean | Lesser Antilles and Bahamas': 'North America',
    'Americas | Northern America | Northern America': 'North America',
    'Americas | Northern America | Mexico': 'North America',
    'Americas | Southern America | Guyana': 'South America',
    'Americas | Southern America | Andean': 'South America',
    'Americas | Southern America | Brazil': 'South America',
    'Americas | Southern America | Southern America': 'South America',
    'World | Africa': 'Africa',
    'World | Europe': 'Europe',
    'World | Oceania': 'Oceania'
}

def subregion(data, region):
    return data[data.region == region]

def variable_slice(df, variable):
    df = df[df.variable==variable]
    df = df.pivot(index='country', columns='time_period', values='value')
    return df


def plot_map(df, variable, time_period=None, log=False,
             legend_name=None, threshold_scale=None,
             geo=r'../../data/aquastat/world.json'):

    if time_period:
        df = time_slice(df, time_period).reset_index()
    else:
        df = df.reset_index()

    if log:
        df[variable] = df[variable].apply(np.log)

    map = folium.Map(location=[34, -45], zoom_start=2,
                     width=1200, height=600)
    map.choropleth(geo_path=geo,
                   data=df,
                   columns=['country', variable],
                   key_on='feature.properties.name', reset=True,
                   fill_color='PuBuGn', fill_opacity=0.7, line_opacity=0.2,
                   legend_name=legend_name if legend_name else variable,
                   threshold_scale=threshold_scale)
    return map


def map_over_time(df, variable, time_periods, log=False,
                  threshold_scale=None, legend_name=None,
                  geo=r'../../data/aquastat/world.json'):

    time_slider = widgets.SelectionSlider(options=time_periods.tolist(),
                                          value=time_periods[0],
                                          description='Time period:',
                                          disabled=False,
                                          button_style='')
    widgets.interact(plot_map, df=widgets.fixed(df),
                     variable=widgets.fixed(variable),
                     time_period=time_slider, log=widgets.fixed(log),
                     legend_name=widgets.fixed(legend_name),
                     threshold_scale=widgets.fixed(threshold_scale),
                     geo=widgets.fixed(geo));


def plot_hist(df, variable, bins=None, xlabel=None, by=None,
              ylabel=None, title=None, logx=False, ax=None):
    if not bins:
        bins = 20

    if not ax:
        fig, ax = plt.subplots(figsize=(12, 8))
    if logx:
        if df[variable].min() <=0:
            df[variable] = df[variable] - df[variable].min() + 1
            print('Warning: data <=0 exists, data transformed by %0.2g before plotting' % (- df[variable].min() + 1))
        bins = np.logspace(np.log10(df[variable].min()),
                           np.log10(df[variable].max()), bins)
        ax.set_xscale("log")

    if by:
        if type(df[by].unique()) == pd.core.categorical.Categorical:
            cats = df[by].unique().categories.tolist()
        else:
            cats = df[by].unique().tolist()

        for cat in cats:
            to_plot = df[df[by] == cat][variable].dropna()
            ax.hist(to_plot, bins=bins);
    else:
        ax.hist(df[variable].dropna().values, bins=bins);

    if xlabel:
        ax.set_xlabel(xlabel);
    if ylabel:
        ax.set_ylabel(ylabel);
    if title:
        ax.set_title(title);

    return ax

def conditional_bar(series, bar_colors=None, color_labels=None, figsize=(13,24),
                   xlabel=None, by=None, ylabel=None, title=None):
    fig, ax  = plt.subplots(figsize=figsize)
    if not bar_colors:
        bar_colors = mpl.rcParams['axes.prop_cycle'].by_key()['color'][0]
    plt.barh(range(len(series)),series.values, color=bar_colors)
    plt.xlabel('' if not xlabel else xlabel);
    plt.ylabel('' if not ylabel else ylabel)
    plt.yticks(range(len(series)), series.index.tolist())
    plt.title('' if not title else title);
    plt.ylim([-1,len(series)]);
    if color_labels:
        for col, lab in color_labels.items():
            plt.plot([], linestyle='',marker='s',c=col, label= lab);
        lines, labels = ax.get_legend_handles_labels();
        ax.legend(lines[-len(color_labels.keys()):], labels[-len(color_labels.keys()):], loc='upper right');
    plt.close()
    return fig


def plot_scatter(df, x, y, xlabel=None, ylabel=None, title=None,
                 logx=False, logy=False, by=None, ax=None):
    if not ax:
        fig, ax = plt.subplots(figsize=(12, 10))

    colors = mpl.rcParams['axes.prop_cycle'].by_key()['color']
    if by:
        groups = df.groupby(by)
        for j, (name, group) in enumerate(groups):
            ax.scatter(group[x], group[y], color=colors[j], label=name)
        ax.legend()
    else:
        ax.scatter(df[x], df[y], color=colors[0])
    if logx:
        ax.set_xscale('log')
    if logy:
        ax.set_yscale('log')

    ax.set_xlabel(xlabel if xlabel else x);
    ax.set_ylabel(ylabel if ylabel else y);
    if title:
        ax.set_title(title);

def two_hist(df, variable, bins=50,
              ylabel='Number of countries', title=None):

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18,8))
    ax1 = plot_hist(df, variable, bins=bins,
                    xlabel=variable, ylabel=ylabel,
                    ax=ax1, title=variable if not title else title)
    ax2 = plot_hist(df, variable, bins=bins,
                    xlabel='Log of '+ variable, ylabel=ylabel,
                    logx=True, ax=ax2,
                    title='Log of '+ variable if not title else title)
    plt.close()
    return fig

def hist_over_var(df, variables, bins=50, first_choice=None,
                  ylabel='Number of countries', title=None):
    if not first_choice:
        first_choice = variables[0]
    variable_slider = widgets.Dropdown(options=variables.tolist(),
                                       value=first_choice,
                                       description='Variable:',
                                       disabled=False,
                                       button_style='')
    widgets.interact(two_hist, df=widgets.fixed(df),
                     variable=variable_slider, ylabel=widgets.fixed(ylabel),
                     title=widgets.fixed(title), bins=widgets.fixed(bins));

curd_boy

关注

0
点赞
踩
6

收藏

觉得还不错? 一键收藏
1
评论
探索性数据分析-粮农组织数据集

粮农组织的三个主要目标是:消除饥饿、粮食不安全和营养不良消除贫困促进经济社会进步自然资源的可持续管理和利用，包括土地、水、空气、气候和遗传资源，以造福今世后代。为支持这些目标，《宪法》第1条要求粮农组织“收集、分析、解释和传播与营养、粮食和农业有关的信息”。因此，水温自动调节器开始，其目的是通过收集有助于联合国粮农组织的目标，与水资源相关的信息传播分析，用水和农业用水管理，对国家重点在...
复制链接

扫一扫