第一章：机器学习概览

最新推荐文章于 2022-04-14 20:41:28 发布

发呆少女

最新推荐文章于 2022-04-14 20:41:28 发布

阅读量233

点赞数

分类专栏：机器学习实战文章标签：机器学习 python

本文链接：https://blog.csdn.net/qq_43632490/article/details/114335618

版权

机器学习实战专栏收录该内容

2 篇文章 4 订阅

订阅专栏

算法1：

import sys
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn import linear_model


def prepare_country_stats(oecd_bli, gdp_per_capita):
    '''
    将两个数据集合并
    Args:
        oecd_bli:人民生活满意度
        gdp_per_capita:国家平均GDP

    Returns:
        合并后的数据集
    '''
    # 进行一些参数配置
    oecd_bli = oecd_bli[oecd_bli["INEQUALITY"] == "TOT"]
    oecd_bli = oecd_bli.pivot(index="Country", columns="Indicator", values="Value")
    gdp_per_capita.rename(columns={"2015": "GDP per capita"}, inplace=True)
    gdp_per_capita.set_index("Country", inplace=True)
    # 合并数据集
    full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita,
                                  left_index=True, right_index=True)
    # 排序
    full_country_stats.sort_values(by="GDP per capita", inplace=True)
    # 删掉一些数据（为后面的数据拟合做对比）
    remove_indices = [0, 1, 6, 8, 33, 34, 35]
    keep_indices = list(set(range(36)) - set(remove_indices))
    return full_country_stats[["GDP per capita", 'Life satisfaction']].iloc[keep_indices]


# 数据存储路径
datapath = os.path.join("data", "")
# 配置一些画图参数
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
# 加载数据
oecd_bli = pd.read_csv(datapath + "oecd_bli_2015.csv", thousands=',')
gdp_per_capita = pd.read_csv(datapath + "gdp_per_capita.csv", thousands=',', delimiter='\t',
                             encoding='latin1', na_values="n/a")
# 合并数据集
country_stats = prepare_country_stats(oecd_bli, gdp_per_capita)
X = np.c_[country_stats["GDP per capita"]]
y = np.c_[country_stats["Life satisfaction"]]
# 画数据散点图
country_stats.plot(kind='scatter', x="GDP per capita", y='Life satisfaction')
plt.show()

在这里插入图片描述

# 线性回归预测模型
model = sklearn.linear_model.LinearRegression()

# 训练模型
model.fit(X, y)

# 对新数据进行预测
X_new = [[22587]]  # 新国家的GDP
print(model.predict(X_new))  # 新数据的预测值 [[ 5.96242338]]

在这里插入图片描述

# 运行得出的图片将其保存至文件夹
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "fundamentals"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)  # 图片保存路径
os.makedirs(IMAGES_PATH, exist_ok=True)


def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
	'''
    # 保存图片函数
    Args:
        fig_id: 图片名称
        tight_layout:暂不知
        fig_extension:拓展名
        resolution:暂不知

    Returns:
        保存图片到指定文件夹下
    '''
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)


# 种子函数，使每次得出的结果都是相同的，换句话说就是消除了随机性（42不用管，没什么特别意义）
np.random.seed(42)

# 观察一下生活满意度的数据集
oecd_bli = pd.read_csv(datapath + "oecd_bli_2015.csv", thousands=',')
oecd_bli = oecd_bli[oecd_bli["INEQUALITY"] == "TOT"]
oecd_bli = oecd_bli.pivot(index="Country", columns="Indicator", values="Value")
print(oecd_bli.head(2))  # 显示前两行数据

在这里插入图片描述

# 只显示生活满意度属性，观察一下前五行
print(oecd_bli["Life satisfaction"].head())

在这里插入图片描述

# 观察一下各国GDP的数据集
gdp_per_capita = pd.read_csv(datapath + "gdp_per_capita.csv", thousands=',', delimiter='\t',
                             encoding='latin1', na_values="n/a")
gdp_per_capita.rename(columns={"2015": "GDP per capita"}, inplace=True)
gdp_per_capita.set_index("Country", inplace=True)
print(gdp_per_capita.head(2))

在这里插入图片描述

# 观察一下合并后的数据集
full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita, left_index=True, right_index=True)
full_country_stats.sort_values(by="GDP per capita", inplace=True)  # 按各国GDP从小到大排序
print(full_country_stats.head())

在这里插入图片描述

# 看一下美国的GDP和生活满意度
print(full_country_stats[["GDP per capita", 'Life satisfaction']].loc["United States"])

在这里插入图片描述

# 从数据集中移除了一些数据（我认为可能是当做测试集用，36*0.2约等于7）
remove_indices = [0, 1, 6, 8, 33, 34, 35]
keep_indices = list(set(range(36)) - set(remove_indices))
sample_data = full_country_stats[["GDP per capita", 'Life satisfaction']].iloc[keep_indices]  # （训练集）
missing_data = full_country_stats[["GDP per capita", 'Life satisfaction']].iloc[remove_indices]  # （测试集）

# 对散点图的部分数据进行高亮
sample_data.plot(kind='scatter', x="GDP per capita", y='Life satisfaction', figsize=(5, 3))
plt.axis([0, 60000, 0, 10])  # 配置刻度

# “国家”：（图中每个国家名称的左下角坐标）
position_text = {
    "Hungary": (5000, 1),
    "Korea": (18000, 1.7),
    "France": (29000, 2.4),
    "Australia": (40000, 3.0),
    "United States": (52000, 3.8),
}
for country, pos_text in position_text.items():
    pos_data_x, pos_data_y = sample_data.loc[country]
    country = "U.S." if country == "United States" else country

    # 画箭头
    plt.annotate(country, xy=(pos_data_x, pos_data_y), xytext=pos_text,
                 arrowprops=dict(facecolor='black', width=0.5, shrink=0.1, headwidth=5))

    # 对上述国家的数据进行高亮（红色的小圆点）
    plt.plot(pos_data_x, pos_data_y, "ro")
plt.xlabel("GDP per capita (USD)")
# save_fig('money_happy_scatterplot') （我这里就不保存结果图片了）
plt.show()

在这里插入图片描述

# 将处理好的训练集存到指定目录下
sample_data.to_csv(os.path.join("data", "lifesat.csv"))

在这里插入图片描述

# 观看一下被高亮的数据
print(sample_data.loc[list(position_text.keys())])

在这里插入图片描述

# 画不同参数下的线性回归直线
sample_data.plot(kind='scatter', x="GDP per capita", y='Life satisfaction', figsize=(5, 3))
plt.xlabel("GDP per capita (USD)")
plt.axis([0, 60000, 0, 10])
X = np.linspace(0, 60000, 1000)

# 第一条直线
plt.plot(X, 2 * X / 100000, "r")
plt.text(40000, 2.7, r"$\theta_0 = 0$", fontsize=14, color="r")
plt.text(40000, 1.8, r"$\theta_1 = 2 \times 10^{-5}$", fontsize=14, color="r")

# 第二条直线
plt.plot(X, 8 - 5 * X / 100000, "g")
plt.text(5000, 9.1, r"$\theta_0 = 8$", fontsize=14, color="g")
plt.text(5000, 8.2, r"$\theta_1 = -5 \times 10^{-5}$", fontsize=14, color="g")

# 第三条直线
plt.plot(X, 4 + 5 * X / 100000, "b")
plt.text(5000, 3.5, r"$\theta_0 = 4$", fontsize=14, color="b")
plt.text(5000, 2.6, r"$\theta_1 = 5 \times 10^{-5}$", fontsize=14, color="b")
# save_fig('tweaking_model_params_plot')
plt.show()

在这里插入图片描述

# 运用线性回归模型进行预测
lin1 = linear_model.LinearRegression()
Xsample = np.c_[sample_data["GDP per capita"]]
ysample = np.c_[sample_data["Life satisfaction"]]
lin1.fit(Xsample, ysample)
t0, t1 = lin1.intercept_[0], lin1.coef_[0][0]
# clf.intercept_[0] 截距
# lin1.coef_[0][0] 斜率
print(t0, t1)  # 打印学习到的参数

在这里插入图片描述

# 用得出的参数画线性回归拟合直线
sample_data.plot(kind='scatter', x="GDP per capita", y='Life satisfaction', figsize=(5, 3))
plt.xlabel("GDP per capita (USD)")
plt.axis([0, 60000, 0, 10])
X = np.linspace(0, 60000, 1000)
plt.plot(X, t0 + t1 * X, "b")
plt.text(5000, 3.1, r"$\theta_0 = 4.85$", fontsize=14, color="b")
plt.text(5000, 2.2, r"$\theta_1 = 4.91 \times 10^{-5}$", fontsize=14, color="b")
# save_fig('best_fit_model_plot')
plt.show()

在这里插入图片描述

cyprus_gdp_per_capita = gdp_per_capita.loc["Cyprus"]["GDP per capita"]
print(cyprus_gdp_per_capita)  # 打印Cyprus的GDP
cyprus_predicted_life_satisfaction = lin1.predict([[cyprus_gdp_per_capita]])[0][0]
print(cyprus_predicted_life_satisfaction)  # 打印预测值

在这里插入图片描述

# 画出刚刚预测的点
sample_data.plot(kind='scatter', x="GDP per capita", y='Life satisfaction', figsize=(5, 3), s=1)
plt.xlabel("GDP per capita (USD)")
X = np.linspace(0, 60000, 1000)
plt.plot(X, t0 + t1 * X, "b")
plt.axis([0, 60000, 0, 10])
plt.text(5000, 7.5, r"$\theta_0 = 4.85$", fontsize=14, color="b")
plt.text(5000, 6.6, r"$\theta_1 = 4.91 \times 10^{-5}$", fontsize=14, color="b")
plt.plot([cyprus_gdp_per_capita, cyprus_gdp_per_capita], [0, cyprus_predicted_life_satisfaction], "r--")  # 画那个虚线
plt.text(25000, 5.0, r"Prediction = 5.96", fontsize=14, color="b")
plt.plot(cyprus_gdp_per_capita, cyprus_predicted_life_satisfaction, "ro")  # 画那个小红点
# save_fig('cyprus_prediction_plot')
plt.show()

# K临近算法（手动计算）
print(sample_data[7:10])  # 观察与Cyprus差不过GDP值的三个国家的GDP值
print((5.1 + 5.7 + 6.5) / 3)  # 用k临近算法得出的预测值

# K临近回归模型（直接模型计算）
import sklearn.neighbors
model = sklearn.neighbors.KNeighborsRegressor(n_neighbors=3)
X = np.c_[country_stats["GDP per capita"]]
y = np.c_[country_stats["Life satisfaction"]]
model.fit(X, y)
X_new = np.array([[22587.0]])
print(model.predict(X_new))

在这里插入图片描述

# 观察一下之前删掉的数据
print(missing_data)
# 画加上这些数据之后的拟合曲线，可以发现拟合曲线明显改变了，这说明之前的训练数据不具有足够的代表性
position_text2 = {
    "Brazil": (1000, 9.0),
    "Mexico": (11000, 9.0),
    "Chile": (25000, 9.0),
    "Czech Republic": (35000, 9.0),
    "Norway": (60000, 3),
    "Switzerland": (72000, 3.0),
    "Luxembourg": (90000, 3.0),
}
sample_data.plot(kind='scatter', x="GDP per capita", y='Life satisfaction', figsize=(8, 3))
plt.axis([0, 110000, 0, 10])
# 高亮显示之前被删除的那些数据
for country, pos_text in position_text2.items():
    pos_data_x, pos_data_y = missing_data.loc[country]
    plt.annotate(country, xy=(pos_data_x, pos_data_y), xytext=pos_text,
                 arrowprops=dict(facecolor='black', width=0.5, shrink=0.1, headwidth=5))
    plt.plot(pos_data_x, pos_data_y, "rs")
# 画之前的拟合曲线（蓝色线）
X = np.linspace(0, 110000, 1000)
plt.plot(X, t0 + t1 * X, "b:")
# 画加上缺失数据之后的拟合曲线（黑色线）
lin_reg_full = linear_model.LinearRegression()
Xfull = np.c_[full_country_stats["GDP per capita"]]
yfull = np.c_[full_country_stats["Life satisfaction"]]
lin_reg_full.fit(Xfull, yfull)
t0full, t1full = lin_reg_full.intercept_[0], lin_reg_full.coef_[0][0]
X = np.linspace(0, 110000, 1000)
plt.plot(X, t0full + t1full * X, "k")
plt.xlabel("GDP per capita (USD)")
# save_fig('representative_training_data_scatterplot')
plt.show()

在这里插入图片描述

# 过拟合情况
full_country_stats.plot(kind='scatter', x="GDP per capita", y='Life satisfaction', figsize=(8, 3))
plt.axis([0, 110000, 0, 10])

from sklearn import preprocessing
from sklearn import pipeline

poly = preprocessing.PolynomialFeatures(degree=60, include_bias=False)  # 用多项式的方法来扩张特征个数
scaler = preprocessing.StandardScaler()  # 特征标准化（归一化）：这种数据标准化方法经过处理后数据符合标准正态分布，即均值为0，标准差为1
# 如果某个特征的方差远大于其它特征的方差，那么它将会在算法学习中占据主导位置，导致我们的学习器不能像我们期望的那样，去学习其他的特征，这将导致最后的模型收敛速度慢甚至不收敛，因此我们需要对这样的特征数据进行标准化/归一化。
lin_reg2 = linear_model.LinearRegression()  # 线性回归
pipeline_reg = pipeline.Pipeline([('poly', poly), ('scal', scaler), ('lin', lin_reg2)])#流水线
pipeline_reg.fit(Xfull, yfull)
curve = pipeline_reg.predict(X[:, np.newaxis])
'''
print(np.arange(0, 10)[:, np.newaxis])
结果如下：
[[0]
 [1]
 [2]
 [3]
 [4]
 [5]
 [6]
 [7]
 [8]
 [9]]
'''
plt.plot(X, curve)
plt.xlabel("GDP per capita (USD)")
# save_fig('overfitting_model_plot')
plt.show()

在这里插入图片描述

# 画不同情况下的拟合情况
plt.figure(figsize=(8, 3))
plt.xlabel("GDP per capita")
plt.ylabel('Life satisfaction')
plt.plot(list(sample_data["GDP per capita"]), list(sample_data["Life satisfaction"]), "bo")  # 蓝色代表保留数据
plt.plot(list(missing_data["GDP per capita"]), list(missing_data["Life satisfaction"]), "rs")  # 红色代表之前被丢失的数据
X = np.linspace(0, 110000, 1000)
plt.plot(X, t0full + t1full * X, "r--", label="Linear model on all data")  # 红色线是所有数据上的拟合曲线
plt.plot(X, t0 + t1 * X, "b:", label="Linear model on partial data")  # 蓝色虚线是部分数据上的拟合曲线
ridge = linear_model.Ridge(alpha=10 ** 9.5)  # 岭回归（正则化线性回归）
Xsample = np.c_[sample_data["GDP per capita"]]
ysample = np.c_[sample_data["Life satisfaction"]]
ridge.fit(Xsample, ysample)
t0ridge, t1ridge = ridge.intercept_[0], ridge.coef_[0][0]
plt.plot(X, t0ridge + t1ridge * X, "b", label="Regularized linear model on partial data")  # 蓝色实线是部分数据上的正则化线性回归
plt.legend(loc="lower right")
plt.axis([0, 110000, 0, 10])
plt.xlabel("GDP per capita (USD)")
# save_fig('ridge_model_plot')
plt.show()

在这里插入图片描述

发呆少女

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
2
评论
第一章：机器学习概览

算法1：import sysimport osimport matplotlib as mplimport matplotlib.pyplot as pltimport numpy as npimport pandas as pdimport sklearnfrom sklearn import linear_modeldef prepare_country_stats(oecd_bli, gdp_per_capita): ''' 将两个数据集合并 Args:
复制链接

扫一扫