算法1:
import sys
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn import linear_model
def prepare_country_stats(oecd_bli, gdp_per_capita):
'''
将两个数据集合并
Args:
oecd_bli:人民生活满意度
gdp_per_capita:国家平均GDP
Returns:
合并后的数据集
'''
# 进行一些参数配置
oecd_bli = oecd_bli[oecd_bli["INEQUALITY"] == "TOT"]
oecd_bli = oecd_bli.pivot(index="Country", columns="Indicator", values="Value")
gdp_per_capita.rename(columns={"2015": "GDP per capita"}, inplace=True)
gdp_per_capita.set_index("Country", inplace=True)
# 合并数据集
full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita,
left_index=True, right_index=True)
# 排序
full_country_stats.sort_values(by="GDP per capita", inplace=True)
# 删掉一些数据(为后面的数据拟合做对比)
remove_indices = [0, 1, 6, 8, 33, 34, 35]
keep_indices = list(set(range(36)) - set(remove_indices))
return full_country_stats[["GDP per capita", 'Life satisfaction']].iloc[keep_indices]
# 数据存储路径
datapath = os.path.join("data", "")
# 配置一些画图参数
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
# 加载数据
oecd_bli = pd.read_csv(datapath + "oecd_bli_2015.csv", thousands=',')
gdp_per_capita = pd.read_csv(datapath + "gdp_per_capita.csv", thousands=',', delimiter='\t',
encoding='latin1', na_values="n/a")
# 合并数据集
country_stats = prepare_country_stats(oecd_bli, gdp_per_capita)
X = np.c_[country_stats["GDP per capita"]]
y = np.c_[country_stats["Life satisfaction"]]
# 画数据散点图
country_stats.plot(kind='scatter', x="GDP per capita", y='Life satisfaction')
plt.show()
# 线性回归预测模型
model = sklearn.linear_model.LinearRegression()
# 训练模型
model.fit(X, y)
# 对新数据进行预测
X_new = [[22587]] # 新国家的GDP
print(model.predict(X_new)) # 新数据的预测值 [[ 5.96242338]]
# 运行得出的图片将其保存至文件夹
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "fundamentals"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID) # 图片保存路径
os.makedirs(IMAGES_PATH, exist_ok=True)
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
'''
# 保存图片函数
Args:
fig_id: 图片名称
tight_layout:暂不知
fig_extension:拓展名
resolution:暂不知
Returns:
保存图片到指定文件夹下
'''
path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
print("Saving figure", fig_id)
if tight_layout:
plt.tight_layout()
plt.savefig(path, format=fig_extension, dpi=resolution)
# 种子函数,使每次得出的结果都是相同的,换句话说就是消除了随机性(42不用管,没什么特别意义)
np.random.seed(42)
# 观察一下生活满意度的数据集
oecd_bli = pd.read_csv(datapath + "oecd_bli_2015.csv", thousands=',')
oecd_bli = oecd_bli[oecd_bli["INEQUALITY"] == "TOT"]
oecd_bli = oecd_bli.pivot(index="Country", columns="Indicator", values="Value")
print(oecd_bli.head(2)) # 显示前两行数据
# 只显示生活满意度属性,观察一下前五行
print(oecd_bli["Life satisfaction"].head())
# 观察一下各国GDP的数据集
gdp_per_capita = pd.read_csv(datapath + "gdp_per_capita.csv", thousands=',', delimiter='\t',
encoding='latin1', na_values="n/a")
gdp_per_capita.rename(columns={"2015": "GDP per capita"}, inplace=True)
gdp_per_capita.set_index("Country", inplace=True)
print(gdp_per_capita.head(2))
# 观察一下合并后的数据集
full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita, left_index=True, right_index=True)
full_country_stats.sort_values(by="GDP per capita", inplace=True) # 按各国GDP从小到大排序
print(full_country_stats.head())
# 看一下美国的GDP和生活满意度
print(full_country_stats[["GDP per capita", 'Life satisfaction']].loc["United States"])
# 从数据集中移除了一些数据(我认为可能是当做测试集用,36*0.2约等于7)
remove_indices = [0, 1, 6, 8, 33, 34, 35]
keep_indices = list(set(range(36)) - set(remove_indices))
sample_data = full_country_stats[["GDP per capita", 'Life satisfaction']].iloc[keep_indices] # (训练集)
missing_data = full_country_stats[["GDP per capita", 'Life satisfaction']].iloc[remove_indices] # (测试集)
# 对散点图的部分数据进行高亮
sample_data.plot(kind='scatter', x="GDP per capita", y='Life satisfaction', figsize=(5, 3))
plt.axis([0, 60000, 0, 10]) # 配置刻度
# “国家”:(图中每个国家名称的左下角坐标)
position_text = {
"Hungary": (5000, 1),
"Korea": (18000, 1.7),
"France": (29000, 2.4),
"Australia": (40000, 3.0),
"United States": (52000, 3.8),
}
for country, pos_text in position_text.items():
pos_data_x, pos_data_y = sample_data.loc[country]
country = "U.S." if country == "United States" else country
# 画箭头
plt.annotate(country, xy=(pos_data_x, pos_data_y), xytext=pos_text,
arrowprops=dict(facecolor='black', width=0.5, shrink=0.1, headwidth=5))
# 对上述国家的数据进行高亮(红色的小圆点)
plt.plot(pos_data_x, pos_data_y, "ro")
plt.xlabel("GDP per capita (USD)")
# save_fig('money_happy_scatterplot') (我这里就不保存结果图片了)
plt.show()
# 将处理好的训练集存到指定目录下
sample_data.to_csv(os.path.join("data", "lifesat.csv"))
# 观看一下被高亮的数据
print(sample_data.loc[list(position_text.keys())])
# 画不同参数下的线性回归直线
sample_data.plot(kind='scatter', x="GDP per capita", y='Life satisfaction', figsize=(5, 3))
plt.xlabel("GDP per capita (USD)")
plt.axis([0, 60000, 0, 10])
X = np.linspace(0, 60000, 1000)
# 第一条直线
plt.plot(X, 2 * X / 100000, "r")
plt.text(40000, 2.7, r"$\theta_0 = 0$", fontsize=14, color="r")
plt.text(40000, 1.8, r"$\theta_1 = 2 \times 10^{-5}$", fontsize=14, color="r")
# 第二条直线
plt.plot(X, 8 - 5 * X / 100000, "g")
plt.text(5000, 9.1, r"$\theta_0 = 8$", fontsize=14, color="g")
plt.text(5000, 8.2, r"$\theta_1 = -5 \times 10^{-5}$", fontsize=14, color="g")
# 第三条直线
plt.plot(X, 4 + 5 * X / 100000, "b")
plt.text(5000, 3.5, r"$\theta_0 = 4$", fontsize=14, color="b")
plt.text(5000, 2.6, r"$\theta_1 = 5 \times 10^{-5}$", fontsize=14, color="b")
# save_fig('tweaking_model_params_plot')
plt.show()
# 运用线性回归模型进行预测
lin1 = linear_model.LinearRegression()
Xsample = np.c_[sample_data["GDP per capita"]]
ysample = np.c_[sample_data["Life satisfaction"]]
lin1.fit(Xsample, ysample)
t0, t1 = lin1.intercept_[0], lin1.coef_[0][0]
# clf.intercept_[0] 截距
# lin1.coef_[0][0] 斜率
print(t0, t1) # 打印学习到的参数
# 用得出的参数画线性回归拟合直线
sample_data.plot(kind='scatter', x="GDP per capita", y='Life satisfaction', figsize=(5, 3))
plt.xlabel("GDP per capita (USD)")
plt.axis([0, 60000, 0, 10])
X = np.linspace(0, 60000, 1000)
plt.plot(X, t0 + t1 * X, "b")
plt.text(5000, 3.1, r"$\theta_0 = 4.85$", fontsize=14, color="b")
plt.text(5000, 2.2, r"$\theta_1 = 4.91 \times 10^{-5}$", fontsize=14, color="b")
# save_fig('best_fit_model_plot')
plt.show()
cyprus_gdp_per_capita = gdp_per_capita.loc["Cyprus"]["GDP per capita"]
print(cyprus_gdp_per_capita) # 打印Cyprus的GDP
cyprus_predicted_life_satisfaction = lin1.predict([[cyprus_gdp_per_capita]])[0][0]
print(cyprus_predicted_life_satisfaction) # 打印预测值
# 画出刚刚预测的点
sample_data.plot(kind='scatter', x="GDP per capita", y='Life satisfaction', figsize=(5, 3), s=1)
plt.xlabel("GDP per capita (USD)")
X = np.linspace(0, 60000, 1000)
plt.plot(X, t0 + t1 * X, "b")
plt.axis([0, 60000, 0, 10])
plt.text(5000, 7.5, r"$\theta_0 = 4.85$", fontsize=14, color="b")
plt.text(5000, 6.6, r"$\theta_1 = 4.91 \times 10^{-5}$", fontsize=14, color="b")
plt.plot([cyprus_gdp_per_capita, cyprus_gdp_per_capita], [0, cyprus_predicted_life_satisfaction], "r--") # 画那个虚线
plt.text(25000, 5.0, r"Prediction = 5.96", fontsize=14, color="b")
plt.plot(cyprus_gdp_per_capita, cyprus_predicted_life_satisfaction, "ro") # 画那个小红点
# save_fig('cyprus_prediction_plot')
plt.show()
# K临近算法(手动计算)
print(sample_data[7:10]) # 观察与Cyprus差不过GDP值的三个国家的GDP值
print((5.1 + 5.7 + 6.5) / 3) # 用k临近算法得出的预测值
# K临近回归模型(直接模型计算)
import sklearn.neighbors
model = sklearn.neighbors.KNeighborsRegressor(n_neighbors=3)
X = np.c_[country_stats["GDP per capita"]]
y = np.c_[country_stats["Life satisfaction"]]
model.fit(X, y)
X_new = np.array([[22587.0]])
print(model.predict(X_new))
# 观察一下之前删掉的数据
print(missing_data)
# 画加上这些数据之后的拟合曲线,可以发现拟合曲线明显改变了,这说明之前的训练数据不具有足够的代表性
position_text2 = {
"Brazil": (1000, 9.0),
"Mexico": (11000, 9.0),
"Chile": (25000, 9.0),
"Czech Republic": (35000, 9.0),
"Norway": (60000, 3),
"Switzerland": (72000, 3.0),
"Luxembourg": (90000, 3.0),
}
sample_data.plot(kind='scatter', x="GDP per capita", y='Life satisfaction', figsize=(8, 3))
plt.axis([0, 110000, 0, 10])
# 高亮显示之前被删除的那些数据
for country, pos_text in position_text2.items():
pos_data_x, pos_data_y = missing_data.loc[country]
plt.annotate(country, xy=(pos_data_x, pos_data_y), xytext=pos_text,
arrowprops=dict(facecolor='black', width=0.5, shrink=0.1, headwidth=5))
plt.plot(pos_data_x, pos_data_y, "rs")
# 画之前的拟合曲线(蓝色线)
X = np.linspace(0, 110000, 1000)
plt.plot(X, t0 + t1 * X, "b:")
# 画加上缺失数据之后的拟合曲线(黑色线)
lin_reg_full = linear_model.LinearRegression()
Xfull = np.c_[full_country_stats["GDP per capita"]]
yfull = np.c_[full_country_stats["Life satisfaction"]]
lin_reg_full.fit(Xfull, yfull)
t0full, t1full = lin_reg_full.intercept_[0], lin_reg_full.coef_[0][0]
X = np.linspace(0, 110000, 1000)
plt.plot(X, t0full + t1full * X, "k")
plt.xlabel("GDP per capita (USD)")
# save_fig('representative_training_data_scatterplot')
plt.show()
# 过拟合情况
full_country_stats.plot(kind='scatter', x="GDP per capita", y='Life satisfaction', figsize=(8, 3))
plt.axis([0, 110000, 0, 10])
from sklearn import preprocessing
from sklearn import pipeline
poly = preprocessing.PolynomialFeatures(degree=60, include_bias=False) # 用多项式的方法来扩张特征个数
scaler = preprocessing.StandardScaler() # 特征标准化(归一化):这种数据标准化方法经过处理后数据符合标准正态分布,即均值为0,标准差为1
# 如果某个特征的方差远大于其它特征的方差,那么它将会在算法学习中占据主导位置,导致我们的学习器不能像我们期望的那样,去学习其他的特征,这将导致最后的模型收敛速度慢甚至不收敛,因此我们需要对这样的特征数据进行标准化/归一化。
lin_reg2 = linear_model.LinearRegression() # 线性回归
pipeline_reg = pipeline.Pipeline([('poly', poly), ('scal', scaler), ('lin', lin_reg2)])#流水线
pipeline_reg.fit(Xfull, yfull)
curve = pipeline_reg.predict(X[:, np.newaxis])
'''
print(np.arange(0, 10)[:, np.newaxis])
结果如下:
[[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]]
'''
plt.plot(X, curve)
plt.xlabel("GDP per capita (USD)")
# save_fig('overfitting_model_plot')
plt.show()
# 画不同情况下的拟合情况
plt.figure(figsize=(8, 3))
plt.xlabel("GDP per capita")
plt.ylabel('Life satisfaction')
plt.plot(list(sample_data["GDP per capita"]), list(sample_data["Life satisfaction"]), "bo") # 蓝色代表保留数据
plt.plot(list(missing_data["GDP per capita"]), list(missing_data["Life satisfaction"]), "rs") # 红色代表之前被丢失的数据
X = np.linspace(0, 110000, 1000)
plt.plot(X, t0full + t1full * X, "r--", label="Linear model on all data") # 红色线是所有数据上的拟合曲线
plt.plot(X, t0 + t1 * X, "b:", label="Linear model on partial data") # 蓝色虚线是部分数据上的拟合曲线
ridge = linear_model.Ridge(alpha=10 ** 9.5) # 岭回归(正则化线性回归)
Xsample = np.c_[sample_data["GDP per capita"]]
ysample = np.c_[sample_data["Life satisfaction"]]
ridge.fit(Xsample, ysample)
t0ridge, t1ridge = ridge.intercept_[0], ridge.coef_[0][0]
plt.plot(X, t0ridge + t1ridge * X, "b", label="Regularized linear model on partial data") # 蓝色实线是部分数据上的正则化线性回归
plt.legend(loc="lower right")
plt.axis([0, 110000, 0, 10])
plt.xlabel("GDP per capita (USD)")
# save_fig('ridge_model_plot')
plt.show()