Coder by learning Hands on Machine Learning with Scikit-Learn and TensorFlow @Chapter1

最新推荐文章于 2023-09-27 09:29:17 发布

George_Dong

最新推荐文章于 2023-09-27 09:29:17 发布

阅读量267

点赞数

分类专栏： MachineLearning

本文链接：https://blog.csdn.net/George_Dong/article/details/85005923

版权

MachineLearning 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

预定义画图程序

#To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

#Common imports
import numpy as np
import os

#to make this notebook's output stable across runs
np.random.seed(42)

#To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

#Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "fundamentals"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id+".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

#Ignore useless warnings(see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

整合数据

经合组织统计的GDP和国家幸福度调查数据

def prepare_country_stats(oecd_bli, gdp_per_capita):
    oecd_bli = oecd_bli[oecd_bli["INEQUALITY"] == "TOT"]
    oecd_bli = oecd_bli.pivot(index='Country', columns="Indicator", values="Values")
    gdp_per_capita.rename(columns={"2015": "GDP per capita"}, inplace=True)
    gdp_per_capita.set_index("Country", inplace=True)
    full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita,
                                  left_index=True, right_index=True)
    full_country_stats.sort_values(by="GDP per capita", inplace=True)
    remove_indices = [0, 1, 6, 8, 33, 34, 35]
    keep_indices = list(set(range(36))-set(remove_indices))
    return full_country_stats[["GDP per capita", 'Life satisfaction']].iloc[keep_indices]

实现数据读取，并使用LinearRegression训练

import os
datapath = os.path.join("datasets", "lifesat", "")

#Code example 
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.linear_model

#load the data
oecd_bli = pd.read_csv(datapath+"oecd_bli_2015.csv", thousands=',')
gdp_per_capita = pd.read_csv(datapath+"gdp_per_capita.csv", thousands=',',delimiter='\t',
                             encoding='latin1', na_values='n/a')
#prepare the data
country_stats = prepare_country_stats(oecd_bli, gdp_per_capita)
X = np.c_[country_stats["GDP per capita"]]
y = np.c_[country_stats["Life satisfaction"]]

#Visualize the data
country_stats.plot(kind='scatter', x="GDP per capita", y='Life satisfaction')
plt.show()

#select a linear model
model = sklearn.linear_model.LinearRegression()

#train the model
model.fit(X, y)

#make a prediction for Cyprus
X_new = [[22587]] #Cyprus' GDP per capita
print(model.predict(X_new))  #outputs [[5.96242338]]