Python机器学习实践指南读书笔记（第一章）

最新推荐文章于 2023-04-06 14:44:48 发布

danleeli

最新推荐文章于 2023-04-06 14:44:48 发布

阅读量859

点赞数

文章标签：机器学习 python 数据挖掘

本文链接：https://blog.csdn.net/danleeli/article/details/122287859

版权

第 1 章 Python 机器学习的生态系统

获取数据
- 通过REST的API接口
检查数据
数据建模

获取数据

获取数据格式不重要，了解数据本身很重要。

通过REST的API接口

从链接中拉出数据。
我用书本的代码直接报错，超过了最大重试次数，未能建立新连接。多运行几次好了。
在这里插入图片描述

检查数据

有效性，转化适合于模型使用。
在这里插入图片描述

Pandas

是python的数据分析库，以Series 和DateFrame呈现数据。
在pandas的1.0.0版本开始，移除了Series.ix and DataFrame.ix 方法。
可以使用DataFrame的loc方法或者iloc方法进行替换。
在这里插入图片描述

Python的Matplotlib库
import matplotlib.pyplot as plt 
plt.style.use('ggplot')
#%matplotlib inline 
import numpy as np
import os 
import pandas as pd 
import requests 
PATH = r'C:\Users\think\Desktop\其他'
#https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
r= requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data')
#print(r.text)

with open(PATH + 'iris.data','w') as f: 
   f.write(r.text)
os.chdir(PATH) 
df = pd.read_csv(PATH + 'iris.data', names=['sepal length', 'sepal width','petal length', 'petal width', 'class']) 
#print(df.head())

fig, ax = plt.subplots(figsize=(6,4)) #宽度为 6 英寸和高度为 4 英寸的一个插图
ax.hist(df['petal width'], color='black')#传入数据，依照 iris 数据框绘制了花瓣宽度的直方图
ax.set_ylabel('Count', fontsize=12) 
ax.set_xlabel('Width', fontsize=12) #在 y 轴和 x 轴上放置标签
plt.title('Iris Petal Width', fontsize=14, y=1.01)#设置了标题
plt.show()

fig, ax = plt.subplots(2,2, figsize=(6,4)) 
ax[0][0].hist(df['petal width'], color='black'); 
ax[0][0].set_ylabel('Count', fontsize=12) 
ax[0][0].set_xlabel('Width', fontsize=12)
ax[0][0].set_title('Iris Petal Width', fontsize=14, y=1.01) 
ax[0][1].hist(df['petal length'], color='black'); 
ax[0][1].set_ylabel('Count', fontsize=12) 
ax[0][1].set_xlabel('Lenth', fontsize=12) 
ax[0][1].set_title('Iris Petal Lenth', fontsize=14, y=1.01) 
ax[1][0].hist(df['sepal width'], color='black'); 
ax[1][0].set_ylabel('Count', fontsize=12) 
ax[1][0].set_xlabel('Width', fontsize=12) 
ax[1][0].set_title('Iris Sepal Width', fontsize=14, y=1.01) 
ax[1][1].hist(df['sepal length'], color='black'); 
ax[1][1].set_ylabel('Count', fontsize=12) 
ax[1][1].set_xlabel('Length', fontsize=12) 
ax[1][1].set_title('Iris Sepal Length', fontsize=14, y=1.01) 
plt.tight_layout()#自动调整子插图
plt.show()

fig, ax = plt.subplots(figsize=(6,6)) 
ax.scatter(df['petal width'],df['petal length'], color='green') 
ax.set_xlabel('Petal Width') 
ax.set_ylabel('Petal Length') 
ax.set_title('Petal Scatterplot')
plt.show()#花瓣长度很可能是用于区分类别的一个有用
fig, ax = plt.subplots(figsize=(6,6)) 
bar_width = .8 
labels = [x for x in df.columns if 'length' in x or 'width' in x] 
ver_y = [df[df['class']=='Iris-versicolor'][x].mean() for x in labels] 
vir_y = [df[df['class']=='Iris-virginica'][x].mean() for x in labels] 
set_y = [df[df['class']=='Iris-setosa'][x].mean() for x in labels] 
x = np.arange(len(labels)) 
ax.bar(x, vir_y, bar_width, bottom=set_y, color='darkgrey') 
ax.bar(x, set_y, bar_width, bottom=ver_y, color='white')
#bottom 参数，这个参数将该序列的 y 点最小值设置为其下面那个序列的 y 点最大值
ax.bar(x, ver_y, bar_width, color='black') #bar条形图
ax.set_xticks(x + (bar_width/2))#为了让 x 轴的标签对齐,调整标签之间的间隔 
ax.set_xticklabels(labels, rotation=-70, fontsize=12); 
ax.set_title('Mean Feature Measurement By Class', y=1.01) 
ax.legend(['Virginica','Setosa','Versicolor'])

在这里插入图片描述

Seaborn

专门为统计可视化而创建的库。
在这里插入图片描述

import matplotlib.pyplot as plt 
plt.style.use('ggplot')
#%matplotlib inline 
import numpy as np
import os 
import pandas as pd 
import requests 
PATH = r'C:\Users\think\Desktop\其他'
#https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
r= requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data')
#print(r.text)

with open(PATH + 'iris.data','w') as f: 
   f.write(r.text)
os.chdir(PATH) 
df = pd.read_csv(PATH + 'iris.data', names=['sepal length', 'sepal width','petal length', 'petal width', 'class']) 
#print(df.head())

import seaborn as sns
'''
sns.pairplot(df, hue="class")

plt.show()
'''
fig, ax = plt.subplots(2, 2, figsize=(7, 7)) 
sns.set(style='white', palette='muted') 
sns.violinplot(x=df['class'], y=df['sepal length'], ax=ax[0,0]) 
sns.violinplot(x=df['class'], y=df['sepal width'], ax=ax[0,1]) 
sns.violinplot(x=df['class'], y=df['petal length'], ax=ax[1,0]) 
sns.violinplot(x=df['class'], y=df['petal width'], ax=ax[1,1])
#在所有的子图上添加了一个总标题，而不是在每个单独的子图上各自添加标题。
fig.suptitle('Violin Plots', fontsize=16, y=1.03) 
for i in ax.flat: 
   plt.setp(i.get_xticklabels(), rotation=-90) #遍历每个子图的轴，并使用.setp()设置特定的属性
   fig.tight_layout()
plt.show()

在这里插入图片描述
过滤、聚集，输入，转化。

处理和操作数据

pandas 的 Series.map()、
序列数据
在这里插入图片描述
Series.apply()、
DataFrame.apply()、
DataFrame.applymap()和 DataFrame.groupby()方法

import matplotlib.pyplot as plt 
plt.style.use('ggplot')
#%matplotlib inline 
import numpy as np
import os 
import pandas as pd 
import requests 
PATH = r'C:\Users\think\Desktop\其他'
#https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
r= requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data')
#print(r.text)

with open(PATH + 'iris.data','w') as f: 
   f.write(r.text)
os.chdir(PATH) 
df = pd.read_csv(PATH + 'iris.data', names=['sepal length', 'sepal width','petal length', 'petal width', 'class']) 
#print(df.head())
df['class'] = df['class'].map({'Iris-setosa': 'SET', 'Iris-virginica': 
'VIR', 'Iris-versicolor': 'VER'}) 
print(df)

#类型的转换在机器学习领域是相当普遍的特征工程转变
df['wide petal'] = df['petal width'].apply(lambda v: 1 if v >= 1.3 else 0) 
print(df)

df['petal area'] = df.apply(lambda r: r['petal length'] * r['petal width'], 
axis=1) 
print(df)
#类型的检查，可以防止系统返回一个错误信息，或者是为字符串型的 class 列或整数形的 wide petal 列返回浮动值
#根据一定的条件标准来转变或格式化每一个单元
df.applymap(lambda v: np.log(v) if isinstance(v, float) else v)
print(df)

#某些你所选择的类别对数据进行分组
df.groupby('class').mean()

df.groupby('class').describe()

df.groupby('petal width')['class'].unique().to_frame()

#自定义的聚集函数
df.groupby('class')['petal width']\ 
.agg({'delta': lambda x: x.max() - x.min(), 
'max': np.max, 'min': np.min})

数据建模

选择合适算法，训练模型。

Statsmodels

是用于探索数据、估计模型，并运行统计检验的 Python 包
个简单的线性回归模型，为 setosa 类中花萼长度和花萼宽度之间的关
系进行建模。似乎有一个正向的线性关系。使用 statsmodels，在这个数据集上运行一个线性回归模型，来
预估这种关系的强度
在这里插入图片描述

简单回归模型的结果
由于这是一个线性回归，该
模型的格式为 Y = Β0+Β1X，其中 B0 为截距而 B1 是回归系数。在这里，最终公式是 Sepal Length = 2.6447 + 0.6909 × Sepal Width。我们也可以看到，该模型的 R2 值是
一个可以接受的 0.558，而 p 值（Prob）是非常显著的。

import matplotlib.pyplot as plt 
plt.style.use('ggplot')
#%matplotlib inline 
import numpy as np
import os 
import pandas as pd 
import requests 
PATH = r'C:\Users\think\Desktop\其他'
#https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
r= requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data')
#print(r.text)

with open(PATH + 'iris.data','w') as f: 
   f.write(r.text)
os.chdir(PATH) 
df = pd.read_csv(PATH + 'iris.data', names=['sepal length', 'sepal width','petal length', 'petal width', 'class']) 
#print(df.head())
fig, ax = plt.subplots(figsize=(7,7)) 
ax.scatter(df['sepal width'][:50], df['sepal length'][:50]) 
ax.set_ylabel('Sepal Length') 
ax.set_xlabel('Sepal Width') 
ax.set_title('Setosa Sepal Width vs. Sepal Length', fontsize=14, 
y=1.02)
plt.show()
import statsmodels.api as sm 
y = df['sepal length'][:50] 
x = df['sepal width'][:50] 
X = sm.add_constant(x) 
results = sm.OLS(y, X).fit() 
print(results.summary())
fig, ax = plt.subplots(figsize=(7,7)) 
ax.plot(x, results.fittedvalues, label='regression line') #可以获取从模型所得的回归线
ax.scatter(x, y, label='data point', color='r') 
ax.set_ylabel('Sepal Length') 
ax.set_xlabel('Sepal Width') 
ax.set_title('Setosa Sepal Width vs. Sepal Length', fontsize=14, 
y=1.02) 
ax.legend(loc=2)
plt.show()

在这里插入图片描述

Python 机器学习包中的王者：

scikit-learn

它建立在 Python 科学栈的核心模块之上，也就是 NumPy、
SciPy、pandas 和 matplotlib。scikit-learn 覆盖的一些领域包括：分类、回归、聚类、降维、
模型选择和预处理。

import matplotlib.pyplot as plt 
plt.style.use('ggplot')
#%matplotlib inline 
import numpy as np
import os 
import pandas as pd 
import requests 
PATH = r'C:\Users\think\Desktop\其他'
#https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
r= requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data')
#print(r.text)

with open(PATH + 'iris.data','w') as f: 
   f.write(r.text)
os.chdir(PATH) 
df = pd.read_csv(PATH + 'iris.data', names=['sepal length', 'sepal width','petal length', 'petal width', 'class']) 
#print(df.head())
from sklearn.ensemble import RandomForestClassifier 
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split

clf = RandomForestClassifier(max_depth=5, n_estimators=10) 
X = df.iloc[:,:4] 
y = df.iloc[:,4]
X_train, X_test, y_train, y_test = train_test_split(X, y, 
test_size=.3) 
clf.fit(X_train,y_train) 
y_pred = clf.predict(X_test) 
rf = pd.DataFrame(list(zip(y_pred, y_test)), columns=['predicted', 
'actual']) 
rf['correct'] = rf.apply(lambda r: 1 if r['predicted'] == r['actual'] else 0, axis=1) 
print(rf)

在这里插入图片描述
数据分割为训练，测试，验证的集合。
预测模型做的情况，实际值与预估值。

import matplotlib.pyplot as plt 
plt.style.use('ggplot')
#%matplotlib inline 
import numpy as np
import os 
import pandas as pd 
import requests 
PATH = r'C:\Users\think\Desktop\其他'
#https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
r= requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data')
#print(r.text)

with open(PATH + 'iris.data','w') as f: 
   f.write(r.text)
os.chdir(PATH) 
df = pd.read_csv(PATH + 'iris.data', names=['sepal length', 'sepal width','petal length', 'petal width', 'class']) 
#print(df.head())



from sklearn.ensemble import RandomForestClassifier 
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
'''
我们建立、训练并测试了一个分类器，
它在 Iris 数据集上具有 95％的准确度。
第一个导入的是一个随机森林分类器，第二个导入的是一个将数据分成训练组和测试组
的模块。数据切分在机器学习应用的构建中是很关键的,模块 train_test_split 还会打乱数据的
先后顺序
'''
clf = RandomForestClassifier(max_depth=5, n_estimators=10)
#10 个决策树的森林，而每棵树最多允许五层的判定深度。

X = df.iloc[:,:4] 
y = df.iloc[:,4]
#创建了 X 矩阵和 y 向量,花瓣的宽度和长度，以及花萼的宽度和长度

X_train, X_test, y_train, y_test = train_test_split(X, y, 
test_size=.3) #将数据打乱并划分为四个子集数据集的 30％将被分配给 X_test 和 y_test 部分，而其余的将被分配到训练的部分，X_train 和 y_train。
clf.fit(X_train,y_train) 
y_pred = clf.predict(X_test) 
rf = pd.DataFrame(list(zip(y_pred, y_test)), columns=['predicted', 
'actual']) 
rf['correct'] = rf.apply(lambda r: 1 if r['predicted'] == r['actual'] else 0, axis=1) 
print(rf)
rf['correct'].sum()/rf['correct'].count()

f_importances = clf.feature_importances_f_names = df.columns[:4]
'''
随机森林有一个名为.feature_importances_的方法，它返回特征在决策树中划分叶子节点的
相对能力。如果一个特征能够将分组一致性地、干净拆分成不同的类别，那么它将具有很
高的特征重要性。这个数字的总和将始终为 1。对于每个特征，获取每
10 棵决策树的特征重要性，并计算标准差。'''
f_std = np.std([tree.feature_importances_ for tree in 
clf.estimators_], axis=0) 
zz = zip(f_importances, f_names, f_std) 
zzs = sorted(zz, key=lambda x: x[0], reverse=True)

imps = [x[0] for x in zzs] 
labels = [x[1] for x in zzs] 
errs = [x[2] for x in zzs] 
plt.bar(range(len(f_importances)), imps, color="r", yerr=errs, 
align="center") 
plt.xticks(range(len(f_importances)), labels);
plt.show()
#花瓣的长度和宽度对于区分 iris 的类别而言，具有更好的辨别力

'''切换分类器并使用支持向量机（SVM）
我们将模型切换为支持向量机，而没有改变代码的
本质。唯一的变化是引入了 SVM 而不是随机森林，以及实例

化分类器的那一行代码（标签 y 需要一个小小的格式改变，这是因为 SVM 无法像随机森
林分类器那样，将这些标签解释为 NumPy 的字符串）。'''
from sklearn.multiclass import OneVsRestClassifier 
from sklearn.svm import SVC 
from sklearn.cross_validation import train_test_split 
clf = OneVsRestClassifier(SVC(kernel='linear')) 
X = df.ix[:,:4]
y = np.array(df.ix[:,4]).astype(str) 
X_train, X_test, y_train, y_test = train_test_split(X, y, 
test_size=.3) 
clf.fit(X_train,y_train) 
y_pred = clf.predict(X_test) 
rf = pd.DataFrame(list(zip(y_pred, y_test)), columns=['predicted', 
'actual']) 
rf['correct'] = rf.apply(lambda r: 1 if r['predicted'] == 
r['actual'] else 0, axis=1) 
print(rf)

rf['correct'].sum()/rf['correct'].count()

模型满意

部署

部署小到在本地机器上运行 cron 作业，大到在 Amazon EC2 实例上部署全面的实现。

danleeli

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python机器学习实践指南读书笔记（第一章）

第 1 章 Python 机器学习的生态系统获取数据通过REST的API接口功能快捷键合理的创建标题，有助于目录的生成如何改变文本的样式插入链接与图片如何插入一段漂亮的代码片生成一个适合你的列表创建一个表格设定内容居中、居左、居右SmartyPants创建一个自定义列表如何创建一个注脚注释也是必不可少的KaTeX数学公式新的甘特图功能，丰富你的文章UML 图表FLowchart流程图导出与导入导出导入获取数据获取数据格式不重要，了解数据本身很重要。通过REST的API接口从链接中拉出数据。我用书本
复制链接

扫一扫