ML机器学习算法笔记

l_aiya

已于 2022-08-10 15:02:31 修改

阅读量606

点赞数 1

文章标签：机器学习算法 python

于 2022-08-03 20:47:06 首次发布

本文链接：https://blog.csdn.net/l_aiya/article/details/126148628

版权

文章目录

5.2 数据预处理

5.2 数据预处理

5.2.1 缺失值处理

import pandas as pd 
import numpy as np 
data=pd.read_excel('missing.xlsx')                                      #数据框 data

# 定义数据框
c=np.array([[1,2,3,4],[4,5,6,np.nan],[5,6,7,8],[9,4,np.nan,8]])         #数组 c 
C=pd.DataFrame(c)                                                       #数据框 C
# 也可合并
pd.DataFrame(np.array([[1,2],[3,5]]))
# 获取数据
data=np.load('data.npy')
data=data[:,1:]  # 去除序号

# 均值填充
from sklearn.impute import SimpleImputer    # SimpleImputer取代了以前的sklearn.preprocessing.Imputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean') #创建按列均值填充策略对象   
# 可选参数：median\most_frequent
imp.fit( data)   # data的数据类型为ndarray和DataFrame均可
 data = imp.transform(data)

5.2.2 数据规范化

# 1.数据规范化___均值-方差规范化
from sklearn.preprocessing import StandardScaler，MinMaxScaler
x=data 
scaler = StandardScaler()  # MinMaxScaler()
scaler.fit(x)
x=scaler.transform(x)

5.2.3 主成分分析

# 相关系数矩阵
import pandas as pd 
Data=pd.read_excel('农村居民人均可支配收入来源2016.xlsx') 
X=Data.iloc[:,1:] 
R=X.corr()  # 计算变量间相关系数

# 数据规范化处理 
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler() 
scaler.fit(X)  
X=scaler.transform(X)

# ① 导入主成分分析模块 PCA。 
from sklearn.decomposition import PCA
# ② 利用 PCA 创建主成分分析对象 pca。 
pca=PCA(n_components=0.95)          #这里设置累计贡献率为 95%以上。
# ③ 调用 pca 对象中的 fit()方法，对待分析的数据进行拟合训练。 
pca.fit(X)
# ④ 调用 pca 对象中的 transform()方法，返回提取的主成分。 
Y=pca.transform(X)

# ⑤ 通过 pca 对象中的 components_属性、explained_variance_属性、explained_variance_ ratio_属性，返回主成分分析中对应的特征向量、特征值和主成分方差百分比（贡献率），
# 比如： 
tzxl=pca.components_                   #返回特征向量 
print(tzxl)
tz=pca.explained_variance_             #返回特征值 
print(tz)
gxl=pca.explained_variance_ratio_      #返回主成分方差百分比（贡献率）
print(gxl)

# 第 1 个主成分前面的 4 个分量的值
Y00=sum(X[0,:]*tzxl[0,:]) 
Y01=sum(X[1,:]*tzxl[0,:]) 
Y02=sum(X[2,:]*tzxl[0,:]) 
Y03=sum(X[3,:]*tzxl[0,:])

# 综合排名
F=gxl[0]*Y[:,0]+gxl[1]*Y[:,1]+gxl[2]*Y[:,2]  #综合得分=各个主成分×贡献率之和 （序列对应求和）
print(F)
dq=list(Data['地区'].values)         #提取地区 
# 构建序列，设置索引和值
Rs=pd.Series(F,index=dq)             #以地区作为 index,综合得分为值,构建序列 
Rs=Rs.sort_values(ascending=False) #按综合得分降序进行排序

5.3 线性回归 (回归模型)

data的数据类型为ndarray和DataFrame均可作为算法模型的输入类型

import pandas as pd 
data = pd.read_excel('发电场数据.xlsx') 
x = data.iloc[:,0:4]
y = data.iloc[:,4].values     # #新版pandas中：df.as_matrix()改写成 df.values
print(x)
print(type(x))
print(type(y))

import pandas as pd 
data = pd.read_excel('发电场数据.xlsx') 
x = data.iloc[:,0:4]
y = data.iloc[:,4].values     # #新版pandas中：df.as_matrix()改写成 df.values
print(x)
print(type(x))
print(type(y))

# 线性回归分析
# 1）导入线性回归模块，简称为 LR。 
from sklearn.linear_model import LinearRegression as LR
lr = LR()
lr.fit(x, y)
Slr=lr.score(x,y)    # 判定系数 R^2
c_x=lr.coef_          # x 对应的回归系数（斜率向量）
c_b=lr.intercept_    # 回归系数常数项

# 利用线性回归模型进行预测
# （1）可以利用 lr 对象中的 predict()方法进行预测。 
import numpy as np 
x1=np.array([28.4,50.6,1011.9,80.54])   # 定义一维数组
x1=x1.reshape(1,4)    # 转换为二维数组
R1=lr.predict(x1)

# 输出
print('x 回归系数为：',c_x) 
print('回归系数常数项为：',c_b) 
print('判定系数为：',Slr) 
print('样本预测值为：',R1)

5.4 逻辑回归（分类模型）

线性回归对应回归问题，逻辑回归对应分类问题

回归模型的score输出为拟合优度（判决系数）

分类模型的score输出为准确率

import pandas as pd 
data = pd.read_excel('credit.xlsx')

# 手动划分数据集
# 前600个样本训练模型
x = data.iloc[:600,:14]
y = data.iloc[:600,14] 
# 600后的样本测试
x1= data.iloc[600:,:14]
y1= data.iloc[600:,14]

from sklearn.linear_model import LogisticRegression as LR
lr = LR(max_iter=3000)

lr.fit(x, y)
r=lr.score(x, y) # 模型准确率（针对训练数据）

# 测试集预测准确率
R =lr.predict(x1)   # 预测值
Z=R-y1 
Rs=len(Z[Z==0])/len(Z)    # 手动计算结果准确率
print('预测结果为：',R) 
print('预测准确率为：',Rs)

RR = lr.score(x1, y1)   # 直接输出准确率

5.5 神经网络

5.5.3 Python 神经网络分类应用

import pandas as pd 
data = pd.read_excel('credit.xlsx')
# 前600个样本训练模型 
x = data.iloc[:600,:14]
y = data.iloc[:600,14] 
# 600后的样本测试
x1= data.iloc[600:,:14]
y1= data.iloc[600:,14]

from sklearn.neural_network import MLPClassifier  # 多层感知机分类器
# （2）利用 MLPClassifier 创建神经网络分类对象 clf。
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5,2), random_state=1, max_iter=3000) # solver：神经网络优化求解算法，包括 lbfgs、sgd、adam 3 种，默认为 adam。
clf.fit(x, y)
# （4）调用 clf 对象中的 score ()方法，获得神经网络的预测准确率（针对训练数据）。
rv=clf.score(x,y) 
R=clf.predict(x1)

Z=R-y1 
Rs=len(Z[Z==0])/len(Z) 
print('预测结果为：',R) 
print('预测准确率为：',Rs)

RR = clf.score(x1, y1)

5.5.4 Python 神经网络回归应用

import pandas as pd 
data = pd.read_excel('发电场数据.xlsx') 
x = data.iloc[:,0:4] 
y = data.iloc[:,4]

from sklearn.neural_network import MLPRegressor  # 多层感知机回归
clf = MLPRegressor(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=8, random_state=1)   # solver：神经网络优化求解算法，包括 lbfgs、sgd、adam 3 种，默认为 adam。
clf.fit(x, y)
# （4）调用 clf 对象中的 score ()方法，获得神经网络回归的拟合优度（判决系数）。 
rv=clf.score(x,y)

import numpy as np 
x1=np.array([28.4,50.6,1011.9,80.54]) 
x1=x1.reshape(1,4)
R=clf.predict(x1)
print('样本预测值为：',R)

5.6 支持向量机

核函数选择：

可以选择线性核linear、多项式核poly、高斯核rbf、sig核sigmoid，默认情况下选择高斯核。

import pandas as pd 
data = pd.read_excel('car.xlsx')
x = data.iloc[:1690,:6] 
y = data.iloc[:1690,6] 
x1= data.iloc[1691:,:6] 
y1= data.iloc[1691:,6]

from sklearn import svm
clf = svm.SVC(kernel='rbf')  # 其中核函数可以选择线性核、多项式核、高斯核、sig 核，分别用 linear、poly、rbf、sigmoid 表示，默认情况下选择高斯核。
clf.fit(x, y)
rv=clf.score(x, y)      # 模型准确率（针对训练数据）

R=clf.predict(x1) 
Z=R-y1 
Rs=len(Z[Z==0])/len(Z) 
print('预测结果为：',R) 
print('预测准确率为：',Rs)

5.7 K-均值聚类

Python K-均值聚类算法应用

# 1．数据获取及标准化处理 
import pandas as pd 
data=pd.read_excel('农村居民人均可支配收入来源2016.xlsx') 
X=data.iloc[:,1:] 
from sklearn.preprocessing import StandardScaler   
scaler = StandardScaler() 
scaler.fit(X)  
X=scaler.transform(X)

# 2．K-均值聚类分析 
from sklearn.cluster import KMeans
model = KMeans(n_clusters = 4, random_state=0, max_iter = 500)
model.fit(X)
# （4）获取 model 对象中的 labels_属性，可以返回其聚类的标签。类标签的数值没有实际的意义，仅起到类标注的作用。
c=model.labels_
Fs=pd.Series(c,index=data['地区']) 
Fs=Fs.sort_values(ascending=True)
Fs