python机器学习之回归预测：电池续航里程问题

本文链接：https://blog.csdn.net/weixin_45776027/article/details/107389452

最近在做电池续航测试，不同型号参数的动力电池可以跑40-70多公里，开个小电动出去测里程，累的要死，正好要到数据集，弄个模型预测一波，只用输入电池参数，就可以预测里程，舒服~ （实际样本太少，不足100个，预测效果与实际测量在上下3公里左右波动）
电池数据集获取及相关经验指导请加我球球877562830

华丽的分割线---------------------------------------------------------------------------
用了三种方法：

支持向量机回归
多元回归函数
随机森林回归

支持向量机回归预测代码如下：

# -*- coding:utf-8 -*-
# 电池里程性能预测（SVR线性回归）
# 是最强的冰哥
from pandas import DataFrame
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn import datasets
from sklearn.model_selection import train_test_split#分割训练集和测试集前自动打散数据
from sklearn import svm
from sklearn.preprocessing import Normalizer#归一化库函数
import numpy as np
from sklearn.svm import SVR
import joblib
import matplotlib.pyplot as plt# 引入MATLAB相似绘图库，make_classification生成三元分类模型数据
from sklearn.linear_model import LinearRegression#用于训练多元线性回归模型

#第一步 读取数据
dianchi = pd.read_excel("F:\数据处理\数据\电池测量里程数据\电池容量与续航里程.xls",encoding='utf-8',header =0)#完整数据

x = dianchi.iloc[:,1:5] #数据切片
y = dianchi.iloc[:,-1]
print("电池参数=\n",x)
print("里程=\n",y)

#第二步 留出法划分训练集和测试集
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0,train_size = 0.6)
print("训练集数据=\n",x_train)
print("测试集数据=\n",x_test)


noise = 0
y_noise = noise +y_train #定义一个常数
print("常数项=\n",y_noise)

#第三步 标准化处理
StandardScaler().fit_transform(x_train)
StandardScaler().fit_transform(x_test)#将数据按其属性(按列进行)减去其均值，然后除以其方差,最后得到的结果是，对每个属性/每列来说所有数据都聚集在0附近，方差值为1。

#第四步 svr线性回归算法模型建立
svr_linear = SVR(kernel = "linear",C=1,gamma=100)#这里用的线性核函数，用高斯核函数有问题,C为误差项的惩罚系数，C越大泛化能力越弱


svr_linear.fit(x_train,y_noise)#训练集训练模型并测试

y_linear = svr_linear.predict(x_test)
print("测试结果=\n",y_linear)
print("训练集精度\n=",svr_linear.score(x_train,y_train))
print("测试集精度\n=",svr_linear.score(x_test,y_test))

#C 误差项的惩罚参数，一般取值为10的n次幂，如10的-5次幂，10的-4次幂。。。。10的0次幂，10，1000,1000，在python中可以使用pow（10，n） n=-5~inf
#C越大，相当于希望松弛变量接近0，即对误分类的惩罚增大，趋向于对训练集全分对的情况，这样会出现训练集测试时准确率很高，但泛化能力弱。
#C值小，对误分类的惩罚减小，容错能力增强，泛化能力较强。

#方法2：在训练集上训练多元线性回归模型
#regressor = LinearRegression()
#regressor.fit(x_train, y_train)

#y_pred = regressor.predict(x_test)
#print("测试结果=\n",y_pred)

joblib.dump(svr_linear, "电池里程预测_model_SVR.m")
#第7步 调用训练模型
clf = joblib.load("电池里程预测_model_SVR.m")

dianchi_new = pd.read_excel("F:数据处理\数据\电池测量里程数据\电池容量与续航里程_new.xls",encoding='utf-8',header =0)
test_x = dianchi_new.iloc[:,1:5]#取数据集的前4列为数据值
print("新数据\n=",test_x)
test_y = clf.predict(test_x)
print("预测的电池里程(公里）=\n",test_y)

调用模型，输入电池参数
看一下效果：
在这里插入图片描述

第二种方法：多元线性回归预测：

# -*- coding:utf-8 -*-
# 电池里程性能预测（多变量线性回归）
# 是最强的冰哥
from pandas import DataFrame
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn import datasets
from sklearn.model_selection import train_test_split#分割训练集和测试集前自动打散数据
from sklearn import svm
from sklearn.preprocessing import Normalizer#归一化库函数
import numpy as np
from sklearn.svm import SVR
import joblib
import matplotlib.pyplot as plt# 引入MATLAB相似绘图库，make_classification生成三元分类模型数据
from sklearn.linear_model import LinearRegression#用于训练多元线性回归模型

#第一步 读取数据
dianchi = pd.read_excel("F:\数据处理\数据\电池测量里程数据\电池容量与续航里程.xls",encoding='utf-8',header =0)#完整数据

x = dianchi.iloc[:,1:5] #数据切片
y = dianchi.iloc[:,-1]
print("电池参数=\n",x)
print("里程=\n",y)

#第二步 留出法划分训练集和测试集
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0,train_size = 0.3)
print("训练集数据=\n",x_train)
print("测试集数据=\n",x_test)


#第三步 标准化处理
StandardScaler().fit_transform(x_train)
StandardScaler().fit_transform(x_test)#将数据按其属性(按列进行)减去其均值，然后除以其方差,最后得到的结果是，对每个属性/每列来说所有数据都聚集在0附近，方差值为1。

#第四步 在训练集上训练多元线性回归模型
regressor = LinearRegression()
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)
print("测试结果=\n",y_pred)

print("训练集精度\n=",regressor.score(x_train,y_train))
print("测试集精度\n=",regressor.score(x_test,y_test))




joblib.dump(regressor, "电池里程预测_model_多元线性回归.m")
#第5步 调用训练模型
clf = joblib.load("电池里程预测_model_多元线性回归.m")

dianchi_new = pd.read_excel("F:\数据处理\数据\电池测量里程数据\电池容量与续航里程_new.xls",encoding='utf-8',header =0)
test_x = dianchi_new.iloc[:,1:5]#取数据集的前4列为数据值
print("新数据\n=",test_x)
test_y = clf.predict(test_x)
print("预测的电池里程(公里）=\n",test_y)

在这里插入图片描述

第3种方法：随机森林多元拟合：

# -*- coding:utf-8 -*-
# 电池里程性能预测（RandomForest线性回归）
# 是最强的冰哥
from pandas import DataFrame
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn import datasets
from sklearn.model_selection import train_test_split#分割训练集和测试集前自动打散数据
from sklearn import svm
from sklearn.preprocessing import Normalizer#归一化库函数
import numpy as np
from sklearn.svm import SVR
import joblib
import matplotlib.pyplot as plt# 引入MATLAB相似绘图库，make_classification生成三元分类模型数据
from sklearn.linear_model import LinearRegression#用于训练多元线性回归模型
from sklearn.ensemble import RandomForestClassifier #随机森林模块
from sklearn.ensemble import RandomForestRegressor

#第一步 读取数据
dianchi = pd.read_excel("F:\数据处理\数据\电池测量里程数据\电池容量与续航里程.xls",encoding='utf-8',header =0)#完整数据

x = dianchi.iloc[:,1:5] #数据切片
y = dianchi.iloc[:,-1]
print("电池参数=\n",x)
print("里程=\n",y)

#第二步 留出法划分训练集和测试集
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0,train_size = 0.6)
print("训练集数据=\n",x_train)
print("测试集数据=\n",x_test)

#定义
RandomForestClassifier(
            n_estimators=10,   #树的棵数，也是森林的决策数目
            criterion='gini',      #分类标准，衡量分裂质量的性能（函数）。标准是基于不纯度的"gini",和信息增益的"entropy"（熵）。
    #Gini 衡量的是从一个集合中随机选择一个元素，基于该集合中标签的概率分布为元素分配标签的错误率。即1减去所有分类正确的概率，得到的就是分类不正确的概率
            max_depth=None,  #最大深度，（决策）树的最大深度。如果值为None，那么会扩展节点，直到所有的叶子是纯净的，或者直到所有叶子包含少于min_sample_split的样本。
             min_samples_split=2,  #最少分裂几个子节点，分割内部节点所需要的最小样本数量：~如果为int，那么考虑min_samples_split作为最小的数字。
                                    # ~如果为float，那么min_samples_split是一个百分比，并且把ceil(min_samples_split*n_samples)是每一个分割最小的样本数量。
    #min_samples_leaf 需要在叶子结点上的最小样本数量：
             min_weight_fraction_leaf=0.0,#一个叶子节点所需要的权重总和（所有的输入样本）的最小加权分数。当sample_weight没有提供时，样本具有相同的权重
            max_leaf_nodes=None,#以最优的方法使用max_leaf_nodes来生长树。最好的节点被定义为不纯度上的相对减少。如果为None,那么不限制叶子节点的数量
             bootstrap=True,#建立决策树时，是否使用有放回抽样。
            n_jobs=1,    #指定并行使用的进程数，用于拟合和预测的并行运行的工作（作业）数量。如果值为-1，那么工作数量被设置为核的数量。
            random_state=None,#随机数生成器使用的种子;
            verbose=0,#控制决策树建立过程的冗余度
             warm_start=False,#当被设置为True时，重新使用之前呼叫的解决方案，用来给全体拟合和添加更多的估计器，反之，仅仅只是为了拟合一个全新的森林。
             class_weight=None,  #类别权重，样本不均衡时很重要
)

rf = RandomForestRegressor()


rf.fit(x_train, y_train)
re = rf.predict(x_test)
print("训练集精度\n=",rf.score(x_train,y_train))
print("测试集精度\n=",rf.score(x_test,y_test))
print("随机森林测试集预测里程=\n",re)

joblib.dump(rf, "电池里程预测_model_rf.m")
#第7步 调用训练模型
clf = joblib.load("电池里程预测_model_rf.m")

dianchi_new = pd.read_excel("F:\数据处理\数据\电池测量里程数据\电池容量与续航里程_new.xls",encoding='utf-8',header =0)
test_x = dianchi_new.iloc[:,1:5]#取数据集的前4列为数据值
print("新数据\n=",test_x)
test_y = clf.predict(test_x)
print("预测的电池里程(公里）=\n",test_y)