[python]机器学习之回归

参考资料:
scikit-learn代码实现SVM分类与SVR回归以及调参

# 导入库
import numpy as np  # numpy库
from sklearn.linear_model import BayesianRidge, LinearRegression, ElasticNet  # 批量导入要实现的回归算法
from sklearn.svm import SVR  # SVM中的回归算法
from sklearn.model_selection import cross_val_score  # 交叉检验
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score  # 批量导入指标算法
from sklearn.svm import LinearSVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd  # 导入pandas
import matplotlib.pyplot as plt  # 导入图形展示库
import pylab as mpl
import warnings
import datetime
warnings.filterwarnings("ignore")
# 中文乱码问题
mpl.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.sans-serif'] = ['KaiTi']  # 实现将图中的字体设置为中文楷体的功能
plt.rcParams['axes.unicode_minus'] = False  # 实现显示图中的负数的功能


def readcsv(fileName,startData,endData,columns,target):
    data = pd.read_csv(fileName, sep=",")
    if len(data)==0:
        print("数据为空!")
        return None,None,None
    dateTime_start = datetime.datetime.strptime(startData, '%Y-%m-%d %H:%M:%S')
    dateTime_end = datetime.datetime.strptime(endData, '%Y-%m-%d %H:%M:%S')
    data.iloc[:, 1] = pd.to_datetime(data.iloc[:, 1], format='%Y-%m-%d %H:%M:%S')
    shape = data.shape
    index,timeList=[],[]
    for i in range(shape[0]):
        if  (data.iloc[i, 1]-dateTime_start).days>=0 and (data.iloc[i, 1]-dateTime_end).days<=0:
            index.append(i)
            timeList.append(data.iloc[i, 1].strftime('%Y-%m-%d %H:%M:%S'))
    X = data.iloc[index,columns].copy()
    y = data.iloc[index,target].copy()
    shape=X.shape
    #print("数据量:",shape[0])
    minDate=X.iloc[:,0].min()
    for i in range(shape[0]):
        X.iloc[i,0]=(X.iloc[i,0]-minDate).days

    return X.to_numpy(),y.to_numpy(),timeList

def StandardLinearSVR(epsilon):
    return Pipeline([
        ('std_scaler', StandardScaler()),
        ('linearSVR', LinearSVR(epsilon=epsilon))
        # 此处使用超参数 C 的默认值;
        # 如果使用 SVR(),还需要调节参数 kernel;
    ])

def polyRegression(X,y,X_test,degree):
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X)
    poly.fit(X_poly, y)
    lin2 = LinearRegression()
    lin2.fit(X_poly, y)
    polyPredict = lin2.predict(poly.fit_transform(X_test))
    return polyPredict.reshape(-1,1)

def trainAndPredict(X,y,X_test,epsilon,degree):
    # 训练回归模型
    model_br = BayesianRidge()  # 建立贝叶斯岭回归模型对象
    model_lr = LinearRegression()  # 建立普通线性回归模型对象
    model_etc = ElasticNet()  # 建立弹性网络回归模型对象
    model_svr = SVR()  # 建立支持向量机回归模型对象
    model_svr_line = StandardLinearSVR(epsilon)
    model_dic = [model_br, model_lr, model_etc, model_svr,model_svr_line]  # 不同回归模型对象的集合
    pre_test_list = []  # 各个回归模型预测的y值列表
    for model in model_dic:  # 读出每个回归模型对象
        pre_test_list.append(model.fit(X, y).predict(X_test).reshape(-1,1))  # 将回归训练中得到的预测y存入列表
    pre_test_list.append(polyRegression(X,y,X_test,degree))
    return pre_test_list


if __name__ == '__main__':
    measurePointNames=["TP1","TP2","TP4","TP5","TP6","TP7","TP8","TP9","TP10","TP11","TP12","TP13"]
    #measurePointNames=["TP10"]
    columns = [1, 4, 5]
    basePath='C:\\Users\wzz\Desktop\HGT\work\\'
    target = 3  # 2切向累计位移,3径向累计位移
    plotSelectList = [0,1,2]  # 0'贝叶斯岭回归', 1'普通线性回归', 2'弹性网络回归模型', 3'支持向量机回归', 4'线性支持向量机回归'
    degree = 2
    base, end, trainInterval = 2006, 2021, 1
    epsilon = 0.2
    model_names = ['贝叶斯岭回归', '普通线性回归', '弹性网络回归', '支持向量机回归', '线性支持向量机回归','多项式回归']  # 不同模型的名称列表
    Statistics=[0 for i in range(len(model_names))]
    for measurePointName in measurePointNames:
        fileName = basePath+measurePointName+'.csv'
        myTimeLists,timeLists= [],[]
        X_tests, y_tests = np.empty(shape=(0, 3)), np.empty(shape=(0, 1))
        pre_y_list=[np.empty(shape=(0, 1)) for i in range(len(model_names))]
        for i in range(end-base):
            trainYear=base+i
            testYear,testInterval=trainYear+1,1
            startData = str(trainYear) + "-01-01 00:00:00"
            endData = str(trainYear + trainInterval - 1) + "-12-31 23:59:59"
            testStartData = str(testYear) + "-01-01 00:00:00"
            testEndData = str(testYear + testInterval - 1) + "-12-31 23:59:59"
            myTimeLists.append(str(testYear) + "-01-01")
            myTimeLists.append(str(testYear) + "-06-01")

            X, y, _ = readcsv(fileName, startData, endData, columns, target)
            if len(X)<10:
                print("训练数据集不足!")
                exit(403)
            X_test, y_test, timeList = readcsv(fileName, testStartData, testEndData, columns, target)
            if len(X_test)==0:
                exit(404)

            pre_test_list=trainAndPredict( X, y, X_test, epsilon, degree)
            for j, pre_y in enumerate(pre_test_list):
                pre_y_list[j]=np.vstack((pre_y_list[j],pre_y))

            timeLists=timeLists+timeList
            X_tests=np.vstack((X_tests,X_test))
            y_tests = np.vstack((y_tests, y_test.reshape(-1,1)))

        temp = datetime.datetime.strptime(timeLists[0], '%Y-%m-%d %H:%M:%S')
        timeListsTemp = [(datetime.datetime.strptime(timeLists[i], '%Y-%m-%d %H:%M:%S') - temp).days for i in
                         range(len(timeLists))]
        X_tests[0:,]=np.array(timeListsTemp).reshape((len(timeLists), 1)) - min(timeListsTemp)

        myxs = [datetime.datetime.strptime(d, '%Y-%m-%d').date() for d in myTimeLists]

        model_metrics_name = [explained_variance_score, mean_absolute_error, mean_squared_error, r2_score]  # 回归评估指标对象集
        model_metrics_list = []  # 回归评估指标列表
        maxR2,maxIndex=-9999999.0,-1
        for i in range(len(pre_y_list)):  # 循环每个模型索引
            tmp_list = []  # 每个内循环的临时结果列表
            for m in model_metrics_name:  # 循环每个指标对象
                tmp_score = m(y_tests, pre_y_list[i])  # 计算每个回归指标结果
                tmp_list.append(tmp_score)  # 将结果存入每个内循环的临时结果列表
            if tmp_list[-1]>maxR2:
                maxR2=tmp_list[-1]
                maxIndex=i
            model_metrics_list.append(tmp_list)  # 将结果存入回归评估指标列表
        Statistics[maxIndex]+=1
        df = pd.DataFrame(model_metrics_list, index=model_names, columns=['ev', 'mae', 'mse', 'r2'])  # 建立回归指标的数据框
        print (70 * '-')  # 打印分隔线
        print (measurePointName+' ALL regression metrics:')  # 打印输出标题
        print (df)  # 打印输出回归指标的数据框

        if len(measurePointNames)==1:
            index0 = np.argsort(X_tests[:, 0]) # 按时间排序
            myxs = [datetime.datetime.strptime(d, '%Y-%m-%d').date() for d in myTimeLists]
            # 日期对象作为参数设置到横坐标,并且使用list_date中的字符串日志作为对象的标签(别名)
            newTimeList = [timeLists[index] for index in index0]
            xs = [datetime.datetime.strptime(d, '%Y-%m-%d %H:%M:%S').date() for d in newTimeList]
            interval=20
            fig = plt.figure(figsize=(25, 9))  # 设定图像尺寸
            plt.xticks(myxs, myTimeLists, rotation=30, fontsize=10)
            color_list = ['b', 'r', 'g', 'magenta', 'c']  # 颜色列表
            linestyle_list = ['-', '.', 'o', 'v', '*']  # 样式列表
            plt.plot(xs, y_tests[index0], 'y.--', label='原始数据')
            for i, pre_y in enumerate(pre_y_list):  # 读出通过回归模型预测得到的索引及结果
                if i in plotSelectList:
                    plt.plot(xs, pre_y_list[i][index0], color_list[i], label=model_names[i])  # 画出每条预测结果线
            # 下方图片显示不完整的问题
            plt.tight_layout()
            plt.xlabel('时间')
            plt.title(measurePointName+'位移预测')
            plt.ylabel('位移')
            plt.legend()
            plt.show()
    print(70 * '-')  # 打印分隔线
    for i, name in enumerate(model_names):
        print(name,": ",Statistics[i])

结果:
ev mae mse r2
贝叶斯岭回归 0.751232 2.897101 12.936836 0.740079
普通线性回归 0.778305 2.796944 11.487522 0.769198
弹性网络回归 0.777461 2.802954 11.542598 0.768091
支持向量机回归 0.109123 5.430854 44.341277 0.109115
线性支持向量机回归 0.375706 5.734810 56.660943 -0.138406
多项式回归 -0.226657 3.454529 62.488524 -0.255491
在这里插入图片描述

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值