线性回归算法Python实现

该代码实现了一个基类`Regression`,用于数据预处理、模型训练和评估。它包含了数据源的获取、数据清洗、特征选择等步骤。然后,`TWLinearRegression`作为基类的子类,具体实现了线性回归模型,包括模型初始化、训练、评估等功能。代码还涉及了缺失值处理、特征转换和标准化等预处理操作。
摘要由CSDN通过智能技术生成

基类实现

# _*_ coding: utf-8 _*_
# @Date : 2023/3/20 23:43
# @Author : Paul
# @File : regression.py
# @Description :

import pandas as pd
import io
import matplotlib.pyplot as plt
from core.utils.string_utils import StringUtils
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from core.algo.base_algo import BaseAlgo
from core.data_source.meta_data_source.meta_data_source import MetaDataSource
from core.utils.data_souce_init_utils import DataSourceInitUtil
from core.utils.date_util import DateUtil


class Regression(BaseAlgo):

    def __init__(self,
                 app_name="clusters",
                 data_source_id=None,
                 table_name=None,
                 feature_cols=None,
                 class_col=None,
                 train_size=None,
                 param = None
                 ):
        """
        初始化类
        :param app_name:
        :param data_source_id:
        :param table_name:
        :param train_cols:
        """
        super(Regression, self).__init__(app_name=app_name)
        self.param = param
        # 开始时间
        self.start_time = DateUtil.getCurrentDate()
        self.table_name = table_name
        self.feature_cols = feature_cols
        self.class_col = class_col
        self.train_size = train_size
        self.all_col = self.feature_cols + self.class_col
        # 模型类
        self.reg = None
        # 数据的摘要概要
        self.info = None
        # 数据统计学估计
        self.describe = None
        # 预测效果分布图
        self.regress_pred_image = self.image_path + "regress_pred_" + app_name + "_" + DateUtil.getCurrentDateSimple() + ".png"
        # 获取元数据库
        self.meta_data_source = MetaDataSource()
        # 获取训练集所在的数据源
        self.data_source = DataSourceInitUtil.getDataBase(self.meta_data_source,
                                                          data_source_id)
        self.labels = None
        self.train_data_ratio = float(self.param["trainDataRatio"])

    def getModelData(self):
        """
        获取建模数据:输出训练集、测试集
        :return:
        """
        data_query_sql = "select {} from {}".format(",".join(self.all_col),
                         self.table_name)
        data = self.data_source.queryAll(data_query_sql)
        data = pd.DataFrame(data=data,
                                       columns = self.all_col)

        # 数据的简要摘要
        buf = io.StringIO()  # 创建一个StringIO,便于后续在内存中写入str
        data.info(buf=buf)  # 写入
        self.info = buf.getvalue()  # 读取

        # 统计学估计
        self.describe = data.describe()

        # 获取预处理策略值
        process_method_list_after_process = []
        self.param.get("preProcessMethodList")[0].get("preProcessFeature")
        process_method_list = self.param.get("preProcessMethodList")
        if len(process_method_list) > 0:
            for process_method in process_method_list:
                if process_method == None or process_method == "null":
                    continue
                pre_process_feature = process_method.get("preProcessFeature")
                if StringUtils.isBlack(pre_process_feature):
                    continue
                else:
                    process_method_list_after_process.append(process_method)
        self.param["preProcessMethodList"] = process_method_list_after_process
        if len(process_method_list_after_process) > 0:
            for process_method in process_method_list_after_process:
                pre_process_feature = process_method.get("preProcessFeature")
                preProcessMethod = process_method.get("preProcessMethod")
                preProcessMethodValue = process_method.get("preProcessMethodValue")

                #1.删除填充值
                if preProcessMethod == "deletena":
                    data.drop(pre_process_feature, inplace=True, axis=1)
                #2.替换缺失值
                elif preProcessMethod == "fillna":
                    if preProcessMethodValue == "mean":
                        imp_mean = SimpleImputer()
                        data[pre_process_feature] = imp_mean.fit_transform(data[pre_process_feature].values.reshape(-1,1))
                    elif preProcessMethodValue == "median":
                        imp_median = SimpleImputer(strategy="median")
                        data[pre_process_feature] = imp_median.fit_transform(data[pre_process_feature].values.reshape(-1,1))
                    elif preProcessMethodValue == "most_frequent":
                        imp_mode = SimpleImputer(strategy="most_frequent")
                        data[pre_process_feature] = imp_mode.fit_transform(data[pre_process_feature].values.reshape(-1,1))
                    elif preProcessMethodValue == "constant_0":
                        imp_0 = SimpleImputer(strategy="constant", fill_value=0)
                        data[pre_process_feature] = imp_0.fit_transform(data[pre_process_feature].values.reshape(-1,1))
                    elif preProcessMethodValue == "constant_1":
                        imp_1 = SimpleImputer(strategy="constant", fill_value=1)
                        data[pre_process_feature] = imp_1.fit_transform(data[pre_process_feature].values.reshape(-1,1))
                # 3.分类变量转换为数值变量
                elif preProcessMethod == "transClassFeature":
                    unique_value = data[pre_process_feature].unique().tolist()
                    data[pre_process_feature] = data[pre_process_feature].apply(lambda x: unique_value.index(x))
                # 4.类型转换
                elif preProcessMethod == "transType":
                    if preProcessMethodValue == "int":
                        data[pre_process_feature] = data[pre_process_feature].astype("int")
                    elif preProcessMethodValue == "float":
                        data[pre_process_feature] = data[pre_process_feature].astype("float")

        X = data.iloc[:, data.columns != self.class_col[0]]
        Y = data.iloc[:, data.columns == self.class_col[0]]
        # 数据无纲量化策略
        standardization = self.param["standardization"]
        if standardization == "MinMaxScaler":
            from sklearn.preprocessing import MinMaxScaler
            scaler = MinMaxScaler()
            X = scaler.fit_transform(X)
        elif standardization == "StandardScaler":
            from sklearn.preprocessing import StandardScaler
            scaler = StandardScaler()
            X = scaler.fit_transform(X)

        Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, train_size=self.train_data_ratio)
        return [Xtrain, Ytrain], [Xtest, Ytest]


if __name__ == '__main__':
    regression = Regression(app_name="cluster_demo",
                      data_source_id=9,
                      table_name="titanic",
                         feature_cols=["Survived", "Pclass", "Sex", "Age", "Cabin"],
                      class_col=["Embarked"],
                      train_size=0.7)
    regression.getModelData()

线性回归代码实现

# _*_ coding: utf-8 _*_
# @Date : 2023/3/22 17:25
# @Author : Paul
# @File : linear_regression.py
# @Description :
import matplotlib.pyplot as plt
import numpy as np
import sys
import json

from core.utils.log_util import LogUtil
from core.beans.regress_result import RegressionResult
from core.utils.string_utils import StringUtils
from core.utils.date_util import DateUtil
from sklearn.model_selection import cross_val_score
from regressions.regression import Regression
from sklearn.linear_model import LinearRegression as LR


class TWLinearRegression(Regression):

    def __init__(self,
                 app_name="linear_regression",
                 data_source_id=None,
                 table_name=None,
                 feature_cols=None,
                 class_col=None,
                 train_size=None,
                 param=None
                 ):
        """
        初始化
        :param app_name:
        :param data_source_id:
        :param table_name:
        :param feature_cols:
        :param class_col:
        :param train_size:
        """
        super(TWLinearRegression, self).__init__(app_name=app_name,
                                                   data_source_id=data_source_id,
                                                   table_name=table_name,
                                                   feature_cols=feature_cols,
                                                   class_col=class_col,
                                                   train_size=train_size,
                                                   param=param)
        self.IS_MODEL_EVAL = True  # 默认:不需要评估模型
        # 预测效果分布图
        self.tree_pred_image = "descion_tree_pred_" + app_name + "_" + DateUtil.getCurrentDateSimple()
        # 画图数据条数
        self.plot_rows_num = 200

    def initModel(self):
        """
        初始化模型
        """
        algoParam = self.param["algoParam"]
        fit_intercept = True if StringUtils.isBlack(algoParam["fitIntercept"]) else bool(algoParam["fitIntercept"])
        normalize = False if StringUtils.isBlack(algoParam["normalize"]) else bool(algoParam["normalize"])
        copy_X = True if StringUtils.isBlack(algoParam["copyX"]) else bool(algoParam["copyX"])
        n_jobs = None if StringUtils.isBlack(algoParam["nJobs"]) else int(algoParam["nJobs"])

        self.reg = LR(fit_intercept=fit_intercept,
                      normalize=normalize,
                      copy_X=copy_X,
                      n_jobs=n_jobs)

    def buildModel(self, train_data):
        """
        训练模型
        """
        Xtrain = train_data[0]
        Ytrain = train_data[1]
        self.reg = self.reg.fit(Xtrain, Ytrain)

    def evalModel(self, train_data, test_data):
        """
        评估模型
        """
        Xtest = test_data[0]
        Ytest = test_data[1]
        Ytest_predict = self.reg.predict(Xtest)
        from sklearn.metrics import mean_squared_error as MSE
        self.score_ = np.sqrt(MSE(Ytest, Ytest_predict))
        var_importance = {}
        for i in range(len(self.feature_cols)):
            var_importance[self.feature_cols[i]] = self.reg.coef_[0][i]

        test_data_rows = len(Xtest)

        plt.figure()
        if test_data_rows < self.plot_rows_num:
            plt.plot(np.linspace(0.05, 1, test_data_rows), Ytest, "green", label="Y-真实值")
            plt.plot(np.linspace(0.05, 1, test_data_rows), Ytest_predict, "red", label="Y-预测值")
        else:
            plt.plot(np.linspace(0.05, 1, self.plot_rows_num), Ytest[:self.plot_rows_num], "green", label="Y-真实值")
            plt.plot(np.linspace(0.05, 1, self.plot_rows_num), Ytest_predict[:self.plot_rows_num], "red", label="Y-预测值")

        plt.legend()
        plt.savefig(self.regress_pred_image, dpi=300)
        plt.show()

        # 结束时间
        end_time = DateUtil.getCurrentDate()
        cost_second = DateUtil.diffMin(self.start_time, end_time)

        # 模型结果存入mysql
        algo_result = RegressionResult(self.param["id"],
                                       "linear_regression",
                                       self.param,
                                       self.app_name,
                                       self.info,
                                       self.describe,
                                       self.regress_pred_image,
                                       json.dumps(var_importance).replace("\"", "'"),
                                       self.score_,
                                       "sucess",
                                       self.start_time,
                                       end_time,
                                       cost_second)
        LogUtil.saveRegressionResult(self.meta_data_source, algo_result)


if __name__ == '__main__':
    argv = sys.argv[1]
    # argv = "{\"algoParam\":{\"copyX\":\"true\",\"fitIntercept\":\"true\",\"nJobs\":\"\",\"normalize\":\"true\"},\"appName\":\"kmeans_1\",\"classCols\":\"housePrice\",\"dataSourceId\":\"9\",\"featureCols\":\"MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude\",\"id\":\"1679540469249\",\"preProcessMethodList\":[{\"preProcessMethod\":\"deletena\"}],\"tableName\":\"california_housing\",\"trainDataRatio\":\"0.7\"}"
    param = json.loads(argv)
    app_name = param["appName"]
    data_source_id = param["dataSourceId"]
    table_name = param["tableName"]
    feature_cols = param["featureCols"]
    class_cols = param["classCols"]
    train_size = float(param["trainDataRatio"])
    class_cols_list = []
    if isinstance(class_cols, list):
        class_cols_list = class_cols
    else:
        class_cols_list.append(class_cols)
    classifier = TWLinearRegression(app_name=app_name,
                                      data_source_id=data_source_id,
                                      table_name=table_name,
                                      feature_cols=feature_cols,
                                      class_col=class_cols_list,
                                      train_size=train_size,
                                      param=param)
    if "paramTrain" not in param.keys():
        classifier.execute()
    else:
        classifier.paramTrain()

详细代码见gitee

twinkle_algo_plat: 晓烁算法平台算法端

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值