python读取mysql实现一元和多元的线性拟合

python读取mysql实现一元和多元的线性拟合

一元线性方程拟合

# -*- coding:utf-8 -*-
# __author__ = "LQ"
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from DataAnalysis.TeachingModel.dbc import dbcConnect

def main(examDf,colsName):

    # 绘制散点图,examDf.jt为X轴,examDf.hk为Y轴
    plt.scatter(examDf[colsName[0]], examDf[colsName[1]], color='darkgreen', label="Exam Data")

    # 添加图的标签(x轴,y轴)
    plt.xlabel(colsName[0])  # 设置X轴标签
    plt.ylabel(colsName[1])  # 设置Y轴标签
    plt.show()  # 显示图像

    rDf = examDf.corr()  # 查看数据间的相关系数
    print("相关系数:", rDf)

    # 拆分训练集和测试集(train_test_split是存在与sklearn中的函数)
    X_train, X_test, Y_train, Y_test = train_test_split(examDf[colsName[0]], examDf[colsName[1]], train_size=0.8)
    # train为训练数据,test为测试数据,examDf为源数据,train_size 规定了训练数据的占比

    print("自变量---源数据:", examDf[colsName[0]].shape, ";  训练集:", X_train.shape, ";  测试集:", X_test.shape)
    print("因变量---源数据:", examDf[colsName[1]].shape, ";  训练集:", Y_train.shape, ";  测试集:", Y_test.shape)

    # 调用线性规划包
    model = LinearRegression()

    # 在这里加一段
    X_train = X_train.values.reshape(-1, 1)
    X_test = X_test.values.reshape(-1, 1)

    # 线性回归训练
    model.fit(X_train, Y_train)  # 调用线性回归包

    a = model.intercept_  # 截距
    b = model.coef_  # 回归系数

    # 训练数据的预测值
    y_train_pred = model.predict(X_train)
    # 绘制最佳拟合线:标签用的是训练数据的预测值y_train_pred
    plt.plot(X_train, y_train_pred, color='blue', linewidth=2, label="best line")

    # 测试数据散点图
    plt.scatter(X_train, Y_train, color='darkgreen', label="train data")
    plt.scatter(X_test, Y_test, color='red', label="test data")

    # 添加图标标签
    plt.legend(loc=2)  # 图标位于左上角,即第2象限,类似的,1为右上角,3为左下角,4为右下角
    plt.xlabel(colsName[0])  # 添加 X 轴名称
    plt.ylabel(colsName[1])  # 添加 Y 轴名称
    plt.show()  # 显示图像

    print("拟合参数:截距", a, ",回归系数:", b)
    print("最佳拟合线: Y = ", round(a, 2), "+", round(b[0], 2), "* X")  # 显示线性方程,并限制参数的小数位为两位

if __name__ == '__main__':
    sql = '''SELECT
    	total_log_num,
    	getscore
    FROM
    	t_study_behavior_bak
    	'''
    df = dbcConnect.selectDf(sql)
    colsName=['total_log_num','getscore']
    main(df,colsName)

多元线性方程拟合

# -*- coding:utf-8 -*-
# __author__ = "LQ"
import pymysql
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas import DataFrame, Series
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from DataAnalysis.TeachingModel.dbc import dbcConnect
from sklearn import preprocessing

def main(d,yName):
      # 检验数据
      print(d.describe())  # 数据描述,会显示最值,平均数等信息,可以简单判断数据中是否有异常值
      print(d[d.isnull() == True].count())  # 检验缺失值,若输出为0,说明该列没有缺失值

      # 输出相关系数,判断是否值得做线性回归模型
      print(d.corr())  # 0-0.3弱相关;0.3-0.6中相关;0.6-1强相关;

      # 拆分训练集和测试集
      X_train, X_test, Y_train, Y_test = train_test_split(d.ix[:, :5], d.getscore, train_size=0.8)
      # new_examDf.ix[:,:2]取了数据中的前两列为自变量,此处与单变量的不同

      print("自变量---源数据:", d.ix[:, :5].shape, ";  训练集:", X_train.shape, ";  测试集:", X_test.shape)
      print("因变量---源数据:", d[yName].shape, ";  训练集:", Y_train.shape, ";  测试集:", Y_test.shape)

      # 调用线性规划包
      model = LinearRegression()

      model.fit(X_train, Y_train)  # 线性回归训练

      a = model.intercept_  # 截距
      b = model.coef_  # 回归系数
      print("拟合参数:截距", a, ",回归系数:", b)
      for b0 in b:
            print(b0)
      # 显示线性方程,并限制参数的小数位为两位
      print("最佳拟合线: Y = ", round(a, 2), "+", round(b[0], 2), "* X1", "+", round(b[1], 2), "* X2", "+", round(b[2], 2), "* X3", "+", round(b[3], 2), "* X4", "+", round(b[4], 2), "* X5")

      Y_pred = model.predict(X_test)  # 对测试集数据,用predict函数预测

      plt.plot(range(len(Y_pred)), Y_pred, 'red', linewidth=2.5, label="predict data")
      plt.plot(range(len(Y_test)), Y_test, 'green', label="test data")
      plt.legend(loc=2)
      plt.show()  # 显示预测值与测试值曲线

      # 均方差
      # 查看残差平方的均值(mean square error,MSE)
      print("Mean squared error: %.2f"
            % mean_squared_error(Y_test, Y_pred))

      # Explained variance score: 1 is perfect prediction
      #  R2 决定系数(拟合优度)
      # 模型越好:r2→1
      # 模型越差:r2→0
      print('Variance score: %.2f' % r2_score(Y_test, Y_pred))

      # Plot outputs
      plt.scatter(range(len(Y_pred)), Y_test, color='black')
      plt.plot(range(len(Y_test)), Y_pred, color='blue', linewidth=3)

      plt.xticks(())
      plt.yticks(())

      plt.show()
if __name__ == '__main__':
      sql = '''SELECT
	study_document_num,
	actual_brain_num,
	actual_discuss_num,
	actual_quiz_num,
	actual_homework_num,
	getscore
	FROM
		t_study_behavior_rate
      '''
      conn=dbcConnect.dbcconnect()
      cursor = conn.cursor()
      cursor.execute(sql)
      # 获取剩余结果所有数据
      results = cursor.fetchall()
      # 获取列名
      cols = [i[0] for i in cursor.description]
      # sql内表转换pandas的DF
      df = pd.DataFrame(np.array(results), columns=cols).astype(float)
      conn.commit()
      cursor.close()
      # 归一化处理
      min_max_scaler = preprocessing.MinMaxScaler()
      df_minMax = min_max_scaler.fit_transform(df)
      examDf = pd.DataFrame(df_minMax, columns=cols).astype(float)
      yName = 'getscore'
      main(examDf,yName)

自定义方法dbcConnect
在下面博客中
https://blog.csdn.net/qq_30868737/article/details/103995174

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值