模型评估
特征缩放
sklearn完成标准线性回归
# pip install sklearn
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split #模型评估 留出法
from sklearn.linear_model import LinearRegression#线性回归
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score#评估 mse mae r2
path = r'../datas/Advertising.csv'
# pandas读入
data = pd.read_csv(path) # 4列数据
x = data[['TV', 'Radio', 'Newspaper']]#将电视广告,广播广告,报纸广告作为特征
y = data['Sales']#销售额作为标签
print(x)
print(y)
# # 绘制样本分布,观察效果(其实报纸不适合用作考虑特征)
plt.plot(data['TV'], y, 'ro', label='TV')
plt.plot(data['Radio'], y, 'g*', label='Radio')
plt.plot(data['Newspaper'], y, 'bv', label='Newspaer')
plt.legend(loc='lower right')
plt.grid()
plt.show()
#留出法,进行分割,默认比例7.5:2.5
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)# 随机种子
# 创建线性回归模型
linreg = LinearRegression()
model = linreg.fit(x_train, y_train)#拟合数据
print(linreg.coef_)#参数
print(linreg.intercept_)#截距
y_hat = linreg.predict(np.array(x_test))#模型预测
print(mean_squared_error(y_test, y_hat))#均方误差
print(mean_absolute_error(y_test, y_hat))#均绝对值误差,度量模型优劣
print(r2_score(y_test, y_hat))#R方
t = np.arange(len(x_test))
plt.plot(t, y_test, 'r-', linewidth=2, label='Test')
plt.plot(t, y_hat, 'g-', linewidth=2, label='Predict')
plt.legend(loc='upper right')
plt.grid()
plt.show()
1.lightgbm
import pandas as pd
import numpy as np
import matplotlib as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn import datasets
df_train = pd.read_csv(r"../data/train_data.csv")
df_train_label = pd.read_csv(r"../data/train_label.csv")
df_test = pd.read_csv(r"../data/test_data.csv")
label='deposit'
### 训练数据和训练标签合并
df_train = df_train.merge(df_train_label, on='product_no',how='inner')
df_train
### 获取数值型、离散特征
num_list = []
obj_list = []
for i in df_train.columns:
if i !=label:
if df_train[i].dtypes=='object':
obj_list.append(i)
if df_train[i].dtypes in ['int64','int32','float64','float32']:
num_list.append(i)
print(obj_list)
print(num_list)
### 离散型特征编码
for col in obj_list:
# 将分类变量转化为数值变量
dummy_df = pd.get_dummies(df_train[col], prefix=col)
# 将转换后的数据添加到原始数据中
df_train = pd.concat([df_train, dummy_df], axis=1)
df_train
df_train[['housing','housing_yes','housing_no']]
### 数据拆分
df_train = df_train.drop(obj_list,axis=1)
y= df_train['deposit']
X= df_train.drop(['deposit','product_no'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=42)
#lightgbm
import lightgbm as lgb
#lgb_train = lgb.Dataset(X_train, y_train)
#lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# 将参数写成字典下形式
params = {
'task': 'train',
'boosting_type': 'gbdt', # 设置提升类型
'objective': 'binary', # 目标函数
'metric': { 'recall'}, # 评估函数
'num_leaves': 31, # 叶子节点数
'learning_rate': 0.05, # 学习速率
'feature_fraction': 0.9, # 建树的特征选择比例
'bagging_fraction': 0.8, # 建树的样本采样比例
'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
print('Start training...')
# 训练 cv and train
gbm = lgb.LGBMClassifier(objective='binary',num_leaves=31,learning_rate=0.05,n_estimators=20)
gbm.fit(X_train, y_train,eval_set=[(X_test, y_test)],eval_metric='l1',early_stopping_rounds=5)
#评估
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
y_pred
### 预测:对测试数据预测
df_test_tmp = df_test.drop(['deposit','product_no'],axis=1)
# 离散特征编码
for col in obj_list:
dummy_df = pd.get_dummies(df_test_tmp[col], prefix=col)
df_test_tmp = pd.concat([df_test_tmp, dummy_df], axis=1)
# 剔除特征
df_test_tmp = df_test_tmp.drop(obj_list,axis=1)
# 执行预测
test_pred = gbm.predict(df_test_tmp)
# 将预测结果和手机号合并
result = pd.concat([df_test['product_no'], pd.DataFrame(test_pred,columns=['pred'])], axis=1)
result
# 将预测结果上传
result.to_csv('filename.csv', index=False) # 保存为 CSV 文件,不包含索引