提供的数据集是txt,先手动将文件后缀改为csv即可利用pandas处理数据
import pandas as pd
path_train = 'D:\\Dataset\\Tianchi\\zhengqi_train.csv'
path_test = 'D:\\Dataset\\Tianchi\\zhengqi_test.csv'
df_train_data = pd.read_csv(path_train)
df_test_data = pd.read_csv(path_test)
print(df_train_data.info())
print(df_test_data.info())
由图可见,提供的数据集包括训练集(2888条数据)和测试集(1925条数据)。训练集和测试集同样都有v0-v37一共38个属性,其中训练集还有target属性即需要预测的结果。所有的数据都为float型且无空值,可以直接扔到LinearRegression训练。
X = df_train_data.iloc[:, :-1]
y = df_train_data['target']
#分为测试集和验证集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7, test_size=0.25)
#建立线性回归模型
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train, y_train)
#验证集验证
X_test_predict = linreg.predict(X_test)
print('Params intercept_ are :\n', linreg.intercept_)
print('Params coef_ are :\n', linreg.coef_)
接下来使用k折交叉验证模拟出误差最小时候的情况:
from sklearn.model_selection import KFold, cross_val_score
import numpy as np
mean = []
var = []
for n_folds in range(2,20):
kf = KFold(n_folds, shuffle=True, random_state=1).get_n_splits(X_train)
mse = cross_val_score(linreg, X_train, y_train, cv=kf)
mean.append(mse.mean())
var.append(mse.var())
df_data = pd.DataFrame({'mean':mean, 'var':var})
df_data.plot(kind='scatter', x= 'mean', y='var', marker='x', facecolors='pink', label=('mean', 'var'))
得出在k为5的时候得分最高,在测试集上进行预测得出最终结果,并保存到文件中:
kf = KFold(5, shuffle=True, random_state=1).get_n_splits(X_train)
mse = cross_val_score(linreg, X_test, y_test, cv=kf)
print('MSE is :',mse)
res = linreg.predict(df_test_data)
print(res)
np.savetxt('D:\\Dataset\\Tianchi\\zhengqi_res.txt',res, fmt="%.3f",delimiter="\n")