一、赛题描述
通过各种传感器获取了各项数据如温度、湿度、压力等等。
二、赛题目标
根据采集的数据预测产量。(所有数据经过脱敏处理)
三、数据说明
训练数据集和测试数据集,测试集没有target字段。需要利用训练数据集对模型进行训练,然后由测试数据集预测目标变量target。
四、评估指标
均方误差MSE作为评判标准
***原始数据提取:***链接: https://pan.baidu.com/s/1obZvmKnWmLRoW4Dq6Joj1g 提取码: m562
1.导入数据并读取前五行数据观察数据
2.通过箱线图查看是否存在异常值
3.接下来我们找出所有的异常值并把它们放到一个列表中
4.均值替换异常值
5.数据分割,划分训练集,测试集
6.随机森林建模与预测
最后上全部代码:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Lasso,LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np
#导入数据
b=pd.read_table(‘C:/Users/86137/Desktop/互联网+行业训练/项目数据/train.txt’)
#print(b)
#读取前五行数据观察数据
nn=pd.read_table(‘C:/Users/86137/Desktop/互联网+行业训练/项目数据/train.txt’,nrows=5)
data1=nn.iloc[0:38]
print(data1.head())
plt.figure(figsize=(12,8))
p = data1.boxplot(return_type=‘dict’)
#print§
plt.figure(figsize=(18,10))
plt.boxplot(x=data.values,labels=data.columns)
plt.show()
l=[]
d=[]
for i in range(38):
x = p[‘fliers’][i].get_ydata()
for m in x:
if m =="":
pass
else:
m=m.tolist()
d.append(m)
#print(m)
#print(type(m))
print(d)
t=data1[‘V0’].mean()
#print(t)
data2=nn.iloc[0:38]
for i in d:
k=data2.replace(i,t)
data2=k
#print(data2)
#print(data2)
#数据分割
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(train_x,train_y, test_size = 0.3,random_state = 0)
train_data=data2.iloc[0:38]
#print(train_data.head())
train_target=data2.iloc[0:-1]
#print(train_target.head())
fig = plt.subplots(figsize=(30,20))
j = 1
for cols in data2.columns:
plt.subplot(5,8,j)
sns.distplot(train_data[cols])
sns.distplot(train_target[cols])
j+=1
删除无用特征并进行模型尝试
#data2.drop([‘V5’,‘V9’,‘V14’, ‘V12’, ‘V13’, ‘V20’,‘V22’,‘V27’,‘V30’,‘V31’,‘V33’,‘V35’],axis=1,inplace=True)
#建模
from sklearn.linear_model import Lasso,LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
data2=pd.DataFrame(data2)
data2.columns=data2.columns
x=data2.iloc[0:-1]
y=data2.iloc[0:-1]
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.3,random_state = 0)
#随机森林建模
m1=RandomForestRegressor()
m1.fit(x_train,y_train)
scorel=mean_squared_error(y_test,m1.predict(x_test))
print(scorel)