数据
数据下载地址 http://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset
案例分析
1.拿到这个数据需要删除那些没有用的列,比如序号
2.进行独热编码
3.进行多项式扩展
4.标准化
5.切分数据集
6.建立模型求出数据
代码如下
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
# 读取数据
path = '../../datas/hour.csv'
data = pd.read_csv(path)
# 处理数据
# 删除无用列
data.drop(columns=["instant", "dteday", "casual", "registered"], inplace=True)
# 独热编码
# 提取要进行独热编码的列
hot = data[["season", "mnth", "hr", "weekday"]]
hotCoder = OneHotEncoder(sparse=False)
hot = pd.DataFrame(hotCoder.fit_transform(hot))
# 删除掉独热编码的列
data.drop(columns=["season", "mnth", "hr", "weekday"], inplace=True)
# 多项式扩展
ploy = data[["weathersit", "temp", "atemp", "hum", "windspeed"]]
ployCoder = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
ploy = pd.DataFrame(ployCoder.fit_transform(ploy), columns=ployCoder.get_feature_names())
# 标准化
ssCoder = StandardScaler()
ploy = pd.DataFrame(ssCoder.fit_transform(ploy), columns=ployCoder.get_feature_names())
# 删掉多项式扩展的列
data.drop(columns=["weathersit", "temp", "atemp", "hum", "windspeed"], inplace=True)
# 合并
df = pd.concat([hot, ploy, data], axis=1)
# 构建XY
X = df.iloc[:, :-1]
Y = df.iloc[:, [-1]]
# 切分数据集
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.3, random_state=2)
# 训练LinearRegression模型
model = LinearRegression()
model.fit(train_x, train_y)
print('LinearRegression精度为:', model.score(test_x, test_y))
# 训练Ridge模型
model = Ridge()
model.fit(train_x, train_y)
print('Ridge精度为:', model.score(test_x, test_y))
效果
LinearRegression精度为: 0.695209732049
Ridge精度为: 0.695128876927