包含全部示例的代码仓库见GIthub
1 导入库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
2 数据准备
data_list = []
for i in range(1,8):
try:
data = pd.read_csv('./dataset/lianjia{}.csv'.format(i), encoding='gbk')
except:
data = pd.read_csv('./dataset/lianjia{}.csv'.format(i))
finally:
data_list.append(data)
data = pd.concat(data_list)
data = data.dropna()
data.info()
# output
<class 'pandas.core.frame.DataFrame'>
Int64Index: 144531 entries, 0 to 6659
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 cjtaoshu 144531 non-null int64
1 mendian 144531 non-null object
2 cjzongjia 144531 non-null float64
3 zhiwei 144531 non-null object
4 haoping 144531 non-null object
5 cjdanjia 144531 non-null object
6 cjxiaoqu 144531 non-null object
7 xingming 144531 non-null object
8 cjzhouqi 144531 non-null object
9 biaoqian 144531 non-null object
10 cjlouceng 144531 non-null object
11 cjshijian 144531 non-null object
12 congyenianxian 144531 non-null object
13 bankuai 144531 non-null object
dtypes: float64(1), int64(1), object(12)
memory usage: 16.5+ MB
data.head()
# output
cjtaoshu mendian cjzongjia zhiwei haoping cjdanjia cjxiaoqu xingming cjzhouqi biaoqian cjlouceng cjshijian congyenianxian bankuai
0 37 红莲北里店 251.0 店经理 97% 141 43997元/平 红莲北里 3室1厅 57平 郭海龙 36 房东信赖;销售达人;带看活跃 南 北/高楼层/6层 签约时间:2015-05-24 4-5年 马连道
1 37 红莲北里店 159.0 店经理 97% 141 36969元/平 红莲南里 1室1厅 43平 郭海龙 36 房东信赖;销售达人;带看活跃 南/高楼层/7层 签约时间:2015-05-10 4-5年 马连道
2 37 红莲北里店 257.0 店经理 97% 141 39046元/平 常青藤嘉园 1室1厅 65平 郭海龙 36 房东信赖;销售达人;带看活跃 北/低楼层/16层 签约时间:2015-04-26 4-5年 马连道
3 37 红莲北里店 243.0 店经理 97% 141 41313元/平 红莲北里 2室1厅 58平 郭海龙 36 房东信赖;销售达人;带看活跃 南 北/高楼层/6层 签约时间:2015-04-04 4-5年 马连道
4 37 红莲北里店 372.5 店经理 97% 141 42053元/平 广安门外大街 3室1厅 88平 郭海龙 36 房东信赖;销售达人;带看活跃 东 南 西 北/中楼层/18层 签约时间:2015-04-01 4-5年 马连道
data.cjdanjia = np.round(data.cjdanjia.str.replace('元/平','').astype(np.float32).map(lambda x:x/10000),2)
data.cjdanjia
# output
0 4.40
1 3.70
2 3.90
3 4.13
4 4.21
...
6653 2.28
6656 8.10
6657 1.91
6658 1.21
6659 1.18
for value in data.cjshijian:# 遍历并拿到数据
print(type(value))
print(value[5:])
# output
<class 'str'>
2015-05-24
<class 'str'>
2015-05-10
<class 'str'>
2015-04-26
data.cjshijian = pd.to_datetime(data.cjshijian.map(lambda x:x[5:]))
data.cjshijian
# output
0 2015-05-24
1 2015-05-10
2 2015-04-26
3 2015-04-04
4 2015-04-01
...
6653 2016-07-13
6656 2016-09-11
6657 2016-04-10
6658 2016-01-29
6659 2015-12-20
data = data[data.cjxiaoqu.str.contains('远洋山水')]
data = data.sort_values(by = 'cjshijian')
data = data.set_index('cjshijian')['2012':]
plt.figure(figsize=(10,8))
plt.scatter(data.index, data.cjdanjia)
data = data[data.cjdanjia>1]
(data.index - pd.to_datetime('2012-01-01')).days
# output
Int64Index([ 6, 6, 43, 45, 49, 50, 50, 50, 50, 55,
...
1727, 1728, 1729, 1729, 1729, 1759, 1763, 1763, 1765, 1771],
dtype='int64', name='cjshijian', length=481)
处理成时间序列
data = data.assign(time = (data.index - pd.to_datetime('2012-01-01')).days) # assign 添加一列
data = data[['time', 'cjdanjia']]
data
# output
time cjdanjia
cjshijian
2012-01-07 6 2.54
2012-01-07 6 2.54
2012-02-13 43 2.22
2012-02-15 45 2.06
2012-02-19 49 2.23
... ... ...
2016-10-25 1759 6.43
2016-10-29 1763 6.47
2016-10-29 1763 6.62
2016-10-31 1765 6.60
2016-11-06 1771 8.22
plt.figure(figsize=(10,8))
plt.scatter(data.time, data.cjdanjia)
X = data.time
Y = data.cjdanjia
3 模型构建
3.1 y = ax+b
model = LinearRegression()
model.fit(pd.DataFrame(X),Y) # sklearn 与 dataframe比较契合,x转为dataframe就不会报错
x = [[0], [1800]]
type(x)
# output
list
y = model.predict(pd.DataFrame(x))
plt.figure(figsize=(10,8))
plt.plot(x, y, color='r')
plt.scatter(data.time, data.cjdanjia)
3.2 y = ax2+bx+c
model2 = LinearRegression()
model2.fit(pd.DataFrame({'x2':X*X, 'x1':X}), Y)
x = np.linspace(0, 1800)
y = model2.predict(pd.DataFrame({'x2':x*x, 'x1':x}))
plt.figure(figsize=(10,8))
plt.plot(x, y, color='r') # 测试数据绘图
plt.scatter(data.time, data.cjdanjia)
训练数据绘图
model2.predict(pd.DataFrame({'x2':X*X, 'x1':X})).shape
model2.predict(pd.DataFrame({'x2':x*x, 'x1':x})).shape
y = model2.predict(pd.DataFrame({'x2':X*X, 'x1':X})) # X(训练数据)绘图
plt.figure(figsize=(10,8))
plt.plot(X, y, color='r')
plt.scatter(data.time, data.cjdanjia)
3.3 y = ax3+bx2+x+c
q3 = PolynomialFeatures(degree=3) # 转换成3阶多项式
X3 = q3.fit_transform(pd.DataFrame(X))
model3 = LinearRegression()
model3.fit(X3, Y)
x = np.linspace(0, 1800)
x_ = q3.fit_transform(pd.DataFrame(x))
y = model3.predict(x_)
plt.figure(figsize=(10,8))
plt.plot(x, y, color='r')
plt.scatter(data.time, data.cjdanjia)
3.4 4阶多项式
q4 = PolynomialFeatures(degree=4) # 转换成4阶多项式
X4 = q4.fit_transform(pd.DataFrame(X))
model4 = LinearRegression()
model4.fit(X4, Y)
x = np.linspace(0, 1800)
x_ = q4.fit_transform(pd.DataFrame(x))
y = model4.predict(x_)
plt.figure(figsize=(10,8))
plt.plot(x, y, color='r')
plt.scatter(data.time, data.cjdanjia)
4 测试模型
X_train, X_test = X[:'2016-5'], X['2016-5':]
Y_train, Y_test = Y[:'2016-5'], Y['2016-5':]
q3 = PolynomialFeatures(degree=3) # 转换成3阶多项式
X3 = q3.fit_transform(pd.DataFrame(X_train))
model3 = LinearRegression()
model3.fit(X3, Y_train)
X3_ = q3.fit_transform(pd.DataFrame(X_test))
np.sum((model3.predict(X3_) - Y_test)**2)
# output
78.51184688848316
q4 = PolynomialFeatures(degree=4) # 转换成4阶多项式
X4 = q4.fit_transform(pd.DataFrame(X_train))
model4 = LinearRegression()
model4.fit(X4, Y_train)X4_ = q4.fit_transform(pd.DataFrame(X_test))
np.sum((model4.predict(X4_) - Y_test)**2)
# output
68.29625037920916
q5 = PolynomialFeatures(degree=5) # 转换成5阶多项式
X5 = q5.fit_transform(pd.DataFrame(X_train))
model5 = LinearRegression()
model5.fit(X5, Y_train)
X5_ = q5.fit_transform(pd.DataFrame(X_test))
np.sum((model5.predict(X5_) - Y_test)**2)
# output
140.7305646522754
q2 = PolynomialFeatures(degree=2) # 转换成2阶多项式
X2 = q2.fit_transform(pd.DataFrame(X_train))
model2 = LinearRegression()
model2.fit(X2, Y_train)
X2_ = q2.fit_transform(pd.DataFrame(X_test))
np.sum((model2.predict(X2_) - Y_test)**2)
# output
258.2695617180388
5 改变训练集大小
X_train, X_test = X['2016-1':'2016-5'], X['2016-5':]
Y_train, Y_test = Y['2016-1':'2016-5'], Y['2016-5':]
model = LinearRegression()
model.fit(X_train.values.reshape(-1,1), Y_train)
np.sum((model.predict(X_test.values.reshape(-1,1)) - Y_test)**2)
# output
67.65526416726503
q2 = PolynomialFeatures(degree=2) # 转换成2阶多项式
X2 = q2.fit_transform(pd.DataFrame(X_train))
model2 = LinearRegression()
model2.fit(X2, Y_train)
X2_ = q2.fit_transform(pd.DataFrame(X_test))
np.sum((model2.predict(X2_) - Y_test)**2)
# output
343.769340055577
q3 = PolynomialFeatures(degree=3) # 转换成3阶多项式
X3 = q3.fit_transform(pd.DataFrame(X_train))
model3 = LinearRegression()
model3.fit(X3, Y_train)
X3_ = q3.fit_transform(pd.DataFrame(X_test))
np.sum((model3.predict(X3_) - Y_test)**2)
# output
38874.83884770577
q4 = PolynomialFeatures(degree=4) # 转换成4阶多项式
X4 = q4.fit_transform(pd.DataFrame(X_train))
model4 = LinearRegression()
model4.fit(X4, Y_train)
X4_ = q4.fit_transform(pd.DataFrame(X_test))
np.sum((model4.predict(X4_) - Y_test)**2)
# output
203047.92547841667