import xlrd
import numpy as np
import re
xls_file = "/Users/zhuji/Desktop/链家二手房房源信息采集.xlsx"
book = xlrd.open_workbook(xls_file)
sheet1 = book.sheet_by_index(0)
nrows = sheet1.nrows
shuju = []
need_number = [4, 7, 8, 9, 10, 11]
for i in need_number:
shuju.append([])
for i in range(1,nrows-5):
for j in range(6):
cell_value = sheet1.cell_value(i, need_number[j])
shuju[j].append(cell_value)
for i in range(len(shuju[1])):
shuju[0][i] = float(re.sub(r'元/平米', '', shuju[0][i]))
jj = re.search(r'精装', shuju[2][i])
if jj != None:
shuju[5][i] = 1.0
else:
shuju[5][i] = 0.0
jj = re.search(r'平层', shuju[2][i])
if jj != None:
shuju[2][i] = 0.0
else:
shuju[2][i] = 1.0
chaoxiang = re.search(r'南', shuju[1][i])
if chaoxiang != None:
shuju[1][i] = 1.0
else:
shuju[1][i] = 0.0
shuju[3][i] = float(re.sub(r'平米', '', shuju[3][i]))
shuju[4][i] = float(re.search(r'[0-9]*', shuju[4][i]).group())
b, w1, w2, w3, w4, w5, w6, w7 = 1, 1, 1, 1, 1, 1, 1, 1
lr = 1
iteration = 10000000
lr_b, lr_w1, lr_w2, lr_w3, lr_w4, lr_w5, lr_w6,lr_w7,= 0, 0, 0, 0, 0, 0,0,0
for i in range(iteration):
b_grad, w1_grad, w2_grad, w4_grad, w3_grad, w5_grad, w6_grad, w7_grad = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
for n in range(len(shuju[1])):
b_grad=b_grad-2.0*(shuju[0][n] - b - w1*shuju[1][n] - w2*shuju[2][n]- w3*shuju[3][n]/10 - w4*shuju[4][n]/100 - w5*shuju[5][n])*1.0
w1_grad=w1_grad-2.0*(shuju[0][n] - b - w1*shuju[1][n] - w2*shuju[2][n]- w3*shuju[3][n]/10 - w4*shuju[4][n]/100 - w5*shuju[5][n]\
-w6*(shuju[3][n]/10)**2 - w7*(shuju[4][n]/10)**2)*shuju[1][n]
w2_grad=w2_grad-2.0*(shuju[0][n] - b - w1*shuju[1][n] - w2*shuju[2][n]- w3*shuju[3][n]/10 - w4*shuju[4][n]/100 - w5*shuju[5][n]\
-w6*(shuju[3][n]/10)**2 - w7*(shuju[4][n]/10)**2)*shuju[2][n]
w3_grad=w3_grad-2.0*(shuju[0][n] - b - w1*shuju[1][n] - w2*shuju[2][n]- w3*shuju[3][n]/10 - w4*shuju[4][n]/100 - w5*shuju[5][n]\
-w6*(shuju[3][n]/10)**2 - w7*(shuju[4][n]/10)**2)*shuju[3][n]/10
w4_grad=w4_grad-2.0*(shuju[0][n] - b - w1*shuju[1][n] - w2*shuju[2][n]- w3*shuju[3][n]/10 - w4*shuju[4][n]/100 - w5*shuju[5][n]\
-w6*(shuju[3][n]/10)**2 - w7*(shuju[4][n]/10)**2)*shuju[4][n]/100
w5_grad=w5_grad-2.0*(shuju[0][n] - b - w1*shuju[1][n] - w2*shuju[2][n]- w3*shuju[3][n]/10 - w4*shuju[4][n]/100 - w5*shuju[5][n]\
-w6*(shuju[3][n]/10)**2 - w7*(shuju[4][n]/10)**2)*shuju[5][n]
w6_grad=w6_grad-2.0*(shuju[0][n] - b - w1*shuju[1][n] - w2*shuju[2][n]- w3*shuju[3][n]/10 - w4*shuju[4][n]/100 - w5*shuju[5][n]\
-w6*(shuju[3][n]/10)**2 - w7*(shuju[4][n]/10)**2)*(shuju[3][n]/10)**2
w7_grad=w7_grad-2.0*(shuju[0][n] - b - w1*shuju[1][n] - w2*shuju[2][n]- w3*shuju[3][n]/10 - w4*shuju[4][n]/100 - w5*shuju[5][n]\
-w6*(shuju[3][n]/10)**2 - w7*(shuju[4][n]/10)**2)*(shuju[4][n]/100)**2
lr_b = lr_b + b_grad**2
lr_w1 = lr_w1 + w1_grad**2
lr_w2 = lr_w2 + w2_grad**2
lr_w3 = lr_w3 + w3_grad**2
lr_w4 = lr_w4 + w4_grad**2
lr_w5 = lr_w5 + w5_grad**2
lr_w6 = lr_w6 + w6_grad**2
lr_w7 = lr_w7 + w7_grad**2
b = b - (lr/np.sqrt(lr_b))*b_grad
w1 = w1 - (lr/np.sqrt(lr_w1))*w1_grad
w2 = w2 - (lr/np.sqrt(lr_w2))*w2_grad
w3 = w3 - (lr/np.sqrt(lr_w3))*w3_grad
w4 = w4 - (lr/np.sqrt(lr_w4))*w4_grad
w5 = w5 - (lr/np.sqrt(lr_w5))*w5_grad
w6 = w6 - (lr/np.sqrt(lr_w6))*w6_grad
w7 = w7 - (lr/np.sqrt(lr_w7))*w7_grad
print(i,b,w1,w2,w3,w4,w5,w6,w7)
print(b, w1, w2, w3, w4, w5)
z=0
for n in range(len(shuju[0])):
z = z + (shuju[0][n] - b - w1*shuju[1][n] - w2*shuju[2][n]- w3*shuju[3][n]/10 - w4*shuju[4][n]/100 - w5*shuju[5][n]\
-w6*(shuju[3][n]/10)**2 - w7*(shuju[4][n]/10)**2)**2
z=z/len(shuju[0])
print(z)
线性方程考虑房屋朝向(南向记为1)、房屋平层跃层(跃层记为1)、房屋总面积、房屋建筑年份、房屋精简装情况(精装记为1)这五个主要参数,同时对房屋总面积、房屋建筑年份这两个参数进行二阶拟合。从你和数据可以发现,房屋价格与建筑年份的关联性相对较低,对于平均价格的影响在400元一下,建筑总面积的影响系数最大。
根据以上结果,对公式进行进一步分析,删除年份二次项,重新拟合。