使用statmodel进行最小二乘法预测房租价格
# utf-8
import requests
from bs4 import BeautifulSoup
import lxml
import os
import csv
import pandas
import numpy as np
import matplotlib.pyplot as plt
import time
import patsy
import statsmodels.api as sm
url = 'https://tj.xxxxxx.com/zufang/pg'
# 数据准备阶段之将下载的数据写入csv文件
def write_to_csv(value_list):
current_dir = os.path.abspath('.')
file_name = os.path.join(current_dir, "data.csv")
csvfile = open(file_name, 'wt', encoding="UTF8")
writer = csv.writer(csvfile, delimiter=",")
header = ['标题', '小区名称', '户型', '面积', '价格', '区域', '是否邻近地铁', '供暖方式']
writer.writerow(header)
for item_list in value_list:
for row_item in item_list:
writer.writerow(row_item)
csvfile.close()
# 数据准备阶段之从网上下载数据
def get_proced_data(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')
ul = soup.find(id='house-lst')
li_list = ul.find_all('li')
value_list = []
for item in li_list:
title = item.find('div', attrs={"class": 'info-panel'}).h2.text
title = title.strip()
# 小区名称
xiaoqu_name = item.find('div', attrs={'class': 'where'}).find('span', attrs={'class': 'region'}).text
xiaoqu_name = xiaoqu_name.strip()
# 4室一厅
guimo = item.find('div', attrs={'class': 'where'}).find('span', attrs={'class': 'zone'}).text
guimo = guimo.strip()
# 大小 16平
area = item.find('div', attrs={'class': 'where'}).find('span', attrs={'class': 'meters'}).text
area = area.strip()
area = area.replace('平米', '')
# 价格
price = item.find('div', attrs={'class': 'col-3'}).find('div', attrs={'class': 'price'}).find('span', attrs={
'class': 'num'}).text
price = price.strip()
# 所属区域
zone = item.find('div', attrs={'class': 'con'}).a.text
zone = zone.strip()
# 是否近地铁
subway = ''
subway_item = item.find('span', attrs={'class': 'fang-subway-ex'})
if subway_item != None:
subway = subway_item.text
subway = subway.strip()
# 是否集中供暖
heating = ''
heating_item = item.find('span', attrs={'class': 'heating-ex'})
if heating_item != None:
heating = heating_item.text
heating = heating.strip()
value_list.append([title, xiaoqu_name, guimo, area, price, zone, subway, heating])
return value_list
# 数据准备阶段入口
def download_data_handle():
page_index = list(range(1, 10))
value_list = []
for page in page_index:
value = get_proced_data(url + str(page))
value_list.append(value)
print('正在处理第' + str(page) + "页")
time.sleep(3)
write_to_csv(value_list)
# 数据预处理
def data_preprocess():
df = pandas.read_csv('data.csv')
subway_describe = {np.nan: '不邻近地铁'}
heating_describe = {np.nan: '自采暖'}
df['是否邻近地铁'] = df['是否邻近地铁'].replace(subway_describe)
df['供暖方式'] = df['供暖方式'].replace(heating_describe)
return df
# 进行预测输出
def predict(params, sample_input):
params = params.to_dict()
sample_input = sample_input.to_dict()
sum = 0
for key in params:
if key == 'Intercept':
# 截距特殊处理
continue
if key in sample_input.keys():
# 计算求和
sum += float(params[key]) * float(sample_input[key])
sum += params['Intercept']
return sum
# 进行数据分析
def data_analyze(df):
df = df[['价格', '是否邻近地铁', '小区名称', '面积', '供暖方式']]
f = "价格~是否邻近地铁+小区名称+面积+供暖方式"
y, X = patsy.dmatrices(f, data=df, return_type='dataframe')
print('学习完成,开始预测')
results = sm.OLS(y, X).fit()
to_pred_idx = X.iloc[0].index
to_pred_zeros = np.zeros(len(to_pred_idx))
tpdf = pandas.DataFrame(to_pred_zeros, index=to_pred_idx, columns=["价格"])
subway = "[T.近地铁]"
heatway = "[T.集中供暖]"
area = 11
area_str = " 面积:" + str(area)
zone = "小区名称:"+ "[T.都市桃源]"
print('条件: ' + subway + " " + heatway + " " + area_str + " "+ zone)
tpdf.loc["小区名称" + zone] = 1
tpdf.loc["是否邻近地铁" + subway] = 1
tpdf.loc["供暖方式" + heatway] = 1
tpdf.loc["面积"] = area
result = predict(results.params, tpdf['价格'])
print(result)
if __name__ == '__main__':
if False == os.path.exists('data.csv'):
download_data_handle()
print('数据装载完成,开始数据预处理')
df = data_preprocess()
print('数据预处理完成,开始分析')
data_analyze(df)