OLS 预测

最新推荐文章于 2024-09-21 13:22:00 发布

世纪殇

最新推荐文章于 2024-09-21 13:22:00 发布

阅读量1.9k

点赞数 2

分类专栏：统计学习方法

本文链接：https://blog.csdn.net/dasgk/article/details/82391482

版权

统计学习方法专栏收录该内容

2 篇文章 0 订阅

订阅专栏

使用statmodel进行最小二乘法预测房租价格

# utf-8
import requests
from bs4 import BeautifulSoup
import lxml
import os
import csv
import pandas
import numpy as np
import matplotlib.pyplot as plt

import time
import patsy
import statsmodels.api as sm

url = 'https://tj.xxxxxx.com/zufang/pg'


# 数据准备阶段之将下载的数据写入csv文件
def write_to_csv(value_list):
    current_dir = os.path.abspath('.')
    file_name = os.path.join(current_dir, "data.csv")
    csvfile = open(file_name, 'wt', encoding="UTF8")
    writer = csv.writer(csvfile, delimiter=",")
    header = ['标题', '小区名称', '户型', '面积', '价格', '区域', '是否邻近地铁', '供暖方式']
    writer.writerow(header)
    for item_list in value_list:
        for row_item in item_list:
            writer.writerow(row_item)

    csvfile.close()


# 数据准备阶段之从网上下载数据
def get_proced_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    ul = soup.find(id='house-lst')
    li_list = ul.find_all('li')
    value_list = []
    for item in li_list:
        title = item.find('div', attrs={"class": 'info-panel'}).h2.text
        title = title.strip()
        # 小区名称
        xiaoqu_name = item.find('div', attrs={'class': 'where'}).find('span', attrs={'class': 'region'}).text
        xiaoqu_name = xiaoqu_name.strip()
        # 4室一厅
        guimo = item.find('div', attrs={'class': 'where'}).find('span', attrs={'class': 'zone'}).text
        guimo = guimo.strip()
        # 大小 16平
        area = item.find('div', attrs={'class': 'where'}).find('span', attrs={'class': 'meters'}).text
        area = area.strip()
        area = area.replace('平米', '')
        # 价格
        price = item.find('div', attrs={'class': 'col-3'}).find('div', attrs={'class': 'price'}).find('span', attrs={
            'class': 'num'}).text
        price = price.strip()
        # 所属区域
        zone = item.find('div', attrs={'class': 'con'}).a.text
        zone = zone.strip()
        # 是否近地铁        
        subway = ''
        subway_item = item.find('span', attrs={'class': 'fang-subway-ex'})

        if subway_item != None:
            subway = subway_item.text
            subway = subway.strip()
        # 是否集中供暖
        heating = ''
        heating_item = item.find('span', attrs={'class': 'heating-ex'})

        if heating_item != None:
            heating = heating_item.text
        heating = heating.strip()
        value_list.append([title, xiaoqu_name, guimo, area, price, zone, subway, heating])
    return value_list


# 数据准备阶段入口
def download_data_handle():
    page_index = list(range(1, 10))
    value_list = []
    for page in page_index:
        value = get_proced_data(url + str(page))
        value_list.append(value)
        print('正在处理第' + str(page) + "页")
        time.sleep(3)
    write_to_csv(value_list)


# 数据预处理
def data_preprocess():
    df = pandas.read_csv('data.csv')
    subway_describe = {np.nan: '不邻近地铁'}
    heating_describe = {np.nan: '自采暖'}
    df['是否邻近地铁'] = df['是否邻近地铁'].replace(subway_describe)
    df['供暖方式'] = df['供暖方式'].replace(heating_describe)
    return df


# 进行预测输出
def predict(params, sample_input):
    params = params.to_dict()
    sample_input = sample_input.to_dict()
    sum = 0
    for key in params:
        if key == 'Intercept':
            # 截距特殊处理
            continue
        if key in sample_input.keys():
            # 计算求和
            sum += float(params[key]) * float(sample_input[key])
    sum += params['Intercept']
    return sum


# 进行数据分析
def data_analyze(df):
    df = df[['价格', '是否邻近地铁', '小区名称', '面积', '供暖方式']]
    f = "价格~是否邻近地铁+小区名称+面积+供暖方式"
    y, X = patsy.dmatrices(f, data=df, return_type='dataframe')
    print('学习完成，开始预测')
    results = sm.OLS(y, X).fit()
    to_pred_idx = X.iloc[0].index
    to_pred_zeros = np.zeros(len(to_pred_idx))
    tpdf = pandas.DataFrame(to_pred_zeros, index=to_pred_idx, columns=["价格"])
    subway = "[T.近地铁]"
    heatway = "[T.集中供暖]"
    area = 11
    area_str = " 面积:" + str(area)
    zone =  "小区名称："+ "[T.都市桃源]"
    print('条件: ' + subway + " " + heatway + " " + area_str + " "+ zone)
    tpdf.loc["小区名称" + zone] = 1
    tpdf.loc["是否邻近地铁" + subway] = 1
    tpdf.loc["供暖方式" + heatway] = 1
    tpdf.loc["面积"] = area
    result = predict(results.params, tpdf['价格'])
    print(result)


if __name__ == '__main__':
    if False == os.path.exists('data.csv'):
        download_data_handle()
    print('数据装载完成,开始数据预处理')
    df = data_preprocess()
    print('数据预处理完成，开始分析')
    data_analyze(df)