前言:
小白代码,程序可运行,算法运行效率可能较低,用来完成大作业应该没有问题。,需要实验报告等可评论区评论。
1.对网站进行爬取
首先对网站重庆二手房房源_重庆二手房出售|买卖|交易信息(重庆链家)进行解析,将数据保存在page_text里,将不同的数据如:标题、地址、价格、总价、等等通过xpath进行解析后保存在不同的变量里,最后将所有的数据全部保存在dataframe里面,通过追加的方式追加到a变量里,a变量首先需要赋值一个空的list。
代码如下:
def paqu():
a=[]
lj_house_hx=[]
lj_house_mj=[]
lj_house_cx=[]
lj_house_zx=[]
lj_house_lc=[]
lj_house_nx=[]
lj_house_lx=[]
lj_house_total_price=[]
lj_house_unit_price=[]
for i in range(1,100):
url='https://cq.lianjia.com/ershoufang/'+'pg'+str(i)+'/'
page_text=requests.get(url=url,headers=headers).text
tree=etree.HTML(page_text)
lj_title=tree.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[1]/a/text()')#标题
lj_address_1=tree.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[2]/div/a[1]/text()')
lj_address_2=tree.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[2]/div/a[2]/text()')
lj_house_all=tree.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[3]/div/text()')
for j in range(len(lj_house_all)):
lj_house=lj_house_all[j].split('|')
lj_house_hx.append(lj_house[0])
lj_house_mj.append(lj_house[1])
lj_house_cx.append(lj_house[2])
lj_house_zx.append(lj_house[3])
lj_house_lc.append(lj_house[4])
if(len(lj_house)==6):
lj_house.insert(-1,'')
lj_house_nx.append(lj_house[5])
else:
lj_house_nx.append(lj_house[5])
lj_house_lx.append(lj_house[6])
lj_house_total_price=tree.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[6]/div[1]/span/text()')
lj_house_unit_price=tree.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[6]/div[2]/span/text()')
for lj_titles,lj_address_1s,lj_address_2s,lj_house_hxs,lj_house_mjs,lj_house_cxs,lj_house_zxs,lj_house_lcs,lj_house_nxs,lj_house_lxs,lj_house_total_prices,lj_house_unit_prices in zip(lj_title,lj_address_1,lj_address_2,lj_house_hx,lj_house_mj,lj_house_cx,lj_house_zx,lj_house_lc,lj_house_nx,lj_house_lx,lj_house_total_price,lj_house_unit_price):
data={
'lj_title':lj_titles.strip(),
'lj_address_1':lj_address_1s.strip(),
'lj_address_2':lj_address_2s.strip(),
'lj_house_hx':lj_house_hxs.strip(),
'lj_house_mj':lj_house_mjs.strip(),
'lj_house_cx':lj_house_cxs.strip(),
'lj_house_zx':lj_house_zxs.strip(),
'lj_house_lc':lj_house_lcs.strip(),
'lj_house_nx':lj_house_nxs.strip(),
'lj_house_lx':lj_house_lxs.strip(),
'lj_house_total_price':lj_house_total_prices.strip(),
'lj_house_unit_price':lj_house_unit_prices.strip(),
}
a.append(data)
print("成功爬取网页!!!")
return a
2.爬取完成后保存代码如下:
def bc(a):
df=pd.DataFrame(a,columns=['lj_title','lj_address_1','lj_address_2','lj_house_hx','lj_house_mj','lj_house_cx','lj_house_zx','lj_house_lc','lj_house_nx','lj_house_lx','lj_house_total_price','lj_house_unit_price'])
df.to_csv('重庆链家二手房.csv',encoding='utf-8-sig')
conn = create_engine('mysql+mysqldb://root:123456@localhost:3306/cqershoufang?charset=utf8')
df.to_sql(name='cqershoufang1', con=conn, if_exists='append', index=False, index_label=False,)#df_out是处理好的数据
print("成功保存!!!")
3.对爬取的数据进行清洗
为了防止数据的丢失以及在清洗过程中出现错误数据,备份一个重庆二手房.csv的文件,将其命名为重庆二手房2.csv,通过pd.read_csv读取重庆二手房2.csv,通过切片,去空值,去除异常值等方式进行数据的清洗。
打开重庆二手房2.csv文件,可以粗略的观察一下哪些列有空值或异常值等(也可通过函数isna().sum()来检测共有多少个空值),大致了解哪些数据有问题后在对相应的数据进行清洗,最后将清洗好的数据重新保存在重庆二手房2.csv中,后面可以用来对数据的预测等等。
实现代码如下:
def qx():
data=pd.read_csv('重庆链家二手房2.csv',encoding='utf-8') #读取文件
num=data[u'lj_house_nx'].isna().sum() #检测lj_house_nx列有多少个Nan值,
data[u'lj_house_nx']=data[u'lj_house_nx'].astype(str) #转换为str
data[u'lj_house_nx']=data[u'lj_house_nx'].apply(lambda x:x.replace('年建','')) #将年建去掉,方便后面用值计算
data[u'lj_house_nx']=data[u'lj_house_nx'].astype(float) #转换为float
a=data[u'lj_house_nx'].sum()
a=a//(len(data[u'lj_house_nx'])-num) #求得平均是哪一年建造的
data[u'lj_house_nx']=data[u'lj_house_nx'].astype(str) #转换为str
data[u'lj_house_nx']=data[u'lj_house_nx'].apply(lambda x:x.replace('nan',str(int(a))))
data.to_csv('重庆链家二手房2.csv',index=False,encoding='utf-8-sig') #将修改后的data保存到csv中
4.数据可视化,分别用了4个图形可视化
def ht_zxt():
matplotlib.rcParams['font.sans-serif'] = ['SimHei'] #设置字体为SimHei显示中文
matplotlib.rcParams['axes.unicode_minus'] = False
data=pd.read_csv('重庆链家二手房2.csv',encoding='utf-8')
group=data.groupby(['lj_address_2'])
average_nx_group = group['lj_house_total_price'].min()
x=average_nx_group.index[:10]
y=average_nx_group.values[:10]
print(x,y)
title='各地区最便宜的房价'
plt.figure() # 图形画布
plt.bar(x,y, alpha=0.8) # 绘制条形图
plt.xlabel("地区") # 区域文字
plt.ylabel("房价") # 均价文字
plt.title(title) # 表标题文字
# 为每一个图形加数值标签
for x, y in enumerate(y):
plt.text(x, y + 10, y, ha='center')
plt.show() # 显示图表
def ht_bt():
matplotlib.rcParams['font.sans-serif'] = ['SimHei'] #设置字体为SimHei显示中文
matplotlib.rcParams['axes.unicode_minus'] = False
data=pd.read_csv('重庆链家二手房2.csv',encoding='utf-8')
group=data.groupby(['lj_address_2'])
average_nx_group = group['lj_house_total_price'].min()
x=average_nx_group.index[:10]
y=average_nx_group.values[:10]
title='各地最低房价'
plt.figure() # 图形画布
plt.pie(y, labels=x, labeldistance=1.1,
autopct="%1.1f%%", shadow=True, startangle=90, pctdistance=0.7)
plt.axis("equal") # 设置横轴和纵轴大小相等,这样饼才是圆的
plt.title(title, fontsize=24)
plt.legend(bbox_to_anchor=(0.1, 1.1)) # 让图例生效,并设置图例显示位置
plt.show()
def ht_Tzxt():
matplotlib.rcParams['font.sans-serif'] = ['SimHei'] #设置字体为SimHei显示中文
matplotlib.rcParams['axes.unicode_minus'] = False
data=pd.read_csv('重庆链家二手房2.csv',encoding='utf-8')
xdata=[]
ydata=[]
xdata=data.loc[:,'lj_house_nx'][:10]
ydata=data.loc[:,'lj_address_2'][:10]
plt.plot(ydata,xdata,'bo-',label=u'',linewidth=1)
plt.title(u'折线图',size=10)
plt.legend()
plt.xlabel(u'y轴',size=10)
plt.ylabel(u'y轴',size=10)
plt.show()
def ht_sdt():
matplotlib.rcParams['font.sans-serif'] = ['SimHei'] #设置字体为SimHei显示中文
matplotlib.rcParams['axes.unicode_minus'] = False
data=pd.read_csv('重庆链家二手房2.csv',encoding='utf-8')
plt.scatter(data['lj_house_nx'][:50],data['lj_address_2'][:50])
plt.show()
5.预测算法
代码如下:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVR#回归函数
#避免中文乱码
matplotlib.rcParams['font.sans-serif']=['SimHei'] #设置字体
matplotlib.rcParams['axes.unicode_minus']=False #设置正常显示字符
#数据简单清洗
data = pd.read_csv('重庆链家二手房2.csv') # 读取csv数据文件
del data['Unnamed: 0'] # 将索引列删除
data.dropna(axis=0, how='any', inplace=True) # 删除data数据中的所有空值
data['lj_house_unit_price'] = data['lj_house_unit_price'].map(lambda d: d.replace('元/平', '')) # 将lj_house_unit_price“元/平米”去掉
data['lj_house_unit_price'] = data['lj_house_unit_price'].map(lambda d: d.replace(',', ''))
data['lj_house_unit_price'] = data['lj_house_unit_price'].astype(float) # 将房子lj_house_unit_price转换为浮点类型
data['lj_house_total_price'] = data['lj_house_total_price'].astype(float) # 将房子lj_house_total_price转换为浮点类型
data['lj_house_mj'] = data['lj_house_mj'].map(lambda p: p.replace('平米', '')) # 将建筑面价“平米”去掉
data['lj_house_mj'] = data['lj_house_mj'].astype(float) # 将lj_house_mj转换为浮点类型
def get_price_forecast():
data_copy=data.copy()
data_copy[['室','厅']]=data_copy['lj_house_hx'].str.extract('(\d+)室(\d+)厅')
data_copy[['室']]=data_copy[['室']].astype(float)
data_copy[['厅']]=data_copy[['厅']].astype(float)
del data_copy['lj_title']
del data_copy['lj_address_2']
del data_copy['lj_house_cx']
del data_copy['lj_house_lc']
del data_copy['lj_house_zx']
del data_copy['lj_address_1']
del data_copy['lj_house_unit_price']
del data_copy['lj_house_lx']
del data_copy['lj_house_hx']
data_copy.dropna(axis=0,how='any',inplace=True)
data_copy.dropna(axis=0,how='any',inplace=True)
new_data=data_copy[data_copy['lj_house_mj']<300].reset_index(drop=True)
new_data.loc[2871]=[234.12,2015,None,4.0,2.0]
new_data.loc[2872]=[212.12,2018,None,4.0,1.0]
data_train=new_data.loc[0:2870]
x_list=['lj_house_mj','室','厅']
data_mean=data_train.mean()
data_std=data_train.std()
data_strain=(data_train-data_mean)/data_std
x_train=data_train[x_list].values
y_train=data_train['lj_house_total_price'].values
linearsvr=LinearSVR(C=0.0001)
linearsvr.fit(x_train,y_train)
x=((new_data[x_list]-data_mean[x_list])/data_std[x_list]).values
new_data[u'y_pred']=linearsvr.predict(x)*data_std['lj_house_total_price']+data_mean['lj_house_total_price']
y=new_data[['lj_house_total_price']][2860:]
y_pred=new_data[['y_pred']][2860:]
return y, y_pred
def broken_line(y,y_pred,title):
plt.figure()
plt.plot(y,color='r',marker='o',label='真实房价')
plt.plot(y_pred,color='b',marker='*',label='预测房价')
plt.xlabel('数据量')
plt.ylabel('房子总价')
plt.title(title)
plt.legend()
plt.grid()
plt.show()
if __name__ == '__main__':
x,y=get_price_forecast()
title='二手房预测图'
broken_line(x,y,title)
完整代码为
第一个py文件里的代码为:
import requests
import re
import os
from bs4 import BeautifulSoup
from lxml import etree
import time
import mysql.connector as ms #导入链接mysql数据模块
import pandas as pd
import pymysql
pymysql.install_as_MySQLdb()
import math
from sqlalchemy import create_engine #写数据库
import matplotlib
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVR
import random
import numpy as np
from sklearn.datasets import load_breast_cancer
import cleaning
def paqu():
a=[]
lj_house_hx=[]
lj_house_mj=[]
lj_house_cx=[]
lj_house_zx=[]
lj_house_lc=[]
lj_house_nx=[]
lj_house_lx=[]
lj_house_total_price=[]
lj_house_unit_price=[]
for i in range(1,100):
url='https://cq.lianjia.com/ershoufang/'+'pg'+str(i)+'/'
page_text=requests.get(url=url,headers=headers).text
tree=etree.HTML(page_text)
lj_title=tree.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[1]/a/text()')#标题
lj_address_1=tree.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[2]/div/a[1]/text()')
lj_address_2=tree.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[2]/div/a[2]/text()')
lj_house_all=tree.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[3]/div/text()')
for j in range(len(lj_house_all)):
lj_house=lj_house_all[j].split('|')
lj_house_hx.append(lj_house[0])
lj_house_mj.append(lj_house[1])
lj_house_cx.append(lj_house[2])
lj_house_zx.append(lj_house[3])
lj_house_lc.append(lj_house[4])
if(len(lj_house)==6):
lj_house.insert(-1,'')
lj_house_nx.append(lj_house[5])
else:
lj_house_nx.append(lj_house[5])
lj_house_lx.append(lj_house[6])
lj_house_total_price=tree.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[6]/div[1]/span/text()')
lj_house_unit_price=tree.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[6]/div[2]/span/text()')
for lj_titles,lj_address_1s,lj_address_2s,lj_house_hxs,lj_house_mjs,lj_house_cxs,lj_house_zxs,lj_house_lcs,lj_house_nxs,lj_house_lxs,lj_house_total_prices,lj_house_unit_prices in zip(lj_title,lj_address_1,lj_address_2,lj_house_hx,lj_house_mj,lj_house_cx,lj_house_zx,lj_house_lc,lj_house_nx,lj_house_lx,lj_house_total_price,lj_house_unit_price):
data={
'lj_title':lj_titles.strip(),
'lj_address_1':lj_address_1s.strip(),
'lj_address_2':lj_address_2s.strip(),
'lj_house_hx':lj_house_hxs.strip(),
'lj_house_mj':lj_house_mjs.strip(),
'lj_house_cx':lj_house_cxs.strip(),
'lj_house_zx':lj_house_zxs.strip(),
'lj_house_lc':lj_house_lcs.strip(),
'lj_house_nx':lj_house_nxs.strip(),
'lj_house_lx':lj_house_lxs.strip(),
'lj_house_total_price':lj_house_total_prices.strip(),
'lj_house_unit_price':lj_house_unit_prices.strip(),
}
a.append(data)
print("成功爬取网页!!!")
return a
def bc(a):
df=pd.DataFrame(a,columns=['lj_title','lj_address_1','lj_address_2','lj_house_hx','lj_house_mj','lj_house_cx','lj_house_zx','lj_house_lc','lj_house_nx','lj_house_lx','lj_house_total_price','lj_house_unit_price'])
df.to_csv('重庆链家二手房.csv',encoding='utf-8-sig')
conn = create_engine('mysql+mysqldb://root:123456@localhost:3306/cqershoufang?charset=utf8')
df.to_sql(name='cqershoufang1', con=conn, if_exists='append', index=False, index_label=False,)#df_out是处理好的数据
print("成功保存!!!")
def qx():
data=pd.read_csv('重庆链家二手房2.csv',encoding='utf-8') #读取文件
num=data[u'lj_house_nx'].isna().sum() #检测lj_house_nx列有多少个Nan值,
data[u'lj_house_nx']=data[u'lj_house_nx'].astype(str) #转换为str
data[u'lj_house_nx']=data[u'lj_house_nx'].apply(lambda x:x.replace('年建','')) #将年建去掉,方便后面用值计算
data[u'lj_house_nx']=data[u'lj_house_nx'].astype(float) #转换为float
a=data[u'lj_house_nx'].sum()
a=a//(len(data[u'lj_house_nx'])-num) #求得平均是哪一年建造的
data[u'lj_house_nx']=data[u'lj_house_nx'].astype(str) #转换为str
data[u'lj_house_nx']=data[u'lj_house_nx'].apply(lambda x:x.replace('nan',str(int(a))))
data.to_csv('重庆链家二手房2.csv',index=False,encoding='utf-8-sig') #将修改后的data保存到csv中
def ht_zxt():
matplotlib.rcParams['font.sans-serif'] = ['SimHei'] #设置字体为SimHei显示中文
matplotlib.rcParams['axes.unicode_minus'] = False
data=pd.read_csv('重庆链家二手房2.csv',encoding='utf-8')
group=data.groupby(['lj_address_2'])
average_nx_group = group['lj_house_total_price'].min()
x=average_nx_group.index[:10]
y=average_nx_group.values[:10]
print(x,y)
title='各地区最便宜的房价'
plt.figure() # 图形画布
plt.bar(x,y, alpha=0.8) # 绘制条形图
plt.xlabel("地区") # 区域文字
plt.ylabel("房价") # 均价文字
plt.title(title) # 表标题文字
# 为每一个图形加数值标签
for x, y in enumerate(y):
plt.text(x, y + 10, y, ha='center')
plt.show() # 显示图表
def ht_bt():
matplotlib.rcParams['font.sans-serif'] = ['SimHei'] #设置字体为SimHei显示中文
matplotlib.rcParams['axes.unicode_minus'] = False
data=pd.read_csv('重庆链家二手房2.csv',encoding='utf-8')
group=data.groupby(['lj_address_2'])
average_nx_group = group['lj_house_total_price'].min()
x=average_nx_group.index[:10]
y=average_nx_group.values[:10]
title='各地最低房价'
plt.figure() # 图形画布
plt.pie(y, labels=x, labeldistance=1.1,
autopct="%1.1f%%", shadow=True, startangle=90, pctdistance=0.7)
plt.axis("equal") # 设置横轴和纵轴大小相等,这样饼才是圆的
plt.title(title, fontsize=24)
plt.legend(bbox_to_anchor=(0.1, 1.1)) # 让图例生效,并设置图例显示位置
plt.show()
def ht_Tzxt():
matplotlib.rcParams['font.sans-serif'] = ['SimHei'] #设置字体为SimHei显示中文
matplotlib.rcParams['axes.unicode_minus'] = False
data=pd.read_csv('重庆链家二手房2.csv',encoding='utf-8')
xdata=[]
ydata=[]
xdata=data.loc[:,'lj_house_nx'][:10]
ydata=data.loc[:,'lj_address_2'][:10]
plt.plot(ydata,xdata,'bo-',label=u'',linewidth=1)
plt.title(u'折线图',size=10)
plt.legend()
plt.xlabel(u'y轴',size=10)
plt.ylabel(u'y轴',size=10)
plt.show()
def ht_sdt():
matplotlib.rcParams['font.sans-serif'] = ['SimHei'] #设置字体为SimHei显示中文
matplotlib.rcParams['axes.unicode_minus'] = False
data=pd.read_csv('重庆链家二手房2.csv',encoding='utf-8')
plt.scatter(data['lj_house_nx'][:50],data['lj_address_2'][:50])
plt.show()
def yuce():
x,y=cleaning.get_price_forecast()
title='二手房预测图'
cleaning.broken_line(x,y,title)
if __name__ == "__main__":
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.57'
}
# b=paqu()
# bc(b)
# qx()
ht_zxt()
ht_bt()
ht_Tzxt()
ht_sdt()
yuce()
第二个py文件的代码为:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVR#回归函数
#避免中文乱码
matplotlib.rcParams['font.sans-serif']=['SimHei'] #设置字体
matplotlib.rcParams['axes.unicode_minus']=False #设置正常显示字符
#数据简单清洗
data = pd.read_csv('重庆链家二手房2.csv') # 读取csv数据文件
del data['Unnamed: 0'] # 将索引列删除
data.dropna(axis=0, how='any', inplace=True) # 删除data数据中的所有空值
data['lj_house_unit_price'] = data['lj_house_unit_price'].map(lambda d: d.replace('元/平', '')) # 将lj_house_unit_price“元/平米”去掉
data['lj_house_unit_price'] = data['lj_house_unit_price'].map(lambda d: d.replace(',', ''))
data['lj_house_unit_price'] = data['lj_house_unit_price'].astype(float) # 将房子lj_house_unit_price转换为浮点类型
data['lj_house_total_price'] = data['lj_house_total_price'].astype(float) # 将房子lj_house_total_price转换为浮点类型
data['lj_house_mj'] = data['lj_house_mj'].map(lambda p: p.replace('平米', '')) # 将建筑面价“平米”去掉
data['lj_house_mj'] = data['lj_house_mj'].astype(float) # 将lj_house_mj转换为浮点类型
def get_price_forecast():
data_copy=data.copy()
data_copy[['室','厅']]=data_copy['lj_house_hx'].str.extract('(\d+)室(\d+)厅')
data_copy[['室']]=data_copy[['室']].astype(float)
data_copy[['厅']]=data_copy[['厅']].astype(float)
del data_copy['lj_title']
del data_copy['lj_address_2']
del data_copy['lj_house_cx']
del data_copy['lj_house_lc']
del data_copy['lj_house_zx']
del data_copy['lj_address_1']
del data_copy['lj_house_unit_price']
del data_copy['lj_house_lx']
del data_copy['lj_house_hx']
data_copy.dropna(axis=0,how='any',inplace=True)
data_copy.dropna(axis=0,how='any',inplace=True)
new_data=data_copy[data_copy['lj_house_mj']<300].reset_index(drop=True)
new_data.loc[2871]=[234.12,2015,None,4.0,2.0]
new_data.loc[2872]=[212.12,2018,None,4.0,1.0]
data_train=new_data.loc[0:2870]
x_list=['lj_house_mj','室','厅']
data_mean=data_train.mean()
data_std=data_train.std()
data_strain=(data_train-data_mean)/data_std
x_train=data_train[x_list].values
y_train=data_train['lj_house_total_price'].values
linearsvr=LinearSVR(C=0.0001)
linearsvr.fit(x_train,y_train)
x=((new_data[x_list]-data_mean[x_list])/data_std[x_list]).values
new_data[u'y_pred']=linearsvr.predict(x)*data_std['lj_house_total_price']+data_mean['lj_house_total_price']
y=new_data[['lj_house_total_price']][2860:]
y_pred=new_data[['y_pred']][2860:]
return y, y_pred
def broken_line(y,y_pred,title):
plt.figure()
plt.plot(y,color='r',marker='o',label='真实房价')
plt.plot(y_pred,color='b',marker='*',label='预测房价')
plt.xlabel('数据量')
plt.ylabel('房子总价')
plt.title(title)
plt.legend()
plt.grid()
plt.show()
if __name__ == '__main__':
x,y=get_price_forecast()
title='二手房预测图'
broken_line(x,y,title)