1
、数据爬取
采用爬虫技术(
urllib
库,
BeautifulSoup
库,
re
库)从链家网站(参考网址:
https://cd.lianjia.com/zufang/jinrongcheng/pg2rt200600000001/#contentList
)获取
”
链家
/
成
都市
/
高新区
/
金融城
/
整租
”
租房信息(爬取
40
页数据),从各房屋信息中提取
”
楼盘名称
/
面积
/
装修
/
近地铁
/
楼层
/
总楼层
/
租金
”
信息,并将数据写入
lianjia.xls
文件(标题行为
['name', 'area', 'floor', 'totalFloor', 'decorate', 'nearSubway','rentFee']
)
2
、机器学习数据准备(数据清洗、统计分析和数据变换等)
读取
lianjia.xls
数据,存储到
dataframe
对象,完成以下内容:
1)
提取
['name','area','decorate','nearSubway','rentFee']
共
5
列数据,用于后续机器学习
2)
房屋面积
area
是回归分析的核心参数,不能有缺失值,过滤
area
列缺失值
3)
查看
'decorate'
和
'nearSubway'
缺失值,并将
: 'decorate'
缺失值填充为
'
非精装
',
'nearSubway'
缺失值填充为
'
非近地铁
'
4)
查看
’name’
列楼盘信息,提取特定楼盘数据共后续使用(学号末尾奇数提取
’
誉峰三
期
’
数据,偶数提取
’
招商大魔方
’
数据
5)
特征编码:
'decorate'
列,
'
非精装
'
编码为
0,'
精装
'
编码为
1
;
'decorate'
列,
'
非近地铁
'
编
码为
0,'
近地铁
'
编码为
1
6) 'rentFee'
列除以
1000
,单位为千元
3
、单变量回归分析
构建数据集:以
'area'
列数据为特征
,'rentFee'
列为标注信息,进行线性回归分析,分别
计算并打印训练和测试误差
4
、多变量回归分析
构建数据集:以
['area', 'decorate', 'nearSubway']
共
3
列数据为特征
,'rentFee'
列为标注
信息,特征数据经归一化处理后,进行线性回归分析,分别计算并打印训练和测试误
差
from gevent import monkey
# 猴子补丁
monkey.patch_all(thread=False)
import pandas as pd
import requests
from bs4 import BeautifulSoup
import xlwt
import gevent
from queue import Queue
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
Htmls = []
data_queue = Queue()
Data = []
# 先解析首页
def GetHtml_First():
url = "https://cd.lianjia.com/zufang/jinrongcheng/rt200600000001/"
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1 Edg/122.0.0.0"
}
r = requests.get(url, headers=headers, timeout=60)
if r.status_code != 200:
raise Exception("Error")
parse_find_link(r.text)
for i in range(2, 41):
url = f"https://cd.lianjia.com/zufang/jinrongcheng/pg{i}rt200600000001/#contentList"
r = requests.get(url, headers=headers, timeout=60)
if r.status_code != 200:
raise Exception("Error")
parse_find_link(r.text)
def GetHtml_Second():
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1 Edg/122.0.0.0"
}
while not data_queue.empty():
url = data_queue.get_nowait()
r = requests.get(url, headers=headers)
if r.status_code != 200:
raise Exception("Error")
print(url)
parse_html(r.text)
# 我先把每个房屋的链接爬取保存下来,再进入每个房子的主页里进行爬取房屋信息
# 不过好像也不用进入每个房子的主页里去爬取信息,本身页面就有信息
# 这里爬取会有点慢,可以自己再建立多个协程去爬取
def parse_find_link(html):
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all("a", class_="content__list--item--aside")
for link in links:
href = link.attrs["href"]
# 去除广告链接
if 'apartment' in href:
continue
Htmls.append("https://cd.lianjia.com"+href)
# 获取数据
def parse_html(html):
soup = BeautifulSoup(html, "html.parser")
# 房屋名字
name = soup.find("p", class_="content__title").get_text().strip().split(" ")[0].replace("整租·", "")
li = soup.find("ul", class_="content__aside__list").find_all("li")
a_d_ifo = li[1].get_text().split(" ")
# 房屋面积
area = "None"
try:
if a_d_ifo[1] is not None:
area = a_d_ifo[1]
except:
pass
# 房屋装修
decorate = "None"
try:
if "㎡" not in a_d_ifo[-1]:
decorate = a_d_ifo[-1]
except:
pass
# 租金
rentFee = soup.find("div", class_="content__aside--title").find("span").get_text()
f_t_info = soup.find("div", id="info").find("ul").find_all("li")[7].get_text().replace(":", "/").split("/")
# 楼层
floor = f_t_info[1]
# 总楼层
totalFloor = f_t_info[-1]
# 近地铁
nearSubway = "None"
try:
if soup.find("i", class_="content__item__tag--is_subway_house").get_text() is not None:
nearSubway = soup.find("i", class_="content__item__tag--is_subway_house").get_text()
except:
pass
Data.append([name, area, floor, totalFloor, decorate, nearSubway, rentFee])
# 写入excel
def Write_to_Excel(columns, data, sheet_name, xls_name):
# 创建excel表对象
workbook = xlwt.Workbook(encoding="utf-8")
# 创建sheet表
worksheet = workbook.add_sheet(sheet_name)
# 将列名写入第一行
for j in range(len(columns)):
worksheet.write(0, j, columns[j])
# 向excel中写入数据
for x in range(len(data)):
d = data[x]
print(d)
for y in range(len(columns)):
worksheet.write((x + 1), y, d[y])
# 保存文件,文件名为xls_name
workbook.save(xls_name)
print(f"{xls_name}保存成功")
# 创建十个协程
def start_work():
jobs = []
for item in range(10):
job = gevent.spawn(GetHtml_Second)
jobs.append(job)
gevent.joinall(jobs)
GetHtml_First()
workbook = xlwt.Workbook(encoding="utf-8")
worksheet = workbook.add_sheet("Sheet1")
worksheet.write(0, 0, "link")
for k in range(len(Htmls)):
worksheet.write(k+1, 0, Htmls[k])
workbook.save("link.xls")
df = pd.read_excel("link.xls")
temp = np.array(df)
Html = temp.tolist()
# 把url放入队列
for i in Html:
data_queue.put_nowait(i[0])
start_work()
columns = ['name', 'area', 'floor', 'totalFloor', 'decorate', 'nearSubway', 'rentFee']
Write_to_Excel(columns, Data, "Sheet2", "lianjia.xls")
df = pd.read_excel("lianjia.xls")
df = df[["name", "area", "decorate", "nearSubway", "rentFee"]]
# 统计各列空白个数
print(df.isnull().sum())
# 填充缺失值
df = df.fillna({"decorate": "非精装修", "nearSubway": "非近地铁"})
print(df.isnull().sum())
# 查看楼盘信息
print(df[df["name"] == "誉峰三期"])
# 特征编码
df.loc[df["decorate"] == "非精装修", ['decorate']] = 0
df.loc[df["decorate"] == "精装修", ['decorate']] = 1
df.loc[df["nearSubway"] == "非近地铁", ['nearSubway']] = 0
df.loc[df["nearSubway"] == "近地铁", ['nearSubway']] = 1
# rentFee列除以1000,单位为千元
df["rentFee"] = df["rentFee"] / 1000
print("*"*20)
# 构建数据集
x = np.array(df["area"])
# 把area的类型转为float
for i in range(len(x)):
x[i] = float(x[i][:-1])
y = np.array(df["rentFee"])
# 把x转为二维数组
x = x.reshape(-1, 1)
# 将数据集拆分为训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2)
# 进行线性回归模型的训练
model = LinearRegression()
model.fit(x_train, y_train)
# 用训练好的模型对测试集进行预测,并计算训练误差和测试误差
train_predictions = model.predict(x_train)
train_error = mean_squared_error(y_train, train_predictions)
print("训练误差:", train_error)
# 计算测试误差
test_predictions = model.predict(x_test)
test_error = mean_squared_error(y_test, test_predictions)
print("测试误差:", train_error)
# plt.scatter(y_test, test_predictions, s=5)
# plt.xlabel('real value')
# plt.ylabel('predict value')
# plt.title('predict value vs real value')
# std_x = np.arange(0, 30, 0.1)
# std_y = np.arange(0, 30, 0.1)
# plt.plot(std_x, std_y, c='r')
# plt.show()
print(model.score(x_test, y_test))
print("*"*20)
# 归一化处理
f = np.array(df[["area", "decorate", "nearSubway"]])
for i in range(len(f)):
f[i][0] = float(f[i][0][:-1])
# area最大值,最小值
max_area = np.max(f)
min_area = min(f, key=lambda x: x[0])[0]
normal = max_area - min_area
for i in range(len(f)):
# 最小-最大归一化
f[i][0] = (f[i][0] - min_area) / normal
g = np.array(df["rentFee"])
# 将数据集拆分为训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(f, g, test_size=0.3, random_state=2)
# 进行线性回归模型的训练
model = LinearRegression()
model.fit(x_train, y_train)
# 用训练好的模型对测试集进行预测,并计算训练误差和测试误差
train_predictions = model.predict(x_train)
train_error = mean_squared_error(y_train, train_predictions)
print("训练误差:", train_error)
# 计算测试误差
test_predictions = model.predict(x_test)
test_error = mean_squared_error(y_test, test_predictions)
print("测试误差:", train_error)
print(model.score(x_test, y_test))
print("*"*20)
# plt.scatter(y_test, test_predictions, s=5)
# plt.xlabel('real value')
# plt.ylabel('predict value')
# plt.title('predict value vs real value')
# std_x = np.arange(0, 30, 0.1)
# std_y = np.arange(0, 30, 0.1)
# plt.plot(std_x, std_y, c='r')
# plt.show()