实验三文本数据及其可视化

1 、数据爬取
采用爬虫技术( urllib 库, BeautifulSoup 库, re 库)从链家网站(参考网址:
https://cd.lianjia.com/zufang/jinrongcheng/pg2rt200600000001/#contentList )获取 链家 /
都市 / 高新区 / 金融城 / 整租 租房信息(爬取 40 页数据),从各房屋信息中提取 楼盘名称 /
面积 / 装修 / 近地铁 / 楼层 / 总楼层 / 租金 信息,并将数据写入 lianjia.xls 文件(标题行为
['name', 'area', 'floor', 'totalFloor', 'decorate', 'nearSubway','rentFee']
2 、机器学习数据准备(数据清洗、统计分析和数据变换等)
读取 lianjia.xls 数据,存储到 dataframe 对象,完成以下内容:
1) 提取 ['name','area','decorate','nearSubway','rentFee'] 5 列数据,用于后续机器学习 2) 房屋面积 area 是回归分析的核心参数,不能有缺失值,过滤 area 列缺失值
3) 查看 'decorate' 'nearSubway' 缺失值,并将 : 'decorate' 缺失值填充为 ' 非精装 ',
'nearSubway' 缺失值填充为 ' 非近地铁 '
4) 查看 ’name’ 列楼盘信息,提取特定楼盘数据共后续使用(学号末尾奇数提取 誉峰三
数据,偶数提取 招商大魔方 数据
5) 特征编码: 'decorate' 列, ' 非精装 ' 编码为 0,' 精装 ' 编码为 1 'decorate' 列, ' 非近地铁 '
码为 0,' 近地铁 ' 编码为 1
6) 'rentFee' 列除以 1000 ,单位为千元
3 、单变量回归分析
构建数据集:以 'area' 列数据为特征 ,'rentFee' 列为标注信息,进行线性回归分析,分别
计算并打印训练和测试误差
4 、多变量回归分析
构建数据集:以 ['area', 'decorate', 'nearSubway'] 3 列数据为特征 ,'rentFee' 列为标注
信息,特征数据经归一化处理后,进行线性回归分析,分别计算并打印训练和测试误
from gevent import monkey
# 猴子补丁
monkey.patch_all(thread=False)
import pandas as pd
import requests
from bs4 import BeautifulSoup
import xlwt
import gevent
from queue import Queue
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt


Htmls = []
data_queue = Queue()
Data = []


# 先解析首页
def GetHtml_First():
    url = "https://cd.lianjia.com/zufang/jinrongcheng/rt200600000001/"
    headers = {
        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1 Edg/122.0.0.0"
    }
    r = requests.get(url, headers=headers, timeout=60)
    if r.status_code != 200:
        raise Exception("Error")
    parse_find_link(r.text)
    for i in range(2, 41):
        url = f"https://cd.lianjia.com/zufang/jinrongcheng/pg{i}rt200600000001/#contentList"
        r = requests.get(url, headers=headers, timeout=60)
        if r.status_code != 200:
            raise Exception("Error")
        parse_find_link(r.text)


def GetHtml_Second():
    headers = {
        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1 Edg/122.0.0.0"
    }
    while not data_queue.empty():
        url = data_queue.get_nowait()
        r = requests.get(url, headers=headers)
        if r.status_code != 200:
            raise Exception("Error")
        print(url)
        parse_html(r.text)


# 我先把每个房屋的链接爬取保存下来,再进入每个房子的主页里进行爬取房屋信息
# 不过好像也不用进入每个房子的主页里去爬取信息,本身页面就有信息
# 这里爬取会有点慢,可以自己再建立多个协程去爬取
def parse_find_link(html):
    soup = BeautifulSoup(html, "html.parser")
    links = soup.find_all("a", class_="content__list--item--aside")
    for link in links:
        href = link.attrs["href"]
        # 去除广告链接
        if 'apartment' in href:
            continue
        Htmls.append("https://cd.lianjia.com"+href)


# 获取数据
def parse_html(html):
    soup = BeautifulSoup(html, "html.parser")

    # 房屋名字
    name = soup.find("p", class_="content__title").get_text().strip().split(" ")[0].replace("整租·", "")

    li = soup.find("ul", class_="content__aside__list").find_all("li")
    a_d_ifo = li[1].get_text().split(" ")
    # 房屋面积
    area = "None"
    try:
        if a_d_ifo[1] is not None:
            area = a_d_ifo[1]
    except:
        pass

    # 房屋装修
    decorate = "None"
    try:
        if "㎡" not in a_d_ifo[-1]:
            decorate = a_d_ifo[-1]
    except:
        pass
    # 租金
    rentFee = soup.find("div", class_="content__aside--title").find("span").get_text()

    f_t_info = soup.find("div", id="info").find("ul").find_all("li")[7].get_text().replace(":", "/").split("/")
    # 楼层
    floor = f_t_info[1]
    # 总楼层
    totalFloor = f_t_info[-1]

    # 近地铁
    nearSubway = "None"
    try:
        if soup.find("i", class_="content__item__tag--is_subway_house").get_text() is not None:
            nearSubway = soup.find("i", class_="content__item__tag--is_subway_house").get_text()
    except:
        pass

    Data.append([name, area, floor, totalFloor, decorate, nearSubway, rentFee])


# 写入excel
def Write_to_Excel(columns, data, sheet_name, xls_name):
    # 创建excel表对象
    workbook = xlwt.Workbook(encoding="utf-8")
    # 创建sheet表
    worksheet = workbook.add_sheet(sheet_name)
    # 将列名写入第一行
    for j in range(len(columns)):
        worksheet.write(0, j, columns[j])
    # 向excel中写入数据
    for x in range(len(data)):
        d = data[x]
        print(d)
        for y in range(len(columns)):
            worksheet.write((x + 1), y, d[y])

    # 保存文件,文件名为xls_name
    workbook.save(xls_name)
    print(f"{xls_name}保存成功")


# 创建十个协程
def start_work():
    jobs = []
    for item in range(10):
        job = gevent.spawn(GetHtml_Second)
        jobs.append(job)
    gevent.joinall(jobs)


GetHtml_First()
workbook = xlwt.Workbook(encoding="utf-8")
worksheet = workbook.add_sheet("Sheet1")
worksheet.write(0, 0, "link")
for k in range(len(Htmls)):
    worksheet.write(k+1, 0, Htmls[k])
workbook.save("link.xls")

df = pd.read_excel("link.xls")
temp = np.array(df)
Html = temp.tolist()

# 把url放入队列
for i in Html:
    data_queue.put_nowait(i[0])

start_work()
columns = ['name', 'area', 'floor', 'totalFloor', 'decorate', 'nearSubway', 'rentFee']
Write_to_Excel(columns, Data, "Sheet2", "lianjia.xls")

df = pd.read_excel("lianjia.xls")
df = df[["name", "area", "decorate", "nearSubway", "rentFee"]]
# 统计各列空白个数
print(df.isnull().sum())

# 填充缺失值
df = df.fillna({"decorate": "非精装修", "nearSubway": "非近地铁"})
print(df.isnull().sum())

# 查看楼盘信息
print(df[df["name"] == "誉峰三期"])

# 特征编码
df.loc[df["decorate"] == "非精装修", ['decorate']] = 0
df.loc[df["decorate"] == "精装修", ['decorate']] = 1
df.loc[df["nearSubway"] == "非近地铁", ['nearSubway']] = 0
df.loc[df["nearSubway"] == "近地铁", ['nearSubway']] = 1

# rentFee列除以1000,单位为千元
df["rentFee"] = df["rentFee"] / 1000

print("*"*20)
# 构建数据集
x = np.array(df["area"])
# 把area的类型转为float
for i in range(len(x)):
    x[i] = float(x[i][:-1])

y = np.array(df["rentFee"])
# 把x转为二维数组
x = x.reshape(-1, 1)

# 将数据集拆分为训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2)

# 进行线性回归模型的训练
model = LinearRegression()
model.fit(x_train, y_train)

# 用训练好的模型对测试集进行预测,并计算训练误差和测试误差
train_predictions = model.predict(x_train)
train_error = mean_squared_error(y_train, train_predictions)
print("训练误差:", train_error)

# 计算测试误差
test_predictions = model.predict(x_test)
test_error = mean_squared_error(y_test, test_predictions)
print("测试误差:", train_error)

# plt.scatter(y_test, test_predictions, s=5)
# plt.xlabel('real value')
# plt.ylabel('predict value')
# plt.title('predict value vs real value')
# std_x = np.arange(0, 30, 0.1)
# std_y = np.arange(0, 30, 0.1)
# plt.plot(std_x, std_y, c='r')
# plt.show()

print(model.score(x_test, y_test))
print("*"*20)

# 归一化处理
f = np.array(df[["area", "decorate", "nearSubway"]])
for i in range(len(f)):
    f[i][0] = float(f[i][0][:-1])

# area最大值,最小值
max_area = np.max(f)
min_area = min(f, key=lambda x: x[0])[0]
normal = max_area - min_area
for i in range(len(f)):
    # 最小-最大归一化
    f[i][0] = (f[i][0] - min_area) / normal
g = np.array(df["rentFee"])

# 将数据集拆分为训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(f, g, test_size=0.3, random_state=2)

# 进行线性回归模型的训练
model = LinearRegression()
model.fit(x_train, y_train)

# 用训练好的模型对测试集进行预测,并计算训练误差和测试误差
train_predictions = model.predict(x_train)
train_error = mean_squared_error(y_train, train_predictions)
print("训练误差:", train_error)

# 计算测试误差
test_predictions = model.predict(x_test)
test_error = mean_squared_error(y_test, test_predictions)
print("测试误差:", train_error)
print(model.score(x_test, y_test))
print("*"*20)

# plt.scatter(y_test, test_predictions, s=5)
# plt.xlabel('real value')
# plt.ylabel('predict value')
# plt.title('predict value vs real value')
# std_x = np.arange(0, 30, 0.1)
# std_y = np.arange(0, 30, 0.1)
# plt.plot(std_x, std_y, c='r')
# plt.show()

  • 9
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

qq_73931224

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值