实验三文本数据及其可视化

最新推荐文章于 2024-05-11 17:52:49 发布

qq_73931224

最新推荐文章于 2024-05-11 17:52:49 发布

阅读量320

点赞数 9

文章标签： python

本文链接：https://blog.csdn.net/qq_73931224/article/details/138307878

版权

1 、数据爬取

采用爬虫技术（ urllib 库， BeautifulSoup 库， re 库）从链家网站（参考网址：

https://cd.lianjia.com/zufang/jinrongcheng/pg2rt200600000001/#contentList ）获取 ” 链家 / 成

都市 / 高新区 / 金融城 / 整租 ” 租房信息（爬取 40 页数据），从各房屋信息中提取 ” 楼盘名称 /

面积 / 装修 / 近地铁 / 楼层 / 总楼层 / 租金 ” 信息，并将数据写入 lianjia.xls 文件（标题行为

['name', 'area', 'floor', 'totalFloor', 'decorate', 'nearSubway','rentFee'] ）

2 、机器学习数据准备（数据清洗、统计分析和数据变换等）

读取 lianjia.xls 数据，存储到 dataframe 对象，完成以下内容：

1) 提取 ['name','area','decorate','nearSubway','rentFee'] 共 5 列数据，用于后续机器学习 2) 房屋面积 area 是回归分析的核心参数，不能有缺失值，过滤 area 列缺失值

3) 查看 'decorate' 和 'nearSubway' 缺失值，并将 : 'decorate' 缺失值填充为 ' 非精装 ',

'nearSubway' 缺失值填充为 ' 非近地铁 '

4) 查看 ’name’ 列楼盘信息，提取特定楼盘数据共后续使用（学号末尾奇数提取 ’ 誉峰三

期 ’ 数据，偶数提取 ’ 招商大魔方 ’ 数据

5) 特征编码： 'decorate' 列， ' 非精装 ' 编码为 0,' 精装 ' 编码为 1 ； 'decorate' 列， ' 非近地铁 ' 编

码为 0,' 近地铁 ' 编码为 1

6) 'rentFee' 列除以 1000 ，单位为千元

3 、单变量回归分析

构建数据集：以 'area' 列数据为特征 ,'rentFee' 列为标注信息，进行线性回归分析，分别

计算并打印训练和测试误差

4 、多变量回归分析

构建数据集：以 ['area', 'decorate', 'nearSubway'] 共 3 列数据为特征 ,'rentFee' 列为标注

信息，特征数据经归一化处理后，进行线性回归分析，分别计算并打印训练和测试误

差

from gevent import monkey
# 猴子补丁
monkey.patch_all(thread=False)
import pandas as pd
import requests
from bs4 import BeautifulSoup
import xlwt
import gevent
from queue import Queue
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt


Htmls = []
data_queue = Queue()
Data = []


# 先解析首页
def GetHtml_First():
    url = "https://cd.lianjia.com/zufang/jinrongcheng/rt200600000001/"
    headers = {
        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1 Edg/122.0.0.0"
    }
    r = requests.get(url, headers=headers, timeout=60)
    if r.status_code != 200:
        raise Exception("Error")
    parse_find_link(r.text)
    for i in range(2, 41):
        url = f"https://cd.lianjia.com/zufang/jinrongcheng/pg{i}rt200600000001/#contentList"
        r = requests.get(url, headers=headers, timeout=60)
        if r.status_code != 200:
            raise Exception("Error")
        parse_find_link(r.text)


def GetHtml_Second():
    headers = {
        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1 Edg/122.0.0.0"
    }
    while not data_queue.empty():
        url = data_queue.get_nowait()
        r = requests.get(url, headers=headers)
        if r.status_code != 200:
            raise Exception("Error")
        print(url)
        parse_html(r.text)


# 我先把每个房屋的链接爬取保存下来，再进入每个房子的主页里进行爬取房屋信息
# 不过好像也不用进入每个房子的主页里去爬取信息，本身页面就有信息
# 这里爬取会有点慢，可以自己再建立多个协程去爬取
def parse_find_link(html):
    soup = BeautifulSoup(html, "html.parser")
    links = soup.find_all("a", class_="content__list--item--aside")
    for link in links:
        href = link.attrs["href"]
        # 去除广告链接
        if 'apartment' in href:
            continue
        Htmls.append("https://cd.lianjia.com"+href)


# 获取数据
def parse_html(html):
    soup = BeautifulSoup(html, "html.parser")

    # 房屋名字
    name = soup.find("p", class_="content__title").get_text().strip().split(" ")[0].replace("整租·", "")

    li = soup.find("ul", class_="content__aside__list").find_all("li")
    a_d_ifo = li[1].get_text().split(" ")
    # 房屋面积
    area = "None"
    try:
        if a_d_ifo[1] is not None:
            area = a_d_ifo[1]
    except:
        pass

    # 房屋装修
    decorate = "None"
    try:
        if "㎡" not in a_d_ifo[-1]:
            decorate = a_d_ifo[-1]
    except:
        pass
    # 租金
    rentFee = soup.find("div", class_="content__aside--title").find("span").get_text()

    f_t_info = soup.find("div", id="info").find("ul").find_all("li")[7].get_text().replace("：", "/").split("/")
    # 楼层
    floor = f_t_info[1]
    # 总楼层
    totalFloor = f_t_info[-1]

    # 近地铁
    nearSubway = "None"
    try:
        if soup.find("i", class_="content__item__tag--is_subway_house").get_text() is not None:
            nearSubway = soup.find("i", class_="content__item__tag--is_subway_house").get_text()
    except:
        pass

    Data.append([name, area, floor, totalFloor, decorate, nearSubway, rentFee])


# 写入excel
def Write_to_Excel(columns, data, sheet_name, xls_name):
    # 创建excel表对象
    workbook = xlwt.Workbook(encoding="utf-8")
    # 创建sheet表
    worksheet = workbook.add_sheet(sheet_name)
    # 将列名写入第一行
    for j in range(len(columns)):
        worksheet.write(0, j, columns[j])
    # 向excel中写入数据
    for x in range(len(data)):
        d = data[x]
        print(d)
        for y in range(len(columns)):
            worksheet.write((x + 1), y, d[y])

    # 保存文件，文件名为xls_name
    workbook.save(xls_name)
    print(f"{xls_name}保存成功")


# 创建十个协程
def start_work():
    jobs = []
    for item in range(10):
        job = gevent.spawn(GetHtml_Second)
        jobs.append(job)
    gevent.joinall(jobs)


GetHtml_First()
workbook = xlwt.Workbook(encoding="utf-8")
worksheet = workbook.add_sheet("Sheet1")
worksheet.write(0, 0, "link")
for k in range(len(Htmls)):
    worksheet.write(k+1, 0, Htmls[k])
workbook.save("link.xls")

df = pd.read_excel("link.xls")
temp = np.array(df)
Html = temp.tolist()

# 把url放入队列
for i in Html:
    data_queue.put_nowait(i[0])

start_work()
columns = ['name', 'area', 'floor', 'totalFloor', 'decorate', 'nearSubway', 'rentFee']
Write_to_Excel(columns, Data, "Sheet2", "lianjia.xls")

df = pd.read_excel("lianjia.xls")
df = df[["name", "area", "decorate", "nearSubway", "rentFee"]]
# 统计各列空白个数
print(df.isnull().sum())

# 填充缺失值
df = df.fillna({"decorate": "非精装修", "nearSubway": "非近地铁"})
print(df.isnull().sum())

# 查看楼盘信息
print(df[df["name"] == "誉峰三期"])

# 特征编码
df.loc[df["decorate"] == "非精装修", ['decorate']] = 0
df.loc[df["decorate"] == "精装修", ['decorate']] = 1
df.loc[df["nearSubway"] == "非近地铁", ['nearSubway']] = 0
df.loc[df["nearSubway"] == "近地铁", ['nearSubway']] = 1

# rentFee列除以1000,单位为千元
df["rentFee"] = df["rentFee"] / 1000

print("*"*20)
# 构建数据集
x = np.array(df["area"])
# 把area的类型转为float
for i in range(len(x)):
    x[i] = float(x[i][:-1])

y = np.array(df["rentFee"])
# 把x转为二维数组
x = x.reshape(-1, 1)

# 将数据集拆分为训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2)

# 进行线性回归模型的训练
model = LinearRegression()
model.fit(x_train, y_train)

# 用训练好的模型对测试集进行预测，并计算训练误差和测试误差
train_predictions = model.predict(x_train)
train_error = mean_squared_error(y_train, train_predictions)
print("训练误差：", train_error)

# 计算测试误差
test_predictions = model.predict(x_test)
test_error = mean_squared_error(y_test, test_predictions)
print("测试误差：", train_error)

# plt.scatter(y_test, test_predictions, s=5)
# plt.xlabel('real value')
# plt.ylabel('predict value')
# plt.title('predict value vs real value')
# std_x = np.arange(0, 30, 0.1)
# std_y = np.arange(0, 30, 0.1)
# plt.plot(std_x, std_y, c='r')
# plt.show()

print(model.score(x_test, y_test))
print("*"*20)

# 归一化处理
f = np.array(df[["area", "decorate", "nearSubway"]])
for i in range(len(f)):
    f[i][0] = float(f[i][0][:-1])

# area最大值，最小值
max_area = np.max(f)
min_area = min(f, key=lambda x: x[0])[0]
normal = max_area - min_area
for i in range(len(f)):
    # 最小-最大归一化
    f[i][0] = (f[i][0] - min_area) / normal
g = np.array(df["rentFee"])

# 将数据集拆分为训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(f, g, test_size=0.3, random_state=2)

# 进行线性回归模型的训练
model = LinearRegression()
model.fit(x_train, y_train)

# 用训练好的模型对测试集进行预测，并计算训练误差和测试误差
train_predictions = model.predict(x_train)
train_error = mean_squared_error(y_train, train_predictions)
print("训练误差：", train_error)

# 计算测试误差
test_predictions = model.predict(x_test)
test_error = mean_squared_error(y_test, test_predictions)
print("测试误差：", train_error)
print(model.score(x_test, y_test))
print("*"*20)

# plt.scatter(y_test, test_predictions, s=5)
# plt.xlabel('real value')
# plt.ylabel('predict value')
# plt.title('predict value vs real value')
# std_x = np.arange(0, 30, 0.1)
# std_y = np.arange(0, 30, 0.1)
# plt.plot(std_x, std_y, c='r')
# plt.show()

qq_73931224

关注

9
点赞
踩
3

收藏

觉得还不错? 一键收藏
打赏
0
评论
实验三文本数据及其可视化

信息，特征数据经归一化处理后，进行线性回归分析，分别计算并打印训练和测试误。列楼盘信息，提取特定楼盘数据共后续使用（学号末尾奇数提取。是回归分析的核心参数，不能有缺失值，过滤。列为标注信息，进行线性回归分析，分别。页数据），从各房屋信息中提取。计算并打印训练和测试误差。列数据，用于后续机器学习。
复制链接

扫一扫