python 爬取链家北京租房信息

最新推荐文章于 2024-06-24 16:06:48 发布

赵雷_

最新推荐文章于 2024-06-24 16:06:48 发布

阅读量1k

点赞数 1

分类专栏： python

本文链接：https://blog.csdn.net/qq_39238370/article/details/113887463

版权

python 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

结果图镇楼。无图无真相。。。。嘿嘿

参考了一篇链家石家庄的文章，但是那篇已经没法用了规则变了，我又重新写了一份。

https://blog.csdn.net/hihell/article/details/84029492

一、效果图

二、代码

import re
from fake_useragent import UserAgent
from lxml import etree
import asyncio
import aiohttp
import pandas as pd

# 定义一个类 定义使用的变量  定义get方法通过连接池进行网络请求
class LianjiaSpider(object):

    def __init__(self):
        self.ua = UserAgent()  # 获取userAgent类
        self.head = {"User-Agent": self.ua.random}
        self._data = list()  # 初始化list

    async def get_page_count(self):
        result = await self.get("https://bj.lianjia.com/zufang/pg1")
        page_html = etree.HTML(result)  # 解析网页
        pageCount = page_html.xpath(".//div[@class='content__pg']/@data-totalpage")
        pageCount = list(map(int, pageCount))
        return pageCount[0]

    async def get(self, url):  # 异步方法  当方法执行挂起线程执行完毕返回当前执行
        async with aiohttp.ClientSession() as session:  # 线程连接池
            try:
                async with session.get(url, headers=self.head, timeout=3) as resp:
                    if resp.status == 200:
                        result = await resp.text()
                        return result
            except Exception as e:
                print(e.args)

    async def parse_html(self):
        count = await self.get_page_count()
        for page in range(1, count):
            url = "https://bj.lianjia.com/zufang/pg{}/".format(page)
            print("正在爬取{}".format(url))
            html = await self.get(url)  # 获取网页内容
            html = etree.HTML(html)  # 解析网页
            await self.parse_page(html)  # 匹配我们想要的数据

        print("正在存储数据....")
        print(len(self._data))
        ######################### 数据写入
        data = pd.DataFrame(self._data)
        data.to_csv("链家网租房数据.csv", encoding='utf_8_sig')  # 写入文件
        ######################### 数据写入

    def run(self):
        loop = asyncio.get_event_loop()  # 获取到循环
        tasks = [asyncio.ensure_future(self.parse_html())]  # 创建任务
        loop.run_until_complete(asyncio.wait(tasks))

    async def parse_page(self, html):
        rst = html.xpath(".//div[@class='content__list--item']")  # //代表在任意路径下查找节点为div，class为的所有元素
        print(rst)  # ==> [<Element li at 0x133d9e0>, <Element li at 0x133d9b8>, <Element li at 0x133d990>]  找
        for div in rst:
            imgurl = div.xpath(".//a[@class='content__list--item--aside']/img/@src")
            title = div.xpath(".//a[@class='content__list--item--aside']/img/@alt")
            floor = div.xpath(".//span[@class='hide']/text()")
            price = div.xpath(".//span[@class='content__list--item-price']/em/text()")
            type = div.xpath(".//p[@class='content__list--item--des']/text()")

            if len(floor) > 0:  # 有的没有写楼层会报错加一层判断
                currentFloor = floor[1].replace("\n", "").replace(" ", "")
            else:
                currentFloor = ''

            strinfo = []    #用于存储多少平方米 朝向 几室几厅
            strinfo.clear()
            for str in type:
                info = str.replace(" ", "").replace("\n", "").replace("-", "")
                if info != '':
                    strinfo.append(info)
                    print(info)

            size = strinfo[0].replace(" ", "").replace("\n", "")  # 30㎡
            direction = strinfo[1].replace(" ", "").replace("\n", "")  # 南
            structure = strinfo[2].replace(" ", "").replace("\n", "")  # 5室1厅2卫
            structure = re.findall(r'\d+', structure)
            print(structure)
            print("imgurl:" + imgurl[0])  # 图片地址
            print("title:" + title[0])  # 标题
            print("price:" + price[0])  # 价钱
            print("currentFloor:" + currentFloor)  # 楼层
            print(structure)  # 分割几室几厅几卫

            if len(structure) == 3:
                one_data = {
                    "图片地址": imgurl[0],
                    "标题": title[0],
                    "价格": price[0],
                    "楼层": currentFloor,
                    "大小": size,
                    "朝向": direction,
                    "室": structure[0],
                    "厅": structure[1],
                    "卫": structure[2]
                }
            elif len(structure) == 2:
                one_data = {
                    "图片地址": imgurl[0],
                    "标题": title[0],
                    "价格": price[0],
                    "楼层": currentFloor,
                    "大小": size,
                    "朝向": direction,
                    "室": structure[0],
                    "厅": 0,
                    "卫": structure[1]
                }
            self._data.append(one_data)  # 添加数据

if __name__ == '__main__':
    l = LianjiaSpider()
    l.run()

三、总结

搜索for循环

替换字符串

len长度函数

etree 根据class 解析，

\d正则表达提取数字

fake_useragent 模拟head的使用

协程的使用

list中str转为int map的使用 list(map(int,strList)) list清除的方法 clear

if elseif 使用