异步抓取二手车信息（协程、redis（去重）+mysql（存储））

最新推荐文章于 2024-10-03 09:02:12 发布

你好，此用户已存在

最新推荐文章于 2024-10-03 09:02:12 发布

阅读量768

点赞数 8

分类专栏：爬虫文章标签： python

本文链接：https://blog.csdn.net/qq_58158950/article/details/134884253

版权

爬虫专栏收录该内容

5 篇文章 0 订阅

订阅专栏

import aiohttp
import asyncio
import aiomysql
import redis
import chardet
from lxml import etree
import hashlib


class carSpider:
    redisClient = redis.Redis()
    
    # 初始化信息
    def __init__(self):
        self.url = 'https://www.che168.com/china/a0_0msdgscncgpi1ltocsp{}exf4x0/?pvareaid=102179#currengpostion'
        self.api_url = 'https://cacheapigo.che168.com/CarProduct/GetParam.ashx?specid={}'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
        }

    # 在首页中获取详情页apiurl所需的id值
    async def get_car_id(self, page, session, pool):
        async with session.get(self.url.format(page), headers=self.headers) as response:
            content = await response.read()  # 获取页面的二进制数据
            # 检测页面的编码结构
            encoding = chardet.detect(content)['encoding']
            # print(encoding)#打印编码格式
            if encoding == 'GB2312' or encoding == 'ISO-8859-1':  # windows下特有的编码ISO-8859-1
                html = content.decode('gbk')
            else:  # 当编码格式不是GB2312时，获取到的页面将没有id字段，这是服务器的一种反爬机制
                html = content.decode(encoding)
                print("被反爬了....")

            tree = etree.HTML(html)
            id_list = tree.xpath("//ul[@class='viewlist_ul']/li/@specid")
            # print(id_list)
            if id_list:
                tasks = [asyncio.create_task(self.get_car_info(specid, session, pool)) for specid in id_list]
                await asyncio.wait(tasks)

    # 获取到id后拼接到详情页apiurl中，获取详情页
    async def get_car_info(self, specid, session, pool):
        tasks = []
        async with session.get(self.api_url.format(specid), headers=self.headers) as response:
            infos = await response.json()
            item = dict()
            if infos['result']['paramtypeitems']:
                paraInfo = infos['result']['paramtypeitems'][0]  # 基本参数
                item['name'] = paraInfo['paramitems'][0]['value']
                item['price'] = paraInfo['paramitems'][1]['value']
                item['brand'] = paraInfo['paramitems'][2]['value']

                bodyInfo = infos['result']['paramtypeitems'][1]  # 车身
                item['length'] = bodyInfo['paramitems'][0]['value']  # 长度
                item['breadth'] = bodyInfo['paramitems'][1]['value']  # 宽度
                item['altitude'] = bodyInfo['paramitems'][2]['value']  # 高度

                tasks.append(asyncio.create_task(self.save_car_info(item, pool)))
            else:
                print("数据不存在...")
            await asyncio.wait(tasks)

    # 获取数据的唯一MD5值
    @staticmethod
    def getmd5(item):
        md5info = hashlib.md5(str(item).encode('utf-8')).hexdigest()
        return md5info

    # 保存数据
    async def save_car_info(self, item, pool):
        async with pool.acquire() as conn:
            async with conn.cursor() as cursor:
                # 使用MD5值对数据进行去重
                md5Info = self.getmd5(item)
                isMd5Insert = self.redisClient.sadd('car:filter', md5Info)
                if isMd5Insert:
                    sql = """insert into carInfo(
                            id,name,price,brand,length,breadth,altitude
                            ) values(%s,%s,%s,%s,%s,%s,%s)
                    """
                    try:
                        await cursor.execute(
                            sql,
                            (0,
                             item['name'],
                             item['price'],
                             item['brand'],
                             item['length'],
                             item['breadth'],
                             item['altitude']
                             ))
                        await conn.commit()
                        print("数据插入成功...")
                    except Exception as e:
                        print("数据插入失败...", e)
                        await conn.rollback()
                else:
                    print("数据重复...")

    # 执行函数
    async def main(self):
        # 创建数据库连接池、连接对象及游标
        async with aiomysql.create_pool(host='localhost', port=3306,
                                        user='root', password='admin', db='py_spider') as pool:
            # 用于创建表的链接对象
            async with pool.acquire() as conn:
                async with conn.cursor() as cursor:
                    # 创建表
                    create_sql = """create table carInfo(
                                    id int primary key auto_increment,
                                    name varchar(100),
                                    price varchar(100),
                                    brand varchar(100),
                                    altitude varchar(100),
                                    breadth varchar(100),#宽度
                                    length varchar(100)#长度                                                                  
                    );
                    """
                    check_table_sql = "show tables like 'carInfo'"
                    # 检查表是否已经存在，如果不存在则创建之
                    result = await cursor.execute(check_table_sql)
                    if not result:
                        print("正在创建表carInfo....")
                        await cursor.execute(create_sql)

            # 创建异步请求对象
            async with aiohttp.ClientSession() as session:
                tasks = [asyncio.create_task(self.get_car_id(page, session, pool)) for page in range(1, 16)]
                await asyncio.wait(tasks)


if __name__ == '__main__':
    car = carSpider()
    # asyncio.run(car.main())
    loop = asyncio.get_event_loop()
    loop.run_until_complete(car.main())