import aiohttp
import asyncio
import aiomysql
import redis
import chardet
from lxml import etree
import hashlib
class carSpider:
redisClient = redis.Redis()
# 初始化信息
def __init__(self):
self.url = 'https://www.che168.com/china/a0_0msdgscncgpi1ltocsp{}exf4x0/?pvareaid=102179#currengpostion'
self.api_url = 'https://cacheapigo.che168.com/CarProduct/GetParam.ashx?specid={}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
# 在首页中获取详情页apiurl所需的id值
async def get_car_id(self, page, session, pool):
async with session.get(self.url.format(page), headers=self.headers) as response:
content = await response.read() # 获取页面的二进制数据
# 检测页面的编码结构
encoding = chardet.detect(content)['encoding']
# print(encoding)#打印编码格式
if encoding == 'GB2312' or encoding == 'ISO-8859-1': # windows下特有的编码ISO-8859-1
html = content.decode('gbk')
else: # 当编码格式不是GB2312时,获取到的页面将没有id字段,这是服务器的一种反爬机制
html = content.decode(encoding)
print("被反爬了....")
tree = etree.HTML(html)
id_list = tree.xpath("//ul[@class='viewlist_ul']/li/@specid")
# print(id_list)
if id_list:
tasks = [asyncio.create_task(self.get_car_info(specid, session, pool)) for specid in id_list]
await asyncio.wait(tasks)
# 获取到id后拼接到详情页apiurl中,获取详情页
async def get_car_info(self, specid, session, pool):
tasks = []
async with session.get(self.api_url.format(specid), headers=self.headers) as response:
infos = await response.json()
item = dict()
if infos['result']['paramtypeitems']:
paraInfo = infos['result']['paramtypeitems'][0] # 基本参数
item['name'] = paraInfo['paramitems'][0]['value']
item['price'] = paraInfo['paramitems'][1]['value']
item['brand'] = paraInfo['paramitems'][2]['value']
bodyInfo = infos['result']['paramtypeitems'][1] # 车身
item['length'] = bodyInfo['paramitems'][0]['value'] # 长度
item['breadth'] = bodyInfo['paramitems'][1]['value'] # 宽度
item['altitude'] = bodyInfo['paramitems'][2]['value'] # 高度
tasks.append(asyncio.create_task(self.save_car_info(item, pool)))
else:
print("数据不存在...")
await asyncio.wait(tasks)
# 获取数据的唯一MD5值
@staticmethod
def getmd5(item):
md5info = hashlib.md5(str(item).encode('utf-8')).hexdigest()
return md5info
# 保存数据
async def save_car_info(self, item, pool):
async with pool.acquire() as conn:
async with conn.cursor() as cursor:
# 使用MD5值对数据进行去重
md5Info = self.getmd5(item)
isMd5Insert = self.redisClient.sadd('car:filter', md5Info)
if isMd5Insert:
sql = """insert into carInfo(
id,name,price,brand,length,breadth,altitude
) values(%s,%s,%s,%s,%s,%s,%s)
"""
try:
await cursor.execute(
sql,
(0,
item['name'],
item['price'],
item['brand'],
item['length'],
item['breadth'],
item['altitude']
))
await conn.commit()
print("数据插入成功...")
except Exception as e:
print("数据插入失败...", e)
await conn.rollback()
else:
print("数据重复...")
# 执行函数
async def main(self):
# 创建数据库连接池、连接对象及游标
async with aiomysql.create_pool(host='localhost', port=3306,
user='root', password='admin', db='py_spider') as pool:
# 用于创建表的链接对象
async with pool.acquire() as conn:
async with conn.cursor() as cursor:
# 创建表
create_sql = """create table carInfo(
id int primary key auto_increment,
name varchar(100),
price varchar(100),
brand varchar(100),
altitude varchar(100),
breadth varchar(100),#宽度
length varchar(100)#长度
);
"""
check_table_sql = "show tables like 'carInfo'"
# 检查表是否已经存在,如果不存在则创建之
result = await cursor.execute(check_table_sql)
if not result:
print("正在创建表carInfo....")
await cursor.execute(create_sql)
# 创建异步请求对象
async with aiohttp.ClientSession() as session:
tasks = [asyncio.create_task(self.get_car_id(page, session, pool)) for page in range(1, 16)]
await asyncio.wait(tasks)
if __name__ == '__main__':
car = carSpider()
# asyncio.run(car.main())
loop = asyncio.get_event_loop()
loop.run_until_complete(car.main())
异步抓取二手车信息(协程、redis(去重)+mysql(存储))
最新推荐文章于 2024-10-03 09:02:12 发布