前言
本插件是基于Nonebot2 的QQ聊天机器人。通过集成京东商品爬虫,将京东网站中的优质产品推荐给用户,本文对京东官网源码信息进行了分析,并附上了完整的源代码。文末有一个小彩蛋,感兴趣的小伙伴可以瞧一瞧!
商品页面分析
首先打开京东主页,搜索商品,找到对应的网络请求包!
可以发现这里发送的是一个get请求,携带的参数有3个,经过分析对我们最总要的就是keyword参数,这个参数的值是我们搜索商品的url编码。
分析商品信息在网页源代码中的位置(我们应如何提取代码中的信息)
通过观察可以发现:框框中的一串数字为商品的唯一标识,通过这个标识可以看到商品详细信息
所以我们只需进行url拼接,即可达到商品详细信息页面!
然后对商品详细信息页面进行分析,提取详细信息即可!
分析到这里可以先看一下,插件的效果图,然后进行代码的编写!(兴趣提升)
京东商品搜索功能!搜出京东网站中该类商品的排名前30.然后通过QQ机器人转发给用户。
每一个商品都有一个ID在上述排行中已经展示了出来,可以通过该ID查看商品详细信息!
插件编写
京东爬虫(负责爬取京东商品信息)
# 获取物品列表
import httpx
from random import choice
from bs4 import BeautifulSoup
from nonebot.adapters.onebot.v11 import MessageSegment
user_agent = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
"UCWEB7.0.2.37/28/999",
"NOKIA5700/ UCWEB7.0.2.37/28/999",
"Openwave/ UCWEB7.0.2.37/28/999",
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
# iPhone 6:
"Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
]
async def get_item_search_result_list(item_name: str):
url = 'https://search.jd.com/Search?'
param = {
'keyword': item_name
}
# 发送请求时相当于请求https://search.jd.com/Search?keyword=%E8%8B%B9%E6%9E%9C%E6%89%8B%E6%9C%BA
# https://search.jd.com/Search?keyword=苹果手机
item_list = BeautifulSoup(httpx.get(url, params=param, headers={"User-Agent": choice(user_agent)}).text, 'lxml').select('.gl-item')
search_result_str = '=================='
search_result_list, item_img_url_list = [], []
for item in item_list:
item_img_url = 'https:' + item.select('.p-img img')[0].get('data-lazy-img')
result = '\n名称:' + item.select('.p-name em')[0].text.replace(' ', '\n\t\t') + \
'\nID:' + item.select('.p-price i')[0].get('data-price') + \
'\n价格:' + item.select('.p-price i')[0].text.replace(' ', '') + '¥' + \
'\n链接:' + 'https://item.jd.com/' + item.select('.p-price i')[0].get('data-price') + '.html'
search_result_str += result + '\n=================='
msg = [{"type": "image", "data": {"file": item_img_url, }}, {"type": "text", "data": {"text": result}}]
search_result_list.append(msg)
return search_result_list, search_result_str
async def get_item_detail(item_id: str):
url = 'https://item.jd.com/' + item_id + '.html'
data = httpx.get(url, headers={"User-Agent": choice(user_agent)})
if data.status_code == 200:
search_soup = BeautifulSoup(data.text, 'lxml')
item_img_url = 'https:' + search_soup.select('#spec-img')[0].get('data-origin')
item_detail_brand = search_soup.select('.p-parameter-list li')[0].text.strip()
item_detail_li_list = search_soup.select('.parameter2 li')
result_str = item_detail_brand + '\n==================\n'
for item_detail in item_detail_li_list:
result = item_detail.text
result_str += '\t' + result + '\n'
result_str += '\t商品链接:' + url
item_detail = MessageSegment.image(item_img_url) + result_str
return item_detail
else:
return '获取失败,错误代码' + str(data.status_code)
Nonebot2 QQ机器人插件(负责衔接Nonebot机器人与爬虫插件)
from typing import List
from random import choice
from nonebot import on_command, get_bot
from nonebot.matcher import Matcher
from nonebot.adapters import Message
from nonebot.params import CommandArg, ArgPlainText
from nonebot.adapters.onebot.v11 import Bot, MessageEvent, GroupMessageEvent, PrivateMessageEvent
from .data_source import get_item_search_result_list, get_item_detail
jingdong_search = on_command("jingdong_search", aliases={"京东查询"}, priority=5)
jingdong_item_detail = on_command("jingdong_item_detail", aliases={"京东详情", "京东商品"}, priority=5)
"""
京东:
京东查询 [商品名称] 如:京东查询 笔记本
群聊中返回结果为合并转发消息
私聊、频道返回结果为长消息
京东详情 [商品ID] 如:京东详情 123456789
"""
# 直接附参
@jingdong_search.handle()
async def handle_first_receive(event: MessageEvent, matcher: Matcher, args: Message = CommandArg()):
if isinstance(event, PrivateMessageEvent):
await jingdong_search.finish(message="请在群聊内使用该功能!!!")
else:
plain_text = args.extract_plain_text()
if plain_text:
matcher.set_arg("item", args)
# 二次确认
@jingdong_search.got("item", prompt="你想搜索什么商品呢?")
async def handle_item(event: MessageEvent, item_name: str = ArgPlainText("item")):
item_list, list_str = await get_item_search_result_list(item_name)
if isinstance(event, GroupMessageEvent):
await send_forward_msg(get_bot(), event, get_bot().self_id, item_list)
elif isinstance(event, PrivateMessageEvent):
await jingdong_search.finish(message="请在群聊内使用该功能!!!")
else:
await jingdong_search.finish(list_str)
# 直接附参
@jingdong_item_detail.handle()
async def handle_first_receive(matcher: Matcher, args: Message = CommandArg()):
plain_text = args.extract_plain_text()
if plain_text:
matcher.set_arg("itemid", args)
# 二次确认
@jingdong_item_detail.got("itemid", prompt="你想查询哪个商品的信息呢?")
async def handle_item(item_id: str = ArgPlainText("itemid")):
await jingdong_item_detail.finish(await get_item_detail(item_id))
async def send_forward_msg(
bot: Bot,
event: GroupMessageEvent,
uin: str,
msgs: List[str]
):
def to_json(msg):
return {"type": "node", "data": {"name": choice(list(bot.config.nickname)), "uin": uin, "content": msg}}
messages = [to_json(msg) for msg in msgs]
await bot.send_group_forward_msg(group_id=event.group_id, messages=messages)
彩蛋
除此功能之外,给大家提供一个笔者现在的思路,基于Nonebot2开发一个综合性商品查询平台,通过爬虫爬取主流网络电商平台信息,然后进行综合分析,得出几款搜索物品性能、销量、好评率中的佼佼者,然后推荐给用户,这种方法可以大大减轻用户的商品检索负担,同时可以拉取私有流量池,宣传自己的产品!