美食菜谱爬取
主要功能
多线程爬取美食APP菜谱分类中的菜谱数据,并存到mongoDB
框架
whistle
分析数据包- 夜神安卓模拟器 安装菜谱app
python
编写爬虫代码vscode
编辑器mongoDB
存储数据ROBO 3T
mongoDB
可视化工具
菜谱APP界面
一些截图
代码
spider_menu.py
:
# spider_menu.py
import requests
import json
from multiprocessing import Queue
from handle_mongo import mongo_info
from concurrent.futures import ThreadPoolExecutor # 线程池
#创建队列
queue_list = Queue()
# 处理数据请求
def handle_request(url, data):
header = {
"client":"4",
"version":"6962.2",
"device":"SM-G955N",
"sdk":"25,7.1.2",
"channel":"baidu",
# "resolution":"1600*900",
# "display-resolution":"1600*900",
# "dpi":"2.0",
# "android-id":"784F438E43A20000",
# "pseudo-id":"864394010787945",
"brand":"samsung",
"scale":"2.0",
"timezone":"28800",
"language":"zh",
"cns":"2",
"carrier":"CMCC",
"User-Agent":"Mozilla/5.0 (Linux; Android 7.1.2; SM-G955N Build/N2G48H; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/75.0.3770.143 Mobile Safari/537.36",
"imei":"864394010787945",
"terms-accepted":"1",
"newbie":"1",
"reach":"10000",
"Content-Type":"application/x-www-form-urlencoded; charset=utf-8",
"Accept-Encoding":"gzip",
"Connection":"Keep-Alive",
"Host":"api.douguo.net",
"Content-Length":"147",
}
response = requests.post(url=url,headers=header,data=data)
return response
# 抓取品类列表
def handle_cat():
url = 'http://api.douguo.net/recipe/flatcatalogs'
data = {
"client":"4",
"_vs":"2305",
}
response = handle_request(url,data)
index_dict = json.loads(response.text)
for index_item in index_dict["result"]["cs"]:
for index_item_1 in index_item["cs"]:
for index_item_2 in index_item_1["cs"]:
queue_list.put(index_item_2["name"])
# 关键词搜索
def handle_search(keyword):
print("当前处理的食材是:",keyword,end="\n")
url = 'http://api.douguo.net/search/universalnew/0/10'
data = {
"client":"4",
"keyword":keyword,
"_vs":"400",
}
response = handle_request(url,data)
caipu_list_dict = json.loads(response.text)
for item in caipu_list_dict["result"]["recipe"]["recipes"]:
caipu_info = {}
caipu_info["shicai"] = keyword
caipu_info['caipu_name'] = item["n"]
caipu_info["author_name"] = item["an"]
caipu_info["caipu_id"] = item["id"]
caipu_info["cookstory"] = item["cookstory"]
caipu_info["img"] = item["img"]
caipu_info["major"] = item["major"]
caipu_info["detail_url"] = item["au"]
detail_info_dict = json.loads(handle_detail(caipu_info))
caipu_info["tips"] = detail_info_dict["result"]["recipe"]["tips"]
caipu_info["cookstep"] = detail_info_dict["result"]["recipe"]["cookstep"]
print("当前入库的菜谱是:",caipu_info['caipu_name'])
mongo_info.insert_item(caipu_info)
#菜谱详情
def handle_detail(item):
url = "http://api.douguo.net/recipe/detail/" + str(item["caipu_id"])
data = {
"client":"4",
"_vs":"11101",
"_ext": '{"query":{ "kw":' + str(item["shicai"]) + ',"src":"11101","idx":"1", "type":"13", "id":' + str(item["caipu_id"]) + ' }',
}
response = handle_request(url,data)
return response.text
handle_cat()
pool = ThreadPoolExecutor(max_workers=20) #创建线程池
# while queue_list.qsize() > 0: 报错
while not queue_list.empty():
pool.submit(handle_search,queue_list.get()) # 函数名和 参数
mongoDB存储数据:
# handle_mongodb.py
import pymongo
from pymongo.collection import Collection
class Connect_mongo(object):
def __init__(self):
self.client = pymongo.MongoClient(host="127.0.0.1",port=27017)
self.db_data = self.client["dougou_meishi"]
def insert_item(self,item):
db_collection = Collection(self.db_data,'t_douguo_item')
db_collection.insert(item)
mongo_info = Connect_mongo()
做个笔记
- 粘贴抓包得到的
header
在编辑器里处理成key-value的正则表达式子
抓到的Header
client: 4
version: 6962.2
device: SM-G955N
sdk: 25,7.1.2
channel: baidu
resolution: 1600*900
display-resolution: 1600*900
dpi: 2.0
brand: samsung
scale: 2.0
timezone: 28800
Content-Type: application/x-www-form-urlencoded; charset=utf-8
Accept-Encoding: gzip
Connection: Keep-Alive
Cookie: duid=64275234
Host: api.douguo.net
Content-Length: 147
处理后:
"client":" 4",
"version":" 6962.2",
"device":" SM-G955N",
"sdk":" 25,7.1.2",
"channel":" baidu",
"resolution":" 1600*900",
"display-resolution":" 1600*900",
"dpi":" 2.0",
"brand":" samsung",
"scale":" 2.0",
"timezone":" 28800",
"Content-Type":" application/x-www-form-urlencoded; charset=utf-8",
"Accept-Encoding":" gzip",
"Connection":" Keep-Alive",
"Cookie":" duid=64275234",
"Host":" api.douguo.net",
"Content-Length":" 147",
- 同样把
url
参数处理成key-value
client=4&_session=123&keyword=%E5%9C%9F%E8%B1%86&_vs=11110&sign_ran=123123&code=123123
先用换行替换&
符号
替换结果:
client=4
_session=123
keyword=%E5%9C%9F%E8%B1%86
_vs=11110
sign_ran=123123
code=123123
再处理为key-value
的格式
处理结果:
"client":"4"
"_session":"123"
"keyword":"%E5%9C%9F%E8%B1%86"
"_vs":"11110"
"sign_ran":"123123"
"code":"123123"
项目代码地址(可运行)
遇到的问题
Q:报错信息
while queue_list.qsize() > 0:
File “/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/multiprocessing/queues.py”, line 120, in qsize
return self._maxsize - self._sem._semlock._get_value()
A:
mac os 中 queue.qsize() 报错。暂时的解决办法是,使用queue.empty 来解决
原代码:
while queue_list.qsize() > 0:
pool.submit(handle_search,queue_list.get()) # 函数 和参数
修改后:
....
while not queue_list.empty():
pool.submit(handle_search,queue_list.get()) # 函数 和参数
.....