爬虫代码
import requests
import time
import json
import queue
from handle_mongo import mongo_info
from handle_proxy import proxy
import random
# 导入线程池模块
from concurrent.futures import ThreadPoolExecutor
# 创建队列
qqq = queue.Queue()
# 获取代理
ip = proxy()
def handel_requests(url,data):
headers = {
"client": "4",
"version": "6932.2",
"device": "SM-G955N",
"sdk": "19,4.4.2",
"imei": "354730010722829",
"channel": "qqkp",
"mac": "72:1C:E7:AA:15:BD",
"resolution": "1280*720",
"dpi": "1.5",
"android-id": "721ce7aa15bd3673",
"pseudo-id": "7aa15bd3673721ce",
"brand": "samsung",
"scale": "1.5",
"timezone": "28800",
"language": "zh",
"cns": "3",
"carrier": "CMCC",
# "imsi": "460077228231170",
"user-agent": "Mozilla/5.0 (Linux; Android 4.4.2; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/30.0.0.0 Mobile Safari/537.36",
"reach": "1",
"newbie": "0",
# "lon": "104.569743",
# "lat": "39.002595",
# "cid": "152900",
"Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
"Accept-Encoding": "gzip, deflate",
"Connection": "Keep-Alive",
# "Cookie": "duid=58851623",
"Host": "api.douguo.net",
# "Content-Length": "68"
}
# 代理
p = random.choice(ip)
global dl
dl = {
'http':'{}:{}'.format(p['ip'],p['port'])
}
html = requests.post(url=url,data=data,headers=headers,proxies=dl)
return html.text
def handle_index():
# 豆果 分类 url
url = 'http://api.douguo.net/recipe/flatcatalogs'
data = {
"client": "4",
"_session": "1550551890907354730010722829",
"v": str(int(time.time())),
"_vs": "2305",
}
response = handel_requests(url,data)
index_response = json.loads(response)
for i in index_response['result']['cs']:
for x in i['cs']:
for a in x['cs']:
data2 = {
"client": "4",
# "_session": "1550551890907354730010722829",
"keyword": a['name'],
"order": "0",
"_vs": "400",
}
qqq.put(data2)
def handle_caipu_list(data):
print('当前处理的食材:',data['keyword'])
caipu_url = 'http://api.douguo.net/recipe/v2/search/0/20'
caipu_response = handel_requests(url=caipu_url,data=data)
json_caipu = json.loads(caipu_response)
for _ in json_caipu['result']['list']:
# 存放每个菜的信息
caipu_info = {}
caipu_info['shicai'] = data['keyword']
if _['type'] == 13:
caipu_info['user_name'] = _['r']['an'] # 用户名
caipu_info['id'] = _['r']['id'] # 菜的 id
caipu_info['describe'] = _['r']['cookstory'].replace('\n','').replace(' ','')# 描述
caipu_info['caipu_name'] = _['r']['n'] # 菜名
caipu_info['cailiao_list'] = _['r']['major'] # 材料
# print(caipu_info)
# 拼接 url
detail_url = 'http://api.douguo.net/recipe/detail/{}'.format(caipu_info['id'])
detail_data = {
"client": '4',
# "_session": '1550551890907354730010722829',
"author_id": '0',
"_vs": '2801',
"_ext": '{"query":{"id":"'+str(caipu_info['id'])+'","kw":"'+caipu_info['shicai']+'","idx":"3","src":"2801","type":"13"}}',
}
# print(detail_url)
# print(detail_data)
info = handel_requests(url=detail_url,data=detail_data)
# 转字典格式
info_dict = json.loads(info)
if info_dict['result']['recipe'].get('tags'):
caipu_info['cailiao_list'] = info_dict['result']['recipe']['tags']
caipu_info['cookstep'] = info_dict['result']['recipe']['cookstep']
print('当前入库的菜谱是:',caipu_info['caipu_name'])
print('使用的代理是:',dl)
mongo_info.insert_item(caipu_info)
else:
continue
handle_index()
# 创建线程池
pool = ThreadPoolExecutor(30)
while qqq.qsize() > 0:
pool.submit(handle_caipu_list,qqq.get())
数据库代码
import pymongo
from pymongo.collection import Collection
class Connect_mongo(Collection):
def __init__(self):
self.client = pymongo.MongoClient(host='127.0.0.1',port=27017)
self.db_data = self.client['dou_guo_mei_shi']
def insert_item(self,item):
db_collection = Collection(self.db_data,'don_guo_mei_shi_item')
db_collection.insert(item)
mongo_info = Connect_mongo()
代理代码
import requests
import json
def proxy():
url = 'http://webapi.http.zhimacangku.com/getip?num=5&type=2&pro=&city=0&yys=0&port=1&pack=34365&ts=1&ys=1&cs=1&lb=1&sb=0&pb=45&mr=2®ions='
h = requests.get(url).text
html = json.loads(h)
if html:
return html['data']