Python 爬取手机豆果美食app存Mongodb

最新推荐文章于 2023-08-24 09:59:35 发布

Test_C.

最新推荐文章于 2023-08-24 09:59:35 发布

阅读量2k

点赞数 2

分类专栏：手机抓包 Python requests

本文链接：https://blog.csdn.net/weixin_42544006/article/details/87867758

版权

Python 同时被 3 个专栏收录

110 篇文章 3 订阅

订阅专栏

requests

8 篇文章 0 订阅

订阅专栏

手机抓包

2 篇文章 0 订阅

订阅专栏

爬虫代码

import requests
import time
import json
import queue
from handle_mongo import mongo_info
from handle_proxy import proxy
import random
# 导入线程池模块
from concurrent.futures import ThreadPoolExecutor
# 创建队列
qqq = queue.Queue()
# 获取代理
ip = proxy()

def handel_requests(url,data):
    headers = {
        "client": "4",
        "version": "6932.2",
        "device": "SM-G955N",
        "sdk": "19,4.4.2",
        "imei": "354730010722829",
        "channel": "qqkp",
        "mac": "72:1C:E7:AA:15:BD",
        "resolution": "1280*720",
        "dpi": "1.5",
        "android-id": "721ce7aa15bd3673",
        "pseudo-id": "7aa15bd3673721ce",
        "brand": "samsung",
        "scale": "1.5",
        "timezone": "28800",
        "language": "zh",
        "cns": "3",
        "carrier": "CMCC",
        # "imsi": "460077228231170",
        "user-agent": "Mozilla/5.0 (Linux; Android 4.4.2; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/30.0.0.0 Mobile Safari/537.36",
        "reach": "1",
        "newbie": "0",
        # "lon": "104.569743",
        # "lat": "39.002595",
        # "cid": "152900",
        "Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "Keep-Alive",
        # "Cookie": "duid=58851623",
        "Host": "api.douguo.net",
        # "Content-Length": "68"
    }
    # 代理
    p = random.choice(ip)
    global dl
    dl = {
        'http':'{}:{}'.format(p['ip'],p['port'])
    }
    html = requests.post(url=url,data=data,headers=headers,proxies=dl)
    return html.text

def handle_index():
    # 豆果 分类 url
    url = 'http://api.douguo.net/recipe/flatcatalogs'
    data = {
        "client": "4",
        "_session": "1550551890907354730010722829",
        "v": str(int(time.time())),
        "_vs": "2305",
    }
    response = handel_requests(url,data)
    index_response = json.loads(response)
    for i in index_response['result']['cs']:
        for x in i['cs']:
            for a in x['cs']:
                data2 = {
                    "client": "4",
                    # "_session": "1550551890907354730010722829",
                    "keyword": a['name'],
                    "order": "0",
                    "_vs": "400",
                }
                qqq.put(data2)


def handle_caipu_list(data):
    print('当前处理的食材:',data['keyword'])
    caipu_url = 'http://api.douguo.net/recipe/v2/search/0/20'
    caipu_response = handel_requests(url=caipu_url,data=data)
    json_caipu = json.loads(caipu_response)
    for _ in json_caipu['result']['list']:
        # 存放每个菜的信息
        caipu_info = {}
        caipu_info['shicai'] = data['keyword']
        if _['type'] == 13:
            caipu_info['user_name'] = _['r']['an'] # 用户名
            caipu_info['id'] = _['r']['id'] # 菜的 id
            caipu_info['describe'] = _['r']['cookstory'].replace('\n','').replace(' ','')# 描述
            caipu_info['caipu_name'] = _['r']['n'] # 菜名
            caipu_info['cailiao_list'] = _['r']['major'] # 材料
            # print(caipu_info)
            # 拼接 url
            detail_url = 'http://api.douguo.net/recipe/detail/{}'.format(caipu_info['id'])
            detail_data = {
                "client": '4',
                # "_session": '1550551890907354730010722829',
                "author_id": '0',
                "_vs": '2801',
                "_ext": '{"query":{"id":"'+str(caipu_info['id'])+'","kw":"'+caipu_info['shicai']+'","idx":"3","src":"2801","type":"13"}}',
            }
            # print(detail_url)
            # print(detail_data)
            info = handel_requests(url=detail_url,data=detail_data)
            # 转字典格式
            info_dict = json.loads(info)
            if info_dict['result']['recipe'].get('tags'):
                caipu_info['cailiao_list'] = info_dict['result']['recipe']['tags']
            caipu_info['cookstep'] = info_dict['result']['recipe']['cookstep']

            print('当前入库的菜谱是:',caipu_info['caipu_name'])
            print('使用的代理是:',dl)
            mongo_info.insert_item(caipu_info)

        else:
            continue

handle_index()

# 创建线程池
pool = ThreadPoolExecutor(30)
while qqq.qsize() > 0:
    pool.submit(handle_caipu_list,qqq.get())

数据库代码

import pymongo
from pymongo.collection import Collection


class Connect_mongo(Collection):
    def __init__(self):
        self.client = pymongo.MongoClient(host='127.0.0.1',port=27017)
        self.db_data = self.client['dou_guo_mei_shi']

    def insert_item(self,item):
        db_collection = Collection(self.db_data,'don_guo_mei_shi_item')
        db_collection.insert(item)

mongo_info = Connect_mongo()

代理代码

import requests
import json

def proxy():
    url = 'http://webapi.http.zhimacangku.com/getip?num=5&type=2&pro=&city=0&yys=0&port=1&pack=34365&ts=1&ys=1&cs=1&lb=1&sb=0&pb=45&mr=2&regions='
    h = requests.get(url).text
    html = json.loads(h)
    if html:
        return html['data']