python 多线程 代理 爬取 豆果美食app

python 多线程 代理 爬取 豆果美食

爬取代码

自己最新测试19年6月18还能爬取的方式

import requests
from multiprocessing import Queue
import json
from save_mongo import mongo_info
from concurrent.futures import ThreadPoolExecutor
queues_list = Queue()
def heandel_request(url,data):
    header = {
        "client": "4",
        "version": "6940.2",
        "device": "HUAWEI MLA-AL10",
        "sdk": "22,5.1.1",
        "imei": "863064011228246",
        "channel": "baidu",
        "mac": "E4:F8:9C:F7:4F:22",
        "resolution": "1280*720",
        "dpi": "1.5",
        "android-id": "1e4f89cf74f22378",
        "pseudo-id": "9cf74f223781e4f8",
        "brand": "HUAWEI",
        "scale": "1.5",
        "timezone": "28800",
        "language": "zh",
        "cns": "3",
        "carrier": "CHINA+MOBILE",
        "imsi": "460071228248156",
        "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; HUAWEI MLA-AL10 Build/HUAWEIMLA-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/74.0.3729.136 Mobile Safari/537.36",
        "act-code": "e9d3a060cf2741ba937adda1c9f03fa2",
        "act-timestamp": "1558788732",
        #"uuid": "5dd43ba9-e5ce-44a6-9766-9df287e8fe83",
        "reach": "10000",
        "newbie": "0",
        "Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "Keep-Alive",
        # "Cookie": "duid=59950651",
        "Host": "api.douguo.net",
        "Content-Length": "96",

    }
    respone = requests.post(url=url, headers=header,data=data)
    return respone

def header_index():
    url = " http://api.douguo.net/recipe/flatcatalogs"
    data = {
        "client": "4",
        "_session": "1560752474235863064011228246",
        "keyword": "%E5%9C%9F%E8%B1%86",
        "order": "0",
        "_vs": "400",
        "type": "0",
    }
    response = heandel_request(url,data)
    for data_list in json.loads(response.text)["result"]["cs"]:
        for names in data_list["cs"]:
            for name in names["cs"]:
                data2 = {
                    "client": "4",
                    # "_session": "1560752474235863064011228246",
                    "keyword": name["name"],
                    "order": "0",
                    "_vs": "400",
                    "type": "0",
                }
                queues_list.put(data2)


def heaher_shicai_content(data):
    print("当前处理的食材:",data["keyword"])
    # 请求前100条数据
    for i in range(6):
        shicai_url = "http://api.douguo.net/recipe/v2/search/{}/20".format(i*20)
        shicai = heandel_request(shicai_url,data)
        for item in json.loads(shicai.text)["result"]["list"]:
            shicai_info = {}
            shicai_info["name"] = data["keyword"]
            if item["type"] == 13:
                shicai_info["user_name"] = item["r"]["an"]
                shicai_info["caipu_id"] = item["r"]["id"]
                shicai_info["caipu_name"] = item["r"]["n"]
                shicai_info["context"] = item["r"]["cookstory"].replace("\n","").replace(" ","")
                shicai_info["shicai"] = item["r"]["major"]
                detial_url = "http://api.douguo.net/recipe/detail/"+str(item["r"]["id"])
                data3 = {
                    "client": "4",
                    #"_session": "1560771406377863064011228246",
                    "author_id": "0",
                    "_vs": "5900",
                    "_ext": '{"query":{"kw":"'+shicai_info["name"]+'","src":"2801","idx":"1","type":"13","id":"'+str(shicai_info["caipu_id"])+'"}}',
                }
                response = heandel_request(detial_url,data3)
                detial = json.loads(response.text)
                shicai_info["tips"] = detial["result"]["recipe"]["tips"]
                shicai_info["buzhu"] = detial["result"]["recipe"]["cookstep"]
                # print(json.loads(response.text))
                mongo_info.insert_item(shicai_info)
                print("当前写入的菜谱是:"+shicai_info["caipu_name"])


header_index()
pool = ThreadPoolExecutor(max_workers=20)
while queues_list.qsize() > 0:
    pool.submit(heaher_shicai_content,queues_list.get())
# heaher_shicai_content(queues_list.get())

加上代理

用自己弄得ip代理
我在这篇博客里介绍了一种爬取代理ip的方式。

https://blog.csdn.net/qq_40423339/article/details/92759849

开始使用代理爬

import requests
from multiprocessing import Queue
import json
from save_mongo import mongo_info
from concurrent.futures import ThreadPoolExecutor
import random
queues_list = Queue()
def heandel_request(url,data):
    header = {
        "client": "4",
        "version": "6940.2",
        "device": "HUAWEI MLA-AL10",
        "sdk": "22,5.1.1",
        "imei": "863064011228246",
        "channel": "baidu",
        "mac": "E4:F8:9C:F7:4F:22",
        "resolution": "1280*720",
        "dpi": "1.5",
        "android-id": "1e4f89cf74f22378",
        "pseudo-id": "9cf74f223781e4f8",
        "brand": "HUAWEI",
        "scale": "1.5",
        "timezone": "28800",
        "language": "zh",
        "cns": "3",
        "carrier": "CHINA+MOBILE",
        "imsi": "460071228248156",
        "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; HUAWEI MLA-AL10 Build/HUAWEIMLA-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/74.0.3729.136 Mobile Safari/537.36",
        "act-code": "e9d3a060cf2741ba937adda1c9f03fa2",
        "act-timestamp": "1558788732",
        #"uuid": "5dd43ba9-e5ce-44a6-9766-9df287e8fe83",
        "reach": "10000",
        "newbie": "0",
        "Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "Keep-Alive",
        # "Cookie": "duid=59950651",
        "Host": "api.douguo.net",
        "Content-Length": "96",

    }
    with open("verified_proxies.json", "r") as f:
        ip_list = f.read()
    ip_list = ip_list.split("\n")
    # while True:
    #     ip_json = json.loads(ip_list[random.randint(0,len(ip_list))])
    #     proxy = {}
    #     proxy[ip_json["type"]] = ip_json["host"] + ":" + str(ip_json["port"])
    #     print(proxy)
    #     try:
    #         respone = requests.post(url=url, headers=header,data=data,proxies=proxy)
    #     except Exception:
    #         pass
    #     else:
    #         if respone.status_code == 200:
    #             break
    i = 0
    while True:
        ip_json = json.loads(ip_list[i])
        proxy = {}
        proxy[ip_json["type"]] = ip_json["host"] + ":" + str(ip_json["port"])
        print(proxy)
        try:
            respone = requests.post(url=url, headers=header,data=data,proxies=proxy)
        except Exception:
            i += 1
        else:
            if respone.status_code == 200:
                break
            else:
                i += 1
    return respone

def header_index():
    url = " http://api.douguo.net/recipe/flatcatalogs"
    data = {
        "client": "4",
        "_session": "1560752474235863064011228246",
        "keyword": "%E5%9C%9F%E8%B1%86",
        "order": "0",
        "_vs": "400",
        "type": "0",
    }
    response = heandel_request(url,data)
    for data_list in json.loads(response.text)["result"]["cs"]:
        for names in data_list["cs"]:
            for name in names["cs"]:
                data2 = {
                    "client": "4",
                    # "_session": "1560752474235863064011228246",
                    "keyword": name["name"],
                    "order": "0",
                    "_vs": "400",
                    "type": "0",
                }
                queues_list.put(data2)


def heaher_shicai_content(data):
    print("当前处理的食材:",data["keyword"])
    # 请求前100条数据
    for i in range(6):
        shicai_url = "http://api.douguo.net/recipe/v2/search/{}/20".format(i*20)
        shicai = heandel_request(shicai_url,data)
        for item in json.loads(shicai.text)["result"]["list"]:
            shicai_info = {}
            shicai_info["name"] = data["keyword"]
            if item["type"] == 13:
                shicai_info["user_name"] = item["r"]["an"]
                shicai_info["caipu_id"] = item["r"]["id"]
                shicai_info["caipu_name"] = item["r"]["n"]
                shicai_info["context"] = item["r"]["cookstory"].replace("\n","").replace(" ","")
                shicai_info["shicai"] = item["r"]["major"]
                detial_url = "http://api.douguo.net/recipe/detail/"+str(item["r"]["id"])
                data3 = {
                    "client": "4",
                    #"_session": "1560771406377863064011228246",
                    "author_id": "0",
                    "_vs": "5900",
                    "_ext": '{"query":{"kw":"'+shicai_info["name"]+'","src":"2801","idx":"1","type":"13","id":"'+str(shicai_info["caipu_id"])+'"}}',
                }
                response = heandel_request(detial_url,data3)
                detial = json.loads(response.text)
                shicai_info["tips"] = detial["result"]["recipe"]["tips"]
                shicai_info["buzhu"] = detial["result"]["recipe"]["cookstep"]
                # print(json.loads(response.text))
                # mongo_info.insert_item(shicai_info)
                print("当前写入的菜谱是:"+shicai_info["caipu_name"])


header_index()
pool = ThreadPoolExecutor(max_workers=20)
while queues_list.qsize() > 0:
    pool.submit(heaher_shicai_content,queues_list.get())
# heaher_shicai_content(queues_list.get())
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值