爬取代码
自己最新测试19年6月18还能爬取的方式
import requests
from multiprocessing import Queue
import json
from save_mongo import mongo_info
from concurrent.futures import ThreadPoolExecutor
queues_list = Queue()
def heandel_request(url,data):
header = {
"client": "4",
"version": "6940.2",
"device": "HUAWEI MLA-AL10",
"sdk": "22,5.1.1",
"imei": "863064011228246",
"channel": "baidu",
"mac": "E4:F8:9C:F7:4F:22",
"resolution": "1280*720",
"dpi": "1.5",
"android-id": "1e4f89cf74f22378",
"pseudo-id": "9cf74f223781e4f8",
"brand": "HUAWEI",
"scale": "1.5",
"timezone": "28800",
"language": "zh",
"cns": "3",
"carrier": "CHINA+MOBILE",
"imsi": "460071228248156",
"User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; HUAWEI MLA-AL10 Build/HUAWEIMLA-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/74.0.3729.136 Mobile Safari/537.36",
"act-code": "e9d3a060cf2741ba937adda1c9f03fa2",
"act-timestamp": "1558788732",
#"uuid": "5dd43ba9-e5ce-44a6-9766-9df287e8fe83",
"reach": "10000",
"newbie": "0",
"Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
"Accept-Encoding": "gzip, deflate",
"Connection": "Keep-Alive",
# "Cookie": "duid=59950651",
"Host": "api.douguo.net",
"Content-Length": "96",
}
respone = requests.post(url=url, headers=header,data=data)
return respone
def header_index():
url = " http://api.douguo.net/recipe/flatcatalogs"
data = {
"client": "4",
"_session": "1560752474235863064011228246",
"keyword": "%E5%9C%9F%E8%B1%86",
"order": "0",
"_vs": "400",
"type": "0",
}
response = heandel_request(url,data)
for data_list in json.loads(response.text)["result"]["cs"]:
for names in data_list["cs"]:
for name in names["cs"]:
data2 = {
"client": "4",
# "_session": "1560752474235863064011228246",
"keyword": name["name"],
"order": "0",
"_vs": "400",
"type": "0",
}
queues_list.put(data2)
def heaher_shicai_content(data):
print("当前处理的食材:",data["keyword"])
# 请求前100条数据
for i in range(6):
shicai_url = "http://api.douguo.net/recipe/v2/search/{}/20".format(i*20)
shicai = heandel_request(shicai_url,data)
for item in json.loads(shicai.text)["result"]["list"]:
shicai_info = {}
shicai_info["name"] = data["keyword"]
if item["type"] == 13:
shicai_info["user_name"] = item["r"]["an"]
shicai_info["caipu_id"] = item["r"]["id"]
shicai_info["caipu_name"] = item["r"]["n"]
shicai_info["context"] = item["r"]["cookstory"].replace("\n","").replace(" ","")
shicai_info["shicai"] = item["r"]["major"]
detial_url = "http://api.douguo.net/recipe/detail/"+str(item["r"]["id"])
data3 = {
"client": "4",
#"_session": "1560771406377863064011228246",
"author_id": "0",
"_vs": "5900",
"_ext": '{"query":{"kw":"'+shicai_info["name"]+'","src":"2801","idx":"1","type":"13","id":"'+str(shicai_info["caipu_id"])+'"}}',
}
response = heandel_request(detial_url,data3)
detial = json.loads(response.text)
shicai_info["tips"] = detial["result"]["recipe"]["tips"]
shicai_info["buzhu"] = detial["result"]["recipe"]["cookstep"]
# print(json.loads(response.text))
mongo_info.insert_item(shicai_info)
print("当前写入的菜谱是:"+shicai_info["caipu_name"])
header_index()
pool = ThreadPoolExecutor(max_workers=20)
while queues_list.qsize() > 0:
pool.submit(heaher_shicai_content,queues_list.get())
# heaher_shicai_content(queues_list.get())
加上代理
用自己弄得ip代理
我在这篇博客里介绍了一种爬取代理ip的方式。
https://blog.csdn.net/qq_40423339/article/details/92759849
开始使用代理爬
import requests
from multiprocessing import Queue
import json
from save_mongo import mongo_info
from concurrent.futures import ThreadPoolExecutor
import random
queues_list = Queue()
def heandel_request(url,data):
header = {
"client": "4",
"version": "6940.2",
"device": "HUAWEI MLA-AL10",
"sdk": "22,5.1.1",
"imei": "863064011228246",
"channel": "baidu",
"mac": "E4:F8:9C:F7:4F:22",
"resolution": "1280*720",
"dpi": "1.5",
"android-id": "1e4f89cf74f22378",
"pseudo-id": "9cf74f223781e4f8",
"brand": "HUAWEI",
"scale": "1.5",
"timezone": "28800",
"language": "zh",
"cns": "3",
"carrier": "CHINA+MOBILE",
"imsi": "460071228248156",
"User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; HUAWEI MLA-AL10 Build/HUAWEIMLA-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/74.0.3729.136 Mobile Safari/537.36",
"act-code": "e9d3a060cf2741ba937adda1c9f03fa2",
"act-timestamp": "1558788732",
#"uuid": "5dd43ba9-e5ce-44a6-9766-9df287e8fe83",
"reach": "10000",
"newbie": "0",
"Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
"Accept-Encoding": "gzip, deflate",
"Connection": "Keep-Alive",
# "Cookie": "duid=59950651",
"Host": "api.douguo.net",
"Content-Length": "96",
}
with open("verified_proxies.json", "r") as f:
ip_list = f.read()
ip_list = ip_list.split("\n")
# while True:
# ip_json = json.loads(ip_list[random.randint(0,len(ip_list))])
# proxy = {}
# proxy[ip_json["type"]] = ip_json["host"] + ":" + str(ip_json["port"])
# print(proxy)
# try:
# respone = requests.post(url=url, headers=header,data=data,proxies=proxy)
# except Exception:
# pass
# else:
# if respone.status_code == 200:
# break
i = 0
while True:
ip_json = json.loads(ip_list[i])
proxy = {}
proxy[ip_json["type"]] = ip_json["host"] + ":" + str(ip_json["port"])
print(proxy)
try:
respone = requests.post(url=url, headers=header,data=data,proxies=proxy)
except Exception:
i += 1
else:
if respone.status_code == 200:
break
else:
i += 1
return respone
def header_index():
url = " http://api.douguo.net/recipe/flatcatalogs"
data = {
"client": "4",
"_session": "1560752474235863064011228246",
"keyword": "%E5%9C%9F%E8%B1%86",
"order": "0",
"_vs": "400",
"type": "0",
}
response = heandel_request(url,data)
for data_list in json.loads(response.text)["result"]["cs"]:
for names in data_list["cs"]:
for name in names["cs"]:
data2 = {
"client": "4",
# "_session": "1560752474235863064011228246",
"keyword": name["name"],
"order": "0",
"_vs": "400",
"type": "0",
}
queues_list.put(data2)
def heaher_shicai_content(data):
print("当前处理的食材:",data["keyword"])
# 请求前100条数据
for i in range(6):
shicai_url = "http://api.douguo.net/recipe/v2/search/{}/20".format(i*20)
shicai = heandel_request(shicai_url,data)
for item in json.loads(shicai.text)["result"]["list"]:
shicai_info = {}
shicai_info["name"] = data["keyword"]
if item["type"] == 13:
shicai_info["user_name"] = item["r"]["an"]
shicai_info["caipu_id"] = item["r"]["id"]
shicai_info["caipu_name"] = item["r"]["n"]
shicai_info["context"] = item["r"]["cookstory"].replace("\n","").replace(" ","")
shicai_info["shicai"] = item["r"]["major"]
detial_url = "http://api.douguo.net/recipe/detail/"+str(item["r"]["id"])
data3 = {
"client": "4",
#"_session": "1560771406377863064011228246",
"author_id": "0",
"_vs": "5900",
"_ext": '{"query":{"kw":"'+shicai_info["name"]+'","src":"2801","idx":"1","type":"13","id":"'+str(shicai_info["caipu_id"])+'"}}',
}
response = heandel_request(detial_url,data3)
detial = json.loads(response.text)
shicai_info["tips"] = detial["result"]["recipe"]["tips"]
shicai_info["buzhu"] = detial["result"]["recipe"]["cookstep"]
# print(json.loads(response.text))
# mongo_info.insert_item(shicai_info)
print("当前写入的菜谱是:"+shicai_info["caipu_name"])
header_index()
pool = ThreadPoolExecutor(max_workers=20)
while queues_list.qsize() > 0:
pool.submit(heaher_shicai_content,queues_list.get())
# heaher_shicai_content(queues_list.get())