前言:今天讲下高德地图全国美食爬虫。
反爬点:
高德地图反爬主要是:
1.IP代理。
2.headers(referer,cookie必要的),referer:随便个可以不变。cookie:必要的参数:isg,l,cna(可自行数字大小写字母组合),uab_collina(固定值)
3.限制最大45页,可地区精确到区及二级分类精确到火锅来尽可能最大获取,避免最大页数问题.
import requests,random,string,time,pymongo,re,json,datetime,logging
from Config import Config
from urllib import parse
logging.basicConfig(filename="show.log",filemode="a",format="%(asctime)s-%(name)s-%(levelname)s-%(message)s",level=logging.INFO)
class Amap(object):
def __init__(self):
self.isg = 'XXXX'
self.l = 'XXX'
self.cna = 'XXXX'
def get_pro(self):
get_pro_list = self.post_city.find({})
for get_pro in get_pro_list[9:]:
print('begin......{}'.format(get_pro['pro_name']))
pro_name = get_pro['pro_name']
for every_city in get_pro['city_list']:
choose_city = every_city
city_name = choose_city['city_name']
print('begin city ....{}'.format(city_name))
city_adcode = choose_city['city_adcode']
# 1获取城市所有区及美食二级分类
show_url = 'https://www.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum=1&qii=true&cluster_state=5&need_utd=true&utd_sceneid=1000&div=PC1000&addr_poi_merge=true&is_classify=true&zoom=17&city={}&geoobj=121.9098|25.510585|111.923414|24.516816&_src=around&keywords=美食'.format(city_adcode)
headers = self.get_headers(city_adcode)
show_json = self.request_url(show_url,headers)
# print('11111',show_json)
if show_json:
# 区分类
area_list = []
if 'bizAreaData' in show_json:
districts = show_json['bizAreaData']['districts']
for k in districts:
area_dict = {}
area_dict['area_name'] = k['name']
area_dict['area_value'] = k['districts']
area_list.append(area_dict)
self.deal_areas(pro_name, city_name, city_adcode, area_list