按照网格划分一个大区域,使每个网格的POI小于100*20,参数有网格经纬度范围、按照经纬度划分网格数,通过高德地图API爬取POI最起码先了解高德地图API的一些用法,直接上代码,这里不在赘述原理:
import random
import requests
import pandas as pd
import json
import numpy as np
import time
url="https://restapi.amap.com/v5/place/polygon?key={3}&polygon={0}&page_size={1}&page_num={2}"
#保存路径 以xlsx格式保存
savePath="BeiJingSelect_Pois_.xlsx"
#这里填入自己的高德地图webAPI密钥
key_list=['高德地图webAPI密钥1','高德地图webAPI密钥2','高德地图webAPI密钥3']
#爬取的经纬度范围
Xmin=116.215275
Ymin=39.762179
Xmax=116.575401
Ymax=40.057342
#划分为10*10 这里根据爬取范围修改
x_partion_num=10
y_partion_num=10
'''高频访问会建立造成大量连接,对服务器造成压力,因此需要进行一些操作来减少压力'''
#增大重新连接次数
requests.DEFAULT_RETRIES = 5
def get_gridRegion_list(Xmin,Ymin,Xmax,Ymax,x_partion_num,y_partion_num):
'''
:param Xmin:
:param Ymin:
:param Xmax:
:param Ymax:
:param x_partion_num: x轴分割数
:param y_partion_num: y轴分个数
:return: 分割格网四个点的坐标
'''
delta_x=(Xmax-Xmin)/x_partion_num
delta_y=(Ymax-Ymin)/y_partion_num
grid_list=[]
for i in range(x_partion_num):
for j in range(y_partion_num):
grid_list.append([Xmin+i*delta_x,Ymin+j*delta_y,Xmin+(i+1)*delta_x,Ymin+(j+1)*delta_y])
return grid_list
def get_A_grid_poi(grid_region:list or tuple,order):
'''
:param grid_region:
:param order:
:return:
'''
poi_list=[]
headers = {'Connection': 'close'}#如果requests连接数很多,那么在请求中避免使用持久连接
for page in range(100):
response = requests.get(
url.format('{0},{1}|{2},{3}'.format(str(grid_region[0]),str(grid_region[1]),str(grid_region[2]),str(grid_region[3])),
25, page,random.choice(key_list)),headers=headers,timeout=(3,7))#如果需要移除SSL认证 verify=False
#timeout 参数当服务器延迟响应时,等待时间(等待响应时间,等待处理时间)
if response.status_code==200:
text=json.loads(response.text)
nums=text['count']
if(int(nums)!=0):
pois=text['pois']
for poi in pois:
id=poi['id']
name=poi['name']
location=poi['location'].split(',')
x=float(location[0])
y=float(location[1])
Type = poi['type']
cityname=poi['cityname']
poi_list.append([id, name, x, y, Type, cityname])
else:
print('第{0}个格网{1}页结束'.format(order,page))
if page<2:
time.sleep(5)#避免短时间大量发起请求
return poi_list
else:
print('第{}个格网区域访问错误'.format(order))
pass
time.sleep(1)
return poi_list
def get_grids_pois(Xmin,Ymin,Xmax,Ymax,x_partion_num,y_partion_num):
# all_pois_attrs=['id','name','x','y','type','cityname','rating']
all_pois_attrs = ['id', 'name', 'x', 'y', 'type', 'cityname']
all_pois=[all_pois_attrs]
grid_list=get_gridRegion_list(Xmin,Ymin,Xmax,Ymax,x_partion_num,y_partion_num)
order=1
for grid_region in grid_list:
grid_poi_list=get_A_grid_poi(grid_region,order)
all_pois.extend(grid_poi_list)
order+=1
return all_pois
def save_as_excel(all_pois):
df=pd.DataFrame(data=np.array(all_pois[1:]),columns=all_pois[0])
df.to_excel(excel_writer=savePath)
if __name__=='__main__':
all_pois=get_grids_pois(Xmin,Ymin,Xmax,Ymax,x_partion_num,y_partion_num)
save_as_excel(all_pois)