####这两天工作中需要获取地图poi数据,发现百度、腾讯都有类似的接口,但发现都对数据 有限制,百度限制单次返回400条,腾讯限制200条,为了突破这一限制,思路是取原来的 圆形检索的外切矩形,再将矩形分割成四个小矩形,分别获取其中的数据,再将结果集拼 接返回完整的结果。 实现是依赖的geopy库,可以通过已知的经纬度、距离和方向生成新的经纬度,这样就 可以通过中心点和半径来获取矩形检索的左下和右上点。下面是将一个经纬度通过geopy 库变成4个矩形的过程。
def get_distance_point(lat, lon, distance, direction):
"""
:param lat: 纬度
:param lon: 经度
:param distance: 距离(千米)
:param direction: 方向(北:0,东:90,南:180,西:270)
:return:
"""
start = geopy.Point(lat, lon)
d = geopy.distance.VincentyDistance(kilometers=distance)
return d.destination(point=start, bearing=direction)
def get_square(lat,lon,distance):
sw_1=get_distance_point(lat,lon, distance ,270)
ne_1=get_distance_point(lat,lon, distance ,0)
square_1=(sw_1.latitude,sw_1.longitude,ne_1.latitude,ne_1.longitude)
ne_2=get_distance_point(lat,lon,distance,90)
ne_2_1=get_distance_point(ne_2.latitude,ne_2.longitude,distance,0)
square_2= (str(lat).replace("'","").replace("\n",""),str(lon).replace("'","").replace("\n",""), str(ne_2_1.latitude).replace("'","").replace("\n",""), str(ne_2_1.longitude).replace("'","").replace("\n",""))
sw_3 = get_distance_point(lat, lon, distance, 270)
sw_3 = get_distance_point(sw_3.latitude, sw_3.longitude, distance, 180)
square_3 = (sw_3.latitude, sw_3.longitude, str(lat).replace("'","").replace("\n",""), str(lon).replace("'","").replace("\n",""))
sw_4 = get_distance_point(lat, lon, distance, 180)
ne_4 = get_distance_point(lat, lon, distance, 90)
square_4 = (sw_4.latitude, sw_4.longitude, ne_4.latitude, ne_4.longitude)
square_list=[square_1,square_2,square_3,square_4]
return square_list
因为要检索的poi在范围内较少,所以写的比较粗糙,只分为了4个矩形框,其实可以通过数学计算经纬度分成更多的小块。爬下来的数据:
下面放完整的脚本:
# -*- coding: utf-8 -*-
import datetime
import json
import random
import pandas as pd
import requests
import sys
import time
import geopy
import geopy.distance
import geopy.distance
def get_distance_point(lat, lon, distance, direction):
"""
:param lat: 纬度
:param lon: 经度
:param distance: 距离(千米)
:param direction: 方向(北:0,东:90,南:180,西:270)
:return:
"""
start = geopy.Point(lat, lon)
d = geopy.distance.VincentyDistance(kilometers=distance)
return d.destination(point=start, bearing=direction)
def get_square(lat,lon,distance):
sw_1=get_distance_point(lat,lon, distance ,270)
ne_1=get_distance_point(lat,lon, distance ,0)
square_1=(sw_1.latitude,sw_1.longitude,ne_1.latitude,ne_1.longitude)
ne_2=get_distance_point(lat,lon,distance,90)
ne_2_1=get_distance_point(ne_2.latitude,ne_2.longitude,distance,0)
square_2= (str(lat).replace("'","").replace("\n",""),str(lon).replace("'","").replace("\n",""), str(ne_2_1.latitude).replace("'","").replace("\n",""), str(ne_2_1.longitude).replace("'","").replace("\n",""))
sw_3 = get_distance_point(lat, lon, distance, 270)
sw_3 = get_distance_point(sw_3.latitude, sw_3.longitude, distance, 180)
square_3 = (sw_3.latitude, sw_3.longitude, str(lat).replace("'","").replace("\n",""), str(lon).replace("'","").replace("\n",""))
sw_4 = get_distance_point(lat, lon, distance, 180)
ne_4 = get_distance_point(lat, lon, distance, 90)
square_4 = (sw_4.latitude, sw_4.longitude, ne_4.latitude, ne_4.longitude)
square_list=[square_1,square_2,square_3,square_4]
return square_list
class BaiDuPOI(object):
def __init__(self, itemy, loc):
self.itemy = itemy
self.loc = loc
def urls(self):
api_key =tx_api
urls = []
for pages in range(1, 11):
# url = 'http://api.map.baidu.com/place/v2/search?query=' + self.itemy + '&bounds=' + self.loc + '&page_size=20&page_num=' + str(
# pages) + '&output=json&ak=' + api_key
url="https://apis.map.qq.com/ws/place/v1/search?boundary=rectangle("+ self.loc + ")&keyword="+ self.itemy + "&page_size=20&page_index="+ str(
pages)+"&orderby=_distance&key="+api_key
urls.append(url)
return urls
def baidu_search(self):
json_sel = []
for url in self.urls():
# print(url)
# time.sleep(random.randint(0, 1))
json_obj = requests.get(url)
result = json.loads(json_obj.text)
# print(data)
if 'data' in result:
if not result['data']:
break
json_sel.append(result['data'])
else:
break
return json_sel
if __name__ == '__main__':
start_time = time.time()
tx_api = "**********************************"
# loc = LocaDiv('39.89698495112744,116.38061421550869,39.93301027407832, 116.42739804016666')
# locs_to_use = loc.ls_row()
# 距网点的距离 km
#distance = 5
with open(r'C:\Users\Administrator\Desktop\m.txt', 'r+', encoding='UTF-8') as f:
wangdian = f.readlines()
for line in wangdian:
if line.split(',')[1]=='[]':
continue
#zhongjie_dict={}
square_list=get_square(str(line.split(',')[-1]),str(line.split(',')[1]),float(1))
zhongjie_list=[]
for square in square_list:
print(str(square))
par = BaiDuPOI(u'中介', str(square).replace("(","").replace(")","").replace("\n","").replace("'","").replace(" ",""))
list = par.baidu_search()
#print(list)
#print(len(list))
for i in list:
for j in i:
zhongjie_list.append(j)
#zhongjie_dict[distance]=zhongjie_list
#print(len(zhongjie_dict))
# zhongjie_1_dataframe=pd.DataFrame(zhongjie_dict[1],columns=['zhongjie'])
# zhongjie_3_dataframe=pd.DataFrame(zhongjie_dict[3],columns=['zhongjie'])
# zhongjie_5_dataframe=pd.DataFrame(zhongjie_dict[5],columns=['zhongjie'])
#
# zhongjie_1_3_dataframe=zhongjie_3_dataframe[~(zhongjie_3_dataframe['zhongjie'].isin(zhongjie_1_dataframe["zhongjie"]))]
# zhongjie_3_5_dataframe=zhongjie_5_dataframe[~(zhongjie_5_dataframe['zhongjie'].isin(zhongjie_3_dataframe["zhongjie"]))]
# print(zhongjie_1_3_dataframe)
# print("*************************************************")
num=len(zhongjie_list)
print(num)
#print(zhongjie_list)
for zhongjie in zhongjie_list:
if 'title' in zhongjie:
name=zhongjie['title']
else:
name=''
if 'address' in zhongjie:
address=zhongjie['address']
else:
address=''
if 'ad_info' in zhongjie:
province=zhongjie['ad_info']['province']
else:
province=''
if 'ad_info' in zhongjie:
city=zhongjie['ad_info']['city']
else:
city=''
if 'ad_info' in zhongjie:
area=zhongjie['ad_info']['district']
else:
area=''
#print(address)
if 'location' in zhongjie:
lat=zhongjie['location']['lat']
lng=zhongjie['location']['lng']
else:
lat =''
lng=''
Obtain_time=datetime.datetime.now().strftime("%Y-%m-%d")
# print(line.split(',')[0]+","+str(1)+","+str(num)+","+name + "," + address + "," +province + "," +city + "," +area + "," +str(lat) + "," +str(lng) + "," +str(Obtain_time))
with open('xxx.csv', 'a+', encoding='utf-8') as fh:
fh.write(line.split(',')[0]+","+str(1)+","+str(num)+","+name + "," + address + "," +province + "," +city + "," +area + "," +str(lat) + "," +str(lng) + "," +str(Obtain_time) + "\n")