突破数据限制爬取百度、腾讯poi数据

####这两天工作中需要获取地图poi数据,发现百度、腾讯都有类似的接口,但发现都对数据 有限制,百度限制单次返回400条,腾讯限制200条,为了突破这一限制,思路是取原来的 圆形检索的外切矩形,再将矩形分割成四个小矩形,分别获取其中的数据,再将结果集拼 接返回完整的结果。 实现是依赖的geopy库,可以通过已知的经纬度、距离和方向生成新的经纬度,这样就 可以通过中心点和半径来获取矩形检索的左下和右上点。下面是将一个经纬度通过geopy 库变成4个矩形的过程。


def get_distance_point(lat, lon, distance, direction):
    """
    :param lat: 纬度
    :param lon: 经度
    :param distance: 距离(千米)
    :param direction: 方向(北:0,东:90,南:180,西:270)
    :return:
    """
    start = geopy.Point(lat, lon)
    d = geopy.distance.VincentyDistance(kilometers=distance)
    return d.destination(point=start, bearing=direction)


def get_square(lat,lon,distance):

    sw_1=get_distance_point(lat,lon, distance ,270)
    ne_1=get_distance_point(lat,lon, distance ,0)
    square_1=(sw_1.latitude,sw_1.longitude,ne_1.latitude,ne_1.longitude)

    ne_2=get_distance_point(lat,lon,distance,90)
    ne_2_1=get_distance_point(ne_2.latitude,ne_2.longitude,distance,0)
    square_2= (str(lat).replace("'","").replace("\n",""),str(lon).replace("'","").replace("\n",""), str(ne_2_1.latitude).replace("'","").replace("\n",""), str(ne_2_1.longitude).replace("'","").replace("\n",""))

    sw_3 = get_distance_point(lat, lon, distance, 270)
    sw_3 = get_distance_point(sw_3.latitude, sw_3.longitude, distance, 180)
    square_3 = (sw_3.latitude, sw_3.longitude, str(lat).replace("'","").replace("\n",""), str(lon).replace("'","").replace("\n",""))

    sw_4 = get_distance_point(lat, lon, distance, 180)
    ne_4 = get_distance_point(lat, lon, distance, 90)
    square_4 = (sw_4.latitude, sw_4.longitude, ne_4.latitude, ne_4.longitude)

    square_list=[square_1,square_2,square_3,square_4]

    return square_list

因为要检索的poi在范围内较少,所以写的比较粗糙,只分为了4个矩形框,其实可以通过数学计算经纬度分成更多的小块。爬下来的数据:
在这里插入图片描述
下面放完整的脚本:

# -*- coding: utf-8 -*-

import datetime
import json
import random
import pandas as pd
import requests
import sys
import time
import geopy
import geopy.distance
import geopy.distance

def get_distance_point(lat, lon, distance, direction):
    """
    :param lat: 纬度
    :param lon: 经度
    :param distance: 距离(千米)
    :param direction: 方向(北:0,东:90,南:180,西:270)
    :return:
    """
    start = geopy.Point(lat, lon)
    d = geopy.distance.VincentyDistance(kilometers=distance)
    return d.destination(point=start, bearing=direction)

def get_square(lat,lon,distance):

    sw_1=get_distance_point(lat,lon, distance ,270)
    ne_1=get_distance_point(lat,lon, distance ,0)
    square_1=(sw_1.latitude,sw_1.longitude,ne_1.latitude,ne_1.longitude)

    ne_2=get_distance_point(lat,lon,distance,90)
    ne_2_1=get_distance_point(ne_2.latitude,ne_2.longitude,distance,0)
    square_2= (str(lat).replace("'","").replace("\n",""),str(lon).replace("'","").replace("\n",""), str(ne_2_1.latitude).replace("'","").replace("\n",""), str(ne_2_1.longitude).replace("'","").replace("\n",""))

    sw_3 = get_distance_point(lat, lon, distance, 270)
    sw_3 = get_distance_point(sw_3.latitude, sw_3.longitude, distance, 180)
    square_3 = (sw_3.latitude, sw_3.longitude, str(lat).replace("'","").replace("\n",""), str(lon).replace("'","").replace("\n",""))

    sw_4 = get_distance_point(lat, lon, distance, 180)
    ne_4 = get_distance_point(lat, lon, distance, 90)
    square_4 = (sw_4.latitude, sw_4.longitude, ne_4.latitude, ne_4.longitude)

    square_list=[square_1,square_2,square_3,square_4]
    return square_list

class BaiDuPOI(object):
    def __init__(self, itemy, loc):
        self.itemy = itemy
        self.loc = loc
    def urls(self):
        api_key =tx_api
        urls = []
        for pages in range(1, 11):
            # url = 'http://api.map.baidu.com/place/v2/search?query=' + self.itemy + '&bounds=' + self.loc + '&page_size=20&page_num=' + str(
            #     pages) + '&output=json&ak=' + api_key
            url="https://apis.map.qq.com/ws/place/v1/search?boundary=rectangle("+ self.loc + ")&keyword="+ self.itemy + "&page_size=20&page_index="+ str(
                pages)+"&orderby=_distance&key="+api_key
            urls.append(url)
        return urls

    def baidu_search(self):
        json_sel = []
        for url in self.urls():
            # print(url)
            # time.sleep(random.randint(0, 1))
            json_obj = requests.get(url)

            result = json.loads(json_obj.text)
            # print(data)

            if 'data' in result:
                if not result['data']:
                    break
                json_sel.append(result['data'])
            else:
                break

        return json_sel

if __name__ == '__main__':
    start_time = time.time()
    tx_api = "**********************************"
    # loc = LocaDiv('39.89698495112744,116.38061421550869,39.93301027407832, 116.42739804016666')
    # locs_to_use = loc.ls_row()
    # 距网点的距离 km
    #distance = 5


    with open(r'C:\Users\Administrator\Desktop\m.txt', 'r+', encoding='UTF-8') as f:
        wangdian = f.readlines()
        for line in wangdian:

            if line.split(',')[1]=='[]':
                continue
            #zhongjie_dict={}

            square_list=get_square(str(line.split(',')[-1]),str(line.split(',')[1]),float(1))

            zhongjie_list=[]
            for square in square_list:
                print(str(square))

                par = BaiDuPOI(u'中介', str(square).replace("(","").replace(")","").replace("\n","").replace("'","").replace(" ",""))
                list = par.baidu_search()
                #print(list)
                #print(len(list))
                for i in list:
                    for j in i:
                        zhongjie_list.append(j)
            #zhongjie_dict[distance]=zhongjie_list
            #print(len(zhongjie_dict))

        # zhongjie_1_dataframe=pd.DataFrame(zhongjie_dict[1],columns=['zhongjie'])
        # zhongjie_3_dataframe=pd.DataFrame(zhongjie_dict[3],columns=['zhongjie'])
        # zhongjie_5_dataframe=pd.DataFrame(zhongjie_dict[5],columns=['zhongjie'])
        #
        # zhongjie_1_3_dataframe=zhongjie_3_dataframe[~(zhongjie_3_dataframe['zhongjie'].isin(zhongjie_1_dataframe["zhongjie"]))]
        # zhongjie_3_5_dataframe=zhongjie_5_dataframe[~(zhongjie_5_dataframe['zhongjie'].isin(zhongjie_3_dataframe["zhongjie"]))]
        # print(zhongjie_1_3_dataframe)
        # print("*************************************************")
            num=len(zhongjie_list)
            print(num)
            #print(zhongjie_list)
            for zhongjie in zhongjie_list:
                if 'title' in zhongjie:
                    name=zhongjie['title']
                else:
                    name=''

                if 'address' in zhongjie:
                    address=zhongjie['address']
                else:
                    address=''
                if 'ad_info' in zhongjie:
                    province=zhongjie['ad_info']['province']
                else:
                    province=''

                if 'ad_info' in zhongjie:
                    city=zhongjie['ad_info']['city']
                else:
                    city=''

                if 'ad_info' in zhongjie:
                    area=zhongjie['ad_info']['district']
                else:
                    area=''
                #print(address)

                if 'location' in zhongjie:
                    lat=zhongjie['location']['lat']
                    lng=zhongjie['location']['lng']
                else:
                    lat =''
                    lng=''
                Obtain_time=datetime.datetime.now().strftime("%Y-%m-%d")
                # print(line.split(',')[0]+","+str(1)+","+str(num)+","+name + "," + address + "," +province + "," +city + "," +area + "," +str(lat) + "," +str(lng) + "," +str(Obtain_time))

                with open('xxx.csv', 'a+', encoding='utf-8') as fh:
                    fh.write(line.split(',')[0]+","+str(1)+","+str(num)+","+name + "," + address + "," +province + "," +city + "," +area + "," +str(lat) + "," +str(lng) + "," +str(Obtain_time)  + "\n")


  • 0
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 5
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值