CASE
当有多个key时,实现单线程用多个key爬取搜索POI2.0数据
搜索POI2.0在哪里
高德开放平台链接: link
根据官网步骤申请API key
访问的数据长什么样
- 正常数据
查询数据得到:这里的count是设置的page_size,有用数据在pois中,数据类型为list
如果一个type所有数据已经查询到,再访问下一页会显示"count":“0”,以此为是否遍历所有数据的判断依据 - 一个key用完当日调用量时,详细提示在’info’中
代码实现
用到的库
import pandas as pd
import numpy as np
import csv
import requests
import json
import time
访问网页读取结果
header的意义:对于反爬虫,模拟成浏览器访问页面,防止被当成机器人
def Get_poi(key, region, code, page):
page_size = 25
header = {'User-Agent': "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}
url = 'https://restapi.amap.com/v5/place/text?types={}®ion={}&page_size={}&page_num={}&key={}'.format(code,region,page_size,page,key)
r = requests.get(url, headers=header)
r.encoding = 'utf-8'
data = r.text
return data
本代码中url需要设置的参数有:key,code,region,page_size,page
key:需实名制申请,本代码涉及多个key,需循环调取不同key,且考虑调用量上限
code:指定地点类型,需要读取本地文件,遍历所有type
region:搜索范围,本代码设定为上海市"021",无需循环遍历
page_size,page:page_size设定为最大值,在一页中显示最多数据;page用循环遍历,直至页面不显示数据,认为已经将数据全部爬取
页面需要反复申请读取,来遍历所有数据
涉及:
1、日调用量达到上限后,需要更换key
2、遍历一个类别的所有数据后,更换类别继续遍历
def change_key(keys, i, flag):
if i == len(keys['Key'])-1:
flag = 1
j = i
return j, flag
else:
j = i + 1
return j, flag
def Get_times(keys, region, code, i, flag):
page = 1
poilist = []
key = keys['Key'][i]
while True:
while flag == 0:
result = Get_poi(key, region, code, page) #访问页面爬取的数据结果
content = json.loads(result)
if content['info'] == "USER_DAILY_QUERY_OVER_LIMIT": #判断高德API key是否已经超过当日调用量
j, flag = change_key(keys, i, flag) #如果超出调用量,要更换key继续进行网页访问数据爬取
i = j
key = keys['Key'][i] #更换下一个key
if flag == 1: #判断是否所有key都已经用完当日调用量
print("今日key已用完,遍历至code:", code, "第", page,"页")
return poilist, i, flag
else:
break
result = Get_poi(key, region, code, page)
content = json.loads(result)
time.sleep(0.5) #高德限制访问并发次数
if content['count'] == '0': #判断是否已经读取完一个type code的所有数据
return poilist, i, flag
else:
pois = content['pois']
for m in range(len(pois)):
poilist.append(pois[m])
page = page + 1
return poilist, i, flag
完整代码
import pandas as pd
import numpy as np
import csv
import requests
import json
import time
def Get_poi(key, region, code, page): #访问页面爬取数据
page_size = 25
header = {'User-Agent': "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}
url = 'https://restapi.amap.com/v5/place/text?types={}®ion={}&page_size={}&page_num={}&key={}'.format(code,region,page_size,page,key)
r = requests.get(url, headers=header)
r.encoding = 'utf-8'
data = r.text
return data
def change_key(keys, i, flag):
if i == len(keys['Key'])-1:
flag = 1
j = i
return j, flag
else:
j = i + 1
return j, flag
def Get_times(keys, region, code, i, flag):
page = 1
poilist = []
key = keys['Key'][i]
while True:
while flag == 0:
result = Get_poi(key, region, code, page) #访问页面爬取的数据结果
content = json.loads(result)
if content['info'] == "USER_DAILY_QUERY_OVER_LIMIT": #判断高德API key是否已经超过当日调用量
j, flag = change_key(keys, i, flag) #如果超出调用量,要更换key继续进行网页访问数据爬取
i = j
key = keys['Key'][i] #更换下一个key
if flag == 1: #判断是否所有key都已经用完当日调用量
print("今日key已用完,遍历至code:", code, "第", page,"页")
return poilist, i, flag
else:
break
result = Get_poi(key, region, code, page)
content = json.loads(result)
time.sleep(0.5) #高德限制访问并发次数
if content['count'] == '0': #判断是否已经读取完一个type code的所有数据
return poilist, i, flag
else:
pois = content['pois']
for m in range(len(pois)):
poilist.append(pois[m])
page = page + 1
return poilist, i, flag
def main():
keys = pd.read_excel('keys.xlsx') #读取你本地的高德API Key列表
codes = pd.read_excel(r'POI_code.xlsx',dtype = {'NEW_TYPE':str})
codes = codes['NEW_TYPE'] #读取你要查询的地点类型
city = '021' #上海市的编码,可以替换成想要的城市、行政区等编码
data = list()
i = 0 #key的index,用来更换key
flag = 0 #用flag来判断是否左右高德API key都已经使用完当日调用量
for code in codes:
poi, j, flag = Get_times(keys, city, code, i, flag)
if flag ==1:
break
if poi != []:
data.append(poi)
#爬取的数据转成dataframe的格式并写进excel
name = []
typ = []
address = []
location = []
for d in data:
for dd in d:
name.append(dd['name'])
typ.append(dd['type'])
address.append(dd['address'])
location.append(dd['location'])
results = pd.DataFrame(columns = ['name','type','address','location'])
results['name'] = name
results['type'] = typ
results['address'] = address
results['location'] = location
results.to_excel('results.xlsx')
if __name__ == '__main__':
main()
代码小趴菜,欢迎各路大佬讨论指正