爬虫——百度POI爬取(python3)
学习过程:
爬虫:
本人对python的了解只在hello word水平,能写出这个爬虫主要感谢大佬@陈修一的基于python的百度迁徙1——迁入、迁出数据(附代码)
url:
查看百度地图开放平台Web服务API服务文档
https://lbsyun.baidu.com/index.php?title=webapi/guide/webservice-placeapi
POI分类:
http://lbsyun.baidu.com/index.php?title=lbscloud/poitags
没有什么python爬虫基础,代码可能不够完善,希望有大佬可以多多交流指正。
输出结果:
完整代码:
使用前请先申请AK,并修改参数,申请方法自行百度。
import importlib, sys
import xlwt
import time
import json
import requests
importlib.reload(sys)
user_agent = 'Mozilla/5.0 (Windows NT 6.1 WOW:64)'
headers = {'User_Agent': user_agent}
##########百度API限制每次访问数据不得超过400,不会写分片且目标区域不大所以按照POI不同类别获取###############
def get_POIdata(query, region):
workbook = xlwt.Workbook(encoding = 'utf-8') # 创建一个workbook 设置编码
worksheet = workbook.add_sheet('Sheet', cell_overwrite_ok=True) # 创建一个worksheet
#写入列名
worksheet.write(0 , 0 , label='名称')
worksheet.write(0 , 1 , label='纬度')
worksheet.write(0 , 2 , label='经度')
worksheet.write(0 , 3 , label='省')
worksheet.write(0 , 4 , label='市')
worksheet.write(0 , 5 , label='区')
worksheet.write(0 , 6 , label='地址')
worksheet.write(0 , 7 , label='标签')
worksheet.write(0 , 8 , label='备注')
counter = 1
i = 1
for i in range(0, 40):
url = f'http://api.map.baidu.com/place/v2/search?query={query}&page_size=10&page_num='
#############这里必须写ak参数,获取方法自行百度################
url = url+str(i*1)+f'®ion={region}&output=json&ak=您的AK'
response=requests.get(url, timeout=10)
time.sleep(5)
data_dict=json.loads(response.text)
if(data_dict["total"] != 0):
print(f'第{i}页完成')
data_results=data_dict['results']
time.sleep(1)
for j in range(len(data_results)):
text_info = data_results[j]
worksheet.write(counter, 0, label=text_info['name'])
worksheet.write(counter, 1, label=text_info['location']['lat'])
worksheet.write(counter, 2, label=text_info['location']['lng'])
worksheet.write(counter, 3, label=text_info['province'])
worksheet.write(counter, 4, label=text_info['city'])
worksheet.write(counter, 5, label=text_info['area'])
worksheet.write(counter, 6, label=text_info['address'])
worksheet.write(counter, 7, label=text_info['detail_info']['tag'])
worksheet.write(counter, 8, label=text_info['detail_info']['children'])
counter += 1
workbook.save(f"{region}_{query}.xls")
print(f'{region}_{query}全部完成')
if __name__=="__main__":
# get_POIdata("购物", "武汉")
# time.sleep(10)
get_POIdata("文化传媒", "武汉")
time.sleep(10)
# get_POIdata("医疗", "武汉")
# time.sleep(10)
print('全部完成')