之前网上有看到用javascript写的抓取百度地图POI的程序,效果还不错。作为python初学者,尝试用python写了下面代码,如有错误,还望大神们提供建议。
# -*- coding: utf-8 -*-
'''Created on 2014年12月18日
@author: LiXiang
'''
import math
import time
import urllib
import json
import sys
import urllib2
from threading import Thread
from Queue import Queue
class BaiduSpider:
def __init__(self,city='武汉',bounds=[113.6,115.1,29.9,31.4],keyword='ATM',maxRequire=1000):
self.ak="***"
self.baseUrl="http://api.map.baidu.com/place/v2/search";
self.file=open(u'%s.txt' % keyword,'w')
self.resultCount=0 #实际返回poi数量
self.requestTimes=0 #请求次数,即页数
self.MaxRequire=int(maxRequire)
self.maxPageNum=0
self.city=city
self.keyword=keyword
self.bounds=bounds;
def _start(self):
bounds=self.bounds
delta=0.1;
xmin=float(bounds[0])
xmax=float(bounds[1])
ymin=float(bounds[2])
ymax=float(bounds[3])
xNum=(xmax-xmin)/delta;
yNum=(ymax-ymin)/delta;
rows=int(xNum)
cols=int(yNum)
print u'分割方块数%d*%d' %(rows,cols)
"""
# 使用单一线程
# """
# for i in range(rows):
# for j in range(cols):
# aa=ymin+j*delta
# bb=xmin+i*delta
# cc=aa+delta
# dd=bb+delta
# bound=str(aa)+','+str(bb)+','+str(cc)+','+str(dd)
# print '--%d,%d---------------' %(i,j)
# self._downPatch(bound)
# print '-----------------\n'
"""
使用多线程
"""
#q是任务队列
#NUM是并发线程总数
#JOBS是有多少任务
q = Queue()
NUM = 5
JOBS = rows*cols
#具体的处理函数,负责处理单个任务
def do_somthing_using(arguments):
i=arguments/rows
j=arguments%rows
aa=ymin+j*delta
bb=xmin+i*delta
cc=aa+delta
dd=bb+delta
bound=str(aa)+','+str(bb)+','+str(cc)+','+str(dd)
print '--%d,%d-----%s--------------------' %(i,j,bound)
self._downPatch(bound)
print '-----------------------------------\n'
#这个是工作进程,负责不断从队列取数据并处理
def working():
while True:
arguments = q.get()
do_somthing_using(arguments)
time.sleep(0.1)
q.task_done()
#fork NUM个线程等待队列
for i in range(NUM):
t = Thread(target=working)
t.setDaemon(True)
t.start()
#把JOBS排入队列
for i in range(JOBS):
q.put(i)
#等待所有JOBS完成
q.join()
self.file.close()
def _downPatch(self,coord):
query={
'ak':self.ak,
'query':self.keyword,
# 'region':self.city,
'bounds':coord,
'page_size':'20',
'page_num':'0',
'output':'json',
'scope':'2'
}
js=self._fetch(query)
total=int(js['total'])
pageNum=int(math.ceil(total/20))
print u"---该区域总记录条数-->"+str(total)
print u"---该区域分页数"+str(pageNum)
poiList=js['results']
if pageNum>self.maxPageNum:
self.maxPageNum=pageNum
if pageNum>=76:
pageNum=75
is_break=False
for i in range(1,pageNum):
query['page_num']=str(i)
self.requestTimes+=1
if self.requestTimes<=self.MaxRequire:
poiList.extend(self._fetch(query)['results'])
else:
is_break=True
break
print u"---该区域查询到的记录条数-->"+str(len(poiList))
for poi in poiList:
loc=poi['location']
poi['lat']=loc['lat']
poi['lng']=loc['lng']
poi['type']=''
poi['tag']=''
if poi.has_key('detail_info'):
details=poi["detail_info"]
if details.has_key('type'):
poi['type']=details['type']
if details.has_key('tag'):
poi['tag']=details['tag']
self._save(poi)
self.resultCount+=1
self.file.flush()
if is_break:
sys.exit(0) #退出程序
print u"---该区域处理完毕"
# def _fetch(self,query=None,json=True):
# param = urllib.urlencode(query)
# url = self.baseUrl + '?' + param
# opener = urllib.FancyURLopener()
# data = opener.open(url).read()
# if json:
# return self._tojson(data)
# else:
# return data
def _fetch(self,query=None,json=True):
param = urllib.urlencode(query)
url = self.baseUrl + '?' + param
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36' \
'(KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36'
}
try:
req = urllib2.Request(url, param, headers)
urllib2.socket.timeout=2
res = urllib2.urlopen(req)
data=res.read()
except urllib2.HTTPError,e:
print e.code
if json:
return self._tojson(data)
else:
return data
def _tojson(self, data):
try:
js = json.loads(data, 'utf-8')
except:
js = None
return js
def _save(self, c):
_data = '%f\t%f\t%s\t%s\t%s\t%s\t%s\n' % (c['lat'],c['lng'],c['uid'],c['name'],c['address'],c['type'],c['tag'])
self.file.write(_data)
if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding('utf-8')
print u"请输入搜索关键字:"
keyword=raw_input()
print u"请输入最大请求次数:"
maxRequire=raw_input()
print u"请输入矩形框,以逗号分开(113.6,115.1,29.9,31.4):"
bounds=raw_input()
bounds=bounds.split(',')
baiduSpider=BaiduSpider(None,bounds,keyword,maxRequire)
baiduSpider._start()
print u'---返回的结果数-->'+str(baiduSpider.resultCount)
print u'---请求次数-->'+str(baiduSpider.requestTimes)
print u'---请求的最大页面数(不能超过76)'+str(baiduSpider.maxPageNum)