景点门票爬取的速度提升
因为爬取的数据过多,所以速度较慢,这次完成的工作就是提高爬虫的速度。
采用多线程的方式,检索每一个平台时都创建一个线程,待数据返回,再进行合并即可。
def init():
'''
建立对象
:return:
'''
global inited
global feizhu,xiecheng,tuniu,quna,lvmama,dahe,klook,tongcheng
feizhu = menpiao.zwf_menpiao_feizhu.FeizhuSpider()
xiecheng = menpiao.zwf_menpiao_xiecheng.XiechengSpider()
tuniu = menpiao.zwf_menpiao_tuniu.TuNiuSpider()
quna = menpiao.zwf_menpiao_qunar.QunaSpider()
lvmama = menpiao.zwf_menpiao_lvmama.LvmamaSpider()
dahe = menpiao.zwf_menpiao_dahe.DaheSpider()
klook = menpiao.zwf_menpiao_klook.KlookSpider()
tongcheng = menpiao.zwf_menpiao_tongcheng.TongchengSpider()
inited = True
def searchSpots(keyword, city):
'''
根据关键词和城市返回对应景点的门票,目前已爬取的网站:飞猪、去哪、携程、途牛、klook、驴妈妈、同程、大河
:param keyword:
:param city:
:return:
'''
global feizhu, xiecheng, tuniu, quna, lvmama, dahe, klook, tongcheng
global spotsInfo,city_keywords#景点名称,门票(名称name,类别type,价格price,url,已售buy,旅行社from,可退isReturnable,预定时间bookTime,出票时间outTime,可用时间useTime,说明discription)
if inited==False:
init()
# feizhu = menpiao.zwf_menpiao_feizhu.FeizhuSpider()
# xiecheng = menpiao.zwf_menpiao_xiecheng.XiechengSpider()
# tuniu = menpiao.zwf_menpiao_tuniu.TuNiuSpider()
# quna = menpiao.zwf_menpiao_qunar.QunaSpider()
# lvmama = menpiao.zwf_menpiao_lvmama.LvmamaSpider()
# dahe = menpiao.zwf_menpiao_dahe.DaheSpider()
# klook = menpiao.zwf_menpiao_klook.KlookSpider()
# tongcheng = menpiao.zwf_menpiao_tongcheng.TongchengSpider()
# list = [feizhu,xiecheng,tuniu,quna,lvmama,dahe,klook,tongcheng]
# for l in list:
# l.search_spots(keyword,city)
# print(l.spotsInfo)
# merge(feizhu, xiecheng, tuniu, quna, lvmama, dahe, klook, tongcheng)
# feizhu.search_spots(keyword,city)
# xiecheng.searchSpots(keyword,city)
# tuniu.search_spots(keyword,city)
# quna.search_spots(keyword,city)
list = [feizhu,xiecheng,tuniu,quna,lvmama,dahe,klook,tongcheng]
try:
'''建立线程,提高速度'''
t1 = threading.Thread(target=run_search, args=(xiecheng,keyword,city,))
t2 = threading.Thread(target=run_search, args=(feizhu, keyword, city,))
t3 = threading.Thread(target=run_search, args=(tuniu, keyword, city,))
t4 = threading.Thread(target=run_search, args=(quna, keyword, city,))
t5 = threading.Thread(target=run_search, args=(lvmama, keyword, city,))
t6 = threading.Thread(target=run_search, args=(dahe, keyword, city,))
t7 = threading.Thread(target=run_search, args=(klook, keyword, city,))
t8 = threading.Thread(target=run_search, args=(tongcheng, keyword, city,))
'''线程开始'''
t1.start()
t2.start()
t3.start()
t4.start()
t5.start()
t6.start()
t7.start()
t8.start()
'''父线程等待'''
t1.join()
t2.join()
t3.join()
t4.join()
t5.join()
t6.join()
t7.join()
t8.join()
'''合并'''
ti = merge(feizhu, xiecheng, tuniu, quna, lvmama, dahe, klook, tongcheng)
spotsInfo.append(ti)
city_keywords.append(city+''+keyword)
except Exception as e:
print(e)
因为创建对象耗时较长,所以在程序初始化的时候,即初始化对象,而不用重复创建,可以大大缩减程序运行时长。
采用缓存的方式暂存用户在本次页面打开时间中搜索过的关键词,也可以提高搜索速度。