python多线程实现访问页面_python实现多线程爬取酒店网页动态加载信息

最近学习threading库,接触线程部分相关知识。题示 , 保证同机器且爬取深度相同,单线程需要33.72s,多线程仅需1.78s。可见多开线程对于提高I/O密集爬虫的效率,行之有效。

声明:

0.本代码仅限学习交流使用

1.转载附作者ID及原作品链接

2.自觉遵守协议 维护网站权益

单线程全代码 修改path request_header可用:

import re

import requests

import json

import time

def get_detail(url,header,form_data):

response=requests.post(url,headers=header,data=form_data)

response.encoding=response.apparent_encoding

dictionary=json.loads(response.text)

#json格式转换

text=dictionary['value']['hotelListHtml']

#字典取值

hotel_list=text.split('

')

#切片

dict_list=[]

for each_hotel in hotel_list:

try:

name=re.findall(r'class="info_cn">(.*?)',each_hotel)[0]

price=re.findall(r'(.*?)',each_hotel)[0]

score=re.findall(r'data-score="(.*?)"',each_hotel)[0]

new_dict={}

new_dict["name"]=name

new_dict["price"]=price

new_dict["score"]=score

dict_list.append(new_dict)

except:

continue

return dict_list

def save(detail_list,path):

file=open(path,"a")

for each_detail in detail_list:

name=each_detail["name"]

price=each_detail["price"]

score=each_detail["score"]

file.write(name+","+price+","+score+"\n")

file.close()

#保存

def main():

header={

'Accept':'application/json, text/javascript, */*; q=0.01',

'Accept-Encoding':'gzip, deflate',

'Accept-Language':'zh-CN,zh;q=0.9,ja;q=0.8,en;q=0.7',

'Connection':'keep-alive',

'Content-Length':'2273',

'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',

'Cookie':'CookieGuid=2cbbf4d7-ed57-43f8-8842-4a74ad939a46; H5CookieId=db6aea90-999f-4be0-9930-7cfcea066214; _fid=2cbbf4d7-ed57-43f8-8842-4a74ad939a46; CitySearchHistory=0101%23%E5%8C%97%E4%BA%AC%23beijing%23; firsttime=1583163644996; SessionGuid=abe14edf-638e-44e5-a26d-94807cbf9e7a; Esid=038b4b4a-fd3a-4c88-bb54-fe60f6a9bf12; com.eLong.CommonService.OrderFromCookieInfo=Orderfromtype=1&Parentid=1000&Status=1&Cookiesdays=30&Coefficient=0.0&Pkid=1105&Priority=9001&Isusefparam=0&Makecomefrom=0&Savecookies=0; fv=pcweb; anti_token=4D71FDF2-2357-461A-984C-56958AE1A7CE; ShHotel=InDate=2020-03-03&CityID=0101&CityNameEN=beijing&CityNameCN=%E5%8C%97%E4%BA%AC&OutDate=2020-03-04&CityName=%E5%8C%97%E4%BA%AC; ext_param=bns%3D4%26ct%3D3; s_cc=true; __tctmc=0.215881358; __tctmc=20377580.26050747; __tctmd=20377580.254392154; __tctma=20377580.1583163637156064.1583163637230.1583163637230.1583200239719.2; __tctmu=20377580.0.0; __tctmz=20377580.1583200239719.2.1.utmccn=(referral)|utmcsr=bing.com|utmcct=|utmcmd=referral; longKey=1583163637156064; __tctrack=0; __tctmd=0.1; lasttime=1583202216497; s_visit=1; User-Ref-SessionId=78bd-e80d-13ad-2b0a-92aa-532a; trace_extend={"deviceid":"2cbbf4d7-ed57-43f8-8842-4a74ad939a46","appid":"6","userid":"2cbbf4d7-ed57-43f8-8842-4a74ad939a46","orderfromid":1105,"sessionid":"78bd-e80d-13ad-2b0a-92aa-532a","pvid":"c1b8aaeb"}; __tctmb=0.1284443720926117.1583203761350.1583203761350.1; s_sq=elongcom%3D%2526pid%253Dhotel.elong.com%25252Fbeijing%2526pidt%253D1%2526oid%253Djavascript%25253Avoid(0)%2526ot%253DA; __tccgd=0.0; JSESSIONID=1F53BCB446035ADE53A945505FBF2D47',

'Host':'hotel.elong.com',

'Origin':'http://hotel.elong.com',

'Referer':'http://hotel.elong.com/beijing/',

'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',

'X-Requested-With':'XMLHttpRequest'

}

form_data={

'code':'7611836',

'listRequest.areaID':'',

'listRequest.bedLargeTypes':'',

'listRequest.bookingChannel':'1',

'listRequest.breakfasts':'0',

'listRequest.cancelFree':'false',

'listRequest.cardNo':'192928',

'listRequest.checkInDate':'2020-03-03 00:00:00',

'listRequest.checkOutDate':'2020-03-04 00:00:00',

'listRequest.cityID':'0101',

'listRequest.cityName':'北京',

'listRequest.crawledFlag':'0',

'listRequest.customLevel':'11',

'listRequest.discountIds':'',

'listRequest.distance':'20000',

'listRequest.endLat':'0',

'listRequest.endLng':'0',

'listRequest.epcCreateOrderGuideVersion':'C',

'listRequest.facilityIds':'',

'listRequest.guokaoFlag':'false',

'listRequest.highPrice':'0',

'listRequest.hotelBrandIDs':'',

'listRequest.hotelIDs':'',

'listRequest.interceptAction':'0',

'listRequest.isAdvanceSave':'false',

'listRequest.isAfterCouponPrice':'true',

'listRequest.isCoupon':'false',

'listRequest.isDebug':'false',

'listRequest.isLimitTime':'false',

'listRequest.isLogin':'false',

'listRequest.isMobileOnly':'true',

'listRequest.isNeed5Discount':'true',

'listRequest.isNeedNotContractedHotel':'false',

'listRequest.isNeedSimilarPrice':'false',

'listRequest.isReturnNoRoomHotel':'true',

'listRequest.isStaySave':'false',

'listRequest.isTrace':'false',

'listRequest.isUnionSite':'false',

'listRequest.isnstantConfirm':'false',

'listRequest.keywords':'',

'listRequest.keywordsType':'0',

'listRequest.language':'cn',

'listRequest.lat':'39.9059093',

'listRequest.listType':'0',

'listRequest.lng':'116.3913489',

'listRequest.lowPrice':'0',

'listRequest.orderFromID':'1105',

'listRequest.pageIndex':'1',

'listRequest.pageSize':'20',

'listRequest.payMethod':'0',

'listRequest.personOfRoom':'0',

'listRequest.poiId':'0',

'listRequest.poiName':'',

'listRequest.productTypes':'1,6,26',

'listRequest.promotionChannelCode':'0000',

'listRequest.promotionSwitch':'-1',

'listRequest.proxyID':'ZD',

'listRequest.rankType':'0',

'listRequest.returnFilterItem':'true',

'listRequest.sectionId':'',

'listRequest.sellChannel':'1',

'listRequest.seoHotelStar':'0',

'listRequest.sortDirection':'1',

'listRequest.sortMethod':'1',

'listRequest.standBack':'-1',

'listRequest.starLevels':'',

'listRequest.startLat':'0',

'listRequest.startLng':'0',

'listRequest.sug_act_info':'',

'listRequest.taRecommend':'false',

'listRequest.themeIds':'',

'listRequest.traceId':'b19bcbae-5495-4ce0-ad0e-3db4778d75e7',

'listRequest.wordId':'',

'listRequest.wordType':'-1',

'listRequest.elongToken':'2cbbf4d7-ed57-43f8-8842-4a74ad939a46',

'listRequest.trace_token':'|*|cityId:101|*|qId:1b8e11de-115a-4191-a0ce-b9e679ae66cb|*|st:city|*|sId:101|*|'

}

pages=10

#爬取深度

url='http://hotel.elong.com/ajax/tmapilist/asyncsearch'

path=r"C:\Users\asus\Desktop\CS\PYTHON\实例\酒店信息.csv"

file=open(path,"a")

file.write("名称"+","+"价格"+","+"评分"+"\n")

file.close()

#打印表头

for page in range(pages):

form_data['listRequest.pageIndex']=str(page)

#动态加载表单

detail_list=get_detail(url,header,form_data)

save(detail_list,path)

if __name__ == "__main__":

time1=time.time()

main()

print("Execute over")

time2=time.time()

print(time2-time1)

#耗时33.72(s)

代码至此结束

import re

import requests

import json

import time

import threading

def get_detail(url,header,form_data,path):

response=requests.post(url,headers=header,data=form_data)

response.encoding=response.apparent_encoding

dictionary=json.loads(response.text)

#json格式转换

text=dictionary['value']['hotelListHtml']

#字典取值

hotel_list=text.split('

')

#切片

dict_list=[]

for each_hotel in hotel_list:

try:

name=re.findall(r'class="info_cn">(.*?)',each_hotel)[0]

price=re.findall(r'(.*?)',each_hotel)[0]

score=re.findall(r'data-score="(.*?)"',each_hotel)[0]

new_dict={}

new_dict["name"]=name

new_dict["price"]=price

new_dict["score"]=score

dict_list.append(new_dict)

except:

continue

save(dict_list,path)

#函数连接点

def save(detail_list,path):

file=open(path,"a")

for each_detail in detail_list:

name=each_detail["name"]

price=each_detail["price"]

score=each_detail["score"]

file.write(name+","+price+","+score+"\n")

file.close()

#保存

def main():

header={

'Accept':'application/json, text/javascript, */*; q=0.01',

'Accept-Encoding':'gzip, deflate',

'Accept-Language':'zh-CN,zh;q=0.9,ja;q=0.8,en;q=0.7',

'Connection':'keep-alive',

'Content-Length':'2273',

'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',

'Cookie':'CookieGuid=2cbbf4d7-ed57-43f8-8842-4a74ad939a46; H5CookieId=db6aea90-999f-4be0-9930-7cfcea066214; _fid=2cbbf4d7-ed57-43f8-8842-4a74ad939a46; CitySearchHistory=0101%23%E5%8C%97%E4%BA%AC%23beijing%23; firsttime=1583163644996; SessionGuid=abe14edf-638e-44e5-a26d-94807cbf9e7a; Esid=038b4b4a-fd3a-4c88-bb54-fe60f6a9bf12; com.eLong.CommonService.OrderFromCookieInfo=Orderfromtype=1&Parentid=1000&Status=1&Cookiesdays=30&Coefficient=0.0&Pkid=1105&Priority=9001&Isusefparam=0&Makecomefrom=0&Savecookies=0; fv=pcweb; anti_token=4D71FDF2-2357-461A-984C-56958AE1A7CE; ShHotel=InDate=2020-03-03&CityID=0101&CityNameEN=beijing&CityNameCN=%E5%8C%97%E4%BA%AC&OutDate=2020-03-04&CityName=%E5%8C%97%E4%BA%AC; ext_param=bns%3D4%26ct%3D3; s_cc=true; __tctmc=0.215881358; __tctmc=20377580.26050747; __tctmd=20377580.254392154; __tctma=20377580.1583163637156064.1583163637230.1583163637230.1583200239719.2; __tctmu=20377580.0.0; __tctmz=20377580.1583200239719.2.1.utmccn=(referral)|utmcsr=bing.com|utmcct=|utmcmd=referral; longKey=1583163637156064; __tctrack=0; __tctmd=0.1; lasttime=1583202216497; s_visit=1; User-Ref-SessionId=78bd-e80d-13ad-2b0a-92aa-532a; trace_extend={"deviceid":"2cbbf4d7-ed57-43f8-8842-4a74ad939a46","appid":"6","userid":"2cbbf4d7-ed57-43f8-8842-4a74ad939a46","orderfromid":1105,"sessionid":"78bd-e80d-13ad-2b0a-92aa-532a","pvid":"c1b8aaeb"}; __tctmb=0.1284443720926117.1583203761350.1583203761350.1; s_sq=elongcom%3D%2526pid%253Dhotel.elong.com%25252Fbeijing%2526pidt%253D1%2526oid%253Djavascript%25253Avoid(0)%2526ot%253DA; __tccgd=0.0; JSESSIONID=1F53BCB446035ADE53A945505FBF2D47',

'Host':'hotel.elong.com',

'Origin':'http://hotel.elong.com',

'Referer':'http://hotel.elong.com/beijing/',

'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',

'X-Requested-With':'XMLHttpRequest'

}

form_data={

'code':'7611836',

'listRequest.areaID':'',

'listRequest.bedLargeTypes':'',

'listRequest.bookingChannel':'1',

'listRequest.breakfasts':'0',

'listRequest.cancelFree':'false',

'listRequest.cardNo':'192928',

'listRequest.checkInDate':'2020-03-03 00:00:00',

'listRequest.checkOutDate':'2020-03-04 00:00:00',

'listRequest.cityID':'0101',

'listRequest.cityName':'北京',

'listRequest.crawledFlag':'0',

'listRequest.customLevel':'11',

'listRequest.discountIds':'',

'listRequest.distance':'20000',

'listRequest.endLat':'0',

'listRequest.endLng':'0',

'listRequest.epcCreateOrderGuideVersion':'C',

'listRequest.facilityIds':'',

'listRequest.guokaoFlag':'false',

'listRequest.highPrice':'0',

'listRequest.hotelBrandIDs':'',

'listRequest.hotelIDs':'',

'listRequest.interceptAction':'0',

'listRequest.isAdvanceSave':'false',

'listRequest.isAfterCouponPrice':'true',

'listRequest.isCoupon':'false',

'listRequest.isDebug':'false',

'listRequest.isLimitTime':'false',

'listRequest.isLogin':'false',

'listRequest.isMobileOnly':'true',

'listRequest.isNeed5Discount':'true',

'listRequest.isNeedNotContractedHotel':'false',

'listRequest.isNeedSimilarPrice':'false',

'listRequest.isReturnNoRoomHotel':'true',

'listRequest.isStaySave':'false',

'listRequest.isTrace':'false',

'listRequest.isUnionSite':'false',

'listRequest.isnstantConfirm':'false',

'listRequest.keywords':'',

'listRequest.keywordsType':'0',

'listRequest.language':'cn',

'listRequest.lat':'39.9059093',

'listRequest.listType':'0',

'listRequest.lng':'116.3913489',

'listRequest.lowPrice':'0',

'listRequest.orderFromID':'1105',

'listRequest.pageIndex':'1',

'listRequest.pageSize':'20',

'listRequest.payMethod':'0',

'listRequest.personOfRoom':'0',

'listRequest.poiId':'0',

'listRequest.poiName':'',

'listRequest.productTypes':'1,6,26',

'listRequest.promotionChannelCode':'0000',

'listRequest.promotionSwitch':'-1',

'listRequest.proxyID':'ZD',

'listRequest.rankType':'0',

'listRequest.returnFilterItem':'true',

'listRequest.sectionId':'',

'listRequest.sellChannel':'1',

'listRequest.seoHotelStar':'0',

'listRequest.sortDirection':'1',

'listRequest.sortMethod':'1',

'listRequest.standBack':'-1',

'listRequest.starLevels':'',

'listRequest.startLat':'0',

'listRequest.startLng':'0',

'listRequest.sug_act_info':'',

'listRequest.taRecommend':'false',

'listRequest.themeIds':'',

'listRequest.traceId':'b19bcbae-5495-4ce0-ad0e-3db4778d75e7',

'listRequest.wordId':'',

'listRequest.wordType':'-1',

'listRequest.elongToken':'2cbbf4d7-ed57-43f8-8842-4a74ad939a46',

'listRequest.trace_token':'|*|cityId:101|*|qId:1b8e11de-115a-4191-a0ce-b9e679ae66cb|*|st:city|*|sId:101|*|'

}

pages=10

#爬取深度

url='http://hotel.elong.com/ajax/tmapilist/asyncsearch'

path=r"C:\Users\asus\Desktop\酒店信息.csv"

file=open(path,"a")

file.write("名称"+","+"价格"+","+"评分"+"\n")

file.close()

#打印表头

threads=[]

#线程列表

for page in range(pages):

form_data['listRequest.pageIndex']=str(page)

#动态加载表单

thread=threading.Thread(target=get_detail,args=(url,header,form_data,path),name=str(page))

#多开线程

thread.start()

threads.append(thread)

for each_thread in threads:

each_thread.join()

#线程同步

if __name__ == "__main__":

time1=time.time()

main()

print("Execute over")

time2=time.time()

print(time2-time1)

#耗时1.78(s)

代码至此结束

此栏目旨在共享自学之乐,共勉求知之友,共塑网站和谐好学的形象。

欢迎大家在评论区发表合理的意见和指正。

如果觉得该栏目对您有帮助,望不吝点赞收藏。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值