抓取需要登录才能看到的数据

抓取需要登录才能看到的数据时可以通过header里的cookie来绕过登录具体方法如下

from bs4 import BeautifulSoup
import requests

headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Cookie':'TATravelInfo=V2*A.2*MG.-1*HP.2*FL.3*RS.1; TAUnique=%1%enc%3Aw15YmyxtSRbYDnhd0rD1zZZLVA3dDpFFQBl0spaJFvYnuvWISCXjiA%3D%3D; ServerPool=R; CM=%1%ViatorMCPers%2C%2C-1%7Csesssticker%2C%2C-1%7CRCPers%2C%2C-1%7CHomeAPers%2C%2C-1%7CWShadeSeen%2C%2C-1%7CRCSess%2C%2C-1%7CViatorMCSess%2C%2C-1%7CMC_IB_UPSELL_IB_LOGOS2%2C%2C-1%7CTheForkMCCPers%2C%2C-1%7CPremiumMCPers%2C%2C-1%7CHomeASess%2C1%2C-1%7CMC_IB_UPSELL_IB_LOGOS%2C%2C-1%7CLaFourchette+MC+Banners%2C%2C-1%7CLaFourchette+Banners%2C%2C-1%7Csess_rev%2C%2C-1%7Csessamex%2C%2C-1%7CPremiumMCSess%2C%2C-1%7Csh%2C%2C-1%7Cpssamex%2C%2C-1%7CTheForkMCCSess%2C%2C-1%7Cpers_rev%2C%2C-1%7CRBASess%2C%2C-1%7Cperssticker%2C%2C-1%7CRBAPers%2C%2C-1%7C; TAReturnTo=%1%%2F; TASSK=enc%3AAFPjB7Jktwh1TkKavaE7vtJzClaMI8sXAwFrnRywEl5wNf426sYGiWhFq2I4tyB25w6Vol%2F6%2FFH6mCLkv2DtA%2FbZFkfiUTIbfkT%2BkbgN6zryQbhEMC5lbRs0qiGybgc87g%3D%3D; VRMCID=%1%V1*id.16631*llp.%2F-a_ttcampaign%5C.MTYpc-a_ttgroup%5C.title-m16631*e.1496638674583; _smt_uid=592baa45.296ff39d; _jzqy=1.1496033862.1496033862.1.jzqsr=baidu|jzqct=%E7%8C%AB%E9%80%94%E9%B9%B0.-; _jzqckmp=1; __gads=ID=8a9baaad1bc0df20:T=1496033877:S=ALNI_MYY85sopQAfz89ygeIDbSKZU-_BYg; SecureLogin2=3.4%3AAKAhhA8gsEbqZyAfE4GEEx44Qebj73PnxO5Yid37AjKUHbgFuUZhZoyoMv%2FpUb%2FIKVVgh0gEqQYQwBPsvKhM8O1usQcTZ%2FTlL43UGSxHw0i9wXLf0skJB5h3ChqGgFExbWA8MI1pjWpWMXUrrWr4lw1Q2%2FWq6DHz3EAXOe6mwdAOoqej8ry43YgM1MnB%2B3IMZhf4Xg2U5w%2FgtXjZnAIJiJs%3D; TAAuth2=%1%3%3A302d9e3897ea78dd161d4dec7f187a80%3AAC7j3Agxd7vYxGP62vCD6hcMK8SL55M2Pd1S62hRVQDwV4ecx1q4YQWJIato4a7RW9%2BBqX7MwcbEJe%2FVP81w9HA8%2BvtWwAgvY0dHDyAVh3jMUU57z3EkT7IudGt%2BdQ%2Fi0BMg6CIzAwb48EAZ%2BHN%2BGWRTIVHwuWgxvd1Qy3goNtE2s26m6b6yXGHpFQIIWrrkBA%3D%3D; CommercePopunder=SuppressAll*1496033936642; roybatty=TNI1625!AMR93nCA6sy4YyDiG8e51IlkvzJtxCacBwrt2DTiq%2BJGKV%2BhqEYAo81gXMKq%2FhxZScLdy2AieEbHtBK1IXvsYauEtQFUtAVGuZStB4XkUp6LDIqp%2F4%2BZxvPKC48LVfNev6cIzH6q5uUEP4NUQmlIb4WpWEmoSXW4f3tcfMia%2FR29%2C1; _ga=GA1.2.1770722532.1496033862; _gid=GA1.2.765886644.1496034352; Hm_lvt_2947ca2c006be346c7a024ce1ad9c24a=1495974342,1495974449,1496033862; Hm_lpvt_2947ca2c006be346c7a024ce1ad9c24a=1496034353; ki_t=1496033871168%3B1496033871168%3B1496034353011%3B1%3B6; ki_r=; _qzja=1.679875992.1496033861978.1496033861978.1496033861979.1496034136107.1496034353045..0.0.6.1; _qzjc=1; _qzjto=6.1.0; _jzqa=1.2338869597233829000.1496033862.1496033862.1496033862.1; _jzqc=1; TASession=%1%V2ID.B14E3E64A48BA335E2B99AB18125439B*SQ.29*MC.16631*LR.http%3A%2F%2Fbzclk%5C.baidu%5C.com%2Fadrc%5C.php%3Ftpl%3Dtpl_10085_14394_1%26l%3D1053011480%26ie%3Dutf-8%26f%3D8%26tn%3D97312939_hao_pg%26wd%3D%25E7%258C%25AB%25E9%2580%2594%25E9%25B9%25B0%26oq%3D%25E7%258C%25AB%25E9%2580%2594%25E9%25B9%25B0%26rqlang%3Dcn%26pid%3Dsogou-site-9f4e3847f075d1e7*LP.%2F-a_ttcampaign%5C.MTYpc-a_ttgroup%5C.title-m16631*PR.427%7C*LS.Saves*GR.67*TCPAR.15*TBR.6*EXEX.97*ABTR.28*PHTB.98*FS.84*CPU.47*HS.popularity*ES.popularity*AS.popularity*DS.5*SAS.popularity*FPS.oldFirst*TS.AC63A8B05C6C5597D75E1E3C48CA7087*FA.1*DF.0*TRA.true; TAUD=LA-1496033874562-1*RDD-1-2017_05_29*LG-3811955-2.1.F.*LD-3811956-.....'
}
url_saves = 'https://www.tripadvisor.cn/Saves/all'
web_data = requests.get(url_saves,headers=headers)
soup = BeautifulSoup(web_data.text,'lxml')

titles = soup.select('div.location_summary > a[target="_blank"]')
imgs = soup.select('div.media-left > a[style="background-image"]')
metas = soup.select('div.pop_index')

for title,img,meta in zip(titles,imgs,metas):
date ={
'title':title.get_text(),
'img':img.get('href'),
'meta':meta.get('url')
}
print(date)

1导入beautifulsoup和requests库
2在浏览器的network里找到header里的cookie和user—agent
3通过requests得到网址下的数据header=header(导入刚才的cookie和user-agent)
4beautifulsoup解析内容
5获取想要的信息

转载于:https://www.cnblogs.com/gttpython/p/6926311.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值