服务器爬虫网页数据,爬虫(二)—解析真实网页(猫途鹰)

from bs4 import BeautifulSoup

import requests

import time

urls = ['https://www.tripadvisor.cn/Attractions-g187147-Activities-c47-oa{}-Paris_Ile_de_France.html#FILTERED_LIST'.format(str(i)) for i in range(0,180,30)]

headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',

'Cookie': 'ServerPool=X; TART=%1%enc%3AJrLr2lvxwNlLbH9Cmhye81h4fhzdErdMYRa5jIgQ%2BzMdQzRRGJHP%2BEwOn0Pk%2B7RjAypFF0poxcI%3D; TAUnique=%1%enc%3A2cU7ADHy9Eo%2BkIWbO8dIhRvcFs06Zy%2F3vKdc%2Bd3i34gVAETMq8nxvA%3D%3D; TASSK=enc%3AAO0kkqxQ6UQrxO%2Fhkulabq0%2FgYgi6LuHCDMDfxtJkh4LERyb5A9E2%2FKatL80BtAkileXZDy3kvSOK7CHrCLzCQ23W40ydDWAbiH2fJ1WXXdRpNYcX%2FqFl3XA4gaaqM6ZeA%3D%3D; VRMCID=%1%V1*id.16631*llp.%2F-a_ttcampaign%5C.MTYpc-a_ttgroup%5C.title-m16631*e.1529666639404; _ga=GA1.2.130210118.1529061841; _gid=GA1.2.969501517.1529061841; _smt_uid=5b23a1d2.4e54e361; __gads=ID=826c32b0d192b76d:T=1529061847:S=ALNI_MaQj-S3SBC0F86Wrv6BWEmJRhlB0A; CommercePopunder=SuppressAll*1529061865893; ki_r=; TAAuth3=3%3Adf9baacbcf8f189f276b1a5c29e15b62%3AABSHViFhFqb1vgGz0nQ1zKy3RFlL3VHov1qFBzyJY1diYONpPht1Vnv2LCsUNojv60oiLMYJzj8gWWMB1Gkji%2FNpJw%2FwPFAZ7lkigK3UdltaJehxgMM1MGd7i%2BbXmId%2Fs7HB5w%2F1ezojK0b7n9MQXUdQliAXeStS1SzWK%2BRMop3nNuU3H6o3oOHl9Rt4ltQKUw%3D%3D; MobileLastViewedList=%1%%2FAttractions-g187147-Activities-c47-Paris_Ile_de_France.html; interstitialCounter=-1; TATravelInfo=V2*AY.2018*AM.6*AD.24*DY.2018*DM.6*DD.25*A.2*MG.-1*HP.2*FL.3*DSM.1529067170928; CM=%1%PremiumMobSess%2C%2C-1%7Ct4b-pc%2C%2C-1%7CSPHRSess%2C%2C-1%7CRestAds%2FRPers%2C%2C-1%7CRCPers%2C%2C-1%7CWShadeSeen%2C%2C-1%7CTheForkMCCPers%2C%2C-1%7CHomeASess%2C4%2C-1%7CPremiumSURPers%2C%2C-1%7CPremiumMCSess%2C%2C-1%7CRestPartSess%2C%2C-1%7CRestPremRSess%2C%2C-1%7CCpmPopunder_1%2C1%2C1529148251%7CCCSess%2C%2C-1%7CCpmPopunder_2%2C1%2C-1%7CPremRetPers%2C%2C-1%7CViatorMCPers%2C%2C-1%7Csesssticker%2C%2C-1%7C%24%2C%2C-1%7Ct4b-sc%2C%2C-1%7CRestAdsPers%2C%2C-1%7CMC_IB_UPSELL_IB_LOGOS2%2C%2C-1%7Cb2bmcpers%2C%2C-1%7CMC_IB_UPSELL_IB_LOGOS%2C%2C-1%7CPremMCBtmSess%2C%2C-1%7CPremiumSURSess%2C%2C-1%7CLaFourchette+Banners%2C%2C-1%7Csess_rev%2C%2C-1%7Csessamex%2C%2C-1%7CPremiumRRSess%2C%2C-1%7CSPMCSess%2C%2C-1%7CTheForkORSess%2C%2C-1%7CTheForkRRSess%2C%2C-1%7Cpers_rev%2C%2C-1%7Cmds%2C%2C-1%7CRBAPers%2C%2C-1%7CRestAds%2FRSess%2C%2C-1%7CHomeAPers%2C%2C-1%7CPremiumMobPers%2C%2C-1%7CSPHRPers%2C%2C-1%7CRCSess%2C%2C-1%7CLaFourchette+MC+Banners%2C%2C-1%7CRestAdsCCSess%2C%2C-1%7CRestPartPers%2C%2C-1%7CRestPremRPers%2C%2C-1%7Csh%2C%2C-1%7CLastPopunderId%2C137-1859-null%2C-1%7Cpssamex%2C%2C-1%7CTheForkMCCSess%2C%2C-1%7CCCPers%2C%2C-1%7Cb2bmcsess%2C%2C-1%7CSPMCPers%2C%2C-1%7CPremRetSess%2C%2C-1%7CViatorMCSess%2C%2C-1%7CPremiumMCPers%2C%2C-1%7CPremiumRRPers%2C%2C-1%7CRestAdsCCPers%2C%2C-1%7CTheForkORPers%2C%2C-1%7CPremMCBtmPers%2C%2C-1%7CTheForkRRPers%2C%2C-1%7CRestAdsSess%2C%2C-1%7CRBASess%2C%2C-1%7CSPORPers%2C%2C-1%7Cperssticker%2C%2C-1%7CCPNC%2C%2C-1%7C; TAReturnTo=%1%%2FAttractions-g187147-Activities-Paris_Ile_de_France.html; roybatty=TNI1625!AHvfFP6GU%2Blwk4iVZ0AzyrpCCufht6MXowsnGvilj0IjbceNq1euKmzBt2GMOqFWaUSHiMCOUhrHs%2Fiu0fHYMWBajyJ97jRyEttR9yaX840tAKQUND6vW0o3JIcYXgjdkO3J4lFTseSHKDIZem%2FBrHlR1JF9frXGbBh3kQvWi8Xk%2C1; ki_t=1529061844713%3B1529061844713%3B1529067216226%3B1%3B28; TASession=%1%V2ID.21FD898339223DC08F21308BF888E17F*SQ.134*MC.16631*LR.https%3A%2F%2Fsp0%5C.baidu%5C.com%2F9q9JcDHa2gU2pMbgoY3K%2Fadrc%5C.php%3Ftpl%3Dtpl_11534_17355_13016%26l%3D1504452536%26wd%3D%25E7%258C%25AB%25E9%2580%2594%25E9%25B9%25B0%26issp%3D1%26f%3D8%26ie%3Dutf-8%26rqlang%3Dcn%26tn%3Dbaiduhome_pg%26inputT%3D3211*LP.%2F-a_ttcampaign%5C.MTYpc-a_ttgroup%5C.title-m16631*PR.427%7C*LS.DemandLoadAjax*GR.75*TCPAR.41*TBR.76*EXEX.67*ABTR.62*PHTB.85*FS.25*CPU.80*HS.recommended*ES.popularity*AS.popularity*DS.5*SAS.popularity*FPS.oldFirst*TS.7067C40EA7A60B512E55A582616B88D6*FA.1*DF.0*MS.-1*RMS.-1*FLO.187147*TRA.true*LD.187147; TAUD=LA-1529057833220-1*RDD-1-2018_06_15*HDD-4144951-2018_06_24.2018_06_25*LD-9463016-2018.6.24.2018.6.25*LG-9463017-2.0.F.'

}

def get_data(url, data=None):

wb_data = requests.get(url, headers=headers)

time.sleep(4)

soup = BeautifulSoup(wb_data.text, 'lxml')

titles = soup.select('div.listing_title > a')

stars = soup.select('div.wrap > div.rs.rating > span[alt]')

views = soup.select('span.more > a')

for title, star, view in zip(titles, stars, views):

data = {

'title': title.get_text(),

'star': star.get('alt'),

'view': view.get_text(),

}

print(data)

for single_url in urls:

get_data(single_url)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值