1、安装库
确保python和系统的版本一致,32 or 64 ,
pip --version 查看pip版本
Soup = BeautifulSoup(html,'lxml')
汤 汤料 食谱
安装lxml 库 pip install lxml Successfully installed lxml-4.2.5
一般情况下使用lxml可获得高效率和易用性。参考文献:https://www.cnblogs.com/qijj/p/6265308.html
安装beautifulsoup4库 pip install beautifulsoup4 Successfully installed beautifulsoup4-4.6.3
安装requests 库 pip install requests
2、选择简单网页进行爬取
css.selector 谁 在哪 第几个 样式
xpath 谁 在哪 第几个
:nth-of-type(2) 表示枚举类型,去掉可选择全部
print(p,h1,sep='\n----------------\n') 步进
from bs4 import BeautifulSoup
with open('文件名')as web:
Soup = BeautifulSoup(wb_data,'html.parser')
ps = Soup.select('body > p')
#遍历
for p in ps:
print(p.get_text())
Soup = BeautifulSoup(wb_data,'lxml')
images=Soup.select('body>div.main-content>ul>li:nth-of-type[1]>img')
print(images,sep='\n----------------\n') )
#构造字典
for p,h1 in zip(ps,h1s):
data = {
'p' : p.get_text(),
'h1' : h1.get_text(),
#'images' : image.get('src'), 获取标签下属性
#'cate' : list(cate.stripped_strings), 获取标签下所有的文字表述,并将其列表化 同时获取标签的时候需要将上层进行父目录描述(实用多层多标签)
}
print(data)
info.append(data)
for i in infp:
if float(i['rate']>3)
print(i['title'])
find()查找第一个匹配结果出现的地方,find_all()找到所有匹配结果出现的地方。
#####################################
3、爬取真实网页
分析Request 中的多种方法,主要get post
from bs4 import BeautifulSoup
import requests
import time
'''
url="https://www.tripadvisor.cn/Hotels-g297435-Zhengzhou_Henan-Hotels.html"
wb_date = requests.get(url)
#time.sleep(4)
#soup = BeautifulSoup(wb_date.text,'html.parser')
soup = BeautifulSoup(wb_date.text,'lxml')
#print(soup)
#特征枚举 a[target="_blank"]
#titles = soup.select('div.listing_title > a')
titles = soup.select('div.listing_title > a[target="_blank"]')
#属性枚举,也可以是img[width="160"]
images = soup.select('img[width="160"]')
#images = soup.select('div.aspect.is-shown-at-tablet > div')
cates = soup.select('ul.icons_list.easyClear.vertical ')
#cates = soup.select('div.prw_rup.prw_common_hotel_icons_list.linespace.is-shown-at-tablet > ul.icons_list.easyClear.vertical ')
print(titles,images,cates,sep='\n----------------\n')
for title,image,cate in zip(titles,images,cates):
data={
'title' : title.get_text(),
'image' : image.get('style'),
'cate' : list(cate.stripped_strings), #计取该url里面的字符
}
print(data)
'''
图片地址一致,因为做了爬取保护,后续进行js相关拆解
#############################
最主要的分析selector,有些div里面的style不能获取了,需要后续用xpath进行特别解析,后续在学习
需要注意time.sleep 规避风险
添加headers,然后规避登录爬取
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36',
'Cookie':'Cookie:ServerPool=X; TART=%1%enc%3AWNNiywND7UDd4mtKuGrbic701EqT0AcfEyAaNC44B6H%2FjXTq9TwDjSvLr98FqbW3YyDA5hAqKF0%3D; TAUnique=%1%enc%3A77tOz8SXtDmNIOhtYd%2Fb4lOFokdL0I8%2BcjHDjIu3EivLH0eO5X1%2BTQ%3D%3D; TASSK=enc%3AANo2G5t7Uh9fwFdwPxMcYfKRwZiAkMEUZ72ZSL%2FaZTIl2O02UObg5VjVU7V8T00eIi2D0HDEfyqaughFkPNWG%2FUfVzTRdTVoLo2Woe39wNkbXOFwpR5ruOKXQ%2Bor2FVb7w%3D%3D; TAPD=tripadvisor.cn; __gads=ID=cd1c4823fdd8b65c:T=1539416662:S=ALNI_MZHvSkLFR17eMnRyunj7Wy0mD8OBw; BEPIN=%1%1666c64c72a%3Busr02t.daodao.com%3A10023%3B; VRMCID=%1%V1*id.12019*llp.%2F*e.1540023072752; TATravelInfo=V2*AY.2018*AM.10*AD.22*DY.2018*DM.10*DD.23*A.2*MG.-1*HP.2*FL.3*DSM.1539418304313*AZ.1*RS.1; CommercePopunder=SuppressAll*1539568223481; TAAuth3=3%3A3beb3899e89d6095fd7c36a1a60eaed4%3AAA2GuKLiIuC8%2BBxIBl3rfOZId1civRJB5HsoiAf2mw4UBClcrZmVGpXZfuWpoX2jzQkiylTE2mHvlQkNhooIRo2IUE9T7RKlzILBKKVhw%2FbS07egfquPgL0mNC6a7Nu8l6GCWLcRblaTgXKiX3O1ee2AWAMIr2lnglZ0byy4%2Biqi5PW%2F6ixEfWcIQtDnWeDPJQ%3D%3D; _smt_uid=5bc1a252.5b8541f4; CM=%1%HanaPersist%2C%2C-1%7CPremiumMobSess%2C%2C-1%7Ct4b-pc%2C%2C-1%7CHanaSession%2C%2C-1%7CRestAds%2FRPers%2C%2C-1%7CRCPers%2C%2C-1%7CWShadeSeen%2C%2C-1%7CFtrPers%2C%2C-1%7CTheForkMCCPers%2C%2C-1%7CHomeASess%2C7%2C-1%7CPremiumSURPers%2C%2C-1%7CPremiumMCSess%2C%2C-1%7CRestPartSess%2C%2C-1%7CRestPremRSess%2C%2C-1%7CCCSess%2C%2C-1%7CPremRetPers%2C%2C-1%7CViatorMCPers%2C%2C-1%7Csesssticker%2C%2C-1%7CPremiumORSess%2C%2C-1%7Ct4b-sc%2C%2C-1%7CRestAdsPers%2C%2C-1%7CMC_IB_UPSELL_IB_LOGOS2%2C%2C-1%7Cb2bmcpers%2C%2C-1%7CPremMCBtmSess%2C%2C-1%7CPremiumSURSess%2C%2C-1%7CMC_IB_UPSELL_IB_LOGOS%2C%2C-1%7CLaFourchette+Banners%2C%2C-1%7Csess_rev%2C%2C-1%7Csessamex%2C%2C-1%7CPremiumRRSess%2C%2C-1%7CTADORSess%2C%2C-1%7CAdsRetPers%2C%2C-1%7CTARSWBPers%2C%2C-1%7CSaveFtrPers%2C%2C-1%7CSPMCSess%2C%2C-1%7CTheForkORSess%2C%2C-1%7CTheForkRRSess%2C%2C-1%7Cpers_rev%2C%2C-1%7CMetaFtrSess%2C%2C-1%7CSPMCWBPers%2C%2C-1%7CRBAPers%2C%2C-1%7CWAR_RESTAURANT_FOOTER_PERSISTANT%2C%2C-1%7CFtrSess%2C%2C-1%7CRestAds%2FRSess%2C%2C-1%7CHomeAPers%2C%2C-1%7CPremiumMobPers%2C%2C-1%7CRCSess%2C%2C-1%7CLaFourchette+MC+Banners%2C%2C-1%7CRestAdsCCSess%2C%2C-1%7CRestPartPers%2C%2C-1%7CRestPremRPers%2C%2C-1%7Csh%2C%2C-1%7Cpssamex%2C%2C-1%7CTheForkMCCSess%2C%2C-1%7CCCPers%2C%2C-1%7CWAR_RESTAURANT_FOOTER_SESSION%2C%2C-1%7Cb2bmcsess%2C%2C-1%7CSPMCPers%2C%2C-1%7CPremRetSess%2C%2C-1%7CViatorMCSess%2C%2C-1%7CPremiumMCPers%2C%2C-1%7CAdsRetSess%2C%2C-1%7CPremiumRRPers%2C%2C-1%7CRestAdsCCPers%2C%2C-1%7CTADORPers%2C%2C-1%7CTheForkORPers%2C%2C-1%7CPremMCBtmPers%2C%2C-1%7CTheForkRRPers%2C%2C-1%7CTARSWBSess%2C%2C-1%7CSaveFtrSess%2C%2C-1%7CPremiumORPers%2C%2C-1%7CRestAdsSess%2C%2C-1%7CRBASess%2C%2C-1%7CSPORPers%2C%2C-1%7Cperssticker%2C%2C-1%7CSPMCWBSess%2C%2C-1%7CMetaFtrPers%2C%2C-1%7C; TAReturnTo=%1%%2FHotel_Review-g297435-d299580-Reviews-Sofitel_Zhengzhou-Zhengzhou_Henan.html; roybatty=TNI1625!AKKWGBebJveNg6PZTbvfSQcOX3XuNnB55%2Foe83gabZH9o3Fjt2lxuFIHMaYjr3181E%2Fa9bWfbKeo%2BLKBVOz7UIheCDCRn1TkWGHZ7cFWUn7qpFcByvhrSPmmQBBO6T%2FIp4MmnQR58jZkKkRgoorxFjn41jsR%2BAiM0x97I2DvYeTA%2C1; _ga=GA1.2.1112689405.1539416640; _gid=GA1.2.1530960083.1539568227; ki_t=1539416658753%3B1539568238567%3B1539590215170%3B2%3B20; ki_r=; TASession=%1%V2ID.CE28DFC444EE9ABD2568A44D8846D16E*SQ.245*MC.12019*LR.https%3A%2F%2Fwww%5C.baidu%5C.com%2Flink%3Furl%3D5UiQPWfIhWfHq2_A2lSu_2T1c565rgmWjmGVcfYuoYibBfelpBsxgjZkMS6AqgjJ%26wd%3D%26eqid%3Df990134c00004212000000045bc1a899*LP.%2F*LS.DemandLoadAjax*PD13481.1*GR.59*TCPAR.5*TBR.64*EXEX.96*ABTR.38*PHTB.38*FS.14*CPU.55*HS.recommended*ES.popularity*AS.popularity*DS.5*SAS.popularity*FPS.oldFirst*TS.89131EFDCAF9A3688643020151043A67*LF.zhCN*FA.1*DF.0*IR.3*OD.zh*MS.-1*RMS.-1*FLO.1580992*TRA.false*LD.299580; TAUD=LA-1539416640622-1*RDD-1-2018_10_13*HC-1635559*HDD-1648076-2018_10_21.2018_10_22*HD-1665296-2018_10_22.2018_10_23.297435*G-1665297-2.1.297435.*LD-176188859-2018.10.22.2018.10.23*LG-176188862-2.1.T.*ARDD-176188863-2018_10_222018_10_23',
}
url_saves = 'https://www.tripadvisor.cn/Saves/1394680'
wb_data = requests.get(url_saves,headers=headers)
soup = BeautifulSoup(wb_data.text,'html.parser')
print(soup)
titles = soup.select('a.title')
images = soup.select('a.thumbnail')
print(titles,images)
对于爬取多页的需要构造循环函数进行抓取
# -*- coding: utf-8 -*-
######对比serverpy
from bs4 import BeautifulSoup
import requests
import time
url="https://www.tripadvisor.cn/Hotels-g297435-Zhengzhou_Henan-Hotels.html"
###构建多个链接解析式
urls = ["https://www.tripadvisor.cn/Hotels-g297435-oa{}-Zhengzhou_Henan-Hotels.html".format(str(i)) for i in range(30,930,30)]
url_saves = 'https://www.tripadvisor.cn/Saves/1394680'
url_mobile='https://www.tripadvisor.cn/Hotels-g297435-oa90-Zhengzhou_Henan-Hotels.html'
url_zuoye = 'http://bj.xiaozhu.com/fangzi/31387618603.html'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36',
'Cookie':'ServerPool=X; TART=%1%enc%3AWNNiywND7UDd4mtKuGrbic701EqT0AcfEyAaNC44B6H%2FjXTq9TwDjSvLr98FqbW3YyDA5hAqKF0%3D; TAUnique=%1%enc%3A77tOz8SXtDmNIOhtYd%2Fb4lOFokdL0I8%2BcjHDjIu3EivLH0eO5X1%2BTQ%3D%3D; TASSK=enc%3AANo2G5t7Uh9fwFdwPxMcYfKRwZiAkMEUZ72ZSL%2FaZTIl2O02UObg5VjVU7V8T00eIi2D0HDEfyqaughFkPNWG%2FUfVzTRdTVoLo2Woe39wNkbXOFwpR5ruOKXQ%2Bor2FVb7w%3D%3D; TAPD=tripadvisor.cn; __gads=ID=cd1c4823fdd8b65c:T=1539416662:S=ALNI_MZHvSkLFR17eMnRyunj7Wy0mD8OBw; BEPIN=%1%1666c64c72a%3Busr02t.daodao.com%3A10023%3B; VRMCID=%1%V1*id.12019*llp.%2F*e.1540023072752; TATravelInfo=V2*AY.2018*AM.10*AD.22*DY.2018*DM.10*DD.23*A.2*MG.-1*HP.2*FL.3*DSM.1539418304313*AZ.1*RS.1; CommercePopunder=SuppressAll*1539568223481; TAAuth3=3%3A3beb3899e89d6095fd7c36a1a60eaed4%3AAA2GuKLiIuC8%2BBxIBl3rfOZId1civRJB5HsoiAf2mw4UBClcrZmVGpXZfuWpoX2jzQkiylTE2mHvlQkNhooIRo2IUE9T7RKlzILBKKVhw%2FbS07egfquPgL0mNC6a7Nu8l6GCWLcRblaTgXKiX3O1ee2AWAMIr2lnglZ0byy4%2Biqi5PW%2F6ixEfWcIQtDnWeDPJQ%3D%3D; _smt_uid=5bc1a252.5b8541f4; CM=%1%HanaPersist%2C%2C-1%7CPremiumMobSess%2C%2C-1%7Ct4b-pc%2C%2C-1%7CHanaSession%2C%2C-1%7CRestAds%2FRPers%2C%2C-1%7CRCPers%2C%2C-1%7CWShadeSeen%2C%2C-1%7CFtrPers%2C%2C-1%7CTheForkMCCPers%2C%2C-1%7CHomeASess%2C7%2C-1%7CPremiumSURPers%2C%2C-1%7CPremiumMCSess%2C%2C-1%7CRestPartSess%2C%2C-1%7CRestPremRSess%2C%2C-1%7CCCSess%2C%2C-1%7CPremRetPers%2C%2C-1%7CViatorMCPers%2C%2C-1%7Csesssticker%2C%2C-1%7CPremiumORSess%2C%2C-1%7Ct4b-sc%2C%2C-1%7CRestAdsPers%2C%2C-1%7CMC_IB_UPSELL_IB_LOGOS2%2C%2C-1%7Cb2bmcpers%2C%2C-1%7CPremMCBtmSess%2C%2C-1%7CPremiumSURSess%2C%2C-1%7CMC_IB_UPSELL_IB_LOGOS%2C%2C-1%7CLaFourchette+Banners%2C%2C-1%7Csess_rev%2C%2C-1%7Csessamex%2C%2C-1%7CPremiumRRSess%2C%2C-1%7CTADORSess%2C%2C-1%7CAdsRetPers%2C%2C-1%7CTARSWBPers%2C%2C-1%7CSaveFtrPers%2C%2C-1%7CSPMCSess%2C%2C-1%7CTheForkORSess%2C%2C-1%7CTheForkRRSess%2C%2C-1%7Cpers_rev%2C%2C-1%7CMetaFtrSess%2C%2C-1%7CSPMCWBPers%2C%2C-1%7CRBAPers%2C%2C-1%7CWAR_RESTAURANT_FOOTER_PERSISTANT%2C%2C-1%7CFtrSess%2C%2C-1%7CRestAds%2FRSess%2C%2C-1%7CHomeAPers%2C%2C-1%7CPremiumMobPers%2C%2C-1%7CRCSess%2C%2C-1%7CLaFourchette+MC+Banners%2C%2C-1%7CRestAdsCCSess%2C%2C-1%7CRestPartPers%2C%2C-1%7CRestPremRPers%2C%2C-1%7Csh%2C%2C-1%7Cpssamex%2C%2C-1%7CTheForkMCCSess%2C%2C-1%7CCCPers%2C%2C-1%7CWAR_RESTAURANT_FOOTER_SESSION%2C%2C-1%7Cb2bmcsess%2C%2C-1%7CSPMCPers%2C%2C-1%7CPremRetSess%2C%2C-1%7CViatorMCSess%2C%2C-1%7CPremiumMCPers%2C%2C-1%7CAdsRetSess%2C%2C-1%7CPremiumRRPers%2C%2C-1%7CRestAdsCCPers%2C%2C-1%7CTADORPers%2C%2C-1%7CTheForkORPers%2C%2C-1%7CPremMCBtmPers%2C%2C-1%7CTheForkRRPers%2C%2C-1%7CTARSWBSess%2C%2C-1%7CSaveFtrSess%2C%2C-1%7CPremiumORPers%2C%2C-1%7CRestAdsSess%2C%2C-1%7CRBASess%2C%2C-1%7CSPORPers%2C%2C-1%7Cperssticker%2C%2C-1%7CSPMCWBSess%2C%2C-1%7CMetaFtrPers%2C%2C-1%7C; TAReturnTo=%1%%2FHotel_Review-g297435-d299580-Reviews-Sofitel_Zhengzhou-Zhengzhou_Henan.html; roybatty=TNI1625!AKKWGBebJveNg6PZTbvfSQcOX3XuNnB55%2Foe83gabZH9o3Fjt2lxuFIHMaYjr3181E%2Fa9bWfbKeo%2BLKBVOz7UIheCDCRn1TkWGHZ7cFWUn7qpFcByvhrSPmmQBBO6T%2FIp4MmnQR58jZkKkRgoorxFjn41jsR%2BAiM0x97I2DvYeTA%2C1; _ga=GA1.2.1112689405.1539416640; _gid=GA1.2.1530960083.1539568227; ki_t=1539416658753%3B1539568238567%3B1539590215170%3B2%3B20; ki_r=; TASession=%1%V2ID.CE28DFC444EE9ABD2568A44D8846D16E*SQ.245*MC.12019*LR.https%3A%2F%2Fwww%5C.baidu%5C.com%2Flink%3Furl%3D5UiQPWfIhWfHq2_A2lSu_2T1c565rgmWjmGVcfYuoYibBfelpBsxgjZkMS6AqgjJ%26wd%3D%26eqid%3Df990134c00004212000000045bc1a899*LP.%2F*LS.DemandLoadAjax*PD13481.1*GR.59*TCPAR.5*TBR.64*EXEX.96*ABTR.38*PHTB.38*FS.14*CPU.55*HS.recommended*ES.popularity*AS.popularity*DS.5*SAS.popularity*FPS.oldFirst*TS.89131EFDCAF9A3688643020151043A67*LF.zhCN*FA.1*DF.0*IR.3*OD.zh*MS.-1*RMS.-1*FLO.1580992*TRA.false*LD.299580; TAUD=LA-1539416640622-1*RDD-1-2018_10_13*HC-1635559*HDD-1648076-2018_10_21.2018_10_22*HD-1665296-2018_10_22.2018_10_23.297435*G-1665297-2.1.297435.*LD-176188859-2018.10.22.2018.10.23*LG-176188862-2.1.T.*ARDD-176188863-2018_10_222018_10_23',
}
headers_mobile = {
'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1',
}
def get_attractions(url,data=None):
wb_date = requests.get(url)
time.sleep(2) #防止反爬
soup = BeautifulSoup(wb_date.text,'lxml')
#print(soup)
#特征枚举 a[target="_blank"]
titles = soup.select('div.listing_title > a')
#属性枚举,也可以是img[width="160"]
#images = soup.select('div.aspect.is-shown-at-tablet > div')
cates = soup.select('ul.icons_list.easyClear.vertical')
#cates = soup.select('div.prw_rup.prw_common_hotel_icons_list.linespace.is-shown-at-tablet > ul.icons_list.easyClear.vertical ')
#print(titles,images,cates,sep='\n----------------\n')
for title,cate in zip(titles,cates):
data={
'title' : title.get_text(),
#'image' : image.get('style'),
'cate' : list(cate.stripped_strings), #计取该url里面的字符
}
print(data)
def get_favs(url_saves,data=None):
wb_data = requests.get(url_saves,headers=headers)
soup = BeautifulSoup(wb_data.text,'lxml')
#print(soup)
titles = soup.select('a.title')
images = soup.select('a.thumbnail')
if data == None:
for title,image in zip(titles,images):
data={
'title' : title.get_text(),
'image' : image.get('src'),
}
print(data)
def get_mobile(url_mobile,data=None):
wb_data = requests.get(url_mobile,headers=headers_mobile)
soup = BeautifulSoup(wb_data.text,'lxml')
#print(soup)
images = soup.select(' div.listing_title > a')
for i in images:
#print(i.get('src'))
print(i.get_text())
def get_zuoye(url_zuoye,data=None):
wb_data = requests.get(url_zuoye)
soup = BeautifulSoup(wb_data.text,'lxml')
#print(soup)
texts = soup.select('div.pho_info > h4 > em')
adds = soup.select('span.pr5')
moneys = soup.select('div.day_l > span')
images = soup.select('div.member_pic > a > img')
sexs = soup.select('div.member_ico')
names = soup.select('a.lorder_name')
#print(images)
for text,add,money,image,name,sex in zip(texts,adds,moneys,images,names,sexs):
data = {
'text' : text.get_text(),
'add' : add.get_text(),
'money': money.get_text(),
'image' : image.get('src'),
'sex': sex.get('background'),
'name' :name.get_text(),
}
print(data)
#get_attractions(url)
#get_favs(url)
#print (urls)
#访问30-930个网页
#for single_url in urls:
# get_attractions(single_url)
get_zuoye(url_zuoye)
异步数据爬取
动态数据,选取network,可以查看到多个网页,
# -*- coding: utf-8 -*-
'''
异步爬取
'''
from bs4 import BeautifulSoup
import requests
import time
url = 'https://knewone.com/things?page='
def get_page(url,data=None):
print(url)
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
imgs = soup.select('a.cover-inner > img ')
titles = soup.select('section.content > h4 > a')
links = soup.select('section.content > h4 > a')
if data == None :
for img,title,link in zip(imgs,titles,links):
data = {
'img' : img.get('src'),
'title' : title.get('title'),
'link' : link.get('href'),
}
print(data)
#print(soup)
def get_more_page(start,end):
for one in range(start,end):
get_page(url+str(one))
time.sleep(1)
get_more_page(1, 10)
网页源代码中间的浏览量为0,怀疑是js获得,然后就需要
刷新网页,查看network,然后看url的response值,看看那个值包含浏览量
关于保存图片,参考下面的视频,但是貌似里面也有土鳖的教学,自己体会
# -*- coding: utf-8 -*-
'''
Created on 2018年10月17日
@author: zz
'''
from bs4 import BeautifulSoup
import requests, urllib.request
import time
def get_meimei(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
imgs = soup.select('img.entry-thumbnail')
#print(imgs)
download_links = []
for img in imgs:
download_links.append(img.get('src'))
#print(img.get('src'))
for one in download_links:
print(one)
urllib.request.urlretrieve(one,r'C:\Users\zz\Desktop\test\ '+one[-10:]+'.jpg')
# 从左往右0开始,从右往左-1开始
url = 'https://weheartit.com/inspirations/taylorswift?scrolling=true&page='
def get_meimei_20(s,e):
for one in range(s,e):
get_meimei(url+str(one))
time.sleep(1)
get_meimei_20(1, 20)