from bs4 import BeautifulSoup
from lxml import etree
doc='''<html>
<body>
<title>
A story
</title>
<p class="title">
<b>
Story begin!
</b>
</p>
<p class="story">
Once upon a time there were three little sister.
<a href="http://example.com/elsie">
class="sister" id="link1" >Elsie
</a>
,
<a href="http://example.com/lacie">
class="sister" id="link2" >Lacie
</a>
;
and they lived in a castle
</p>
</body>
</html>'''
soup=BeautifulSoup(doc,"lxml")
s=soup.prettify()
print(s)
查找title元素
soup=BeautifulSoup(doc,"lxml")
tag=soup.find("title")
print(type(tag),tag)
查找所有a元素
soup=BeautifulSoup(doc,"lxml")
tags=soup.findAll("a")
for tag in tags:
print(tag)
查找含titie的所有p元素
soup=BeautifulSoup(doc,"lxml")
tags=soup.findAll("p",attrs={"class":"title"})
print(tags)
查找class为sister的元素
soup=BeautifulSoup(doc,"lxml")
tags=soup.findAll(name=None,attrs={"class":"sister"})
print(tags)
去掉tag元素的信息
soup=BeautifulSoup(doc,"lxml")
tags=soup.find_all("a")
for tag in tags:
print(tag.text)
如果tag还含有子节点
soup=BeautifulSoup(doc,"lxml")
tags=soup.find_all("p")
for tag in tags:
print(tag.text)
利用select方法
写 CSS 时,标签名不加任何修饰
类名(class="className"引号内即为类名)前加点
id名(id="idName"引号前即为id名)前加 #
于是可以利用类似的方法来筛选元素,比如 soup.select(),返回类型是 list
print soup.select('title')
[<title>The Dormouse's story</title>]
爬取图片
from bs4 import BeautifulSoup
import requests
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
user_path="https://www.pexels.com/search/"
word=input("请输入你要下载的图片:")
url=user_path+word+'/'
wb_data=requests.get(url,headers=headers)
soup=BeautifulSoup(wb_data.text,'lxml')
imgs=soup.select('article>a>img')
list=[]
for img in imgs:
photo=img.get('src')
list.append(photo)
保存图片
from bs4 import BeautifulSoup
from lxml import etree
import requests
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
user_path="https://www.pexels.com/search/"
word=input("请输入你要下载的图片:")
url=user_path+word+'/'
wb_data=requests.get(url,headers=headers)
soup=BeautifulSoup(wb_data.text,'lxml')
imgs=soup.select('article>a>img')
list=[]
for img in imgs:
photo=img.get('src')
list.append(photo)
print(photo)
path="C://Users/14760/Desktop/photo/"
for item in list:
data=requests.get(item)
fp=open(path+item.split("?")[0][-10],'wb')
fp.write(data.content)
fp.close()
w 以写的方式打开(会覆盖原有的文件)
r 以只读的方式打开
a 以追加的模式打开(在原文件的末尾追加要写入的数据,不覆盖原文件)
b 以二进制文件的方式打开
r+ w+ a+ 都是以读写的方式打开
rb 以二进制读的方式打开
wb 以二进制写的方式打开
ab 以二进制追加的模式打开
rb+ wb+ ab+ 以二进制读写的方式打开·
保存图片