1.beautifulsoup对象-bsObj获取标题
from urllib.request import urlopen
from urllib.error import HTTPError ,URLError
from bs4 import BeautifulSoup
def getTitle(url):
try:
html = urlopen(url)
except (HTTPError,URLError) as e:
return None
try:
bsObj = BeautifulSoup(html.read())
title = bsObj.body.h1
except AttributeError as e:
return None
return title
title = getTitle("http://www.pythonscraping.com/pages/page1.html")
if title == None:
print (title is None)
else:
print (title)
2.获取标签内容
find,findAll,bsobj.span.text
#创建一个beautifulsoup的对象
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsobj = BeautifulSoup(html.read())
#find_all (tagName,tagAttributes) 获取页面的指定标签
namelist = bsobj.findAll("span",{"class":"green"})
for name in namelist:
#get_text 是获取标签的内容
print (name.get_text())
#find & find_all的区别
#find_all(tag,attributes,recursive,text,limit,keywords)
#find(tag,attributes,recursive,text,keywords)
#主要使用的是前两个参数,tag,attribute,入参可以是一个标签,或者多个标签组成的列表
#tag
testlist = bsobj.findAll({"h1","h2","h3"})
for head in testlist:
print(head)
#attributes为封装的一个标签的若干属性
testlist = bsobj.findAll("span",{"class":{"red","green"}})
for cell in testlist:
print (cell.text)
#recursive 为一个bool变量,True 为所有层次的子标签,False只查询一级标签,findAll默认为True
#text 是按照标签文本查询
textlists = bsobj.findAll(text = "the prince")
print (len(textlists))
#find 等价于 findAll 的limit =1,limit = x只取网页查询的前x项
#keyword,选择具体属性的标签
keystext = bsobj.findAll(id = "text")
print(keystext[0].text)
#keyword比较冗余,findAll(id = "text") = findAll("",{"id":"text"}),在查询class的标签的时候,keyword可能会
#有点问题,class在python中受保护的关键字,不能当做标量或者参数名使用,可以写为
#findAll(class_="green")
#or findAll("",{"class":"green"})
#直接获取span标签的内容
print (bsobj.span.text)
3.beautifulsoup的其他对象
3.1NavigableString对象
用来表示标签的文字,不是标签
3.2Commet对象
用来查找html中的注释
eg:<!-- 像这样 -->