beautifulsoup功能

最新推荐文章于 2024-07-10 17:28:32 发布

weixin_30675247

最新推荐文章于 2024-07-10 17:28:32 发布

阅读量293

点赞数

文章标签： python

原文链接：http://www.cnblogs.com/kylechen/p/8557589.html

版权

#coding:utf-8
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.error import HTTPError,URLError

def getinfo(url):
try:
 html = urlopen(url) #读取网页，html.read()为其源代码
 bsobj = BeautifulSoup(html.read(),"lxml") #用beautifulsoup读取网页源代码
 title = bsobj.h1 #获取网页title
 nameList = bsobj.findAll("span",{"class":"green"})
 all_theprince = bsobj.findAll(text="the prince")
except (HTTPError,URLError,ArithmeticError) as e: #网页错误,服务器不存在,尝试访问未知对象
 return None
 return title,nameList,all_theprince
url_Info = getinfo("http://www.pythonscraping.com/pages/warandpeace.html")
try:
 title = url_Info[0] #调用getTitle函数，获取网站的title
 print(title)
 nameList = url_Info[1] #获取nameList
 for name in nameList: #遍历nameList列表
 print(name.get_text()) #去除标签格式，输出文本
 all_theprince = url_Info[2]
print(len(all_theprince))

except:
print("URL could not be found")