#-*- coding: UTF-8 -*-
import urllib2, BeautifulSoup
# @param url: complete url
# 完整的url
# @param usr, pwd: if the page need account,
# \p usr and \p pwd will be used
# 当访问的页面需要密码的时候会用到
# @return: the formatted string content of the url
# 用了BeautifulSoup返回结果文本
def getWebPage(url, usr=None, pwd=None):
if not usr and not pwd:
content = urllib2.urlopen(url).read()
else:
pwdMgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
pwdMgr.add_password(None, url, usr, pwd)
handler = urllib2.HTTPBasicAuthHandler(pwdMgr)
opener = urllib2.build_opener(handler)
page = opener.open(url).read()
content = BeautifulSoup.BeautifulSoup(page).prettify()
return content
url='http://www.csdn.net/'
print getWebPage(url)
[python]抓取网页的内容
最新推荐文章于 2021-08-07 06:47:22 发布