一、 BeautifulSoup搜索文档
1、 使用find_all方法获取指定节点元素的列表对象
taglist=bs.find_all(‘元素名称’)
示例如下:
from bs4 import BeautifulSoup
import urllib.request
def getUrlHtml(strurl):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36’
}
request=urllib.request.Request(strurl,headers=headers)
html=urllib.request.urlopen(request)
bs=BeautifulSoup(html,"html.parser")
atags=bs.find_all("a")#获取页面所有的a标签
print(type(atags))#查看atags类型
#print(atags)#查看集合内容
for astr in atags:
print(astr.text.strip())#获取a标签的文本信息
print(astr['href'])#获取a标签的href属性值
if name == ‘main’:
getUrlHtml(“http://bbs.tianya.cn/list-free-1.shtml”)
2、 使用属性参数进行查找
Tags=bs.find_all(“标签名称”,attrs={‘属性’:’属性对应的值’,’属性2’:’值2’,…})
示例如下:
from bs4 import BeautifulSoup
import urllib.request
import time
def getUrlHtml(strurl):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36’
}
request=urllib.request.Request(strurl,headers=headers)
html=urllib.request.urlopen(request)
bs=BeautifulSoup(html,"html.parser")
atags=bs.find_all("tr",attrs={'class':'bg'})#根据标签的属性获取标签的集合对象
#print(atags)#查看集合内容
for trtag in atags:
#print(type(trtag))#查看astr的类型
print(trtag.text.strip())
tr_a=trtag.find("a")
titl_url = tr_a['href']
#print(type(titl_url))
#print(titl_url[0:4])#截取从0到4位置的字符串
if titl_url[0:4]=='http' :
titlt_full_url=titl_url
else:
titlt_full_url="http://bbs.tianya.cn"+titl_url
print("标题链接地址:",titlt_full_url)
######################获取内容信息###########################################
titlfullurl_index=titlt_full_url.find("do")
print(titlfullurl_index)
if titlfullurl_index==-1 :
request=urllib.request.Request(titlt_full_url,headers=headers)
html1=urllib.request.urlopen(request)
bs=BeautifulSoup(html1)
contenttage=bs.find("div",attrs={'class':'bbs-content clearfix'})#bbs-content clearfix
print(contenttage.text.strip())
time.sleep(4)
else:
print("不能抓取!")
####################################################################################
print("======================================================================")
if name == ‘main’:
getUrlHtml(“http://bbs.tianya.cn/list-free-1.shtml”)