Beautiful Soup
# 选择方法
1、find # 单选
2、find_all # 多项选择 返回列表
- def find_all(self,name = None,attrs={},recursive = True,text = None,limint = None,**kwargs)
# css常用选择器
soup = BeautifulSoup(html_doc, 'lxml')
# 1、通过标签名查找
print(soup.select('a'))
# 2、通过类名查找
print(soup.select('.sister'))
# 3、通过id查找
print(soup.select('#link1'))
# 4、组合查找(p标签中,id 等于link1的内容,二者需要用空格分开)
print(soup.select('p #link1'))
# 5、直接子标签查找,使用 > 分隔
print(soup.select('head > title'))
# 6、通过属性查找
print(soup.select('a[href = "http://example.com/elsie"]'))
soup = BeautifulSoup(html_doc, 'lxml')
# 1、获取所有tr标签
trs = soup.find_all('tr')
for tr in trs:
print(tr)
# 2、获取第2个tr 标签
tr = soup.find_all('tr',limit = 2)[1]
# 3、获取所有class等于event的tr标签
- trs = soup.find_all('tr',class_ = 'even')
- trs = soup.find_all('tr',attrs={'class':'event'})
# 4、将所有id 等于test,class 也等于test的a标签提取出来
- alist = soup.find_all('a',attrs = {'id':'test','class':'test'})
- alist = soup.find_all('a',id = 'test',class_ = 'test')
# 5、获取所有的a标签的href属性
hrefs = soup.find_all('a')
for a in hrefs:
href = a['href'] # 方式1
href = a.get('href') # 方式2
href = a.attrs['href'] # 方式3
# 6、获取纯文本信息
trs = soup.find_all('tr')[1:]
for tr in trs:
# 获取
tds = tr.find_all('td')
title = tds[0]
print(title.string) # 方式1
# 获取tr标签的所有文本
infos = list(tr.strings)
print(infos)
# 去除空格取文本
infos = list(tr.stripped_strings)
print(infos)
import requests
from bs4 import BeautifulSoup
url = 'http://www.weather.com.cn/textFC/db.shtml'
new_url = 'http://www.weather.com.cn/textFC/{}.shtml'
header ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
response = requests.get(url,headers =header)
response.encoding = 'utf-8'
html = response.text
soup = BeautifulSoup(html,'lxml')
conMidtab = soup.find('div',attrs={'class':'conMidtab'})
tables = conMidtab.find_all('table')
for table in tables:
trs = table.find_all('tr')[2:]
for index,tr in enumerate(trs):
tds = tr.find_all('td')
if index == 0:
city_td = tds[1]
else:
city_td = tds[0]
temp_td = tds[-2]
city = list(city_td.stripped_strings)[0]
temp = list(temp_td.stripped_strings)[0]
print(city,temp)