爬虫基础（四）Beautiful Soup

最新推荐文章于 2021-08-13 12:01:23 发布

LCL-2019

最新推荐文章于 2021-08-13 12:01:23 发布

阅读量114

点赞数

分类专栏：爬虫技术

本文链接：https://blog.csdn.net/weixin_43056654/article/details/104205784

版权

爬虫技术专栏收录该内容

7 篇文章 0 订阅

订阅专栏

Beautiful Soup

# 选择方法
1、find   # 单选
2、find_all   # 多项选择  返回列表
 	- def find_all(self,name = None,attrs={},recursive = True,text = None，limint = None,**kwargs)

# css常用选择器
soup = BeautifulSoup(html_doc, 'lxml')
# 1、通过标签名查找
	print(soup.select('a'))
# 2、通过类名查找
	print(soup.select('.sister'))
# 3、通过id查找
	print(soup.select('#link1'))
# 4、组合查找（p标签中，id 等于link1的内容，二者需要用空格分开）
	print(soup.select('p #link1'))
# 5、直接子标签查找，使用 > 分隔
	print(soup.select('head > title'))
# 6、通过属性查找
	print(soup.select('a[href = "http://example.com/elsie"]'))


soup = BeautifulSoup(html_doc, 'lxml')
# 1、获取所有tr标签
	trs = soup.find_all('tr')
	for tr in trs:
		print(tr)
# 2、获取第2个tr 标签
	tr =  soup.find_all('tr',limit = 2)[1]
# 3、获取所有class等于event的tr标签
 	- trs = soup.find_all('tr',class_ = 'even')
 	- trs = soup.find_all('tr',attrs={'class':'event'})
# 4、将所有id 等于test,class 也等于test的a标签提取出来
 	- alist = soup.find_all('a',attrs = {'id':'test','class':'test'})
 	- alist = soup.find_all('a',id = 'test',class_ = 'test')
# 5、获取所有的a标签的href属性
	hrefs = soup.find_all('a')
	for a in hrefs:
		href = a['href']  # 方式1
		href = a.get('href')  # 方式2
		href =  a.attrs['href']  # 方式3

# 6、获取纯文本信息
	trs = soup.find_all('tr')[1:]
	for tr in trs:
		# 获取
		tds = tr.find_all('td')
		title = tds[0]
		print(title.string)   # 方式1
	
		# 获取tr标签的所有文本
		infos = list(tr.strings)
		print(infos)

		# 去除空格取文本
		infos = list（tr.stripped_strings）
		print(infos)

import requests
from bs4 import BeautifulSoup

url = 'http://www.weather.com.cn/textFC/db.shtml'
new_url = 'http://www.weather.com.cn/textFC/{}.shtml'
header ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
response = requests.get(url,headers =header)
response.encoding = 'utf-8'
html = response.text
soup = BeautifulSoup(html,'lxml')
conMidtab = soup.find('div',attrs={'class':'conMidtab'})
tables = conMidtab.find_all('table')

for table in tables:
    trs = table.find_all('tr')[2:]
    for index,tr in enumerate(trs):
        tds = tr.find_all('td')
        if index == 0:
            city_td = tds[1]
        else:
            city_td = tds[0]

        temp_td = tds[-2]

        city = list(city_td.stripped_strings)[0]
        temp = list(temp_td.stripped_strings)[0]
        print(city,temp)