简介
beautifulsoup和lxml都是解析器
优点:具有人性化接口
缺点:没有lxml效率高
bs4基本语法(bs4解析本地文件)
html文件如下:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<div>
<ul>
<li id = 'l1'>张三</li>
<li id = 'l2'>李四</li>
<li>王五</li>
<a href="" id = "a1" class = "a1">小王</a>
<span>嘿嘿嘿</span>
</ul>
</div>
<a href="" title="a2">小张</a>
<div id = 'd1'>
<span>
哈哈哈哈
</span>
</div>
<p id = 'p1' class = 'p1'>呵呵呵</p>
</body>
</html>
解析代码如下:
from bs4 import BeautifulSoup
# 解析本地文件
# 默认打开文件的编码格式是gbk,打开文件的时候需要指定编码
soup = BeautifulSoup(open('爬虫_解析_bs4.html',encoding='utf-8'),'lxml') # beautifulsoup使用的是lxml的内核
#####################根据标签名查找节点###############################################
# 找到的是第一个符合条件的数据
print(soup.a)
# 返回标签的属性和属性值
print(soup.a.attrs)
########################bs4的一些函数#################################################
# (1)find 函数,返回第一个符合条件的数据,同时也可以根据属性值找到对应的标签对象
print(soup.find('a'))
print(soup.find('a', title = 'a2'))
print(soup.find('a', id = 'a1'))
# print(soup.find('a', class = 'a1')) 这里的class在python中具有特殊意义,因此不能直接使用
print(soup.find('a', class_ = 'a1'))
# (2)find_all 函数,返回列表,包括所有符合条件的数据
print(soup.find_all('a'))
print(soup.find_all(['a', 'span'])) # 要获取多个标签数据,徐娅添加[]符号(列表数据)
print(soup.find_all('li'))
print(soup.find_all('li', limit=2)) # limit的作用是查找前几个数据
# (3)select 函数,返回所有符合条件的数据
print(soup.select('a'))
print(soup.select('.a1')) # select可通过.代表class,类选择器
print(soup.select('#l1')) # select可通过#代表id
print(soup.select('li[id]')) # 属性选择器,查找li标签中有id的标签
print(soup.select('li[id = "l2"]')) # 查找li标签中id为l2的标签
# 层级选择器
print(soup.select('div li')) # 后代选择器,找到div下面的li
print(soup.select('div>li')) # 子代选择器(只有儿子,没有孙子啥的)
print(soup.select('div>ul>li'))
print(soup.select('a,li')) # 找到a标签和li标签所有对象
############################节点信息######################################################
# (1) 节点信息,获取节点内容
obj = soup.select('#d1')[0]
# 如果标签内容中只有内容,那么string就可以获取内容
# 如果标签内容中还有标签,那么string就不能获取内容
print(obj.string)
print(obj.get_text())
# (2)节点属性,获取标签名称
obj = soup.select('#p1')[0]
print(obj.name) # 返回标签名称
print(obj.attrs) # 将属性值作为一个标签返回
print(obj.attrs.get('class'))
print(obj.get('class'))
print(obj['class'])
bs4解析星巴克咖啡(bs4解析服务器响应文件)
网页url:https://www.starbucks.com.cn/menu/
需求:爬取星巴克所有咖啡的名称
获取bs4中路径的方法:首先使用xpath,然后进行路径转换,变为bs路径
代码段:
from bs4 import BeautifulSoup
import urllib.request
url = 'https://www.starbucks.com.cn/menu/'
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Mobile Safari/537.36'
}
request = urllib.request.Request(url=url, headers = headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
soup = BeautifulSoup(content, 'lxml')
# //ul[@class="grid padded-3 product"]//strong/text()
# name_list = soup.select('.grid padded-3 product strong') # 这里的class标签之间的空格会混淆后代
name_list = soup.select('ul[class = "grid padded-3 product"] strong')
for name in name_list:
print(name.get_text())
beautifulsoup解析greasyfork第一页数据
from bs4 import BeautifulSoup
import urllib.request
url = 'https://greasyfork.org/en/scripts/by-site/baidu.com'
headers = {
# ':authority':' greasyfork.org',
# ':method':' GET',
# ':path':' /en/scripts/by-site/baidu.com',
# ':scheme':' https',
'accept':' text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
# 'accept-encoding':' gzip, deflate, br',
'accept-language':' en,zh-CN;q=0.9,zh;q=0.8',
'cache-control':' max-age=0',
'cookie': 'locale_messaged=true; __gads=ID=c84c23cee74af959-227333407cd0009a:T=1644241468:RT=1644241468:S=ALNI_MafhGVZx3djQVgwV_Fp52xuwMcn7g; _gid=GA1.2.1325938447.1646020797; _gat_gtag_UA_48197018_1=1; _greasyfork_session=lVhZp0ukoE5h10ssVu7H72TKDRls2ih0JrTWy77HrMDo922TYfHskNjN4e2TYL01Nrq381%2BvxHoGKWzJSynIWNBEqmDciFwajs3P8G3QUDK6l9lqKyn0tOxq%2B0dDdrJ%2BjUKED6OFDo3Dc3ebI0bpUspEKHq5PomYDB9%2FuOVbPfUXGv9u36Y756X1qBFgTt4vSqGF%2FXfjPHM%2F8wYi1P3U4KZfiE%2FL3VvdvL2NOUxDJDsYZrn8eBrSKNpAcUbDK2PqMEc9A20KAznZPGN%2Fd1%2BIY8Oao%2FU%2BSzf3AWwYz7C7HosEsPcGFRrV2SF0jYYpNXGTqfAsVwd2%2BHXdluOfuY5YhOOrfY7ZOyLRvb0ePAj2jP%2BaTWw%3D--4Tk1XItq95lIS4t%2B--T78JZ9AmhZtQ8hXEpwRULA%3D%3D; _ga_7NMRNRYW7C=GS1.1.1646020797.7.1.1646020804.0; _ga=GA1.2.1849916989.1644240830',
'dnt':' 1',
'if-none-match':' W/"53c820b762aed724d66f3f524dfa96f5"',
'referer': 'https://greasyfork.org/en',
'sec-ch-ua':' " Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
'sec-ch-ua-mobile':' ?1',
'sec-ch-ua-platform':' "Android"',
'sec-fetch-dest':' document',
'sec-fetch-mode':' navigate',
'sec-fetch-site':' same-origin',
'sec-fetch-user':' ?1',
'upgrade-insecure-requests':' 1',
'user-agent':' Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Mobile Safari/537.36',
}
request = urllib.request.Request(url = url, headers = headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
soup = BeautifulSoup(content, 'lxml')
# //ol[@id="browse-script-list"]//article//a[@class="script-link"]/text()
name_list = soup.select('ol[id = "browse-script-list"] article a[class="script-link"]')
for name in name_list:
print(name.get_text())
beautifulsoup解析greasyfork前十页数据
import urllib.request
from bs4 import BeautifulSoup
def create_request(page):
# https://greasyfork.org/en/scripts/by-site/baidu.com
# https://greasyfork.org/en/scripts/by-site/baidu.com?page=2
# https://greasyfork.org/en/scripts/by-site/baidu.com?page=3
url = 'https://greasyfork.org/en/scripts/by-site/baidu.com'
if page == 1:
url = url
else:
url = url + '?page=' + str(page)
headers = {
# ':authority':' greasyfork.org',
# ':method':' GET',
# ':path':' /en/scripts/by-site/baidu.com',
# ':scheme':' https',
'accept': ' text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
# 'accept-encoding':' gzip, deflate, br',
'accept-language': ' en,zh-CN;q=0.9,zh;q=0.8',
'cache-control': ' max-age=0',
'cookie': 'locale_messaged=true; __gads=ID=c84c23cee74af959-227333407cd0009a:T=1644241468:RT=1644241468:S=ALNI_MafhGVZx3djQVgwV_Fp52xuwMcn7g; _gid=GA1.2.1325938447.1646020797; _gat_gtag_UA_48197018_1=1; _greasyfork_session=lVhZp0ukoE5h10ssVu7H72TKDRls2ih0JrTWy77HrMDo922TYfHskNjN4e2TYL01Nrq381%2BvxHoGKWzJSynIWNBEqmDciFwajs3P8G3QUDK6l9lqKyn0tOxq%2B0dDdrJ%2BjUKED6OFDo3Dc3ebI0bpUspEKHq5PomYDB9%2FuOVbPfUXGv9u36Y756X1qBFgTt4vSqGF%2FXfjPHM%2F8wYi1P3U4KZfiE%2FL3VvdvL2NOUxDJDsYZrn8eBrSKNpAcUbDK2PqMEc9A20KAznZPGN%2Fd1%2BIY8Oao%2FU%2BSzf3AWwYz7C7HosEsPcGFRrV2SF0jYYpNXGTqfAsVwd2%2BHXdluOfuY5YhOOrfY7ZOyLRvb0ePAj2jP%2BaTWw%3D--4Tk1XItq95lIS4t%2B--T78JZ9AmhZtQ8hXEpwRULA%3D%3D; _ga_7NMRNRYW7C=GS1.1.1646020797.7.1.1646020804.0; _ga=GA1.2.1849916989.1644240830',
'dnt': ' 1',
'if-none-match': ' W/"53c820b762aed724d66f3f524dfa96f5"',
'referer': 'https://greasyfork.org/en',
'sec-ch-ua': ' " Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
'sec-ch-ua-mobile': ' ?1',
'sec-ch-ua-platform': ' "Android"',
'sec-fetch-dest': ' document',
'sec-fetch-mode': ' navigate',
'sec-fetch-site': ' same-origin',
'sec-fetch-user': ' ?1',
'upgrade-insecure-requests': ' 1',
'user-agent': ' Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Mobile Safari/537.36',
}
request = urllib.request.Request(url=url, headers=headers)
return request
pass
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
return content
def get_name(content):
soup = BeautifulSoup(content, 'lxml')
# //ol[@id="browse-script-list"]//article//a[@class="script-link"]/text()
name_list = soup.select('ol[id = "browse-script-list"] article a[class="script-link"]')
return name_list
def write_file(name_list, filename):
for name in name_list:
with open(file_name, 'a+', encoding='utf-8') as fp:
fp.write(name.get_text()+'\n')
if __name__ == '__main__':
start_page = int(input('请输入起始页码:'))
end_page = int(input('请输入结束页码:'))
for page in range(start_page, end_page + 1):
# (1)请求对象定制
request = create_request(page)
# (2)获取网页源码
content = get_content(request)
# (3)解析
name_list = get_name(content)
# 输出到文件中去
file_name = '爬虫_解析_bs4_greasy_pages.txt'
write_file(name_list, file_name)
pass