关于xpath和bs4的使用（自用）

最新推荐文章于 2024-04-17 18:30:51 发布

百无￥禁忌

最新推荐文章于 2024-04-17 18:30:51 发布

阅读量122

点赞数

分类专栏： python 文章标签： html python 前端

本文链接：https://blog.csdn.net/qq_52025594/article/details/130085486

版权

python 专栏收录该内容

5 篇文章 0 订阅

订阅专栏


import re
import requests
from bs4 import BeautifulSoup
from lxml import html
etree = html.etree
import json
import time

heads = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
}
baseurl = f'https://www.baidu.com/'

response = requests.get(baseurl, headers=heads).text
# print(response)

#使用xpath
# tree = etree.HTML(response)
# print(tree)
# r_list = tree.xpath('//*[@id="s-top-left"]/a[1]/text()')
# r_list = tree.xpath('/html//title')  # // 表示多个层级
# r_list = tree.xpath('//title')     # 所有title后末的内容
# r_list = tree.xpath('//div[@class="nav"]')   # 定位 <div class="nav"> 后面的内容
# r_list = tree.xpath('//span[@id="nav_a"]/a[2]')  # <span id="nav_a">后 第三个a标签的内容
# r_list = tree.xpath('/html/body/div/li/ul/li[3]/a/text()')[0]  # 取 <li><a href="#" class="shezhi">关闭预测</a></li>
# r_list = tree.xpath('/html/body/div/li/ul//text()')  # //text()所有的文本内容
# r_list = tree.xpath('/html//@href[1]')  # 取属性 href="css/base.css"/>
# print(r_list)

# 使用bs4
# soup = BeautifulSoup(response,'lxml')   # 第一步：实体化BS4 对象
# t_list = soup.select('title') # 通过标签来查找
# print(t_list)
# print(soup.img)   # 查找img标签内容
# print(soup.find('div',class_='form'))   # 查找div下面的class_='form'内的内容  查找单个
# print(soup.find_all('div',class_='form')) # 查找全部符合的条件
# soup.select('.index') # 选择器select(‘某种选择器（id,class....）’) 返回的是个列表
# css选择器
# t_list = soup.select('title') # 通过标签来查找
# t_list = soup.select("nav-a") # 通过类名来查找
# t_list =soup.select("#nav-a") # 通过id来查找
# t_list = soup.select("a[class='nav-a']") # 通过属性来查找
# t_list = soup.select("head > link") # 通过子标签来查找 可一直>...
# t_list = soup.select("head > link a") # 获取多个a标签的属性
# t_list = soup.select("head > link a").text  # 获取文本内容    string 获取标签直系的文本内容