import re
import requests
from bs4 import BeautifulSoup
from lxml import html
etree = html.etree
import json
import time
heads = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
}
baseurl = f'https://www.baidu.com/'
response = requests.get(baseurl, headers=heads).text
# print(response)
#使用xpath
# tree = etree.HTML(response)
# print(tree)
# r_list = tree.xpath('//*[@id="s-top-left"]/a[1]/text()')
# r_list = tree.xpath('/html//title') # // 表示多个层级
# r_list = tree.xpath('//title') # 所有title后末的内容
# r_list = tree.xpath('//div[@class="nav"]') # 定位 <div class="nav"> 后面的内容
# r_list = tree.xpath('//span[@id="nav_a"]/a[2]') # <span id="nav_a">后 第三个a标签的内容
# r_list = tree.xpath('/html/body/div/li/ul/li[3]/a/text()')[0] # 取 <li><a href="#" class="shezhi">关闭预测</a></li>
# r_list = tree.xpath('/html/body/div/li/ul//text()') # //text()所有的文本内容
# r_list = tree.xpath('/html//@href[1]') # 取属性 href="css/base.css"/>
# print(r_list)
# 使用bs4
# soup = BeautifulSoup(response,'lxml') # 第一步:实体化BS4 对象
# t_list = soup.select('title') # 通过标签来查找
# print(t_list)
# print(soup.img) # 查找img标签内容
# print(soup.find('div',class_='form')) # 查找div下面的class_='form'内的内容 查找单个
# print(soup.find_all('div',class_='form')) # 查找全部符合的条件
# soup.select('.index') # 选择器select(‘某种选择器(id,class....)’) 返回的是个列表
# css选择器
# t_list = soup.select('title') # 通过标签来查找
# t_list = soup.select("nav-a") # 通过类名来查找
# t_list =soup.select("#nav-a") # 通过id来查找
# t_list = soup.select("a[class='nav-a']") # 通过属性来查找
# t_list = soup.select("head > link") # 通过子标签来查找 可一直>...
# t_list = soup.select("head > link a") # 获取多个a标签的属性
# t_list = soup.select("head > link a").text # 获取文本内容 string 获取标签直系的文本内容
关于xpath和bs4的使用(自用)
最新推荐文章于 2024-04-17 18:30:51 发布