1.xpath: 提取xml文件
浏览器安装插件xpath
Pycharm下载包lxml
1)解析本地文件
# 导入包
from lxml import etree
# 解析文件
tree = etree.parse("data/ip.html")
# 提取信息
li_list = tree.xpath('//li/text()')
提取的语法:
xpath基本语法:
1.路径查询
//:查找所有子孙节点,不考虑层级关系
/ :找直接子节点
2.谓词查询
//div[@id]
//div[@id="maincontent"]
3.属性查询
//@class
4.模糊查询
//div[contains(@id, "he")]
//div[starts‐with(@id, "he")]
5.内容查询
//div/h1/text()
6.逻辑运算
//div[@id="head" and @class="s_down"]
//title | //price
2)解析爬取的网页
# 获取网页内容
response = urllib.request.urlopen(request)
content = response.read().decode("utf-8")
# 解析网页
tree = etree.HTML(content)
# 提取信息
data_lsit = tree.xpath('//input[@id="su"]/@value')
案例:爬取图片数据
import urllib.request
from lxml import etree
from tqdm import *
"""
第一页地址:
https://sc.chinaz.com/tupian/qinglvtupian.html
第二页地址:
https://sc.chinaz.com/tupian/qinglvtupian_2.html
"""
def create_request(page):
if page == 1:
url = "https://sc.chinaz.com/tupian/qinglvtupian.html"
else:
url = "https://sc.chinaz.com/tupian/qinglvtupian_" + str(page) + ".html"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
return request
def get_content(request):
req = urllib.request.urlopen(request)
content = req.read().decode("utf-8")
return content
def down_load(content):
tree = etree.HTML(content)
link_list = tree.xpath('//div[@class="tupian-list com-img-txt-list"]//img/@data-original')
name_list = tree.xpath('//div[@class="tupian-list com-img-txt-list"]//img/@alt')
for i in tqdm(range(len(link_list)), desc="下载的图片"):
link = "https:" + link_list[i]
name = name_list[i]
urllib.request.urlretrieve(link, filename="./photo/" + name + ".jpg")
if __name__ == "__main__":
start_page = int(input("请输入起始页码:"))
end_page = int(input("请输入结束页码:"))
for page in range(start_page, end_page + 1):
# 定制请求对象
request = create_request(page)
# 请求数据
content = get_content(request)
# 下载图片
down_load(content)
2.jsonPath:提取json文件
使用方法:
# jsonpath的使用:
obj = json.load(open('json文件', 'r', encoding='utf‐8'))
ret = jsonpath.jsonpath(obj, 'jsonpath语法')
案例:读取淘票票的城市数据
url = "https://dianying.taobao.com/cityAction.json?activityId&_ksTS=1665842537566_108&jsoncallback=jsonp109&action=cityAction&n_s=new&event_submit_doGetAllRegion=true"
headers = {
# ':authority': 'dianying.taobao.com',
# ':method': 'GET',
# ':path': '/cityAction.json?activityId&_ksTS=1665842537566_108&jsoncallback=jsonp109&action=cityAction&n_s=new&event_submit_doGetAllRegion=true',
# ':scheme': 'https',
'accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
# 'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'bx-v': '2.2.3',
'cookie': 't=01e264dc462c7ec31fa81c964480ef71; cna=PQQNG2ID00wCAcom+Kx+6VFW; sgcookie=E100NV67fuPefRFxmJ0rvIhweAZoab6bysNUqOllPpeLt7x9bpzqS%2BymW2%2Bx44dTkXaSrvn9kbzwh6Zx%2BIsDmj8dYMBa5bqwE2Skvf1Cy6Xrvyysien1uEZxct1eOkjuavSS; tracknick=%5Cu674E%5Cu4E8C%5Cu5E06%5Cu5475%5Cu5475%5Cu5475; _cc_=VFC%2FuZ9ajQ%3D%3D; cookie2=1dfb433f9bd17bd74b43a203c5d6e815; v=0; _tb_token_=577eb3bb87f08; xlly_s=1; tb_city=110100; tb_cityName="sbG+qQ=="; tfstk=cfK1BVYee5V61Om0j1ME_WX9wgscZBw5hV1MCX7g6-W3mBJ1iirPN2i9oSIVH91..; l=eBQNq69gL8OhvXN6BO5Cnurza7792QRb4sPzaNbMiInca6iATFaNYNCU_UxJ7dtjgtCAuetzv3EoxdLHR3AgCc0c07kqm0SZUxvO.; isg=BGVlVF3rai5ub480JRB2COKOdCGfohk0ct6phGdIkhyrfoXwL_CvBHoQCOII_jHs',
'referer': 'https://dianying.taobao.com/',
'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
# 创建请求
req = urllib.request.Request(url=url, headers=headers)
# 获取数据
response = urllib.request.urlopen(req)
content = response.read().decode("utf-8")
content = content.split("(")[1].split(")")[0]
# 提取信息
json_data = json.loads(content)
data_list = jsonpath.jsonpath(json_data, "$..regionName")
for i in data_list:
print(i)
3.BeautifulSoup:解析xml
基本使用
# 1导入包
from bs4 import BeautifulSoup
# 2创建对象
# 服务器响应的文件生成对象
soup = BeautifulSoup(response.read().decode(), 'lxml')
# 本地文件生成对象
soup = BeautifulSoup(open('1.html'), 'lxml')
# 注意:默认打开文件的编码格式gbk所以需要指定打开编码格式
# 3定位节点:3个方法
# 返回第一个匹配的对象,可以加属性限制搜索结果
soup.find("img", id="", class_="")
# 返回匹配的所有对象,可以添加限制结果数量
soup.find_all(["img", "a"], limit=2)
# 返回匹配的所有对象结果
soup.select()
# 1.element
# eg:p
# 2..class
# eg:.firstname
# 3.#id
# eg:#firstname
# 4.属性选择器
# [attribute]
# eg:li = soup.select('li[class]')
# [attribute=value]
# eg:li = soup.select('li[class="hengheng1"]')
# 5.层级选择器
# element element 后代
# div p
# element > element 父子节点关系
# div>p
# element,element
# div,p
# eg:soup = soup.select('a,span')
# 4获取节点内容
# (1).获取节点内容:适用于标签中嵌套标签的结构
# obj.string
# obj.get_text()【推荐】
# (2).节点的属性
# tag.name 获取标签名
# eg:tag = find('li)
# print(tag.name)
# tag.attrs将属性值作为一个字典返回
# (3).获取节点属性
# obj.attrs.get('title')【常用】
# obj.get('title')
# obj['title']
案例:爬取星巴克产品列表
from bs4 import BeautifulSoup
import urllib.request
url = "https://www.starbucks.com.cn/menu/"
response = urllib.request.urlopen(url)
content = response.read().decode("utf-8")
soup = BeautifulSoup(content, 'lxml')
# //ul[@class="grid padded-3 product"]//strong
name_list = soup.select("ul[class='grid padded-3 product'] strong")
for e in name_list:
print(e.get_text())