day5-xpath和多线程

xpath和xml数据格式

# 绝对路径:html/body/div/a
# 相对路径:./a(../div/a)

数据格式

from lxml import etree

# 1. 专业术语
"""
树:整个html或xml结构
节点:html中的每个标签,xml中标签就是节点
根节点:树的第一个节点,html的根节点就是html标签
属性:节点属性(html中就是标签属性)
"""
# 2.xml数据格式
# json数据和xml数据是两种通用的数据格式,用于不同语言之间进行数据交流
"""
将一个超市的商品数据进行传输:
json:
{
    "name": "永辉超市",
    "address": "肖家河大厦",
    "goods": [
        {"name": "泡面", "price": 3.5, "count": 20},
        {"name": "矿泉水", "prcie": 2, "count": 50},
        {"name": "面包", "price": 5, "count": 15}
    ]
}

xml:
<supermarket>
    <name>永辉超市</name>
    <address>肖家河大厦</address>
    <goodsList>
        <goods name="泡面" price="3.5" count="20"></goods>
        <goods name="矿泉水" price="2" count="50"></goods>
        <goods name="面包" price="5" count="15"></goods>
    </goods>
</supermarket>
"""

# 1) 准备数据
xml_str = """
<supermarket>
    <name>永辉超市</name>
    <address>肖家河大厦</address>
    <goodsList>
        <goods name="泡面" price="3.5" count="20"></goods>
        <goods name="矿泉水" price="2" count="50"></goods>
        <goods name="面包" price="5" count="15"></goods>
    </goodsList>
    <worker_list>
        <cashier name="张三" pay="4000"></cashier>
        <shoppingGuide name="李四" pay="3500"></shoppingGuide>
    </worker_list>
    <goods price="50" count="15">
         <name>烟</name>
    </goods>
</supermarket>
"""

# 2)创建树对象,并且获取数据的根节点
supermarket = etree.XML(xml_str)
print(supermarket)

# 3) 获取标签(获取节点)
# 节点对象.xpath(路径)    -     根据路径找到对应的节点,返回保存节点对象的列表
# a. 写绝对路径:不管xpath前面的节点对象是什么,路径从根节点开始写
# 写法:/绝对路径
cashier = supermarket.xpath('/supermarket/worker_list/cashier')
print(cashier)

worker_list = supermarket.xpath('/supermarket/worker_list')[0]
print(worker_list)  # <Element worker_list at 0x105da5b40>

result = worker_list.xpath('/worker_list/cashier')
print(result)  # []

# b. 相对路径:
# 用.来表示当前节点,xpath前面是谁,当前节点就是谁
# 用..来表示当前节点的上层节点
# 注意:./可以省略
cashier = supermarket.xpath('./worker_list/cashier')
print(cashier)  # [<Element cashier at 0x106769c00>]

cashier = worker_list.xpath('./cashier')
print(cashier)  # [<Element cashier at 0x106769c00>]

cashier = supermarket.xpath('worker_list/cashier')
print(cashier)  # [<Element cashier at 0x1047e5c80>]

cashier = worker_list.xpath('cashier')
print(cashier)  # [<Element cashier at 0x1047e5c80>]

# c. //路径   -      从任意位置开始全局搜索
# 查找方式和功能和xpath前的节点无关
result = supermarket.xpath('//cashier')
print(result)  # [<Element cashier at 0x10616ecc0>]

result = supermarket.xpath('//goods')
print(
    result)  # [<Element goods at 0x109011e40>, <Element goods at 0x109011e80>, <Element goods at 0x109011ec0>, <Element goods at 0x109011f00>]

result = supermarket.xpath('//goodsList/goods')
print(result)

# 4) 获取节点内容
# 语法:获取节点的路径/text()
name = supermarket.xpath('./name/text()')
print(name)

names = supermarket.xpath('//name/text()')
print(names)

# 5) 获取节点属性值
# 语法:获取节点的路径/@属性名
result = supermarket.xpath('./goods/@price')
print(result)

result = supermarket.xpath('//goods/@price')
print(result)

解析html

from lxml import etree

html = etree.HTML(open('test.html', encoding='utf-8').read())

h1 = html.xpath('/html/body/h1')
print(h1)

h1 = html.xpath('./body/h1')
print(h1)

h1 = html.xpath('//h1')
print(h1)

# 1. 加谓语(加条件)
# 语法:选中标签的路径[谓语]
# 1)[N]  - 获取同层的第N个标签
p = html.xpath('./body/p[1]/text()')
print(p)

result = html.xpath('./body/ul/li[2]/p/text()')
print(result)

# 2)
# [last()]  -  获取同层的最后一个标签
# [last()-N]  - 获取同层的倒数第(N+1)个
result = html.xpath('./body/ul/li/p[last()-1]/text()')
print(result)

result = html.xpath('./body/ul/li[last()-1]/p[last()]/text()')
print(result)

# 3)
# [position()>N]
# [position()<N]
# [position()>=N]
# [position()<=N]

result = html.xpath('./body/ul/li[position()<=2]/p/text()')
print(result)

result = html.xpath('./body/ul/li[position()>2]/p/text()')
print(result)


# 4) [@属性名]     -    获取有指定属性的标签
# p[@class]  - 有class属性的p标签
result = html.xpath('./body/div/p[@class]/text()')
print(result)

# [@属性名=属性值]  -  获取指定属性是指定值的标签
result = html.xpath('./body/div/p[@class="c1"]/text()')
print(result)

result = html.xpath('//p[@class="c1"]/text()')
print(result)

result = html.xpath('./body/div/p[@id="p1"]/text()')
print(result)


# 5)
# [标签 >/</>=/<=/= 数据]   -   将标签按照指定子标签的内容进行筛选

result = html.xpath('./body/ul/li[p[2]>4]/p/text()')
print(result)

result = html.xpath('./body/ul/li[p[3]>30]/p[1]/text()')
print(result)


result = html.xpath('./body/ul/li[p[1]="面包"]/p/text()')
print(result)

# 2. 通配符: *
# 1) 表示任意标签
result = html.xpath('./body/div[@id="div1"]/*')
print(result)

result = html.xpath('./body/div[@id="div1"]/*[@class]')
print(result)

result = html.xpath('//*[@class="c1"]')
print(result)

# 2)表示任意属性
result = html.xpath('./body/div[last()]/p[@*]/text()')
print(result)

result = html.xpath('./body/div[last()]/p[@*="p"]/text()')
print(result)

result = html.xpath('//img/@*')
print(result)

# 3. 分支(获取若干个路径) - |
# 注意:一个|隔开的必须是两个独立的路径
result = html.xpath('./body/ul/li/p[1]/text()|./body/ul/li/p[3]/text()')
print(result)

解析html

from lxml import etree

html = etree.HTML(open('test.html', encoding='utf-8').read())

h1 = html.xpath('/html/body/h1')
print(h1)

h1 = html.xpath('./body/h1')
print(h1)

h1 = html.xpath('//h1')
print(h1)

# 1. 加谓语(加条件)
# 语法:选中标签的路径[谓语]
# 1)[N]  - 获取同层的第N个标签
p = html.xpath('./body/p[1]/text()')
print(p)

result = html.xpath('./body/ul/li[2]/p/text()')
print(result)

# 2)
# [last()]  -  获取同层的最后一个标签
# [last()-N]  - 获取同层的倒数第(N+1)个
result = html.xpath('./body/ul/li/p[last()-1]/text()')
print(result)

result = html.xpath('./body/ul/li[last()-1]/p[last()]/text()')
print(result)

# 3)
# [position()>N]
# [position()<N]
# [position()>=N]
# [position()<=N]

result = html.xpath('./body/ul/li[position()<=2]/p/text()')
print(result)

result = html.xpath('./body/ul/li[position()>2]/p/text()')
print(result)


# 4) [@属性名]     -    获取有指定属性的标签
# p[@class]  - 有class属性的p标签
result = html.xpath('./body/div/p[@class]/text()')
print(result)

# [@属性名=属性值]  -  获取指定属性是指定值的标签
result = html.xpath('./body/div/p[@class="c1"]/text()')
print(result)

result = html.xpath('//p[@class="c1"]/text()')
print(result)

result = html.xpath('./body/div/p[@id="p1"]/text()')
print(result)


# 5)
# [标签 >/</>=/<=/= 数据]   -   将标签按照指定子标签的内容进行筛选

result = html.xpath('./body/ul/li[p[2]>4]/p/text()')
print(result)

result = html.xpath('./body/ul/li[p[3]>30]/p[1]/text()')
print(result)


result = html.xpath('./body/ul/li[p[1]="面包"]/p/text()')
print(result)

# 2. 通配符: *
# 1) 表示任意标签
result = html.xpath('./body/div[@id="div1"]/*')
print(result)

result = html.xpath('./body/div[@id="div1"]/*[@class]')
print(result)

result = html.xpath('//*[@class="c1"]')
print(result)

# 2)表示任意属性
result = html.xpath('./body/div[last()]/p[@*]/text()')
print(result)

result = html.xpath('./body/div[last()]/p[@*="p"]/text()')
print(result)

result = html.xpath('//img/@*')
print(result)

# 3. 分支(获取若干个路径) - |
# 注意:一个|隔开的必须是两个独立的路径
result = html.xpath('./body/ul/li/p[1]/text()|./body/ul/li/p[3]/text()')
print(result)

豆瓣电影

from selenium.webdriver import Chrome, ChromeOptions
import csv
from lxml import etree


def get_net_data():
    b = Chrome()
    b.get("https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0")

    html = etree.HTML(b.page_source)
    all_movie = html.xpath('//div[@class="list"]/a')
    all_data = []
    for movie in all_movie:
        img_url = movie.xpath('./div/img/@src')[0]
        name = movie.xpath('./div/img/@alt')[0]
        score = movie.xpath('./p/strong/text()')[0]
        all_data.append([name, score, img_url])

    return all_data

def save_data(data: list):
    writer = csv.writer(open('files/电影.csv', 'w', encoding='utf-8'))
    writer.writerow(['名称', '分数', '封面'])
    writer.writerows(data)

save_data(get_net_data())
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
XPath是一种XML文档的定位方法,也可以用于HTML文档的定位,Selenium中也可以使用XPath来定位网页元素。下面是使用XPath定位元素的详细步骤: 1. 打开浏览器并访问网页: ```python from selenium import webdriver driver = webdriver.Chrome() driver.get("http://www.example.com") ``` 2. 使用XPath定位元素: ```python # 通过元素id定位 element = driver.find_element_by_xpath('//*[@id="element_id"]') # 通过元素name定位 element = driver.find_element_by_xpath('//*[@name="element_name"]') # 通过元素class定位 element = driver.find_element_by_xpath('//*[@class="element_class"]') # 通过元素标签名定位 element = driver.find_element_by_xpath('//tag_name') # 通过元素属性定位 element = driver.find_element_by_xpath('//*[@attribute_name="attribute_value"]') # 通过元素文本内容定位 element = driver.find_element_by_xpath('//*[text()="text_content"]') # 通过元素部分文本内容定位 element = driver.find_element_by_xpath('//*[contains(text(), "text_content")]') ``` 3. 对元素进行操作: ```python # 输入文本 element.send_keys("text_input") # 点击元素 element.click() # 获取元素文本 print(element.text) # 获取元素属性值 print(element.get_attribute("attribute_name")) ``` 注意事项: - XPath定位需要用到浏览器的开发者工具,在开发者工具中可以查看元素的XPath路径。 - XPath路径中的引号需要用不同类型的引号包裹,例如在单引号内使用双引号包裹。 - 如果XPath路径中包含斜杠(/),则需要使用双斜杠(//)或者使用单引号包裹整个XPath路径。 - 在XPath路径中没有找到元素时,会抛出NoSuchElementException异常。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值