lxml库介绍及实例

最新推荐文章于 2024-08-30 20:41:27 发布

weixin_34295316

最新推荐文章于 2024-08-30 20:41:27 发布

阅读量820

点赞数 2

文章标签： python json 操作系统

原文链接：http://www.cnblogs.com/wdl1078390625/p/9876356.html

版权

XPath常用规则
表达式	描述
nodename	选取此节点的所有子节点
/	从当前节点选取直接子节点
//	从当前节点选取子孙节点
.	选取当前节点
..	选取当前节点的父节点
@	选取属性

html = lxml.etree.HTML(text)
#使用text构造一个XPath解析对象,etree模块可以自动修正HTML文本

html = lxml.etree.parse('./ex.html',etree.HTMLParser())
#直接读取文本进行解析

from lxml import etree
result = html.xpath('//*')
#选取所有节点

result = html.xpath('//li')
#获取所有li节点

result = html.xpath('//li/a')
#获取所有li节点的直接a子节点

result = html.xpath('//li//a')
#获取所有li节点的所有a子孙节点

result = html.xpath('//a[@href="link.html"]/../@class')
#获取所有href属性为link.html的a节点的父节点的class属性

result = html.xpath('//li[@class="ni"]')
#获取所有class属性为ni的li节点

result = html.xpath('//li/text()')
#获取所有li节点的文本

result = html.xpath('//li/a/@href')
#获取所有li节点的a节点的href属性

result = html.xpath('//li[contains(@class,"li")]/a/text())
#当li的class属性有多个值时，需用contains函数完成匹配

result = html.xpath('//li[contains(@class,"li") and @name="item"]/a/text()')
#多属性匹配

result = html.xpath('//li[1]/a/text()')
result = html.xpath('//li[last()]/a/text()')
result = html.xpath('//li[position()<3]/a/text()')
result = html.xpath('//li[last()-2]/a/text()')
#按序选择，中括号内为XPath提供的函数

result = html.xpath('//li[1]/ancestor::*')
#获取祖先节点
result = html.xpath('//li[1]/ancestor::div')
result = html.xpath('//li[1]/attribute::*')
#获取属性值
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
#获取直接子节点
result = html.xpath('//li[1]/descendant::span')
#获取所有子孙节点
result = html.xpath('//li[1]/following::*[2]')
#获取当前节点之后的所有节点的第二个
result = html.xpath('//li[1]/following-sibling::*')
#获取后续所有同级节点

爬取bangumi动画排行榜并写入文件示例：

 1 import json
 2 import requests
 3 from requests.exceptions import RequestException
 4 import re
 5 import time
 6 from lxml import etree
 7 
 8 
 9 def get_one_page(url):
10     try:
11         headers = {
12             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
13         }
14         response = requests.get(url, headers=headers)
15         if response.status_code == 200:
16             return response.content.decode('utf-8')
17         return None
18     except RequestException:
19         return None
20 
21 
22 def parse_one_page(html):
23     newhtml = etree.HTML(html)
24     ranks = newhtml.xpath('//div/span[@class="rank"]/text()')
25     names = newhtml.xpath('//div/h3/a/text()')
26     others = newhtml.xpath('//div/p[@class="info tip"]/text()')
27     scores = newhtml.xpath('//div/p[@class="rateInfo"]/small/text()')
28     peoples = newhtml.xpath('//div/p[@class="rateInfo"]/span/text()')
29     ret = []
30     for i in range(len(ranks)):
31         tmp = {
32             'rank':ranks[i],
33             'name':names[i],
34             'other':others[i],
35             'score':scores[i],
36             'people':peoples[i]
37             }
38         ret.append(tmp)
39         i += 1
40     return ret
41 
42 def main(offset):
43     url = 'http://bangumi.tv/anime/browser?sort=rank&page=' + str(offset)
44     html = get_one_page(url)
45     texts = parse_one_page(html)
46     for text in texts:
47         print(text)
48         write_to_file(text)
49 
50 
51 def write_to_file(content):
52     with open('bangumi.txt', 'a', encoding='utf-8') as f:
53         f.write(json.dumps(content, ensure_ascii=False) + '\n')
54 
55 
56 
57 if __name__ == '__main__':
58     for i in range(1,20):
59         main(offset=i)
60         time.sleep(1)