表达式 | 描述 |
nodename | 选取此节点的所有子节点 |
/ | 从当前节点选取直接子节点 |
// | 从当前节点选取子孙节点 |
. | 选取当前节点 |
.. | 选取当前节点的父节点 |
@ | 选取属性 |
html = lxml.etree.HTML(text)
#使用text构造一个XPath解析对象,etree模块可以自动修正HTML文本
html = lxml.etree.parse('./ex.html',etree.HTMLParser())
#直接读取文本进行解析
from lxml import etree
result = html.xpath('//*')
#选取所有节点
result = html.xpath('//li')
#获取所有li节点
result = html.xpath('//li/a')
#获取所有li节点的直接a子节点
result = html.xpath('//li//a')
#获取所有li节点的所有a子孙节点
result = html.xpath('//a[@href="link.html"]/../@class')
#获取所有href属性为link.html的a节点的父节点的class属性
result = html.xpath('//li[@class="ni"]')
#获取所有class属性为ni的li节点
result = html.xpath('//li/text()')
#获取所有li节点的文本
result = html.xpath('//li/a/@href')
#获取所有li节点的a节点的href属性
result = html.xpath('//li[contains(@class,"li")]/a/text())
#当li的class属性有多个值时,需用contains函数完成匹配
result = html.xpath('//li[contains(@class,"li") and @name="item"]/a/text()')
#多属性匹配
result = html.xpath('//li[1]/a/text()')
result = html.xpath('//li[last()]/a/text()')
result = html.xpath('//li[position()<3]/a/text()')
result = html.xpath('//li[last()-2]/a/text()')
#按序选择,中括号内为XPath提供的函数
result = html.xpath('//li[1]/ancestor::*')
#获取祖先节点
result = html.xpath('//li[1]/ancestor::div')
result = html.xpath('//li[1]/attribute::*')
#获取属性值
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
#获取直接子节点
result = html.xpath('//li[1]/descendant::span')
#获取所有子孙节点
result = html.xpath('//li[1]/following::*[2]')
#获取当前节点之后的所有节点的第二个
result = html.xpath('//li[1]/following-sibling::*')
#获取后续所有同级节点
爬取bangumi动画排行榜并写入文件示例:
1 import json 2 import requests 3 from requests.exceptions import RequestException 4 import re 5 import time 6 from lxml import etree 7 8 9 def get_one_page(url): 10 try: 11 headers = { 12 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36' 13 } 14 response = requests.get(url, headers=headers) 15 if response.status_code == 200: 16 return response.content.decode('utf-8') 17 return None 18 except RequestException: 19 return None 20 21 22 def parse_one_page(html): 23 newhtml = etree.HTML(html) 24 ranks = newhtml.xpath('//div/span[@class="rank"]/text()') 25 names = newhtml.xpath('//div/h3/a/text()') 26 others = newhtml.xpath('//div/p[@class="info tip"]/text()') 27 scores = newhtml.xpath('//div/p[@class="rateInfo"]/small/text()') 28 peoples = newhtml.xpath('//div/p[@class="rateInfo"]/span/text()') 29 ret = [] 30 for i in range(len(ranks)): 31 tmp = { 32 'rank':ranks[i], 33 'name':names[i], 34 'other':others[i], 35 'score':scores[i], 36 'people':peoples[i] 37 } 38 ret.append(tmp) 39 i += 1 40 return ret 41 42 def main(offset): 43 url = 'http://bangumi.tv/anime/browser?sort=rank&page=' + str(offset) 44 html = get_one_page(url) 45 texts = parse_one_page(html) 46 for text in texts: 47 print(text) 48 write_to_file(text) 49 50 51 def write_to_file(content): 52 with open('bangumi.txt', 'a', encoding='utf-8') as f: 53 f.write(json.dumps(content, ensure_ascii=False) + '\n') 54 55 56 57 if __name__ == '__main__': 58 for i in range(1,20): 59 main(offset=i) 60 time.sleep(1)