XPath(XML Path Language)是一门在XML文档中查找信息的语言;可用来在XML文档中对元素和属性进行遍历。
python中如何安装使用XPath:
①: 安装 lxml 库。
②: from lxml import etree
③: Selector = etree.HTML(网页源代码)
④: Selector.xpath(一段神奇的符号)
1.2.1. 准备工作:
要使用XPath首先要先安装lxml库:
pip install lxml
谷歌浏览器配置XPath插件
配置参考链接:谷歌浏览器插件xpath helper 的安装和使用
如果安装报 包装包无效,解决办法:xpath helper插件安装提示程序包无效
<!DOCTYPE html> <html> <head> <title>我的网页</title> </head> <body> <h3 id="hid">我的常用链接</h3> <ul> <li class="item-0"><a href="百度一下,你就知道">百度</a></li> <li class="item-1 shop"><a href="京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!">京东</a></li> <li class="item-2"><a href="搜狐">搜狐</a></li> <li class="item-3"><a href="WWW.SINA.COM">新浪</a></li> <li class="item-4 shop"><a href="http://www.taobao.com">淘宝</a></li> </ul> </body> </html>
# 获取节点
" //* ":# 获取网页中所有标签并遍历输出标签名
" //li ": # 获取所有li节点
" //li/a " :# 获取所有li节点下的所有直接a子节点
" //ul//a ":# 效果同上(ul下所有子孙节点)
" //a/..": #获取所有a节点的父节点
# 获取属性和文本内容
" //li/a/@href ": #获取所有li下所有直接子a节点的href属性值 ---->"京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!"
" //li/a/text() ": #获取所有li下所有直接子a节点内的文本内容 ----->['百度', '京东', '搜狐', '新浪', '淘宝']
" //li/a[@class]/text() ": #获取所有li下所有直接含有class属性子a节点内的文本内容 ----->#['百度', '搜狐', '新浪']
" //li/a[@class='aa']/text() ":#获取所有li下所有直接含有class属性值为aa的子a节点内的文本内容 -----> #['搜狐', '新浪']
" //li[contains(@class,'shop')]/a/text() ":#获取class属性值中含有shop的li节点下所有直接a子节点内的文本内容 --->#['搜狐', '新浪']
re.findall 与 xpath 互转 实例
import re import requests from time import sleep from lxml import etree from selenium import webdriver class llll(): def one(self): #初始方法,自动执行的 self.dr=webdriver.Chrome() self.dr.get('http://ahfy.chinacourt.gov.cn/index.shtml') # sleep(3) # self.dr.refresh()#刷新网页 # sleep(2) self.dr.find_element_by_link_text('司法公开').click() # sleep(3) # self.dr.find_element_by_xpath('//*[@id="category"]/div[2]/a').click() # sleep(3) # self.dr.close()#关闭网页 def sdfsdf(self): import requests import re # import pymysql # conn = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="mysql", db="jjj", charset="utf8") # cur = conn.cursor() url='http://ahfy.chinacourt.gov.cn/article/detail/2020/01/id/4754356.shtml' headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) 37abc/2.0.6.16 Chrome/60.0.3112.113 Safari/537.36' } result=requests.get(url=url,headers=headers).text shuju=etree.HTML(result) title=shuju.xpath('//div[@class="b_title"]/text()') # # title=re.findall("<div class='b_title'>(.*?)</div>",result)[0] author_1=shuju.xpath('//div[@class="sth_a"]/span/text()')[0] true_author=author_1.strip()#去除字符串内空格 import re # time=re.findall('\d+-\d+-\d+',true_author)[0] # time=re.findall('时间:(.*?) ',true_author)[0]#时间:(.*?)空格 author=true_author[3:6] content=shuju.xpath('//p//span[@style="font-family:宋体"]/text()') print(content) # # author=(re.findall("(.*?) ",result)[0]).replace('\t','') # # time=re.findall("时间:(\d+-\d+-\d+ \d+:\d+)",result)[0] # # content=re.findall('宋体(.*?)<',result) # # # print(title) # # gg='' # # for i in content: # # gg+=i # # aa_content=gg.replace(';','').replace('"','').replace('>','').replace(' ','') # # aa=[] # sql = """insert into fayuan (title,author,time,content) values ('%s','%s','%s','%s')""" % (title,author,time,aa_content) # cur.execute(sql) # conn.commit() # cur.close() # conn.close() def gy(self): url='http://ahfy.chinacourt.gov.cn/article/index/id/Myi2NDAwNjAwNCACAAA/page/1.shtml' headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' } resullll=requests.get(url=url,headers=headers).text # title=re.findall("target='_blank' title='(.*?)'>",resullll)[20:] # time=re.findall("<span class='right'>(.*?)</span>",resullll) # aa=[] # for i in range(0,len(title)): # aa.append({'title':title[i],'time':time[i]}) # print(resullll) # print(aa) shuju=etree.HTML(resullll) # title=shuju.Xpath("//li//a[@title]") title=shuju.xpath('//div[@id="category"]//ul/li//a/@title') http=shuju.xpath('//div[@id="category"]//ul//li//span[@class="left"]/a/@href') time=shuju.xpath('//*[@class="right"]/text()') print(time) # print(http) if __name__ == '__main__': aaa=llll() aaa.sdfsdf()
参考:https://zhuanlan.zhihu.com/p/90911772