Xpath确实是一个很方便的工具
在Xpath与BS4复用的时候,需要先用XPATH定位出HTML的某一块内容,然后再用BS4解析,这里放出源码
from lxml import html,etree import html as HTML from bs4 import BeautifulSoup text = """ <body> <div id="aa">aa啊</div> <div id="ab">ab啊</div> <div id="ac">ac啊</div> </body> """ #先创建etree selector= etree.HTML(text) #Xpath先定位 content=selector.xpath('//div[@id="aa" or @id="ac"]') print(content) #xpath给的是列表,需要循环 for i in content: i = html.tostring(i) i = str(i, encoding = "utf-8") print(i) #转str后发现不是中文,自带库重命名为大写避开etree的小写html库 i = HTML.unescape(i) print(i) #将转化完成的content扔进bs4,可以正常工作了 soup = BeautifulSoup(i,'lxml') print(soup.find_all('div'))