不知道为啥突然碰到一个页面etree xpath获取到的中文是乱码。最后靠加HTMLParser参数搞定。代码如下
@staticmethod
def getXpath(xpath, content):
hparser = etree.HTMLParser(encoding='utf-8')
tree = etree.HTML(content,hparser)
out = []
results = tree.xpath(xpath)
for result in results:
if 'ElementStringResult' in str(type(result)) or 'ElementUnicodeResult' in str(type(result)) :
out.append(result)
else:
out.append(etree.tostring(result))
return out