题目:学习xpath,使用lxml+xpath提取内容。 使用xpath提取丁香园论坛的回复内容。
from lxml import etree
import requestsdef main():
url='http://www.dxy.cn/bbs/thread/626626'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3521.2 Safari/537.36'}
html=get_html(url,headers)
get_A(html)
def get_html(url,headers):
response=requests.get(url,headers=headers)
return response.textdef get_A(html):
tree=etree.HTML(html)
ids=tree.xpath('//*/table/tbody/tr/td/div/a/text()')
comments=tree.xpath('//*/table/tbody/tr/td/div/div/table/tbody/tr/td/text()')
for id,comment in zip(ids,comments):
print('用户:'+id,'评价:'+comment.strip())
if __name__=='__main__':
main()