【Task4 学习xpath 】
学习xpath,使用lxml+xpath提取内容。
使用xpath提取丁香园论坛的回复内容。
from lxml import etree
import requests
url = "http://www.dxy.cn/bbs/thread/626626#626626"
broswer = requests.session()
re = requests.get(url=url)
result = re.text
html = etree.HTML(result)
user_count = html.xpath('//*[@id="postcontainer"]/div')
for i in range(len(user_count)):
user_name = html.xpath('//*[@id="post_' + str(i+1) + '"]/table/tbody/tr/td[1]/div[2]/a/text()')[0]
try:
user_level = html.xpath('//*[@id="post_' + str(i+1) + '"]/table/tbody/tr/td[1]/div[3]/p/span/text()')[0]
except Exception:
user_level = html.xpath('//*[@id="post_' + str(i+1) + '"]/table/tbody/tr/td[1]/div[3]/div/text()')[0]
user_score = html.xpath('//*[@id="post_' + str(i+1) + '"]/table/tbody/tr/td[1]/div[4]/ul/li[1]/div/a/text()')[0]
user_votes = html.xpath('//*[@id="post_' + str(i+1) + '"]/table/tbody/tr/td[1]/div[4]/ul/li[2]/div/a/text()')[0]
user_home_money = html.xpath('//*[@id="post_' + str(i+1) + '"]/table/tbody/tr/td[1]/div[4]/ul/li[3]/div/a/text()')[0]
if i == 0:
user_comment = html.xpath('//*[@id="post_' + str(i+1) + '"]/table/tbody/tr/td[2]/div[2]/div[2]/table/tbody/tr/td/text()')[0]
else:
user_comment = html.xpath('//*[@id="post_' + str(i+1) + '"]/table/tbody/tr/td[2]/div[2]/div[1]/table/tbody/tr/td/text()')[0]
print("第" + str(i+1) + "楼复人的信息>>姓名:" + user_name + " 等级:" + user_level.strip() + " 积分:" + user_score + " 得票:" + user_votes + " 叮当:" + user_home_money + " 评论内容:" + user_comment.strip())
原文地址:https://blog.csdn.net/u011757108/article/details/88128116