from lxml import etree
import requests
def get_html(url, headers):
response = requests.get(url, headers = headers)
try:
if response.status_code == 200:
return response.text
except:
pass
def get_parse(html):
tree = etree.HTML(html)
user = tree.xpath('//*/table/tbody/tr/td/div/a/text()')
reply = tree.xpath('//*/table/tbody/tr/td/div/div/table/tbody/tr/td/text()')
for users, replys in zip(user, reply):
print('用户名:'+users, '回复内容:'+ replys.strip())
def main():
url = 'http://www.dxy.cn/bbs/thread/626626'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
html = get_html(url, headers)
get_parse(html)
if __name__ == '__main__':
main()
```
爬取结果:![在这里插入图片描述](https://img-blog.csdnimg.cn/20190304160450654.png)
利用xpath爬丁香园论坛帖子的所有回复内容
最新推荐文章于 2020-04-14 11:09:38 发布