importrequestsfrom lxml importetreeimportre
home_url= "https://tieba.baidu.com/p/6428562248"headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"}
html_text= requests.get(url=home_url,headers=headers).text
tree=etree.HTML(html_text)
contents= tree.xpath('//div[@class="d_post_content j_d_post_content "]')#回复留言
ans_url = "https://tieba.baidu.com/p/totalComment?t=1578396061786&tid=6428562248&fid=280050&pn=1&see_lz=0"params={"t": "1578396061786","": "6428562248","": "280050","pn": "1","see_lz": "0"}
comment_list= requests.get(url=ans_url,params=params,headers=headers).json()["data"]["comment_list"]#留言
content_all =str()for div incontents:
msg_top= " ".join(div.xpath('./text()')).strip()ifmsg_top:
content_all+= msg_top + "\n"detail_id= div.xpath('./@id')[0][13:]ifcomment_list.get(detail_id):
comment_data=comment_list[detail_id]
content_all+= "回复:" + "\n"
for comm in comment_data["comment_info"]:
username= comm["username"]
content= comm["content"]
con_all= " " + username + ":" +content
content_all+= con_all + "\n"content_all+= "---------------------------------\n"
#pa = re.compile(r"<.*?>")
content_all= pa.sub("",content_all)
title= tree.xpath('//div[@id="j_core_title_wrap"]/h3/text()')[0]
file_name= "./贴吧/LOL/{}.txt".format(title)
f= open(file_name,"w",encoding="utf-8")
f.write(content_all)
f.close()print("数据已下载完成!!!")