python如何爬虫eps数据_Python爬虫之数据解析的三种方式

importrequestsfrom lxml importetreeimportre

home_url= "https://tieba.baidu.com/p/6428562248"headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"}

html_text= requests.get(url=home_url,headers=headers).text

tree=etree.HTML(html_text)

contents= tree.xpath('//div[@class="d_post_content j_d_post_content "]')#回复留言

ans_url = "https://tieba.baidu.com/p/totalComment?t=1578396061786&tid=6428562248&fid=280050&pn=1&see_lz=0"params={"t": "1578396061786","": "6428562248","": "280050","pn": "1","see_lz": "0"}

comment_list= requests.get(url=ans_url,params=params,headers=headers).json()["data"]["comment_list"]#留言

content_all =str()for div incontents:

msg_top= " ".join(div.xpath('./text()')).strip()ifmsg_top:

content_all+= msg_top + "\n"detail_id= div.xpath('./@id')[0][13:]ifcomment_list.get(detail_id):

comment_data=comment_list[detail_id]

content_all+= "回复:" + "\n"

for comm in comment_data["comment_info"]:

username= comm["username"]

content= comm["content"]

con_all= " " + username + ":" +content

content_all+= con_all + "\n"content_all+= "---------------------------------\n"

#pa = re.compile(r"<.*?>")

content_all= pa.sub("",content_all)

title= tree.xpath('//div[@id="j_core_title_wrap"]/h3/text()')[0]

file_name= "./贴吧/LOL/{}.txt".format(title)

f= open(file_name,"w",encoding="utf-8")

f.write(content_all)

f.close()print("数据已下载完成!!!")

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值