# coding=utf-8 import requests import re from lxml import etree def main(): url='https://www.bilibili.com/video/BV1CL411F7r6?spm_id_from=333.934.0.0' headers={ 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36' } res=(requests.get(url,headers=headers)).text # print(cid) cid=get_cid(res) url2 = 'https://comment.bilibili.com/' + cid + '.xml' content1=get_content(url2) comments=get_target(content1) _print(comments) #print(get_cid(res)) #print(res.text) def get_cid(res): obj1 = re.compile(r'"cid=(?P<cid1>.*?)&aid') cid = obj1.findall(res) cid = list(cid)[0] return cid def get_content(url2): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36' } res2=requests.get(url2,headers=headers) res2.encoding ='Utf-8' #print(res2.text) return res2.text def get_target(content1): obj2 = re.compile('<d p=".*?">(.*?)</d>') comments_list = re.findall(obj2, content1) # print(comments_list) #在控制台打印所匹配的内容 #print('成功获取弹幕信息') #print(comments_list) return comments_list def _print(comments): for i in comments: print(i) if __name__ =="__main__": main()
爬虫最终练习(初版)
最新推荐文章于 2024-08-10 10:09:09 发布