import requests import re from lxml import etree from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator import matplotlib.pyplot as plt def main(): url='https://www.bilibili.com/video/BV1uL41157Nd?spm_id_from=333.851.b_7265636f6d6d656e64.5' headers={ 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36' } res=(requests.get(url,headers=headers)).text # print(cid) cid=get_cid(res) url2 = 'https://comment.bilibili.com/' + cid + '.xml' content1=get_content(url2) comments=get_target(content1) #_print(comments) _write(comments) _wordcloud() #print(get_cid(res)) #print(res.text) def get_cid(res): obj1 = re.compile(r'"cid=(?P<cid1>.*?)&aid') cid = obj1.findall(res) cid = list(cid)[0] return cid def get_content(url2): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36' } res2=requests.get(url2,headers=headers) res2.encoding ='Utf-8' #print(res2.text) return res2.text def get_target(content1): obj2 = re.compile('<d p=".*?">(.*?)</d>') comments_list = re.findall(obj2, content1) # print(comments_list) #在控制台打印所匹配的内容 #print('成功获取弹幕信息') #print(comments_list) return comments_list '''def _print(comments): for i in comments: print(i)''' def _write(commeents): for i in commeents: with open('Barrage.txt','a',encoding='utf-8')as f: f.write(i+'\n') def _wordcloud(): with open('./Barrage.txt', 'r',encoding='utf-8') as f: cut_text=f.read() #print(cut_text) word_cloud = WordCloud( font_path="C:/Windows/Fonts/simfang.ttf",#字体路径 background_color="white", width=1920, height=1080 ).generate(cut_text) plt.imshow(word_cloud, interpolation="bilinear") plt.axis('off') plt.show() if __name__ =="__main__": main()
爬虫练习(昨天的美化)
最新推荐文章于 2024-08-10 10:09:09 发布