爬取B站“冰冰vlog.001“评论&&做词云

python小白,纯属来玩的(狗头)

库准备(推荐清华镜像安装)

  1. requests
  2. bs4
  3. jieba
  4. wordcloud
  5. imageio
  6. matplotlib

具体步骤

爬取评论

代码直接贴上了,自行研究

import requests
import time
from bs4 import BeautifulSoup
import json

def get_html(url):
    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    }

    r = requests.get(url, timeout=30,headers=headers)
    r.raise_for_status()
    r.endcodding = 'utf-8'
    return r.text


def get_content(url):
    comments = []
    html = get_html(url)
    try:
        s=json.loads(html)
    except:
        print("jsonload error")   
    num=len(s['data']['replies']) 
    i=0
    while i<num:
        comment=s['data']['replies'][i]
        InfoDict={}
        InfoDict['Uname']=comment['member']['uname']
        InfoDict['Like']=comment['like'] 
        InfoDict['Content']=comment['content']['message'] 
        InfoDict['Time']=time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(comment['ctime'])) 
        comments.append(InfoDict)
        i=i+1
    return comments
def Out2File(dict):
    with open('BiliBiliComments.txt', 'a+',encoding='utf-8') as f:
        i=0
        for comment in dict:
            i=i+1
            try:
                f.write('姓名:{}\t  点赞数:{}\t \n 评论内容:{}\t  评论时间:{}\t \n '.format(
                    comment['Uname'], comment['Like'], comment['Content'], comment['Time']))
                f.write("-----------------\n")
            except:
                print("out2File error")
        print('当前页面保存完成')

if __name__ == '__main__':
    e=0
    page=1
    while e == 0 :
        url = "https://api.bilibili.com/x/v2/reply?pn="+ str(page)+"&type=1&oid=800760067&sort=2" 
        try:
            print()
            content=get_content(url)
            print("page:",page)
            Out2File(content)
            page=page+1
            # 为了降低被封ip的风险,每爬20页便歇5秒。
            if page%10 == 0:
                time.sleep(5)
        except:
            e=1

生成词云

这个需要先有个图片,随便网上找,然后和py放在一个文件夹下。

代码附上(文件,图片改成自己的)


import jieba.analyse
from wordcloud import WordCloud,ImageColorGenerator
from imageio import imread
import matplotlib.pyplot as plt

class wc:
  def __init__(self,txt_file,img_file,font_file):
    self.f = open(txt_file,encoding='utf-8')
    self.txt = self.f.read()  
    self.f.close()
    self.tags = jieba.analyse.extract_tags(self.txt,topK=100)
    self.text = ' '.join(self.tags)
    self.img = imread(img_file)
    self.wc = WordCloud(font_path=font_file,background_color='white',max_words=100,mask=self.img,max_font_size=80)
    self.word_cloud = self.wc.generate(self.text)

  def show_wc(self):
    plt.imshow(self.word_cloud)
    plt.axis("off")
    plt.show()


if __name__=='__main__':
  mywc = wc('BiliBiliComments.txt','u=2959490536,2877096479&fm=26&gp=0.jpg','simsun.ttc')
  mywc.show_wc()

还是那句话,冰冰真可爱(狗头)

评论 8
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值