java爬B站弹幕加java生成词云

先添加依赖 然后直接用就行了

    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpclient</artifactId>
        <version>4.5.12</version>
    </dependency>
        <dependency>
            <groupId>com.kennycason</groupId>
            <artifactId>kumo-core</artifactId>
            <version>1.27</version>
        </dependency>
        <dependency>
            <groupId>com.kennycason</groupId>
            <artifactId>kumo-tokenizers</artifactId>
            <version>1.27</version>
        </dependency>
package com.yang;

import com.kennycason.kumo.CollisionMode;
import com.kennycason.kumo.WordCloud;
import com.kennycason.kumo.WordFrequency;
import com.kennycason.kumo.bg.CircleBackground;
import com.kennycason.kumo.font.KumoFont;
import com.kennycason.kumo.font.scale.SqrtFontScalar;
import com.kennycason.kumo.nlp.FrequencyAnalyzer;
import com.kennycason.kumo.nlp.tokenizers.ChineseWordTokenizer;
import com.kennycason.kumo.palette.LinearGradientColorPalette;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.awt.*;
import java.io.InputStream;
import java.util.*;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class PanBibi {
    //av 号 获取 cid
    public static String  getBofqi(String aid) throws Exception{
        RequestConfig defaultConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD).build();
        CloseableHttpClient closeableHttpClient = HttpClients.createDefault() ;
        HttpGet httpGet = new HttpGet("https://www.bilibili.com/video/av"+aid+"/") ;
        httpGet.setConfig(defaultConfig);
        CloseableHttpResponse httpResponse = closeableHttpClient.execute(httpGet) ;
        HttpEntity httpEntity = httpResponse.getEntity() ;
        String en= EntityUtils.toString(httpEntity) ;
        String con = "cid=(.*)?&aid=" ;
        Pattern ah = Pattern.compile(con);
        Matcher mr = ah.matcher(en);
        while(mr.find()) {
            String id = mr.group();
            String newUrl = id.replace("cid=","") ;
            String x = newUrl.replace("&aid=","") ;
            System.out.println(x);
        return x;
        }
        return "";
    }
    //访问 弹幕地址  获取弹幕
    public static Map<String,Integer> ReaderBiBi(String x) throws Exception{
        CloseableHttpClient closeableHttpClient = HttpClients.createDefault() ;
        HttpGet httpGet1 = new HttpGet("http://comment.bilibili.com/"+x+".xml");
        CloseableHttpResponse httpResponse1 = closeableHttpClient.execute(httpGet1) ;
        HttpEntity entity  = httpResponse1.getEntity() ;
        InputStream in = entity.getContent();
        byte b[] = new byte[1024*100];//不用entity.tostring因为乱码
        int temp=0;          //所有读取的内容都使用temp接收
        StringBuffer stringBuffer = new StringBuffer();
        while((temp=in.read(b))!=-1){    
            stringBuffer.append(new String(b));
        }
        in.close();
        Map<String,Integer>list=new HashMap<String, Integer>(1024);
        String patstr="<d.*?>(.*?)</d>";
        Pattern a = Pattern.compile(patstr);

        Matcher m = a.matcher(stringBuffer+"");
        String str=null;
        Integer integer=0;
        while(m.find()){
            str=m.group().replaceAll("<d.*?>","").replaceAll("</d>","");
            integer= list.get(str);
            if(integer==null)
                integer=0;
          list.put(str,++integer);
        }
        System.out.println("共"+list.size()+"条弹幕");
        return  list;
    }
    //创建图片
    public static  void GreaderImg(Map<String,Integer> strs) throws  Exception{
        final FrequencyAnalyzer frequencyAnalyzer = new FrequencyAnalyzer();
        frequencyAnalyzer.setWordFrequenciesToReturn(600);
        frequencyAnalyzer.setMinWordLength(2);
        frequencyAnalyzer.setWordTokenizer(new ChineseWordTokenizer());
        // 可以直接从文件中读取
        //final List<WordFrequency> wordFrequencies = frequencyAnalyzer.load(getInputStream("text/chinese_language.txt"));
        final List<WordFrequency> wordFrequencies = new ArrayList();

        
        for (Map.Entry<String,Integer> book : strs.entrySet()){
            wordFrequencies.add(new WordFrequency(book.getKey(),book.getValue()));
        }
        //此处不设置会出现中文乱码
        java.awt.Font font = new java.awt.Font("STSong-Light", 2, 18);
        final Dimension dimension = new Dimension(900, 900);
        final WordCloud wordCloud = new WordCloud(dimension, CollisionMode.PIXEL_PERFECT);
        wordCloud.setPadding(2);
        wordCloud.setBackground(new CircleBackground(255));
        wordCloud.setFontScalar(new SqrtFontScalar(12, 42));
        //设置词云显示的三种颜色,越靠前设置表示词频越高的词语的颜色
        wordCloud.setColorPalette(new LinearGradientColorPalette(Color.RED, Color.BLUE, Color.GREEN, 30, 30));
        wordCloud.setKumoFont(new KumoFont(font));
        wordCloud.setBackgroundColor(new Color(255, 255, 255));
        //因为我这边是生成一个圆形,这边设置圆的半径
        wordCloud.setBackground(new CircleBackground(900));
        wordCloud.build(wordFrequencies);
        wordCloud.writeToFile("d://3.png");
    }
    // av 号 和 cid 弹幕的编号
    public static  void ComeKna(String av,String cid) throws  Exception{
        String bofqi=null;
        if(cid!=null)
            bofqi=cid;
        else
         bofqi= getBofqi(av);
        if(bofqi==null) return;
        Map<String,Integer> str= PanBibi.ReaderBiBi(bofqi);
        PanBibi.GreaderImg(str);
    }
}

调用

//传第二个了 就不用在找cid了 直接获取 生成 
 PanBibi.ComeKna("626634287","221674886");

1 通过java获取 B站视频的弹幕 数据

在这里插入图片描述可以在控制台直接打aid获取
在这里插入图片描述可以通过网络窗口获取cid的名称 这样就可以获取放弹幕的文件了
在这里插入图片描述
然后访问这个地址 获取数据 在通过数据权重生成词云

2 在用java来 实现生成词云

开源地址 com.kennycason
Github:
码云
在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值