爬取京东评论、分词+词频统计、词云图展示

一、爬取京东评论

京东评论竟然全部对外开放

public class CommentCrawler {
    final static PoolingHttpClientConnectionManager httpClientConnectionManager = new PoolingHttpClientConnectionManager();
    final static int MAX_PAGE = 50;

    static HttpClient getClient() {
        return HttpClients.custom().setConnectionManager(httpClientConnectionManager).build();
    }

    static String getUrl(String productId, int page) {
        return String.format(
                "http://sclub.jd.com/comment/productPageComments.action?productId=%s&score=0&sortType=3&page=%d&pageSize=10",
                productId, page);
    }

    static Comment commentFromJson(JSONObject json, String productId) {
        return new Comment(json.getLongValue("id"), productId, json.getString("score"), json.getString("content"));
    }

    public static boolean crawlComments(String productId) {
        try {
            int maxPage = 1;
            int nowPage = 0;
            HttpClient client = getClient();
            while (nowPage < maxPage) {
                String url = getUrl(productId, nowPage);
                HttpGet get = new HttpGet(url);
                HttpResponse resp = client.execute(get);
                JSONObject json = JSON.parseObject(EntityUtils.toString(resp.getEntity()));
                JSONArray comments = json.getJSONArray("comments");
                if (comments.size() == 0)
                    return false;
                CommentService ser = new CommentService();
                for (int i = 0; i < comments.size(); i++) {
                    Comment comment = commentFromJson(comments.getJSONObject(i), productId);
                    ser.insertComment(comment);
                }
                if (nowPage == 0) {
                    maxPage = json.getInteger("maxPage");
                    ser.insertProduct(new Product(productId, comments.getJSONObject(0).getString("referenceName")));
                }
                nowPage++;
            }
            ProductJudger.judge(productId);
            return true;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return false;
    } 
}

二、结巴分词

jieba分词原本是Python版的,有人把它改成了Java版,名字也改成了jieba-analysis

JiebaSegmenter segmenter = new JiebaSegmenter();
        List<Word> list = segmenter.sentenceProcess(str);
        for (Word i : list) {
            String token = i.getToken();
            if (stopWords.contains(token)) {
                continue;
            }
            Integer cnt = map.get(token);
            if (cnt == null) {
                cnt = 0;
            }
            map.put(token, new Integer(cnt + 1));
        }

三、词云图

用到d3.js,d3-cloud.js这两个库,d3.js是“Data Driven Document”,d3-cloud这个库还是比较难用的,主要是官方实例代码太少了。

这里给出一个例子:每一个词云图都对应一个字典,这个字典就是“词语:频率”这样的键值对。给定多个字典,每一个字典都要渲染成一个词云图。

<div id="word-clouds" style="text-align: center"></div>
<script>
    var wordClouds = <%=request.getAttribute("wordClouds")%>;
    $(document).ready(function () {
        for (var i in wordClouds) {
            var it = wordClouds[i];
            var divId = "product_wordcloud" + it['productId'];
            $("#word-clouds").append("<h3 align='center'>商品" + it['productId'] + "词云图</h3>")
                    .append("<div id='" + divId + "'></div>");
            createWordCloud(transformWordFraquency(it['words']), "#" + divId)
        }
    })
</script>

还需要编写如下JS代码

var fill = d3.scale.category20();// 20种颜色
var wordCloudWidth = 800, wordCloudHeight = 400;
var font_name = "楷体", font_weight = "bold", max_font_size = 50;
var word_count = 50;// 显示词汇个数
var word_max_size = 60;// 显示词汇字体最大字号
var word_min_size = 10;// 显示词汇字体最小字号
/*
 * 函数名称:transformWordFrequency
 * 参数words:字典类型,形如“{word1:cnt1,word2:cnt2,word3:cnt3}” 返回值:{text,size}数组
 */

function transformWordFraquency(words) {
    var ar = []
    for ( var i in words) {
        ar.push({
            "text" : i,
            "size" : words[i]
        })
    }
    // 按照字体的大小从大到小进行排序,只取出现次数较多的前几名
    ar.sort(function(x, y) {
        return y['size'] - x['size'];
    })
    ar = ar.slice(0, Math.min(word_count, ar.length));
    for (var i = 0; i < ar.length; i++) {
        ar[i]['size'] = word_max_size - (word_max_size - word_min_size)
                / ar.length * i;
    }
    return ar;
}
/*
 * wordMap是[{text:"",size:""}]形式的数组 selector是即将渲染到的目标位置
 */
function createWordCloud(wordMap, selector) {
    d3.layout.cloud().size(
            [ wordCloudWidth * 2 - 100, wordCloudHeight * 2 - 100 ]).words(
            wordMap).font(font_name).fontWeight(font_weight).fontSize(
            function(d) {
                return d.size;
            }).rotate(function() {
        return 0;
    }).on("end", function(words) {
        renderWordCloud(words, selector)
    }).start();
}
/*
 * 像这种风格的函数调用,这些函数的调用顺序不能变
 */
function renderWordCloud(words, selector) {
    d3.select(selector).append("svg").attr("width", wordCloudWidth).attr(
            "height", wordCloudHeight).append("g")
            .attr(
                    "transform",
                    "translate(" + wordCloudWidth / 2 + "," + wordCloudHeight
                            / 2 + ")").selectAll("text").data(words).enter()// 进入words,相当于for循环
            .append("text").style("font-family", font_name).style(
                    "font-weight", font_weight)
            // .attr("text-anchor", "middle")
            .style("font-size", function(d) {// 字体大小
                return d.size + "px";
            }).style("fill", function(d, i) {// 字体颜色
                return fill(i);
            }).attr(
                    "transform",
                    function(d) {
                        return "translate(" + [ d.x, d.y ] + ") rotate("
                                + d.rotate + ")";
                    }).text(function(d) {
                return d.text;
            });
}
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值