期末大作业07

由于nodejieba一直安装不成功,换了一个分词器——node-analyzer

命令行输入npm install node-analyzer -save即可

在server.js头部添加:

var Segmenter = require('node-analyzer');
var segmenter = new Segmenter();

 在search路由下先生成新闻表格,再顺便对每一条新闻做分词,分词结果存储在segs中:

for (var x = 0; x < cnt; ++x) {
		let newsfetch = vals[x];
        newsfetch.author = newsfetch.author.replace(regex, "");
		newsfetch.author = newsfetch.author.replace(/编辑:/g, "");
		newsfetch.content = newsfetch.content.replace(regex, "");
        $("table").append(`
		<tr id="line${x}">
        <td>${x}</td>
        <td><a href="${newsfetch.url}" id="t${x}">${newsfetch.title}</a></td>
        <td id="c${x}">${newsfetch.content}</td>
        <td id="a${x}">${newsfetch.author}</td>
        <td id="s${x}">${newsfetch.source_name}</td>
        <td id="p${x}">${newsfetch.publish_date}</td>
        <td>${newsfetch.crawltime}</td>
        <td id="k${x}">${newsfetch.keywords}</td>
        <td id="d${x}">${newsfetch.description}</td>
        </tr>
		`);
        let segs = segmenter.analyze(newsfetch.title + "。\n" + newsfetch.content)
}

为了给后续做图表准备数据,对分词后的结果继续处理

对新闻标题和内容中的词做词频统计

卡在发现seg.length都是1,输出也是每行输出一个字

经过不断调试,分词的最终版本是:

let segs = segmenter.analyze(newsfetch.title + "。\n" + newsfetch.content).split(' ');
for (let seg of segs) {
	//if (seg.length > 1 && !stop_words.includes(seg)) {
	//if (seg.length > 1) {
	//console.log(seg);
	//console.log(seg.length);
	//seg = seg + "";
	seg = seg.toString();
	if(seg.length>1 && seg!=" "){
		if (wordcount.has(seg)) {
			wordcount.set(seg, wordcount.get(seg) + 1);
		} else {
			wordcount.set(seg, 1);
		}
	}
    //}
}

为了去除停用词,在网上下载了一个停用词表

读入文件:

var fs = require('fs');
const stop_words = fs.readFileSync("./public/StopWords.txt").toString()

在统计词频的条件里加上

!stop_words.includes(seg)

除了统计内容的词频,我们再写两个map,统计作者的出现次数和报刊的出现次数:

if(newsfetch.author =="未知"){
	continue;
}

if (newsfetch.author) {
	if (authorcount.has(newsfetch.author)) {
		authorcount.set(newsfetch.author, authorcount.get(newsfetch.author) + 1);
	} else {
		authorcount.set(newsfetch.author, 1);
	}
}

if (newsfetch.source_name) {
	if (sourcecount.has(newsfetch.source_name)) {
		sourcecount.set(newsfetch.source_name, sourcecount.get(newsfetch.source_name) + 1);
	} else {
		sourcecount.set(newsfetch.source_name, 1);
	}
}

准备三个js文件,生成不同类型的可视化图表

addChart.js生成高频词直方图:

function addChart(xElems, yElems, chartId, xName, yName, Title, type_name) {
	var option = {
		tooltip: {
			trigger: 'axis',
			axisPointer: {
				type: 'shadow',
				crossStyle: {
					color: '#FFD700'
				}
			},
			formatter: function(params) {
				console.log(params);
				return params.seriesName + "<br />" + yName + ":" + params.value;
			}
		},
		title:{
			text: Title,
			x: 'center'
		},
		legend:{
			x: 'center',
			y: 'buttom',
			data: [yName]
		},
		grid: {
			x: 50,
			x2: 50,
			y: 50,
			y2: 100
		},
		xAxis:{
			data: xElems,
			axisLabel: {  
				interval: 0,
				rotate: 0,
				formatter: function(value){
					return value.split("").join("\n");
				}
			}
		},
		yAxis:{},
		series:[{
			data: yElems,
			type: type_name,
			name: xName
		}]
	};

	var myChart = echarts.init(document.getElementById(chartId));

	myChart.setOption(option);
}

addFloatChart.js生成文章数-日期流图:

function addFloatChart(xElems, yElems, chartId, xName, yName, Title, type_name) {
        var myChart = echarts.init(document.getElementById(chartId));
        var app = {};
        option = null;
        option = {
            xAxis: {
                type: 'category',
                boundaryGap: false,
                data: xElems
            },
            yAxis: {
                boundaryGap: [0, '50%'],
                type: 'value'
            },
            title:{
                text: Title,
                x: 'center'
            },
            series: [
                {
                    name:'成交',
                    type:'line',
                    smooth:true,
                    symbol: 'none',
                    stack: 'a',
                    areaStyle: {
                        normal: {}
                    },
                    data: yElems
                }
            ]
        };
    
        setInterval(function () {
            addData(true);
            myChart.setOption({
                xAxis: {
                    data: date
                },
                series: [{
                    name:'成交',
                    data: data
                }]
            });
        }, 500);
        if (option && typeof option === "object") {
            myChart.setOption(option, true);
        }
}

addPieChart生成文章来源报刊占比饼图:

function addPieChart(xElems, yElems, chartId, xName, yName, Title, type_name) {
    var myChart = echarts.init(document.getElementById(chartId));
    var app = {};
    option = null;
    // 指定图表的配置项和数据
    var option = {
        title : {
            text: Title,
            x: 'center'
        },
        tooltip: {
            trigger: 'item',
            formatter: "{a} <br/>{b} : {c} ({d}%)"
        },
        legend: {
            orient: 'vertical',
            left: 'left',
            data: xElems
        },
        series : [
            {
                name: "来源报刊",
                type: 'pie',
                radius : '55%',
                center: ['50%', '60%'],
                data:[
                    {value:yElems[0], name:xElems[0]},
                    {value:yElems[1], name:xElems[1]},
                    {value:yElems[2], name:xElems[2]},
                ],
                itemStyle: {
                    emphasis: {
                        shadowBlur: 10,
                        shadowOffsetX: 0,
                        shadowColor: 'rgba(0, 0, 0, 0.5)'
                    }
                }
            }
        ]
    };

    app.currentIndex = -1;

    setInterval(function () {
        var dataLen = option.series[0].data.length;
        // 取消之前高亮的图形
        myChart.dispatchAction({
            type: 'downplay',
            seriesIndex: 0,
            dataIndex: app.currentIndex
        });
        app.currentIndex = (app.currentIndex + 1) % dataLen;
        // 高亮当前图形
        myChart.dispatchAction({
            type: 'highlight',
            seriesIndex: 0,
            dataIndex: app.currentIndex
        });
        // 显示 tooltip
        myChart.dispatchAction({
            type: 'showTip',
            seriesIndex: 0,
            dataIndex: app.currentIndex
        });
    }, 1000);
    if (option && typeof option === "object") {
        myChart.setOption(option, true);
    }
}

在主界面html中引入这三个js文件并准备好对应的div:

<script type="text/javascript" src="/js/addChart.js"></script>
<script type="text/javascript" src="/js/addPieChart.js"></script>
<script type="text/javascript" src="/js/addFloatChart.js"></script>

<section id="image1">
	<div id="word-chart" style="height: 100%;min-height:400px;">
	</div>
</section>


<section id="image2">
	<div id="date-chart" style="height: 100%;min-height:400px;">
	</div>
</section>

<section id="image3">
	<div id="source-chart" style="height: 100%;min-height:400px;">
	</div>
</section>

在server.js的search路由下:

$("body").append(`
<script type="text/javascript">
	addChart(["${topWords.toString().split(',').join('","')}"], [${topHeats.toString()}], 'word-chart', '高频词', '热度', 'top20高频词');
	addFloatChart(["${xdates.toString().split(',').join('","')}"], [${ynums.toString()}], 'date-chart', '爬取新闻数随日期变化趋势');
	addPieChart(["${topComes.toString().split(',').join('","')}"], [${topGoods.toString()}], 'source-chart', '爬取新闻来源报刊占比');
</script>
`);

目前效果如下图:

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
NodeJieba "结巴"分词的Node.js版本Introduction NodeJieba只是CppJieba简单包装而成的node扩展,用来进行中文分词。 详见NodeJiebaBlogInstallnpm install nodejieba 因为npm速度很慢而且经常因为墙的原因出现莫名其妙的问题,在此强烈建议使用cnpm,命令如下:npm --registry=http://r.cnpmjs.org install nodejieba默认分词算法初始化var segment = require("nodejieba"); segment.loadDict("./node_modules/nodejieba/dict/jieba.dict.utf8", "./node_modules/nodejieba/dict/hmm_model.utf8");阻塞式调用var wordList = segment.cutSync("阻塞模式分词"); if (wordList.constructor == Array) // just for tutorial, this is always be true  {     wordList.forEach(function(word) { console.log(word);          }); }非阻塞式调用segment.cut("非阻塞模式分词", function(wordList) {     wordList.forEach(function(word) { console.log(word);          }); });初始化var segment = require("nodejieba"); segment.queryLoadDict("./node_modules/nodejieba/dict/jieba.dict.utf8", "./node_modules/nodejieba/dict/hmm_model.utf8");阻塞式调用var wordList = segment.queryCutSync("阻塞模式分词"); if (wordList.constructor == Array) // just for tutorial, this is always be true  {     wordList.forEach(function(word) { console.log(word);          }); }非阻塞式调用segment.queryCut("非阻塞模式分词", function(wordList) {     wordList.forEach(function(word) { console.log(word);          }); }); 具体用法可以参考 test/segment.js test/query_segment.jsTesting 在node v0.10.2下测试通过http://cppjieba-webdemo.herokuapp.com/ (chrome is suggested)ThanksJieba中文分词 标签:nodejieba

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值