由于nodejieba一直安装不成功,换了一个分词器——node-analyzer
命令行输入npm install node-analyzer -save即可
在server.js头部添加:
var Segmenter = require('node-analyzer');
var segmenter = new Segmenter();
在search路由下先生成新闻表格,再顺便对每一条新闻做分词,分词结果存储在segs中:
for (var x = 0; x < cnt; ++x) {
let newsfetch = vals[x];
newsfetch.author = newsfetch.author.replace(regex, "");
newsfetch.author = newsfetch.author.replace(/编辑:/g, "");
newsfetch.content = newsfetch.content.replace(regex, "");
$("table").append(`
<tr id="line${x}">
<td>${x}</td>
<td><a href="${newsfetch.url}" id="t${x}">${newsfetch.title}</a></td>
<td id="c${x}">${newsfetch.content}</td>
<td id="a${x}">${newsfetch.author}</td>
<td id="s${x}">${newsfetch.source_name}</td>
<td id="p${x}">${newsfetch.publish_date}</td>
<td>${newsfetch.crawltime}</td>
<td id="k${x}">${newsfetch.keywords}</td>
<td id="d${x}">${newsfetch.description}</td>
</tr>
`);
let segs = segmenter.analyze(newsfetch.title + "。\n" + newsfetch.content)
}
为了给后续做图表准备数据,对分词后的结果继续处理
对新闻标题和内容中的词做词频统计
卡在发现seg.length都是1,输出也是每行输出一个字
经过不断调试,分词的最终版本是:
let segs = segmenter.analyze(newsfetch.title + "。\n" + newsfetch.content).split(' ');
for (let seg of segs) {
//if (seg.length > 1 && !stop_words.includes(seg)) {
//if (seg.length > 1) {
//console.log(seg);
//console.log(seg.length);
//seg = seg + "";
seg = seg.toString();
if(seg.length>1 && seg!=" "){
if (wordcount.has(seg)) {
wordcount.set(seg, wordcount.get(seg) + 1);
} else {
wordcount.set(seg, 1);
}
}
//}
}
为了去除停用词,在网上下载了一个停用词表
读入文件:
var fs = require('fs');
const stop_words = fs.readFileSync("./public/StopWords.txt").toString()
在统计词频的条件里加上
!stop_words.includes(seg)
除了统计内容的词频,我们再写两个map,统计作者的出现次数和报刊的出现次数:
if(newsfetch.author =="未知"){
continue;
}
if (newsfetch.author) {
if (authorcount.has(newsfetch.author)) {
authorcount.set(newsfetch.author, authorcount.get(newsfetch.author) + 1);
} else {
authorcount.set(newsfetch.author, 1);
}
}
if (newsfetch.source_name) {
if (sourcecount.has(newsfetch.source_name)) {
sourcecount.set(newsfetch.source_name, sourcecount.get(newsfetch.source_name) + 1);
} else {
sourcecount.set(newsfetch.source_name, 1);
}
}
准备三个js文件,生成不同类型的可视化图表
addChart.js生成高频词直方图:
function addChart(xElems, yElems, chartId, xName, yName, Title, type_name) {
var option = {
tooltip: {
trigger: 'axis',
axisPointer: {
type: 'shadow',
crossStyle: {
color: '#FFD700'
}
},
formatter: function(params) {
console.log(params);
return params.seriesName + "<br />" + yName + ":" + params.value;
}
},
title:{
text: Title,
x: 'center'
},
legend:{
x: 'center',
y: 'buttom',
data: [yName]
},
grid: {
x: 50,
x2: 50,
y: 50,
y2: 100
},
xAxis:{
data: xElems,
axisLabel: {
interval: 0,
rotate: 0,
formatter: function(value){
return value.split("").join("\n");
}
}
},
yAxis:{},
series:[{
data: yElems,
type: type_name,
name: xName
}]
};
var myChart = echarts.init(document.getElementById(chartId));
myChart.setOption(option);
}
addFloatChart.js生成文章数-日期流图:
function addFloatChart(xElems, yElems, chartId, xName, yName, Title, type_name) {
var myChart = echarts.init(document.getElementById(chartId));
var app = {};
option = null;
option = {
xAxis: {
type: 'category',
boundaryGap: false,
data: xElems
},
yAxis: {
boundaryGap: [0, '50%'],
type: 'value'
},
title:{
text: Title,
x: 'center'
},
series: [
{
name:'成交',
type:'line',
smooth:true,
symbol: 'none',
stack: 'a',
areaStyle: {
normal: {}
},
data: yElems
}
]
};
setInterval(function () {
addData(true);
myChart.setOption({
xAxis: {
data: date
},
series: [{
name:'成交',
data: data
}]
});
}, 500);
if (option && typeof option === "object") {
myChart.setOption(option, true);
}
}
addPieChart生成文章来源报刊占比饼图:
function addPieChart(xElems, yElems, chartId, xName, yName, Title, type_name) {
var myChart = echarts.init(document.getElementById(chartId));
var app = {};
option = null;
// 指定图表的配置项和数据
var option = {
title : {
text: Title,
x: 'center'
},
tooltip: {
trigger: 'item',
formatter: "{a} <br/>{b} : {c} ({d}%)"
},
legend: {
orient: 'vertical',
left: 'left',
data: xElems
},
series : [
{
name: "来源报刊",
type: 'pie',
radius : '55%',
center: ['50%', '60%'],
data:[
{value:yElems[0], name:xElems[0]},
{value:yElems[1], name:xElems[1]},
{value:yElems[2], name:xElems[2]},
],
itemStyle: {
emphasis: {
shadowBlur: 10,
shadowOffsetX: 0,
shadowColor: 'rgba(0, 0, 0, 0.5)'
}
}
}
]
};
app.currentIndex = -1;
setInterval(function () {
var dataLen = option.series[0].data.length;
// 取消之前高亮的图形
myChart.dispatchAction({
type: 'downplay',
seriesIndex: 0,
dataIndex: app.currentIndex
});
app.currentIndex = (app.currentIndex + 1) % dataLen;
// 高亮当前图形
myChart.dispatchAction({
type: 'highlight',
seriesIndex: 0,
dataIndex: app.currentIndex
});
// 显示 tooltip
myChart.dispatchAction({
type: 'showTip',
seriesIndex: 0,
dataIndex: app.currentIndex
});
}, 1000);
if (option && typeof option === "object") {
myChart.setOption(option, true);
}
}
在主界面html中引入这三个js文件并准备好对应的div:
<script type="text/javascript" src="/js/addChart.js"></script>
<script type="text/javascript" src="/js/addPieChart.js"></script>
<script type="text/javascript" src="/js/addFloatChart.js"></script>
<section id="image1">
<div id="word-chart" style="height: 100%;min-height:400px;">
</div>
</section>
<section id="image2">
<div id="date-chart" style="height: 100%;min-height:400px;">
</div>
</section>
<section id="image3">
<div id="source-chart" style="height: 100%;min-height:400px;">
</div>
</section>
在server.js的search路由下:
$("body").append(`
<script type="text/javascript">
addChart(["${topWords.toString().split(',').join('","')}"], [${topHeats.toString()}], 'word-chart', '高频词', '热度', 'top20高频词');
addFloatChart(["${xdates.toString().split(',').join('","')}"], [${ynums.toString()}], 'date-chart', '爬取新闻数随日期变化趋势');
addPieChart(["${topComes.toString().split(',').join('","')}"], [${topGoods.toString()}], 'source-chart', '爬取新闻来源报刊占比');
</script>
`);
目前效果如下图: