Node.js实现网络新闻爬虫及搜索增加功能(二):可视化及查询优化
Node.js实现网络新闻爬虫及搜索增加功能(二):可视化及查询优化
系列文章查看不到可能是CSDN审核原因,可以在我的知乎专栏看到所有文章:https://www.zhihu.com/column/c_1370026160999415808
项目要求
基于第一个项目爬虫爬取的数据,完成数据展示网站。
基本要求:
1、用户可注册登录网站,非注册用户不可登录查看数据
2、用户注册、登录、查询等操作记入数据库中的日志
3、爬虫数据查询结果列表支持分页和排序
4、用Echarts或者D3实现3个以上的数据分析图表展示在网站中
5、实现一个管理端界面,可以查看(查看用户的操作记录)和管理(停用启用)注册用户。
扩展要求(非必须):
1、实现对爬虫数据中文分词的查询
2、实现查询结果按照主题词打分的排序
3、用Elastic Search+Kibana展示爬虫的数据结果
本文是该项目增加功能第二部分:可视化及查询优化。包括实现网站数据可视化、支持中文分词查询、对查询结果按照时间排序、对查询结果分页等功能。
二、支持分词查询、查询结果排序和分页及网站数据可视化
1. 网站数据可视化
首先增加数据可视化页面注册路由:
app.get('/charts.html', function(req, res) {
res.sendFile(__dirname + "/templates/" + "charts.html");
})
分别选取了网站新闻数据和用户操作数据进行统计,并使用ECharts展示在前端页面当中。
其中,网站新闻数据统计了爬取到的新闻条数与时间的关系,使用折线图展示,以及爬取到的新闻条数与新闻来源的关系,使用条形图展示。
用户行为数据统计了用户登录次数与时间的关系,使用折线图展示,以及用户搜索次数与时间的关系,使用折线图展示。
四个图表均在页面渲染时设置,前端渲染代码为:
<script text="text/javascript">
$(document).ready(function() {
if (document.cookie == "") {
alert("请登录!");
window.location = "/login.html";
}
if (document.cookie.split('=')[1] != '000') {
$("li#0").hide();
$("li#1").hide();
}
var chart_date_num = echarts.init(document.getElementById('date-num'));
var url = "/chart_date_num";
$.ajax({
type: "GET",
url: url,
data: {
},
success: function(data){
var option = {
legend: {
data: ['新闻条数']
},
xAxis: {
data: data.x
},
yAxis: {},
tooltip: {
trigger: 'item',
},
series: [{
name: '新闻条数',
type: 'line',
data: data.y,
itemStyle : {
normal : {
color:'black',
lineStyle:{
color:'blue'
}
}
},
}],
dataZoom: [
{
type: 'slider',
start: 60,
end: 100
},
{
type: 'inside',
start: 30,
end: 80
},
{
type: 'slider',
yAxisIndex: 0,
start: 30,
end: 100
},
{
type: 'inside',
yAxisIndex: 0,
start: 30,
end: 100
}
]
};
chart_date_num.setOption(option);
},
error: function(XMLHttpRequest, textStatus, errorThrown) {
alert(XMLHttpRequest.status);
alert(XMLHttpRequest.readyState);
alert(textStatus);
window.location = "/charts.html";
}
});
var chart_pos_num = echarts.init(document.getElementById('pos-num'));
url = "/chart_pos_num";
$.ajax({
type: "GET",
url: url,
data: {
},
success: function(data){
var option = {
legend: {
data: ['新闻条数']
},
xAxis: {
data: data.x
},
yAxis: {},
tooltip: {
trigger: 'item',
},
series: [{
name: '新闻条数',
type: 'bar',
data: data.y
}],
dataZoom: [
{
type: 'slider',
start: 10,
end: 60
},
{
type: 'inside',
start: 30,
end: 80
},
{
type: 'slider',
yAxisIndex: 0,
start: 30,
end: 80
},
{
type: 'inside',
yAxisIndex: 0,
start: 30,
end: 80
}
]
};
chart_pos_num.setOption(option);
},
error: function(XMLHttpRequest, textStatus, errorThrown) {
alert(XMLHttpRequest.status);
alert(XMLHttpRequest.readyState);
alert(textStatus);
window.location = "/charts.html";
}
});
var chart_login_time = echarts.init(document.getElementById('login-time'));
url = "/chart_login_time";
$.ajax({
type: "GET",
url: url,
data: {
},
success: function(data){
var option = {
legend: {
data: ['登录次数']
},
xAxis: {
data: data.x
},
yAxis: {},
tooltip: {
trigger: 'item',
},
series: [{
name: '登录次数',
type: 'line',
data: data.y,
itemStyle : {
normal : {
color:'grey',
lineStyle:{
color:'yellow'
}
}
},
}]
};
chart_login_time.setOption(option);
},
error: function(XMLHttpRequest, textStatus, errorThrown) {
alert(XMLHttpRequest.status);
alert(XMLHttpRequest.readyState);
alert(textStatus);
window.location = "/charts.html";
}
});
var chart_search_time = echarts.init(document.getElementById('search-time'));
url = "/chart_search_time";
$.ajax({
type: "GET",
url: url,
data: {
},
success: function(data){
var option = {
legend: {
data: ['搜索次数']
},
xAxis: {
data: data.x
},
yAxis: {},
tooltip: {
trigger: 'item',
},
series: [{
name: '搜索次数',
type: 'line',
data: data.y,
itemStyle : {
normal : {
color:'grey',
lineStyle:{
color:'green'
}
}
},
}]
};
chart_search_time.setOption(option);
},
error: function(XMLHttpRequest, textStatus, errorThrown) {
alert(XMLHttpRequest.status);
alert(XMLHttpRequest.readyState);
alert(textStatus);
window.location = "/charts.html";
}
});
});
function clear_search() {
var url = "/clear_search";
$.ajax({
type: "GET",
url: url,
data: {
},
success: function(data){
window.location = "/news.html";
},
error: function(XMLHttpRequest, textStatus, errorThrown) {
alert(XMLHttpRequest.status);
alert(XMLHttpRequest.readyState);
alert(textStatus);
window.location = "/news.html";
}
});
}
</script>
注意到该页面首先判断了cookie信息是否正确,只有当cookie中储存了手机号时才可以继续渲染该页面。
对应的后端路由代码为:
app.get('/chart_date_num', function(req, res) {
news_search_sql = "select date, count(*) as cnt from news group by date order by date";
crawler_sql.query(news_search_sql, function(err, result, fields) {
var x = [];
var y = [];
for (var i = 0; i < result.length; i++) {
x.push(result[i].date.toString().slice(0, 10));
y.push(result[i].cnt);
}
res.json({x: x, y: y});
});
})
app.get('/chart_pos_num', function(req, res) {
news_search_sql = "select source, count(*) as cnt from news group by source order by source";
crawler_sql.query(news_search_sql, function(err, result, fields) {
var x = [];
var y = [];
for (var i = 0; i < result.length; i++) {
x.push(result[i].source);
y.push(result[i].cnt);
}
res.json({x: x, y: y});
});
})
app.get('/chart_login_time', function(req, res) {
useraction_search_sql = "select time, count(*) as cnt from useraction where action like '%" + "login successfully" + "%' group by time order by time";
crawler_sql.query(useraction_search_sql, function(err, result, fields) {
var x1 = [];
var y1 = [];
for (var i = 0; i < result.length; i++) {
x1.push(result[i].time.toString().slice(0, 10));
y1.push(result[i].cnt);
}
var x = [];
var y = [];
var tmp = x1[0];
var cnt = y1[0];
for (var i = 1; i < x1.length - 1; i++) {
if (tmp == x1[i]) {
cnt += y1[i];
} else {
x.push(tmp);
y.push(cnt);
tmp = x1[i];
cnt = y1[i];
}
}
x.push(tmp);
y.push(cnt);
res.json({x: x, y: y});
});
})
app.get('/chart_search_time', function(req, res) {
useraction_search_sql = "select time, count(*) as cnt from useraction where action like '%" + "search" + "%' group by time order by time";
crawler_sql.query(useraction_search_sql, function(err, result, fields) {
var x1 = [];
var y1 = [];
for (var i = 0; i < result.length; i++) {
x1.push(result[i].time.toString().slice(0, 10));
y1.push(result[i].cnt);
}
var x = [];
var y = [];
var tmp = x1[0];
var cnt = y1[0];
for (var i = 1; i < x1.length - 1; i++) {
if (tmp == x1[i]) {
cnt += y1[i];
} else {
x.push(tmp);
y.push(cnt);
tmp = x1[i];
cnt = y1[i];
}
}
x.push(tmp);
y.push(cnt);
res.json({x: x, y: y});
});
})
2. 支持中文分词查询
中文分词查询支持OR和AND连接查询。选择NONE时仍为单个关键词查询。
前端需要对输入的关键词进行一次判断,NONE时不得输入两个关键词,OR或AND时必须输入两个关键词,相应的前端交互代码如下:
function search_news(search_type){
var search_info = "";
var search_info1 = "";
var search_bool = "";
var search_sort = "";
if (search_type == 'title') {
search_info = $('input#title').val();
search_info1 = $('input#title1').val();
search_bool = $("#title_bool").val();
search_sort = $("#title_sort").val();
} else if (search_type == 'content') {
search_info = $('input#content').val();
search_info1 = $('input#content1').val();
search_bool = $("#content_bool").val();
search_sort = $("#content_sort").val();
}
if (search_info == "") {
alert("请输入查询信息!");
} else if (search_bool != "NONE" && search_info1 == "") {
alert("请输入连接查询信息!");
} else if (search_bool == "NONE" && search_info1 != "") {
alert("请选择连接查询!");
} else {
var url = "/search_news";
$.ajax({
type: "GET",
url: url,
data: {
search_type: search_type,
search_info: search_info,
search_info1: search_info1,
search_bool: search_bool,
search_sort: search_sort
},
success: function(data){
alert(data.msg);
if (data.code == 1) {
window.location = "/news.html";
} else {
window.location = "/search.html";
}
},
error: function(XMLHttpRequest, textStatus, errorThrown) {
alert(XMLHttpRequest.status);
alert(XMLHttpRequest.readyState);
alert(textStatus);
window.location = "/search.html";
}
});
}
}
需要修改搜索接口:
if (search_bool == 'NONE') {
if (search_type == 'title') {
news_search_sql = "select id_news, url, source, title, author, date from news where title like '%" + search_info + "%'";
} else if (search_type == 'content') {
news_search_sql = "select id_news, url, source, title, author, date from news where content like '%" + search_info + "%'";
}
} else if (search_bool == 'AND') {
if (search_type == 'title') {
news_search_sql = "select id_news, url, source, title, author, date from news where title like '%" + search_info + "%' and " + "title like '%" + search_info1 + "%'";
} else if (search_type == 'content') {
news_search_sql = "select id_news, url, source, title, author, date from news where content like '%" + search_info + "%' and " + "title like '%" + search_info1 + "%'";
}
} else if (search_bool == 'OR') {
if (search_type == 'title') {
news_search_sql = "select id_news, url, source, title, author, date from news where title like '%" + search_info + "%' or " + "title like '%" + search_info1 + "%'";
} else if (search_type == 'content') {
news_search_sql = "select id_news, url, source, title, author, date from news where content like '%" + search_info + "%' or " + "title like '%" + search_info1 + "%'";
}
}
3. 支持查询结果排序
查询结果排序支持按照时间升序降序排序。选择随机时仍为随机排序。
需要修改搜索接口:
if (search_sort == "按照时间升序") {
news_search_sql += " order by date";
} else if (search_sort == "按照时间降序") {
news_search_sql += " order by date DESC";
}
4. 支持查询结果分页
对查询结果进行动态分页,避免查询结果较多而查询不友好。
这里分页的跳转与逻辑实现没有使用任何框架,全部为原生实现,需要与后端进行交互。
修改新闻展示页面渲染路由如下:
app.get('/show_news', function(req, res) {
var search_info = search_info_past;
var search_type = search_type_past;
if (news_list.length == 0 || search_info_past == "") {
var news_search_sql = "select id_news, url, source, title, author, date from news";
search_info_past = "";
search_type_past = "";
crawler_sql.query(news_search_sql, function(err, result, fields) {
news_list = result;
if (result.length > max_page * min_news_per_page) {
var news_per_page = Math.floor(result.length / max_page);
var begin_news = news_per_page + result.length % max_page;
if (page_num == 1) {
res.json({search_info: search_info, search_type: search_type, news_list: news_list.slice(0, begin_news), page_num: page_num, pages: max_page});
} else {
res.json({search_info: search_info, search_type: search_type, news_list: news_list.slice(begin_news + (page_num - 2) * news_per_page, begin_news + (page_num - 1) * news_per_page), page_num: page_num, pages: max_page});
}
} else {
var pages = Math.ceil(result.length / min_news_per_page);
if (page_num == pages) {
res.json({search_info: search_info, search_type: search_type, news_list: news_list.slice((pages - 1) * min_news_per_page, result.length), page_num: page_num, pages: pages});
} else {
res.json({search_info: search_info, search_type: search_type, news_list: news_list.slice((page_num - 1) * min_news_per_page, page_num * min_news_per_page), page_num: page_num, pages: pages});
}
}
});
} else {
var news_search_sql = "";
if (search_type == 'title') {
news_search_sql = "select date, count(*) as cnt from news where title like '%" + search_info + "%' group by date order by date";
} else if (search_type == 'content') {
news_search_sql = "select date, count(*) as cnt from news where content like '%" + search_info + "%' group by date order by date";
}
crawler_sql.query(news_search_sql, function(err, result, fields) {
if (news_list.length > max_page * min_news_per_page) {
var news_per_page = Math.floor(news_list.length / max_page);
var begin_news = news_per_page + news_list.length % max_page;
if (page_num == 1) {
res.json({search_info: search_info, search_type: search_type, news_list: news_list.slice(0, begin_news), page_num: page_num, pages: max_page});
} else {
res.json({search_info: search_info, search_type: search_type, news_list: news_list.slice(begin_news + (page_num - 2) * news_per_page, begin_news + (page_num - 1) * news_per_page), page_num: page_num, pages: max_page});
}
} else {
var pages = Math.ceil(news_list.length / min_news_per_page);
if (page_num == pages) {
res.json({search_info: search_info, search_type: search_type, news_list: news_list.slice((pages - 1) * min_news_per_page, news_list.length), page_num: page_num, pages: pages, news_statistics: result});
} else {
res.json({search_info: search_info, search_type: search_type, news_list: news_list.slice((page_num - 1) * min_news_per_page, page_num * min_news_per_page), page_num: page_num, pages: pages, news_statistics: result});
}
}
});
}
})
相应的前端渲染交互代码如下:
$(document).ready(function() {
if (document.cookie == "") {
alert("请登录!");
window.location = "/login.html";
}
if (document.cookie.split('=')[1] != '000') {
$("li#0").hide();
$("li#1").hide();
}
var url = "/show_news";
$.ajax({
type: "GET",
url: url,
data: {
},
success: function(data){
for (var i = 0; i < data.news_list.length; i++) {
var begin_str = "<tr id='" + data.news_list[i].id_news + "'>";
var source_str = "<td class='text-left'>" + data.news_list[i].source + "</td>";
var title_str = "<td class='text-left'><a href='" + data.news_list[i].url + "'>" + data.news_list[i].title.slice(0, 10) + "..." + "</a></td>";
var author_str = "<td class='text-left'>" + data.news_list[i].author + "</td>";
var date_str = "<td class='text-left'>" + data.news_list[i].date.slice(0, 10) + "</td>";
var info_str = "<td class='text-right'><i class='fa fa-paper-plane'></i>  <input type='submit' value='详情' οnclick='news_info(" + data.news_list[i].id_news + ")' /></td>";
$("#tbody-title").append($(begin_str + source_str + title_str + author_str + date_str + info_str + "</tr>"));
}
for(var i = 0; i < data.pages; i++) {
var nav_str = "<li class='page-item'><a class='page-link' οnclick='goto_page(" + (i + 1) + ")'>" + (i + 1) + "</a></li>";
if (i + 1 == data.page_num) {
nav_str = "<li class='page-item active'><a class='page-link' οnclick='goto_page(" + (i + 1) + ")'>" + (i + 1) + "</a></li>";
}
$("#navs").append($(nav_str));
}
if (data.search_info == "") {
$("#statistics").hide();
$("#tab-content-1").hide();
$('#special').hide();
} else {
$('#normal').hide();
$('#special').html("搜索内容: " + data.search_info + "<br>搜索类型: " + data.search_type);
for (var i = 0; i < data.news_statistics.length; i++) {
var news_date = "<td class='text-left'>" + data.news_statistics[i].date.slice(0, 10) + "</td>";
var news_cnt = "<td class='text-left'>" + data.news_statistics[i].cnt.toString() + "</td>";
$("#tbody-statistics").append($("<tr>" + news_date + news_cnt + "<td class='text-left'></td><td class='text-left'></td></tr>"))
}
}
},
error: function(XMLHttpRequest, textStatus, errorThrown) {
alert(XMLHttpRequest.status);
alert(XMLHttpRequest.readyState);
alert(textStatus);
window.location = "/news.html";
}
});
});
相应的前端跳转交互代码如下:
function goto_page(page_num) {
var url = "/news_page";
$.ajax({
type: "GET",
url: url,
data: {
page_num: page_num
},
success: function(data){
window.location = "/news.html";
},
error: function(XMLHttpRequest, textStatus, errorThrown) {
alert(XMLHttpRequest.status);
alert(XMLHttpRequest.readyState);
alert(textStatus);
window.location = "/news.html";
}
});
}
增加展示全部新闻的接口:
app.get('/clear_search', function(req, res) {
search_info_past = "";
search_type_past = "";
news_list = [];
page_num = 1;
res.json({code: 1});
})
相应的前端交互代码如下(此处代码在登入系统所有页面均有):
function clear_search() {
var url = "/clear_search";
$.ajax({
type: "GET",
url: url,
data: {
},
success: function(data){
window.location = "/news.html";
},
error: function(XMLHttpRequest, textStatus, errorThrown) {
alert(XMLHttpRequest.status);
alert(XMLHttpRequest.readyState);
alert(textStatus);
window.location = "/news.html";
}
});
}
至此,增加功能(二):可视化及查询优化功能就已经全部实现了。