Node.js实现网络新闻爬虫及搜索功能(四)
Node.js实现网络新闻爬虫及搜索功能(四)
系列文章查看不到可能是CSDN审核原因,可以在我的知乎专栏看到所有文章:https://www.zhihu.com/column/c_1370026160999415808
项目要求
一、爬虫部分
1、完成目标网站的网页分析和爬虫设计。
2、爬取不少于100条数据(每条数据包括7个字段,新闻关键词、新闻标题、新闻日期、新闻作者、新闻来源、新闻摘要、新闻内容),并存储在数据库中。
二、搜索网站部分
1、完成对数据库中爬取新闻内容和标题的搜索功能,搜索结果以表格形式展示在前端页面中。
2、完成对搜索内容的时间热度分析,使用表格展示爬取数据内容中每一天包含搜索内容的条数。
本文是该项目第四部分:搜索展示新闻和搜索内容时间热度表格
三、搜索网站部分
1. 建立前端网页
前端网页代码开源在我该项目的github仓库中:Node.js实现网络新闻爬虫及搜索功能github仓库地址。在此就不再赘述,只介绍展示相关JavaScript代码的编写。
根据需要,我建立了三个前端网页,其名称和作用如下表格所示:
网页名称 | 作用 |
---|---|
search.html | 提供新闻搜索功能,分为两个Tab,分别可以根据新闻标题和新闻内容搜索 |
news.html | 提供新闻展示功能,当没有搜索信息时,展示全部已爬取新闻;当指定搜索信息时,展示搜索后的结果和搜索内容时间热度表格 |
news_info.html | 提供新闻详情展示功能,展示某条新闻的所有具体信息 |
search.html
分为两个Tab,分别可以根据新闻标题和新闻内容搜索。前后端交互JavaScript代码为:
<script text="text/javascript">
function search_news(search_type){
var search_info = "";
if (search_type == 'title') {
search_info = $('input#title').val();
} else if (search_type == 'content') {
search_info = $('input#content').val();
}
if (search_info == "") {
alert("请输入搜索信息!");
} else {
var url = "/search_news";
$.ajax({
type: "GET",
url: url,
data: {
search_type: search_type,
search_info: search_info
},
success: function(data){
alert(data.msg);
if (data.code == 1) {
window.location = "/news.html";
} else {
window.location = "/search.html";
}
},
error: function(XMLHttpRequest, textStatus, errorThrown) {
alert(XMLHttpRequest.status);
alert(XMLHttpRequest.readyState);
alert(textStatus);
window.location = "/search.html";
}
});
}
}
</script>
两个确认button为:
<input type="submit" value="开始查询" class="mt-2 btn btn-primary" onclick="search_news('title')">
<input type="submit" value="开始查询" class="mt-2 btn btn-primary" onclick="search_news('content')">
news.html
页面加载时渲染HTML的JavaScript函数:
<script text="text/javascript">
$(document).ready(function() {
var url = "/show_news";
$.ajax({
type: "GET",
url: url,
data: {
},
success: function(data){
for (var i = 0; i < data.news_list.length; i++) {
var begin_str = "<tr id='" + data.news_list[i].id_news + "'>";
var source_str = "<td class='text-left'>" + data.news_list[i].source + "</td>";
var title_str = "<td class='text-left'><a href='" + data.news_list[i].url + "'>" + data.news_list[i].title.slice(0, 10) + "..." + "</a></td>";
var author_str = "<td class='text-left'>" + data.news_list[i].author + "</td>";
var date_str = "<td class='text-left'>" + data.news_list[i].date.slice(0, 10) + "</td>";
var info_str = "<td class='text-right'><i class='fa fa-paper-plane'></i>  <input type='submit' value='详情' οnclick='news_info(" + data.news_list[i].id_news + ")' /></td>";
$("#tbody-title").after($(begin_str + source_str + title_str + author_str + date_str + info_str + "</tr>"));
}
if (data.search_info == "") {
$("#statistics").hide();
$("#tab-content-1").hide();
$('#special').hide();
} else {
$('#normal').hide();
$('#special').html("搜索内容: " + data.search_info + "<br>搜索类型: " + data.search_type);
for (var i = 0; i < data.news_statistics.length; i++) {
var news_date = "<td class='text-left'>" + data.news_statistics[i].date.slice(0, 10) + "</td>";
var news_cnt = "<td class='text-left'>" + data.news_statistics[i].cnt.toString() + "</td>";
$("#tbody-statistics").after($("<tr>" + news_date + news_cnt + "<td class='text-left'></td><td class='text-left'></td></tr>"))
}
}
},
error: function(XMLHttpRequest, textStatus, errorThrown) {
alert(XMLHttpRequest.status);
alert(XMLHttpRequest.readyState);
alert(textStatus);
window.location = "/news.html";
}
});
});
</script>
提供新闻展示功能,当没有搜索信息时,展示全部已爬取新闻;当指定搜索信息时,展示搜索后的结果和搜索内容时间热度表格。点击详情跳转到news_info.html网页。
点击详情时的跳转函数:
<script text="text/javascript">
function news_info(id_news) {
var url = "/news_info";
$.ajax({
type: "GET",
url: url,
data: {
id_news: id_news
},
success: function(data){
window.location = "/news_info.html";
},
error: function(XMLHttpRequest, textStatus, errorThrown) {
alert(XMLHttpRequest.status);
alert(XMLHttpRequest.readyState);
alert(textStatus);
window.location = "/news.html";
}
});
}
</script>
news_info.html
提供新闻详情展示功能,展示某条新闻的所有具体信息,页面加载时渲染HTML的JavaScript函数:
<script text="text/javascript">
$(document).ready(function() {
var url = "/show_news_info";
$.ajax({
type: "GET",
url: url,
data: {
},
success: function(data){
$("#url").html("<a href='" + data.url + "'>" + data.url + "</a>");
$("#source").html(data.source);
$("#title").html(data.title);
$("#keywords").html(data.keywords);
$("#author").html(data.author);
$("#date").html(data.date);
$("#crawler_time").html(data.crawler_time);
$("#summary").html(data.summary);
$("#content").html(data.content);
},
error: function(XMLHttpRequest, textStatus, errorThrown) {
alert(XMLHttpRequest.status);
alert(XMLHttpRequest.readyState);
alert(textStatus);
window.location = "/news_info.html";
}
});
});
</script>
其他前端小工具功能:
折叠侧边导航栏
个性化定制网页颜色
2. 建立后端路由
注意:JavaScript是异步执行的语言,因此要避免思维定式把代码顺序当做真实的指令执行顺序。
后端是由node.js中express库搭建,新建app.js,并引入相关包:
var express = require('express');
新建一个express实例:
var app = express();
将静态文件夹设为public,使前端可以访问到:
app.use(express.static(__dirname + '/static'));
连接数据库:
// 连接数据库
var crawler_sql = require("./crawlers/crawler_sql.js");
部署HTML文件路由:
app.get('/search.html', function(req, res) {
res.sendFile(__dirname + "/templates/" + "search.html");
})
app.get('/news.html', function(req, res) {
res.sendFile(__dirname + "/templates/" + "news.html");
})
app.get('/news_info.html', function(req, res) {
res.sendFile(__dirname + "/templates/" + "news_info.html");
})
实现搜索路由逻辑:
var news_list = [];
var search_info_past = "";
var search_type_past = "";
app.get('/search_news', function(req, res) {
// 查询数据库
var search_type = req.query.search_type;
var search_info = req.query.search_info;
var news_search_sql = "";
if (search_type == 'title') {
news_search_sql = "select id_news, url, source, title, author, date from news where title like '%" + search_info + "%'";
} else if (search_type == 'content') {
news_search_sql = "select id_news, url, source, title, author, date from news where content like '%" + search_info + "%'";
}
crawler_sql.query(news_search_sql, function(err, result, fields) {
if (result.length > 0) {
news_list = result;
search_info_past = search_info;
search_type_past = search_type;
res.json({code: 1, msg: "查询成功!"});
} else {
search_info_past = "";
search_type_past = "";
news_list = [];
res.json({code: 0, msg: "相关新闻不存在,请更换信息查询!"});
}
});
})
实现新闻展示路由逻辑:
app.get('/show_news', function(req, res) {
var search_info = search_info_past;
var search_type = search_type_past;
if (news_list.length == 0 || search_info_past == "") {
var news_search_sql = "select id_news, url, source, title, author, date from news";
search_info_past = "";
search_type_past = "";
crawler_sql.query(news_search_sql, function(err, result, fields) {
news_list = result;
res.json({search_info: search_info, search_type: search_type, news_list: news_list});
});
} else {
var news_search_sql = "";
search_info_past = "";
search_type_past = "";
if (search_type == 'title') {
news_search_sql = "select date, count(*) as cnt from news where title like '%" + search_info + "%' group by date order by date";
} else if (search_type == 'content') {
news_search_sql = "select date, count(*) as cnt from news where content like '%" + search_info + "%' group by date order by date";
}
crawler_sql.query(news_search_sql, function(err, result, fields) {
res.json({search_info: search_info, search_type: search_type, news_list: news_list, news_statistics: result});
});
}
})
实现新闻详情路由逻辑:
var news_info = [];
app.get('/news_info', function(req, res) {
var news_search_sql = "select url, source, title, keywords, author, date, crawler_time, summary, content from news where id_news=?";
var news_search = [req.query.id_news];
crawler_sql.query(news_search_sql, news_search, function(err, result, fields) {
news_info = result;
res.json({code: 1});
});
})
app.get('/show_news_info', function(req, res) {
res.json(news_info[0]);
})
在8080端口监听:
var server = app.listen(8080, function() {
console.log("访问地址为 http://127.0.0.1:8080/search.html")
})
app.js文件中的全局变量是为了在不同界面隐式传递参数而设置的。