在搜索框输入关键词,点击搜索,会进入相应的结果/result路由下,匹配了关键字的新闻同样被分页展示;搜索时可以选择新闻的排序方式,共5种,分别是标题、内容、作者、来源报刊、发表日期。
下面尝试搜索关键词 人类,排序原则选择发表时间,按照时间从远到近展示了8条新闻
同样搜索关键词 人类,排序原则选择作者,网站按照作者的字典序展示了8条新闻:
实现以上功能的相关代码有:
search.html:
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title>搜索</title>
<script>
function SearchForKeyword() {
document.getElementById("search-form").submit();
}
</script>
</head>
<body>
<div id="user-search">
<form id="search-form" action="/KeywordSearch" method="get">
<p>每页包含的新闻数:10 排序原则:<select name="sort">
<option value="title">标题</option>
<option value="content">内容</option>
<option value="author">作者</option>
<option value="source_name">来源报刊</option>
<option value="publish_date">发表时间</option>
</select> <input type="checkbox" name="similarity"/>相似度最高优先
</select> <input type="checkbox" name="jieba"/>中文分词查询</p>
<input id="search-key" type="text" name="keyword"/>
<button type="button" onclick="SearchForKeyword()">搜索</button>
</form>
</div>
<div id="select-page">
<p>当前处于第<span id="cur_page">0</span>页,共<span id="total_page">0</span>页,您可以选择在搜索框中跳转</p>
<form id="page-form" action="/search" method="get">
<p><input type="submit" value="转到"/> 第<input id="page-search" type="text" name="page"/>页</p>
</form>
</div>
<table border="1" id="main-table" hidden="true">
</table>
</body>
</html>
result.html:
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title>结果</title>
<script>
function searchTable() {
window.location.href = 'http://localhost:3000/search';
}
</script>
</head>
<body>
<div id="select-page">
<p>当前处于第<span id="cur_page">0</span>页,共<span id="total_page">0</span>页,您可以选择在搜索框中跳转</p>
<form id="page-form" action="/result" method="get">
<p><input type="submit" value="转到"/> 第<input id="page-search" type="text" name="page"/>页</p>
</form>
<p>或点击这里进入搜索界面 <button onclick="searchTable()" class="center">进入搜索界面</button></p>
</div>
<table border="1" id="main-table" hidden="true">
</table>
server.js中:
server.get('/search', function (req, res) { //搜索界面,展示全部爬虫数据,搜索框可开始查询
let $ = myCheerio.load(fs.readFileSync("./html/search.html"));
if(myUser.id==0){
$('table[id="main-table"]').removeAttr("hidden");
}
$("table").append(`<tr>
<td>序号</td>
<td>标题</td>
<td>内容</td>
<td>作者</td>
<td>来源</td>
<td>发表时间</td>
<td>爬取时间</td>
<td>关键字</td>
<td>摘要</td>
</tr>`);
let cur_page = 0;
if(req.query.page){
cur_page = req.query.page-1;
}
$('span[id="cur_page"]').replaceWith(`<span id="cur_page">${cur_page+1}</span>`);
let base = (cur_page + 1) * pagesize;
connection.query("SELECT * from xueqiu1", function (qerr, vals, fields) {
if (qerr) {
console.error(`数据库查询失败-${qerr}`);
return;
}
if (base > vals.length) base = vals.length;
$('span[id="total_page"]').replaceWith(`<span id="total_page">${Math.ceil(vals.length/pagesize)}</span>`);
for (let i = cur_page * pagesize; i < base; ++i) {
let newsfetch = vals[i];
newsfetch.author = newsfetch.author.replace(/编辑:/g, "");
newsfetch.author = newsfetch.author.replace(/作者:/g, "");
$("table").append(`
<tr id="line${i}">
<td>${i}</td>
<td><a href="${newsfetch.url}" id="t${i}">${newsfetch.title}</a></td>
<td id="c${i}">${newsfetch.content}</td>
<td id="a${i}">${newsfetch.author}</td>
<td id="s${i}">${newsfetch.source_name}</td>
<td id="p${i}">${newsfetch.publish_date}</td>
<td>${newsfetch.crawltime}</td>
<td id="k${i}">${newsfetch.keywords}</td>
<td id="d${i}">${newsfetch.description}</td>
</tr>
`);
}
let body = $.html();
res.writeHead(200, { 'Content-type': 'text/html' });
res.end(body);
});
});
var search_text;
var keywords;
var sortmethod;
var q_items = new Array();
var max_page_cnt;
server.get('/KeywordSearch', function (req, res) { //查询处理,与数据库交互
sortmethod = req.query.sort;
search_text = req.query.keyword;
keywords = search_text.split(' ');
var myDate = new Date();
connection.query('insert into log(name,action,content,date) VALUES(?,?,?,?)', [myUser.name, 'SEARCH', search_text, myDate.toLocaleString()], function (qerr, vals, fields) {
if (qerr) {
console.error(`用户行为数据库查询失败-${qerr}`);
return;
}
});
let current_query_sql = "SELECT * from xueqiu1 "; //MySQL语句
for (let kw of keywords) {
if (current_query_sql.charAt(current_query_sql.length - 1) == ' ') {
current_query_sql = current_query_sql + "where ";
} else {
current_query_sql = current_query_sql + " and ";
}
current_query_sql = current_query_sql + `(title like '%${kw}%' or content like '%${kw}%')`;
}
connection.query(current_query_sql + " order by " + sortmethod, function (qerr, vals, fields) {
if (qerr) {
console.error(`数据库查询失败-${qerr}`);
return;
}
q_items = vals;
max_page_cnt = Math.ceil(q_items.length / pagesize);
res.redirect('/result?page=1');
});
});
server.get('/result', function (req, res) { //查询结果展示界面
let $ = myCheerio.load(fs.readFileSync("./html/result.html"));
if(myUser.id==0){
$('table[id="main-table"]').removeAttr("hidden");
}
$("table").append(`<tr>
<td>序号</td>
<td>标题</td>
<td>内容</td>
<td>作者</td>
<td>来源</td>
<td>发表时间</td>
<td>爬取时间</td>
<td>关键字</td>
<td>摘要</td>
</tr>`);
let cur_page = 0;
if(req.query.page){
cur_page = req.query.page-1;
}
$('span[id="cur_page"]').replaceWith(`<span id="cur_page">${cur_page+1}</span>`);
let base = (cur_page + 1) * pagesize;
if (base > q_items.length) base = q_items.length;
$('span[id="total_page"]').replaceWith(`<span id="total_page">${Math.ceil(max_page_cnt)}</span>`);
for (let i = cur_page * pagesize; i < base; ++i) {
let newsfetch = q_items[i];
newsfetch.author = newsfetch.author.replace(/编辑:/g, "");
newsfetch.author = newsfetch.author.replace(/作者:/g, "");
$("table").append(`
<tr id="line${i}">
<td>${i}</td>
<td><a href="${newsfetch.url}" id="t${i}">${newsfetch.title}</a></td>
<td id="c${i}">${newsfetch.content}</td>
<td id="a${i}">${newsfetch.author}</td>
<td id="s${i}">${newsfetch.source_name}</td>
<td id="p${i}">${newsfetch.publish_date}</td>
<td>${newsfetch.crawltime}</td>
<td id="k${i}">${newsfetch.keywords}</td>
<td id="d${i}">${newsfetch.description}</td>
</tr>
`);
}
let body = $.html();
res.writeHead(200, { 'Content-type': 'text/html' });
res.end(body);
});
除此之外,添加主题词打分排序
for (let i in q_items) { //相似度打分
q_items[i].similarity = 0;
for (let kw of keywords) {
let titleMatch = q_items[i].title.match(new RegExp(kw, "g"));
let contentMatch = q_items[i].content.match(new RegExp(kw, "g"));
if (titleMatch !== null) {
q_items[i].similarity += titleMatch.length * 2;
}
if (contentMatch !== null) {
q_items[i].similarity += contentMatch.length;
}
}
}
if (req.query.similarity !== undefined) {
//优先按照相似度排序
q_items.sort((na, nb) => { return nb.similarity - na.similarity; })
}
后面将添加中文分词查询,若用户勾选了中文分词查询,则使用jieba分词后的结果进行匹配