爬虫实现
连接mysql数据库,爬取网易,新华,凤凰网的首页新闻
-
获取种子界面,使用cheerio模块对html页面进行解析,并使用正则表达式进行url地址的规范化,筛选爬取的url
-
var request = require('request') var cheerio = require('cheerio') var iconv = require('iconv-lite'); var mysql = require('./mysql.js'); var schedule = require('node-schedule'); //var url = 'https://news.163.com/'; // 网易 2022-06-25 20:09:00 //var url='http://www.news.cn/'; //新华 2022-07-10 07:09:02 var url='https://www.ifeng.com/'; //凤凰 2022-07-08 06:40:02 var web="凤凰"; //var url_reg=/news\/article\/(\w{16}).html/ //网易 //var url_reg= /\/(\d{4})-(\d{2})\/(\d{2})\/c_(\d{10}).htm/; //新华 var url_reg= /c\/8(\w{10})/; //凤凰 function Request(url, callback) { var options = { url: url, encoding: null, headers: {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}, timeout: 10000 } request(options, callback) }; function seedUrl(){ Request(url, function(err, res, body) { if(body==undefined)return; var html = iconv.decode(body,'utf-8'); var $ = cheerio.load(html, { decodeEntities: true }); var news; try { news = eval("$('a')"); } catch (e) { console.log('html块识别出错:' + e) }; news.each(function(i, e) { try { var href = ""; href = $(e).attr("href"); if (href == undefined || !url_reg .test(href)) return; var news_url = 'select url from newsTable where url=?'; var news_url_Params = [href]; mysql.query(news_url, news_url_Params, function(qerr, vals, fields) { if (vals.length > 0) { console.log('URL duplicate!') } else newsGet(href); }); newsGet(href); } catch (e) { console.log('新闻链接出错:' + e) } }); }) }
-
解析新闻页面的内容,将其结构化存入数据库当中,运用request,iconv-lite,cheerio解析获取信息
-
function newsGet(href){ Request(href, function(err, res, body) { if(body==undefined)return; var html=body; //var html = iconv.decode(body, 'utf-8'); var $ = cheerio.load(html, { decodeEntities: false }); var time = new Date().toLocaleString(); time = time.replaceAll('/', '-'); //网易 // var title= $('meta[property="og:title"]').eq(0).attr("content"); // var keywords = $('meta[name="keywords"]').eq(0).attr("content"); // var source = $('.post_info a:first-child').text(); // var author = $('meta[name="author"]').eq(0).attr("content"); // var date = ($('#ne_wrap').prop("data-publishtime")) // var content = $('.post_body').text(); // console.log(title); //新华 // var keywords= $('meta[name="keywords"]').eq(0).attr("content"); // var info=$('.info').text().split("\n"); // var date=info[1]; // var title = $('title').text(); // title=title.replace("\n", "") // title=title.replace("\n", "") // var source = $('meta[name="source"]').eq(0).attr("content"); // var author=source; // var content = $('#detail').text(); //凤凰 var keywords = $('meta[name="keywords"]').eq(0).attr("content"); var source = $(".source").text(); if (source=="") source=$("[class^='sourceTitleText']").text(); var title = $('meta[property="og:title"]').eq(0).attr("content"); var date= $('meta[name="og:time "]').eq(0).attr("content"); var author = $('meta[name="og:category "]').eq(0).attr("content"); var content = $('.text-3w2e3DBc').text(); if (author== "") author = web; if (source== "") source = web; var crawltime=new Date(); var newsSQL = 'INSERT INTO newsTable(url,source,encoding,title,keywords,author,date,crawltime,content) VALUES(?,?,?,?,?,?,?,?,?)'; var newsSQL_Params = [href, source, 'utf-8',title, keywords, author, date, crawltime, content]; mysql.query(newsSQL, newsSQL_Params , function(qerr, vals, fields) { if (qerr) { console.log(qerr); } }); }); } seedUrl();
-
爬取内容存入news数据库的newsTable表格,通过workbench导出
网页功能
1、用户可注册登录网站,未登录没有权限访问其他页面
界面效果如下
前端实现 login.html
-
登录部分有username和password组成,通过form表单编写,在按钮点击时会根据类名获取dom的value值,同时会阻止form表单的默认事件,先对两个值进行非空判断,满足则通过ajax请求后端//login接口进行登录,若存在控制则会进行提示框提醒,进行提前验证数据。验证通过时添加session进行权限控制,同时通过定时器控制页面跳转的时机。验证失败则提示信息。
-
let loginform = document.querySelector(".sign-in-form"); let loginbtn = loginform.querySelector(".btn") loginbtn.addEventListener("click", function (e) { var event = e || window.event; if (e.preventDefault) { e.preventDefault(); } else { event.returnValue = false; } let username = loginform.querySelectorAll("input")[0].value; let password = loginform.querySelectorAll("input")[1].value; if (!username || !password) { messageplugin({ message: "请完善所有信息!", type: "warning" }); return; } ajax({ url: "login", type: "POST", data: { username, password }, dataType: "json", success: function (res) { let { data } = JSON.parse(res); if (data.status == 200) { messageplugin({ message: data.msg, type: "success" }); sessionStorage.setItem("username", username); setTimeout(function () { window.open("./home.html","_self") },1000) } else { messageplugin({ message: data.msg, type: "error" }); } }, fail:function(error){ messageplugin({ message: "网络异常,请检查后重试!", type: "error" }); } }); })
后端实现 router.js /login
-
通过express的router获取的前端传递过来的参数req,首先编写sql语句,在确保连接到数据库的前提下执行这条sql语句。这里主要为查询语句,根据传递过来的用户名进行查询,若存在则返回对应信息,若不存在则将数据库中查到的记录与前端传递过来的值进行比较判断。
-
router.post('/login', (req, res) => { const getUser = req.body; const selUser_sql = "select * from `login_table` where username = '" + getUser.username + "'"; conn1.query(selUser_sql, getUser.username, (error, results1) => { if (error) { res.send({ "data": { "msg": "请求错误,请检查后重试!", "status": 400 } }) return } if (results1[0] === undefined) { res.send({ "data": { "msg": "用户名不存在", "status": 403 } }) } else { if (results1[0].username == getUser.username && results1[0].password == getUser.password) { res.send({ "data": { "msg": "恭喜你,登录成功!", "status": 200, } }) } else { res.send({ "data": { "msg": "不好意思,您的密码错误!", "status": 403 } }) } } }) })
2、用户操作成功/错误出现相应提示框
效果如下 实现见notification.js
-
提示框提示主要包括
-
网络异常,请检查后重试!
-
请完善所有信息!
-
用户名不存在
-
恭喜你,登录成功!
-
不好意思,您的密码错误!
-
请输入要查找的内容
-
-
此提示框进行了功能封装messageplugin,将其挂载到全局对象window上,方便调用,同时通过类的方式进行有参构造以及函数初始化,动态添加删除元素实现提示框的效果;在用于点击时生成dom元素挂载到body中,同时启动定时任务,在显示一定时间后自动销毁该元素。
-
class MessageBox { constructor(options) { // 把传递进来的配置信息挂载到实例上(以后可以基于实例在各个方法各个地方拿到这个信息) for(let key in options) { if(!options.hasOwnProperty(key)) break; this[key] = options[key]; } // 开始执行 this.init(); } // 初始化:通过执行INIT控制逻辑的进行 init() { if(this.status === "message") { this.createMessage(); this.open(); return; } } // 创建元素 createMessage() { this.messageBox = document.createElement('div'); this.messageBox.className = `dpn-message dpn-${this.type}`; this.messageBox.innerHTML = ` ${this.message} <i class="dpn-close">X</i> `; document.body.appendChild(this.messageBox); // 基于事件委托监听关闭按钮的点击 this.messageBox.onclick = ev => { let target = ev.target; //判断点击的元素是否为关闭按钮 if(target.className === "dpn-close") { // 点击的是关闭按钮 this.close(); } }; // 钩子函数 this.oninit(); } // 控制显示 open() { if(this.status === "message") { let messageBoxs = document.querySelectorAll('.dpn-message'), len = messageBoxs.length; //计算新弹出的messageBox的Y轴偏移量 this.messageBox.style.top = `${len===1 ? 20:20+(len-1)*70}px`; // 如果duration不为零,控制自动消失 this.autoTimer = setTimeout(() => { this.close(); }, this.duration); // 钩子函数 this.onopen(); return; } } // 控制隐藏 close() { if(this.status === "message") { clearTimeout(this.autoTimer); this.messageBox.style.top = '-200px'; let anonymous = () => { document.body.removeChild(this.messageBox); // 钩子函数 this.onclose(); }; this.messageBox.addEventListener('transitionend', anonymous); return; } } } //全局对象上挂载该方法 window.messageplugin = function(options = {}) { //允许只传入字符串,对其进行对象格式处理 if(typeof options === "string") { options = { message: options }; } //用户提供的配置覆盖默认配置项 options = Object.assign({ status: 'message', message: '我是默认信息', type: 'info', duration: 3000, //生命周期钩子 oninit() {}, onopen() {}, onclose() {}, }, options); return new MessageBox(options); };
2、新闻数量随时间变化以及七日内新闻占比分析
页面效果如下:
前端页面 home.html
-
页面主要通过echarts的折线图以及饼状图组成,数据在页面加载时通过向后端请求进行options的配置,然后将其通过ecahrts绑在页面html对应id的标签下。
-
<header id="header"> <div class="website-header"> <img src="../frontend/images/logo.png"> <h1>NEWS TODAY</h1> <nav class="navbar navbar-expand-md "> <div class="collapse navbar-collapse" id="collapsibleNavbar"> <ul class="navbar-nav"> <li class="nav-item"> <a class="nav-link" href="./home.html">首页</a> </li> <li class="nav-item"> <a class="nav-link" href="./search.html">全文搜索</a> </li> <li class="nav-item"> <a class="nav-link" href="./analyse.html">时间热度分析</a> </li> </ul> </div> </nav> <div class="social-icons" style="display:flex;align-items:center;justify-content:right;"> <i class="fab fa-instagram"> <em></em> Latout</i> </div> </div> </header> <main id="main"> <div class="echarts" style=" width:90%;display:flex; justify-content:space-around;"> <div id="test" style="width:48%; height:400px;"></div> <div id="test2" style="width:48%; height:400px;"></div> </div> </main>
-
js部分先利用echarts.init()方法对echarts进行绑定初始化,然后调用ajax函数请求/getDataByTime获取新闻信息,然后将获取到的信息中日期与数量分别对应存在两个数组中,然后对option进行初始化,对具体参数进行配置,最后利用setOption()方法完成数据渲染
-
由于其中需要近七日的日期因此对其进行函数封装getWeek(),先根据new Date()获取现在的日期,然后进行for循环进行时间戳的倒推,进而实现动态获取近七日的日期,在近七日信息option处理时通过对得到的新闻遍历利用indexof判断是否属于近七日范围同时其数据进行更新
-
<script type="text/javascript"> function getWeek() { var today = new Date(); var dateArr = [] for (var i = 6; i >= 0; i--) { var newDate = new Date(today.getTime() - i * 1000 * 60 * 60 * 24) var year = newDate.getFullYear() var month = (parseInt(newDate.getMonth()) + 1) > 9 ? (parseInt(newDate.getMonth()) + 1) : "0" +(parseInt(newDate.getMonth()) + 1) var day = (newDate.getDate()) > 9 ? newDate.getDate() : "0" + newDate.getDate() var fullDate = `${month}-${day}` dateArr.push(fullDate) } return dateArr; } var week = getWeek(); let layoutbtn = document.querySelector(".fa-instagram"); layoutbtn.addEventListener("click", function () { messageplugin({ message: "退出成功!", type: "success" }); setTimeout(function () { sessionStorage.clear(); window.open("./login.html", "_self"); }, 1000) }) var myChart = echarts.init(document.getElementById('test')); var myChart2 = echarts.init(document.getElementById('test2')); ajax({ url: "getDataByTime", type: "get", success: function (res) { let { data } = JSON.parse(res); if (data.status == 200) { let xlist = [], ylist = []; let list = data.list; for (let i = 0; i < list.length; i++) { if (list[i].date) { xlist.push(list[i].date.substring(5, 10)); ylist.push(list[i].num); } } var option = { title: { text: '新闻数量随时间的变化图', left: "center", textStyle: { color: '#777' } }, tooltip: {}, xAxis: { name: "时间", data: xlist }, yAxis: { name: "新闻数量" }, series: [ { name: '新闻数量', type: 'line', data: ylist } ] }; myChart.setOption(option); } else { messageplugin({ message: "网络错误,请刷新后重试!", type: "error" }); } } }); ajax({ url: "getDataBySource", type: "get", success: function (res) { let { data } = JSON.parse(res); if (data.status == 200) { let list = data.list; list = list.sort((a,b)=>{ return b.value - a.value}).slice(0, 10); list[3].name="网易"; console.log(list); let typelist = []; for (let i = 0; i < list.length; i++) { typelist.push(list[i].name); } option2 = { title: { text: '新闻来源数量占比图', left: "center", textStyle: { color: '#777' } }, grid: { top: 20, }, tooltip: {}, series: [ { type: 'pie', data: list } ] }; myChart2.setOption(option2); } else { messageplugin({ message: "网络错误,请刷新后重试!", type: "error" }); } }, fail: function (error) { messageplugin({ message: "网络错误,请刷新后重试!", type: "error" }); } }); </script> </html>
后端部分 router.js /getDataByTime /getDataBySource
-
通过select查询语句获取新闻的数量以及日期,同时利用group by 根据日期进行分组,从而实现获取每日的新闻数据信息,设置get请求以及不需要参数即可完成
-
router.get("/getDataByTime", async (req, res) => { let sql = "select count(*) as `num`,`date` from `newstable` GROUP BY `date` ORDER BY `date` " conn.query(sql, (err, result)=>{ if(err){ res.send({ "data": { "msg": "请求错误,请检查后重试!", "status": 400 } }) return } res.send({ "data":{ "list":result, "status":200 } }) }) }) router.get("/getDataBySource", async (req, res) => { let sql = "select count(*) as `value`,`source` as name from `newstable` GROUP BY `source` " conn.query(sql, (err, result)=>{ if(err){ console.log(err); res.send({ "data": { "msg": "请求错误,请检查后重试!", "status": 400 } }) return } res.send({ "data":{ "list":result, "status":200 } }) }) })
3、关键词搜索内容展示
页面效果如下:
前端页面 search.html
-
主要由搜索框input以及表格table组成,在输入框输入信息的前提下点击搜索会将输入框中的信息进行提取并通过ajax请求传递给后端接口获取模糊查询的结果,然后利用createElement动态生成表格内容与数据。同时还设置有分页按钮,其通过搜索的结果数量以每页十条数据为基准进行动态展示。
-
js部分由于分页功能的原因对绘制表格功能进行了封装提取为drawList()函数,为了防止每次切换分页都会造成appendChildren从而导致上一次数据记录保存,在每次执行时都会将tbody的innerHTML进行清除。
-
对于分页功能将ajax请求到的数据存放于全局变量allList与nowList中,前者保存所有数据后者保存当前展示的数据,同时设置paginationIndex记录当前处于的页数。通过获取到的所有数据的长度即可得到要分的页数,通过在生成分页按钮的同时添加对应的点击事件,根据页数的改变切换nowList中的内容,然后调用drawList()对表格进行重绘。
-
根据时间进行排序主要是对全局的nowList进行操作,通过数组的sort方法巧妙的对对象数组中的对象根据某一key进行排序。
-
var allList = [], nowList = []; var paginationIndex; let username = sessionStorage.getItem("username"); if (!username) { window.open("./login.html", "_self") } function resize() { var windowHeight = window.innerHeight; var footer = document.getElementById("footer"); var main = document.getElementById("main"); var header = document.getElementById("header"); var bodyHeight = footer.offsetHeight + header.offsetHeight + main.offsetHeight; if (windowHeight > bodyHeight) { footer.style.position = "absolute"; footer.style.bottom = "0" } else { footer.style.positon = ""; footer.style.bottom = ""; } } window.onload = function () { resize(); } let layoutbtn = document.querySelector(".fa-instagram"); layoutbtn.addEventListener("click", function () { messageplugin({ message: "退出成功!", type: "success" }); setTimeout(function () { sessionStorage.clear(); window.open("./login.html", "_self"); }, 1000) }) let search_btn = document.querySelector("#search_btn"); let search_content = document.querySelector("#search_content"); let tbody = document.querySelector("#tbody"); let pagination = document.querySelector(".pagination"); let orderByUp = document.querySelector("#orderByUp"); let orderByDown = document.querySelector("#orderByDowm"); search_btn.addEventListener("click", function (e) { var event = e || window.event; if (e.preventDefault) { e.preventDefault(); } else { event.returnValue = false; } let content = search_content.value; if (!content) { messageplugin({ message: "请输入要查找的内容!", type: "warning" }); return; }
function drawList(list) { pagination.classList.remove("hidden"); tbody.innerHTML = ""; for (let i = 0; i < list.length; i++) { let obj = list[i]; let tr = document.createElement("tr"); tr.addEventListener("click", function () { window.open(obj.url, "_blank"); }) let th = document.createElement("th"); tr.appendChild(th); th.innerHTML =paginationIndex * 10 + i + 1; let td1 = document.createElement("td"); td1.innerHTML = obj.title; let td2 = document.createElement("td"); td2.innerHTML = obj.source; let td3 = document.createElement("td"); td3.innerHTML = obj.author; let td4 = document.createElement("td"); td4.innerHTML = obj.date?.substring(5, 10); let td5 = document.createElement("td"); td5.classList.add("onlyline") td5.innerHTML = obj.content; tr.appendChild(th); tr.appendChild(td1); tr.appendChild(td2); tr.appendChild(td3); tr.appendChild(td4); tr.appendChild(td5); tbody.appendChild(tr); } } function drawPagination() { pagination.classList.remove("hidden"); pagination.innerHTML = ""; let list = allList; let length = list.length; let num = length % 10 == 0 ? parseInt(length / 10) : parseInt(length / 10) + 1; let li = document.createElement("li"); for (let i = 0; i < num; i++) { let li = document.createElement("li"); li.classList.add("page-item"); let div = document.createElement("div"); div.classList.add("page-link"); div.innerHTML = i + 1; div.addEventListener("click", function () { pagination.querySelectorAll(".page-link")[paginationIndex].classList.remove("active"); let list = i == num ? allList.slice(i * 10, length) : allList.slice(i * 10, i * 10 + 10); paginationIndex = i; pagination.querySelectorAll(".page-link")[paginationIndex].classList.add("active"); nowList = list; drawList(list); }) li.appendChild(div); pagination.appendChild(li); } paginationIndex = 0; pagination.querySelectorAll(".page-link")[paginationIndex].classList.add("active"); if (length < 10) { nowList = list; drawList(list); } else { nowList = list.slice(0, 10); drawList(nowList); } }
orderByUp.addEventListener("click", function () { let list = nowList; list.sort( (a, b)=> { let time1 =a.date ? new Date(a.date).getTime() : 0 let time2 = b.date ?new Date(b.date).getTime() : 0 return time1 - time2; }) drawList(list); }) orderByDowm.addEventListener("click", function () { let list = nowList; list.sort( (a, b)=> { let time1 =a.date ? new Date(a.date).getTime() : 0 let time2 = b.date ?new Date(b.date).getTime() : 0 return time2 - time1; }) drawList(list); }) ajax({ url: "getLike", type: "POST", data: { "key": content }, dataType: "json", success: function (res) { let { data } = JSON.parse(res); if (data.status == 200) { let list = data.list; allList = list; drawPagination(); resize(); search_content.value = ""; } else { messageplugin({ message: "获取数据失败!", type: "error" }); } }, fail: function (error) { messageplugin({ message: "网络异常,请检查后重试!", type: "error" }); } }); })
> 后端部分 router.js /getLike - 接收post请求同时获取到前端传来的key值,将其带入到sql语句中通过mysql的like进行模糊查询,从而获取到新闻内容中含相关key的新闻信息,并以数据的形式返回。 - ```javascript //like搜索 router.post("/getLike", (req, res) => { const key = req.body.key; const like_sql = "select * from `newstable` where `content` like '%" + key + "%'"; conn.query(like_sql, (err, result) => { if (err) { res.send({ "data": { "msg": "请求错误,请检查后重试!", "status": 400 } }) return } res.send({ "data": { "length": result.length, "list": result, "status": 200 } }) }) })
4、关键词热度分析
页面效果如下: