作业要求
第一部分:
爬虫源码
爬取的网站是:人/民/网
爬取的数据:标题,发布时间,作者,关键词
1.首先引入模块
最先开始看老师的代码什么也看不懂,各种模块怎么用也不清楚,然后就疯狂百度,然后在W3school上学习到头晕目眩终于能看懂了
database名称:renming(前后鼻音不分
table名称:fetches
var mysql = require('./mysql'),//database:renming table:fetches
fs = require('fs'),
myRequest = require('request'),
myCheerio = require('cheerio'),
myIconv = require('iconv-lite');
require('date-utils');
2.一些需要爬取的信息
最先开始使用的是utf-8编码爬取 出现了问题 询问了老师以后发现这个网页使用的是gkb 修改之后就正常啦!
分析网页发现信息格式很统一 :
非常明了 直接分析就能写出代码
全部是meta name形式 这里我只爬取了 title keywords(有的有有的没有) publishdate author description 还有source来源
这个地方唯一卡住我的就是正则表达式的写法, 我搜索了一下发现网上的看起来很复杂没有示例代码那么简洁,然后就仔细学习观察了老师的写法,然后灵光乍现仿照着修改了一下 尝试了一下就爬取成功了。
var source_name='人民网',
domain = 'http://www.people.com.cn/',
myEncoding = 'gbk',
seedURL = 'http://www.people.com.cn/';
var seedURL_format =" $('a')",
keywords_format = "$('meta[name=\"keywords\"]').eq(0).attr(\"content\")",
title_format = "$('title').eq(0).text()",
date_format = "$('meta[name=\"publishdate\"]').eq(0).attr(\"content\")",
author_format = "$('meta[name=\"author\"]').eq(0).attr(\"content\")",
content_format = "$('.box_con').text()",
desc_format = "$('meta[name=\"description\"]').eq(0).attr(\"content\")",
source_format = "$('meta[name=\"source\"]').eq(0).attr(\"content\")",
url_reg = /\/(\w{2})\/(\d{4})\/(\d{4})\/(\w{5,10}\-\w{8,15}).html/;
var regExp = /(\d{4}\-\d{2}\-\d{2})/
(太难了! 而且最先开始拼写错误犯了很多 ,检查的时候对自己好无语...
3.防止网站屏蔽爬虫的代码
这个地方我也是在网上学习的:网络爬虫--防止爬虫被屏蔽的集中方式的总结_after_you的博客-CSDN博客
var headers = {
'User-Agent' : 'Moziilla/5.0(Macintosh; Inyel Mac OS X 10_10_1) AppleWebKit/537.36'
}
4.正式的爬取部分
包括对网页的分析,fetches表格创建 以及写入数据库
这里本来我是一窍不通的,对于很多语句都不熟悉。然后发现老师上课提起的jQuery 并且再一次课上的代码也有提示,就去学习了jQuery
看这美丽的收藏夹
function request (url, callback) {
var options = {
url :url,
encoding : null,
headers : headers,
timeout : 10000
}
myRequest(options, callback)
};
seedget();
function seedget() {
request(seedURL, function(err, res, body) { //读取种子页面
try {
//用iconv转换编码
var html = myIconv.decode(body, myEncoding);
//准备用cheerio解析html
var $ = myCheerio.load(html, { decodeEntities: true });
} catch (e) { console.log('读种子页面并转码出错:' + e) };
var seedurl_news;
try {
seedurl_news = eval(seedURL_format);
} catch (e) { console.log('url列表所处的html块识别出错:' + e) };
seedurl_news.each(function(i, e) { //遍历种子页面里所有的a链接
var myURL = "";
try {
//得到具体新闻url
var href = "";
href = $(e).attr("href");
if (href == undefined) return;
if (href.toLowerCase().indexOf('http://') >= 0) myURL = href; //http://开头的
else if (href.startsWith('//')) myURL = 'http:' + href; 开头的
else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href; //其他
} catch (e) { console.log('识别种子页面中的新闻链接出错:' + e) }
if (!url_reg.test(myURL)) return; //检验是否符合新闻url的正则表达式
//console.log(myURL);
var fetch_url_Sql = 'select url from fetches where url=?';
var fetch_url_Sql_Params = [myURL];
mysql.query(fetch_url_Sql, fetch_url_Sql_Params, function(qerr, vals, fields) {
if (vals.length > 0) {
console.log('URL duplicate!') //除开重复的url
} else newsGet(myURL); //读取新闻页面
});
});
});
};
function newsGet(myURL) { //读取新闻页面
request(myURL, function(err, res, body) { //读取新闻页面
try {
var html_news = myIconv.decode(body, myEncoding); //用iconv转换编码
//准备用cheerio解析html_news
var $ = myCheerio.load(html_news, { decodeEntities: true });
myhtml = html_news;
} catch (e) { console.log('读新闻页面并转码出错:' + e);};
console.log("转码读取成功:" + myURL);
//动态执行format字符串,构建json对象准备写入文件或数据库
var fetch = {};
fetch.title = "";
fetch.content = "";
fetch.publish_date = new Date().toLocaleDateString().split('/').join('-')
//fetch.html = myhtml;
fetch.url = myURL;
fetch.source_name = source_name;
fetch.source_encoding = myEncoding; //编码
fetch.crawltime = new Date();
if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format); //没有关键词就用sourcename
else fetch.keywords = eval(keywords_format);
if (title_format == "") fetch.title = ""
else fetch.title = eval(title_format); //标题
if (date_format != "") fetch.publish_date = eval(date_format); //刊登日期
console.log('date: ' + fetch.publish_date);
if(fetch.publish_date!=null){fetch.publish_date = regExp.exec(fetch.publish_date)[0];};
if( fetch.publish_date){
fetch.publish_date = fetch.publish_date.replace('年', '-')
fetch.publish_date = fetch.publish_date.replace('月', '-')
fetch.publish_date = fetch.publish_date.replace('日', '')
fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");};
if (author_format == "") fetch.author = source_name; //eval(author_format); //作者
else fetch.author = eval(author_format);
if (content_format == "") fetch.content = "";
else { if(eval(content_format))
fetch.content = eval(content_format).replace("\r\n" + fetch.author, "");} //内容
if (source_format == "") fetch.source = fetch.source_name;
else {if(eval(source_format))
fetch.source = eval(source_format).replace("\r\n", "");} //来源
if (desc_format == "") fetch.desc = fetch.title;
else {if(eval(desc_format))
fetch.desc = eval(desc_format).replace("\r\n", "");} //摘要
var fetchAddSql = 'INSERT INTO fetches(url,source_name,source_encoding,title,' +
'keywords,author,publish_date,crawltime,content) VALUES(?,?,?,?,?,?,?,?,?)';
var fetchAddSql_Params = [fetch.url, fetch.source_name, fetch.source_encoding,
fetch.title, fetch.keywords, fetch.author, fetch.publish_date,
fetch.crawltime.toFormat("YYYY-MM-DD HH24:MI:SS"), fetch.content
];
//执行sql
mysql.query(fetchAddSql, fetchAddSql_Params, function(qerr, vals, fields) {
if (qerr) {
console.log(qerr);
}
}); //mysql写入
});
}
以上就是爬虫代码。
5.数据库爬取成功展示
总共爬取了124条信息
第二部分:在前端对数据库搜索
这里需要构建前端和后端:
前端因为做得好看可以酌情加分
那我学的css不就有用处了吗
1.首先前端网页代码:
<!DOCTYPE html>
<html>
<head>
<style>
body {
color:rgb(0, 0, 0);
background-image: url("C:/Users/melodrama/Pictures/Saved Pictures/1.jpg");
background-repeat: no-repeat;
background-size: cover;
text-align: center;
text-decoration: rgb(0, 0, 0);
font-size: larger;
margin-top: 60px;
margin-bottom: 60px;
}
h1{
font-family:'Times New Roman', Times, serif;
letter-spacing: 3px;
margin-top: 50px;
}
p{
font-size: small;
font-family: 'Lucida Sans', 'Lucida Sans Regular', 'Lucida Grande', 'Lucida Sans Unicode', Geneva, Verdana, sans-serif;
}
</style>
</head>
<body>
<h1>Welcome to the search website!</h1>
<form action="http://127.0.0.1:3000/process_get" method="GET">
<br> 标题:<input type="text" name="title">
<input type="submit" value="Submit">
<p>Enter keywords you want to search in the database</p>
</form>
<!-- <script>
</script> -->
</body>
</html>
网页展示:
2.后端Javascript代码:
最先开始我写的是这一段:
var http = require('http');
var fs = require('fs');
var url = require('url');
var mysql = require('./mysql.js');
http.createServer(function(request, response) {
var pathname = url.parse(request.url).pathname;
var params = url.parse(request.url, true).query;
fs.readFile(pathname.substr(1), function(err, data) {
response.writeHead(200, { 'Content-Type': 'text/html; charset=gkb' });
if ((params.title === undefined) && (data !== undefined))
response.write(data.toString());
else {
response.write(JSON.stringify(params));
var select_Sql = "select title,author,publish_date from fetches where title like '%" +
params.title + "%'";
mysql.query(select_Sql, function(qerr, vals, fields) {
console.log(vals);
});
}
response.end();
});
}).listen(3000);
console.log('Server running at http://127.0.0.1:3000/');
但是这一段代码无法在前端显示搜索内容,只能在vscode上面显示 不方便也不符合要求
学习了老师的代码后 我改成了下面这样:
var express = require('express');
var mysql = require('./mysql.js')
var app = express();
//app.use(express.static('public'));
app.get('/search.html', function(req, res) {
res.sendFile(__dirname + "/" + "search.html");
})
//我的前端网页名就是search.html
app.get('/process_get', function(req, res) {
res.writeHead(200, { 'Content-Type': 'text/html;charset=utf-8' }); //设置res编码为utf-8
//sql字符串和参数
var fetchSql = "select url,source_name,title,author,publish_date from fetches where title like '%" +
req.query.title + "%'";
mysql.query(fetchSql, function(err, result, fields) {
console.log(result);
res.end(JSON.stringify(result));
});
})
var server = app.listen(3000, function() {
console.log("访问地址为http://127.0.0.1:3000/search.html")
})
最后我们来随机搜索一下几个关键词
1.脱//贫
2.国/际
以上就是这次作业内容。