第一次爬虫大作业
一.基础爬虫代码(老师的)
/*在这里插入代码片*/var source_name = "中国新闻网";
var domain = 'http://www.chinanews.com/';
var myEncoding = "utf-8";
var seedURL = 'http://www.chinanews.com/';
var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = "$('title').text()";
var date_format = "$('#pubtime_baidu').text()";
var author_format = "$('#editor_baidu').text()";
var content_format = "$('.left_zw').text()";
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var source_format = "$('#source_baidu').text()";
var url_reg = /\/(\d{4})\/(\d{2})-(\d{2})\/(\d{7}).shtml/;
var regExp = /((\d{4}|\d{2})(\-|\/|\.)\d{1,2}\3\d{1,2})|(\d{4}年\d{1,2}月\d{1,2}日)/
var fs = require('fs');
var myRequest = require('request')
var myCheerio = require('cheerio')
var myIconv = require('iconv-lite')
require('date-utils');
//防止网站屏蔽我们的爬虫
var headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}
//request模块异步fetch url
function request(url, callback) {
var options = {
url: url,
encoding: null,
//proxy: 'http://x.x.x.x:8080',
headers: headers,
timeout: 10000
}
myRequest(options, callback)
}
request(seedURL, function(err, res, body) {
var html = myIconv.decode(body, myEncoding);
var $ = myCheerio.load(html, { decodeEntities: true });
var seedurl_news;
try {
seedurl_news = eval(seedURL_format);
//console.log(seedurl_news);
} catch (e) { console.log('url列表所处的html块识别出错:' + e) };
seedurl_news.each(function(i, e) {
var myURL = "";
try {
var href = "";
href = $(e).attr("href");
if (href.toLowerCase().indexOf('http://') >= 0) myURL = href;
else if (href.startsWith('//')) myURL = 'http:' + href;
else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href;
} catch (e) { console.log('识别种子页面中的新闻链接出错:' + e) }
if (!url_reg.test(myURL)) return;
//console.log(myURL);
newsGet(myURL);
});
});
function newsGet(myURL) {
request(myURL, function(err, res, body) {
var html_news = myIconv.decode(body, myEncoding);
var $ = myCheerio.load(html_news, { decodeEntities: true });
myhtml = html_news;
console.log("转码读取成功:" + myURL);
var fetch = {};
fetch.title = "";
fetch.content = "";
fetch.publish_date = (new Date()).toFormat("YYYY-MM-DD");
fetch.url = myURL;
fetch.source_name = source_name;
fetch.source_encoding = myEncoding;
fetch.crawltime = new Date();
if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format); //没有关键词就用sourcename
else fetch.keywords = eval(keywords_format);
if (title_format == "") fetch.title = ""
else fetch.title = eval(title_format);
if (date_format != "") fetch.publish_date = eval(date_format);
console.log('date: ' + fetch.publish_date);
fetch.publish_date = regExp.exec(fetch.publish_date)[0];
fetch.publish_date = fetch.publish_date.replace('年', '-')
fetch.publish_date = fetch.publish_date.replace('月', '-')
fetch.publish_date = fetch.publish_date.replace('日', '')
fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");
if (author_format == "") fetch.author = source_name;
else fetch.author = eval(author_format);
if (content_format == "") fetch.content = "";
else fetch.content = eval(content_format).replace("\r\n" + fetch.author, "");
if (source_format == "") fetch.source = fetch.source_name;
else fetch.source = eval(source_format).replace("\r\n", "");
if (desc_format == "") fetch.desc = fetch.title;
else fetch.desc = eval(desc_format).replace("\r\n", "");
var filename = source_name + "_" + (new Date()).toFormat("YYYY-MM-DD") +
"_" + myURL.substr(myURL.lastIndexOf('/') + 1) + ".json";
fs.writeFileSync(filename, JSON.stringify(fetch));
});
}
二.爬虫代码
不得不说这个爬虫代码是如此的折磨人,我刚开始爬新浪竞技风暴,因为老师的有点长且比较复杂,我刚好在B站上看了一个视频,里面的代码很简单,我就用了下面这个代码,这个代码爬第一页的时候很成功, but第一页只里面有七十条数据,不满足Web编程作业要求,后来我又改成了老师的代码。
下面是b站代码:
var https = require("https");//引入http模块
var cheerio = require("cheerio");
var fs = require("fs");
var uri = "https://sports.sina.com.cn/nba/1.shtml";
function httpsGet(uri,cb){
var html = "";
https.get(uri,function(res){
res.on("data",function(chunk){
html += chunk;
});
res.on("end",function(){
cb(html);
});
}).on("error",function(e){
console.log(e.message);
});
return html;
}
httpsGet(uri,function(html){
var $ = cheerio.load(html);
$("#S_Cont_11 a").each(function(index){
var newsUri = $(this).attr("href");
httpsGet(newsUri,function(body){
var jq = cheerio.load(body);
fs.writeFile('./.vscode/内容文档/'+index+'.txt',jq("#artibody").text(),function(err){
if(err){
return console.log(err.messsage);
}
console.log("完成");
});
});
});
});
这个只爬了新闻的内容部分,还没来得及改成标题,时间,作者,内容的形式就发现它不行了(捂脸哭)。
当我爬第二页时:
是http和https的缘故吗?但我改完是这样:
我又和b站视频上对照了一下,那个老师是爬了全部6页的内容,"#right a",结果我只能爬一页"#S_Col_11 a"。
困惑,why?
下面是在老师给出代码基础上改的
先
var myRequest = require('request')
var myCheerio = require('cheerio')
var myURL = 'https://sports.sina.com.cn/nba/1.shtml'
function request(url, callback) {//request module fetching url
var options = {
url: url, encoding: null, headers: null
}
myRequest(options, callback)
}
request(myURL, function (err, res, body) {
var html = body;
var $ = myCheerio.load(html, { decodeEntities: false });
console.log($.html());
})
得出:
完整代码:
var myRequest = require('request');
var myIconv = require('iconv-lite');
var myCheerio = require('cheerio');
var myEncoding = "utf-8";
var mysql = require('./mysql.js');
var source_name = "新浪竞技风暴";
var domain = 'https://sports.sina.com.cn/nba/1.shtml';
var myEncoding = "utf-8";
var seedURL = 'https://sports.sina.com.cn/nba/1.shtml';
function request(url, callback) {
var options = {
url: url,
encoding: null,
//proxy: 'http://x.x.x.x:8080',
headers: headers,
timeout: 10000 //
}
myRequest(options, callback)
};
var headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}
seedget();
function seedget() {
request('https://sports.sina.com.cn/nba/1.shtml', function(err, res, body) {
var html = myIconv.decode(body, myEncoding);
var $ = myCheerio.load(html, { decodeEntities: true });
var seedurl_news;
try {
seedurl_news = eval($("#right"));
} catch (e) { console.log('url列表所处的html块识别出错:' + e) };
seedurl_news.each(function(i, e) {
var myURL = "";
var href="";
try {
href=$(e).attr("href");
console.log(href);
myURL='https://sports.sina.com.cn/nba/1.shtml'+href
} catch (e) { console.log('识别种子页面中的新闻链接出错:' + e) }
newsget(myURL)
});
});
}
function newsget(myURL){
request(myURL, function(err, res, body){
var html_news = myIconv.decode(body, myEncoding);
var $ = myCheerio.load(html_news, { decodeEntities: true });
myhtml = html_news;
console.log("转码读取成功:" + myURL);
var fetch = {};
fetch.title = "";
fetch.content = "";
fetch.url = myURL;
fetch.title=$("title").text();
fetch.content=$('meta[name="description"]').attr("content");
var fetchadd ='INSERT INTO fetches(url,title,content )VALUES(?,?,?)';
var fetchadd_params=[fetch.url,fetch.title,fetch.content];
mysql.query(fetchadd,fetchadd_params,function(qerr,vals,fields){});
});
}
还有mysql.js等,npm install ***等
三.运行并存入数据库中及.txt文件
建表是在命令提示符里
txt格式是这样的:
四.前端,后端——用网页发送请求到后端查询
前端示例:
<!DOCTYPE html>
<html>
<body>
<form action="http://127.0.0.1:8080/7.02.html" method="GET">
<br> 标题:<input type="text" name="title">
<input type="submit" value="Submit">
</form>
<script>
</script>
</body>
</html>
后端示例:
var http = require('http');
var fs = require('fs');
var url = require('url');
var mysql = require('./mysql.js');
http.createServer(function(request, response) {
var pathname = url.parse(request.url).pathname;
var params = url.parse(request.url, true).query;
fs.readFile(pathname.substr(1), function(err, data) {
response.writeHead(200, { 'Content-Type': 'text/html; charset=utf-8' });
if ((params.title === undefined) && (data !== undefined))
response.write(data.toString());
else {
response.write(JSON.stringify(params));
var select_Sql = "select title,author,publish_date from fetches where title like '%" +
params.title + "%'";
mysql.query(select_Sql, function(qerr, vals, fields) {
console.log(vals);
});
}
response.end();
});
}).listen(8080);
console.log('Server running at http://127.0.0.1:8080/');
五.总结
我遇到的问题:
1.刚开始用b站代码爬新浪竞技风暴时只能爬第一页(应该是http和https的问题吧?),70条数据,后来改成其他新闻网站也不大成功,其实我觉得新浪竞技风暴(哈登·休斯顿火箭_NBA|NBA直播|新浪竞技风暴_新浪网 (sina.com.cn))挺好用的,里面基本没什么图文
2.后来用老师的,主要是mysql这里问题比较多,npm install,建表等让我很心累。
3.就是用网页发送请求到后端查询,html这些,虽然老师给了示例,我还是搞了很长时间。