Node.js 爬虫项目
DDL is coming.
Homework gathers, and now my work begins. It shall not end until all the works are done.I pledge my honor to the curriculum I been studying, for this night and all the nights to come.
爬虫要求:
1.爬虫代码
- 需要的模块
安装所需的模块包
- npm install request
- npm install cheerio
- npm install iconv-lite
- mysql.js是老师给的
p.s:写的爬虫文件一定要和这些模块在一个文件夹下,我已经吃过好几次这个苦头了-_-
var fs = require('fs');
var myRequest = require('request');
var myCheerio = require('cheerio');
var myIconv = require('iconv-lite');
require('date-utils');
var mysql = require('./mysql.js');
- 要爬取的网站 (看看新闻网)
var source_name = "看看新闻网";
var domain = 'http://www.kankanews.com/';
var myEncoding = "utf-8";
var seedURL = 'http://www.kankanews.com/';
3.想要爬取出来的内容的格式
var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"Keywords\"]').eq(0).attr(\"content\")";
var title_format = "$('title').text()";
var date_format = "$('.time').text()";
var author_format = "$('.resource').text()";
var content_format = "$('.textBody').text()";
var desc_format = " $('meta[name=\"Description\"]').eq(0).attr(\"content\")";
var source_format = "$('.resource').text()";
var url_reg = /a\/(\d{4})-(\d{2})-(\d{2})\/(\d{10}).shtml/;
var regExp = /((\d{4}|\d{2})(\-|\/|\.)\d{1,2}\3\d{1,2})|(\d{4}年\d{1,2}月\d{1,2}日)/
注意:若要修改需要爬取的网站,则以上这段代码的最后一行不能随意复制。
引入概念:正则表达式 Regular Expression
在代码中常简写为regex、regexp或RE。需要到所需爬取的网页看源码,找到表达式的规律。
菜鸟教程介绍正则表达式
4.一个神奇的操作——防止网站屏蔽我们的爬虫
var headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
5.request模块异步fetch url
概念:
- 同步:一定要等任务执行完了,得到结果,才执行下一个任务。
- 异步:不等任务执行完,直接执行下一个任务。
function request(url, callback) {
var options = {
url: url,
encoding: null,
//proxy: 'http://x.x.x.x:8080',
headers: headers,
timeout: 10000 //
}
myRequest(options, callback)
};
6.读取种子页面的函数实现
seedget();
function seedget() {
request(seedURL, function(err, res, body) {
//读取种子页面
// try {
//用iconv转换编码
var html = myIconv.decode(body, myEncoding);
//console.log(html);
//准备用cheerio解析html
var $ = myCheerio.load(html, {
decodeEntities: true });
// } catch (e) { console.log('读种子页面并转码出错:' + e) };
var seedurl_news;
try {
seedurl_news = eval(seedURL_format);
} catch (e) {
console.log('url列表所处的html块识别出错:' + e) };
seedurl_news.each(function(i, e) {
//遍历种子页面里所有的a链接
var myURL = "";
try {
//得到具体新闻url
var href = "";
href = $(e).attr("href");
if (href == undefined) return;
if ((href.toLowerCase().indexOf('http://') >= 0) || (href.toLowerCase().indexOf('https://') >= 0)) myURL = href; //http://开头的
else if (href.startsWith('//')) myURL = 'http:' + href; 开头的
else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href; //其他
} catch (e) {
console.log('识别种子页面中的新闻链接出错:' + e) }
if (!url_reg.test(myURL)) return