nodejs+request+request-promise+cheerio做个爬虫系统

最新推荐文章于 2022-08-02 18:10:12 发布

程序员-小许

最新推荐文章于 2022-08-02 18:10:12 发布

阅读量1k

点赞数 1

文章标签： request request-promise nodejs cheerio

本文链接：https://blog.csdn.net/m0_37820751/article/details/103086190

版权

request爬取网页数据、cheerio解析网页数据

1.request：是一个流行 Node.js 第三方 HTTP 请求工具。是目前市场上很多的网络爬虫应用，通过reques请求url返回的body里有整个页面的html文档。

2.cheerio：因为node.js中没有DOM操作，但我们要获取返回来的html文档的某个节点下的数据，就需要用到cheerio这个库，它可以像jquery一样操作dom节点，写法也基本一致。

因为需要把爬取的数据存在数据中，所以要连接数据库，这里用的是mongodb

const mongoose = require('mongoose');
mongoose.connect('mongodb://localhost/news');
	var db = mongoose.connection;
	db.on('error', console.error.bind(console, 'connection error:'));
	db.once('open', function() {
		console.log('连接成功')
	});
let newInfoSchema = db.Schema({
	//设置title为唯一值，相同的无法插入数据库
	title: {type:String,unique:true,dropDups: true},
	postTime: Number,
	content: String,
	className: String,
	source: String,
	url: String,
},{collection:'newInfo'})

let model = db.model('newInfo',newInfoSchema);

对自己想爬取的页面进行爬取。

//使用request爬取网页,这里是新浪新闻、体育分类的列表页
request('http://sports.sina.com.cn/', function(error, response, body) {
	//返回来的body是整个页面的html,通过cherrio的load方法生成一个对象$
	//$对象拥有和jquery一样操作节点的功能
	let $ = cheerio.load(body, {
		decodeEntities: false
	});
	//获取列表页每一项跳转去内容页的a标签
	let aDom = $('.ty-card-tt a');
	let aArray = [];
	let postTime = '';
	//将a标签的url存到数组
	for (let i = 0; i < aDom.length; i++) {
		aArray[i] = aDom.eq(i).attr('href');
	}
	//循环爬取数组中的url，爬取对应url的内容详细页
	for (let i = 0; i < aArray.length; i++) {
		//模仿浏览器对象去访问url
		let url = 'http:' + aArray[i],
		//rq 是依赖包request-promise的对象，解决循环使用request异步方法，返回数据为null的问题，用法跟request相似
		rp(url).then(body1 => {
			let $1 = cheerio.load(body1, {
				decodeEntities: false
			});
			//将爬取的时间转换为时间戳
			postTime = $1('.date').text().replace('月', '-')
			postTime = postTime.replace('年', '-')
			postTime = postTime.replace('日', '')
			//对内容详细页里需要的数据进行爬取，这里爬取了标题、内容、发布时间
			//创建一个保存存储信息的model对象
			let newInfo = new newInfoModel({
				title: $1('.main-title').text(),
				postTime: new Date(postTime).getTime(),
				content: $1('#artibody').html(),
				className: '体育',
				source: '新浪新闻',
				url: 'http:' + aArray4[i],
			})
			//使用model的save保存数据到数据库
			newInfo.save(function(err) {
				console.log(err)
			})
		}).catch(err => {
			console.log(err)
		})
	}
})