package.json:
{
"name": "crawler",
"version": "0.0.0",
"private": true,
"scripts": {
"start": "node ./bin/www"
},
"dependencies": {
"express": "~4.9.0",
"body-parser": "~1.8.1",
"cookie-parser": "~1.3.3",
"morgan": "~1.3.0",
"serve-favicon": "~2.1.3",
"debug": "~2.0.0",
"ejs": "~0.8.5",
"cheerio": "~0.18.0",
"request": "~2.51.0"
}
}
关于cheerio
这里准备爬一下https://cnodejs.org/
要爬的东西是这样的
<a class="topic_title" href="/topic/5493e6c59b158a790e21dc5e" title="新手请教node.js不断回调会吃光内存吗?">
新手请教node.js不断回调会吃光内存吗?
</a>
routes/index.js
var express = require('express');
var router = express.Router();
var request = require('request');
var cheerio = require('cheerio')
/* GET home page. */
router.get('/', function(req, res) {
//res.render('index', { title: 'Express' });
request.get({
url:'https://cnodejs.org/'
}, function (err, response, body) {
if(err) {console.log(err);}
var $ = cheerio.load(body);
var items = [];
console.log(body);
$('a.topic_title').each(function(idx, element) {
var $element = $(element);
items.push({
title: $element.attr('title'),
href: $element.attr('href')
});
});
res.send(items);
});
});
module.exports = router;
访问一下看看爬到的东西