简单的node爬虫,这里用到cheerio模块 他能够很方便的操作dom ,我这里读取了百度搜索的内容
test1.js
const http = require('http');
const cheerio = require('cheerio');
const express = require('express');
const path = require('path');
const app = express();
app.use(express.static(path.join(__dirname)));
app.get('/', function (req, res, next) {
res.json({
'name': 'test'
})
});
app.get('/getData', function (req, res) {
let params = req.query?req.query:{};
console.log('params',params);
getData(params,function (data) {
res.json({
'ok': true,
'name': 'test',
'data':data
});
});
});
app.listen('9999', function () {
console.log('服务器启动 监听9999端口');
});
function getData(params,callback) {
let url = 'http://www.baidu.com/s?wd=word&rsv_spt=1&rsv_iqid=0xde0f6c3c000402ed&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&rqlang=cn&tn=baiduhome_pg&rsv_enter=1&oq=aa&inputT=547&rsv_t=94d3Z7tMbG%2BpuEnOPrHr1UamFC8HB4%2FDItvkDQxqcgl9VqrcHCHrYr1wf1iLMCQxg5II&rsv_pq=f47fcffb00042e7a&rsv_sug3=7&rsv_sug1=6&rsv_sug7=100&rsv_sug2=0&rsv_sug4=547&rsv_sug=2';
url = params.url||url;
let listData = {};
http.get(url, function (res) {
let html = '';
res.on('data', function (data) {
html += data;
});
//结束
res.on('end', function () {
// let slideListData = filter(html);
//获取数据
listData = filter(html);
if(callback){
callback(listData);
}
})
}).on('error', function () {
console.log('获取数据出错');
});
/* 过滤页面信息 */
function filter(html) {
if (html) {
// 沿用JQuery风格,定义$
let $ = cheerio.load(html);
// 根据id获取轮播图列表信息
let listLeft = $('#content_left');
let listRight = $('#content_right');
let listData = {
left: [],
right: []
};
listLeft.find('.c-container').each(function (i, item) {
if (i < 3) {
let $item = $(this);
let text = $item.find('.t a').text();
let src = $item.find('.t a').attr('href');
listData.left.push({
text: text ? text.replace(/\r\n/g) : '',
src: src ? filterSpecialChars(src) : ''
});
}
});
listRight.find('.FYB_RD tbody').eq(0).find('tr').each(function (i, item) {
if (i < 3) {
let $item = $(this);
let index = $item.find('span .c-index').text();
let desc = $item.find('span a').text();
let src = url + $item.find('span a').attr('href');
listData.right.push({
index: index,
desc: desc,
src: src ? filterSpecialChars(src) : '',
});
}
});
listData['img'] = filterSpecialChars($('#lg').find('img').attr('src'));
return listData;
} else {
console.log('无数据传入!');
}
}
function printInfo(listData) {
// 计数
console.log('listData', listData);
}
function filterSpecialChars(str) {
//去掉换行 回车 空格
return str.replace(/\/\r\n/g, '');
}
return listData;
}
index.html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
test
<script src="jquery-3.0.0.min.js"></script>
<script>
function getData(){
$.ajax({
url:'http://127.0.0.1:9999/getData',
data:{
},
success:function (data) {
console.log('data',data);
}
})
}
getData();
</script>
</body>
</html>
文件目录是: