先去了解了解cheerio和rquest模块
cheerio: https://www.npmjs.com/package/cheerio
request: https://www.npmjs.com/package/request
我用vue的webpack模板习惯了。所以还是在这上去npm这两个包
在src下新建splider.js文件用来写脚本代码
新建data目录用来存放爬到的数据
新建images文件夹用来存放图片
爬取的是北大软件与微电子学院新闻
1.引入依赖
const http = require('http')
const fs = require('fs')
const cheerio = require('cheerio')
const request = require('request')
let i = 0;
//初始url
const url = "http://www.ss.pku.edu.cn/index.php/newscenter/news/2391";
startRequest(url)
function startRequest(x) {
http.get(x, function(res) {
let html = '';
let title = [];
res.setEncoding('utf-8');
res.on('data', function(chunk) {
html += chunk;
})
res.on('end', function(){
let $ = cheerio.load(html);
let time = $('.article-info a:first-child').next().text().trim()
let new_item = {
title:$('div.article-title a').text().trim(),
Time: time,
link: "http://www.ss.pku.edu.cn"+ $('div.article-title a').attr('href'),
author:$('[title=供稿]').text().trim(),
i:i++
}
console.log(new_item)
let news_title = $('div.article-title a').text().trim();
savedContent($, news_title)
savedImg($, news_title)
let nextLink = 'http://www.ss.pku.edu.cn'+ $('li.next a').attr('href')
let str1 = nextLink.split('-')
let str = encodeURI(str1[0])
if(i<=500){
fetchPage(str)
}
});
}).on('error', function(err){
console.log(err)
})
}
function savedContent($, news_title) {
$('.article-content p').each(function(index,item) {
let x = $(this).text()
let y = x.substring(0, 2).trim();
if(y == ''){
x= x+'\n'
fs.appendFile('./data/'+ news_title + '.txt', x, 'utf-8',function(err){
if(err){
console.log(err)
}
})
}
})
}
function savedImg($, news_title) {
$('.article-content img').each(function(index, item){
let img_title = $(this).parent().next().text().trim()
if(img_title.length>35||img_title==""){
img_title="Null";}
var img_filename = img_title + '.jpg';
var img_src = 'http://www.ss.pku.edu.cn' + $(this).attr('src');
request.head(img_src,function(err,res,body){
if(err){
console.log(err);
}
});
request(img_src).pipe(fs.createWriteStream('./images/'+news_title + '---' + img_filename));
})
}