node js 小爬虫
var http = require('http'); var cheerio = require('cheerio'); var url = "http://www.imooc.com/learn/348"; var baseurl= "http://www.imooc.com/learn/"; var vediosId=[637,348,259,197,134,75]; var Promise=require('bluebird') function filterChapters(html) { var $ = cheerio.load(html); var chapters = $('.chapter'); var title = $('.course-infos .path span').text(); var number = parseInt($($('.meta-value strong')[3]).text().trim(), 10); var courseData ={ couseTitle:title, number:number, videos:[] }; var chapter; var chapterTitle; var videos; var chapterData; var video; var videoTitle; var id; var adress; chapters.each(function(index, item) { chapter = $(item); chapterTitle = chapter.find('strong').text(); videos = chapter.find('li'); chapterData = { 'chapterTitle': chapterTitle, 'videos': [] }; videos.each(function(index, item) { video = $(item).find('.J-media-item'); videoTitle = video.text(); id = video.attr('href').split('video/')[1]; adress = video.attr('href'); chapterData.videos.push({ 'title':videoTitle, 'id':id, 'adress':adress }); }); courseData.videos.push(chapterData); }); return courseData; } function printCourseInfo(coursesData) { var chapterTitle; var urlPre = 'URL:http://www.imooc.com'; coursesData.forEach(function(courseData) { chapterTitle = courseData.couseTitle; console.log('@@'+chapterTitle + ':' + courseData.number+'人学过!'+ '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n'); courseData.videos.forEach(function(item){ console.log('###'+item.chapterTitle); item.videos.forEach(function(t){ console.log(' '+t.title+' '+ t.id+' '+ t.adress+'\n'); }) }) }); } function getPageasync(url){ return new Promise(function(resolve,reject){ console.log(url); http.get(url, function(res) { var html = ''; res.on('data', function(data) { html += data; }); res.on('end', function() { resolve(html); }); }).on('error', function(e) { reject(e); console.log("失败"); }); }) } var fecthCourseArray=[]; vediosId.forEach(function(id){ fecthCourseArray.push(getPageasync(baseurl+id)) }); Promise.all(fecthCourseArray).then(function(pages){ var cousesData=[]; pages.forEach(function(html){ var couseData=filterChapters(html); cousesData.push(couseData) }); cousesData.sort(function(a,b){ return a.number< b.number }); printCourseInfo(cousesData) });