前端学习笔记:nodeJs爬虫

一、代码

var https = require('https');
var cheerio = require('cheerio');
var mysql  = require('mysql');
var table = "job";
var moment = require('moment');
var today = moment();
var year = today.format('YYYY年');
var yesterday = today.subtract(1, 'days').format('YYYY年MM月DD日');
var urlTool = require("url");
var qs = require('querystring');

var db_config = {
    host: '127.0.0.1',    
    user: 'root',
    password:'123456',
    port:'3306',
    database:'node'
};
var connection;
function handleDisconnect() {
    connection = mysql.createConnection(db_config);            
    connection.connect(function(err) {       
        if(err) {                         
            console.log('进行断线重连:' + new Date());
            setTimeout(handleDisconnect, 2000);   //2秒重连一次
            return;
        }        
        //console.log('连接成功'); 
    });                
    connection.on('error', function(err) {
        console.log('db error', err);
        if(err.code === 'PROTOCOL_CONNECTION_LOST') {
            handleDisconnect();   
        } else {                                     
            throw err;                                
        }
    });
}

function filterJobItem(html){
	var $ = cheerio.load(html, {decodeEntities: false});
	var jobItem = $('.job-list').find('ul').children('li');
	var jobData = [];
	var msgs = [];
	var companyMsgs = [];
	var titleAndSalarys = [];
	var item, primaryInfo, companyInfo, id, msg, companyMsg, companyShortName, titleAndSalary, date;
	var addJob = 'insert into job(titleId, title, salary, city, workYear, education, companyName, industry, financing, companySize,date) values(?,?,?,?,?,?,?,?,?,?,?)';
	var i;
	jobItem.each(function(item){
		item = $(this);
		primaryInfo = item.find('.job-primary').find('.info-primary');
		companyInfo = item.find('.job-primary').find('.info-company');
		id = primaryInfo.find('.name').find('a').attr('href').split('/')[2].split('.')[0];
		titleAndSalary = primaryInfo.find('.name').find('a').text();
		titleAndSalarys = titleAndSalary.split(' ');
		msg = primaryInfo.find('p').html();
		msgs = msg.split('<em class="vline"></em>');
		companyShortName = companyInfo.find('.company-text').find('.name').find('a').text();
		companyMsg = companyInfo.find('p').html();
		companyMsgs = companyMsg.split('<em class="vline"></em>');
		date = item.find('.job-time').text().substr(3);
		if('昨天' === date)
			date = yesterday;
		else { 
			date = year + date;
		}
		handleDisconnect();
		param = [id,titleAndSalarys[0],titleAndSalarys[1],msgs[0],msgs[1],msgs[2],companyShortName,companyMsgs[0],companyMsgs[1],companyMsgs[2],date];
		connection.query(addJob, param, function(error, result){
			if(error){
				console.log(error.message);
			}else{
				console.log('insert id: ' + result.titleId);
			}
		});	
	})
	return jobData;
}

var url;

function httpGet(url){
	https.get(url,function(res){
		var html = '';
		res.on('data',function(data){
			html += data;
		})
		res.on('end',function(){
			//console.log(html); 
			var jobData = filterJobItem(html);
		})
	}).on('error', function(){
		console.log('出错了!');
	})
}

var pages = [];

function robot(){
	pages.forEach(function(page){
		url = url + qs.stringify({ query: 'Java', page: page, ka: 'page-1' },'&');
		httpGet(url);
	});
}

robot();

二、操作

1.1、引入依赖

var https = require('https');//取决于你要爬网址是http还是https
var cheerio = require('cheerio');//用法类似jquery
var mysql  = require('mysql');
var moment = require('moment');//格式化时间
var urlTool = require("url");
var qs = require('querystring');//url和querystring是nodejs用来处理url的两大利器

1.2、依赖的简单使用

1.2.1  monent

var today = moment();
var year = today.format('YYYY年');//2017年
var yesterday = today.subtract(1, 'days').format('YYYY年MM月DD日');//昨天的日期,格式为:2017年11月24日

1.2.2 url和querystring

var getQuery = urlTool.parse(url).query;
var getData = qs.parse(getQuery);//{ query: 'Java', page: '1', ka: 'page-1' }
getData = qs.stringify({ query: 'Java', page: '4', ka: 'page-1' },'&');//query=Java&page=4&ka=page-1

1.2.3 mysql

//nodejs-mysql断线重连
var db_config = {
    host: '127.0.0.1',    
    user: 'root',
    password:'123456',
    port:'3306',
    database:'node'
};
var connection;
function handleDisconnect() {
    connection = mysql.createConnection(db_config);            
    connection.connect(function(err) {       
        if(err) {                         
            console.log('进行断线重连:' + new Date());
            setTimeout(handleDisconnect, 2000);   //2秒重连一次
            return;
        }        
        //console.log('连接成功'); 
    });                
    connection.on('error', function(err) {
        console.log('db error', err);
        if(err.code === 'PROTOCOL_CONNECTION_LOST') {
            handleDisconnect();   
        } else {                                     
            throw err;                                
        }
    });
}

1.2.4  cheerio(用法类似jquery)

item = $(this);
id = primaryInfo.find('.name').find('a').attr('href').split('/')[2].split('.')[0];
titleAndSalary = primaryInfo.find('.name').find('a').text();

三、爬虫的使用

将要爬取网页的页数以数组的形式传入pages,在nodejs 执行 node server.js即可。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值