基于node.js的网页抓捕

从无到有耗时2天,蓝馊!香菇!

var tmp = '';
var count = 0;
var url = 'http://data.eastmoney.com/zjlx/600050.html';
var http = require('http');
var iconv = require('iconv-lite');
var req = http.request(url, function(res){
    //从网页上取数据
    res.on('data',function(data){
        count++;
        data = iconv.decode(data, 'GBK');
        tmp += data;
        console.log('Count\t'+count);
    });
    res.on('end', function() {
        if(tmp){
            //writeFile('a.txt',tmp);
            tmp = replace(tmp);
            //writeFile('b.txt',tmp);
            var cheerio = require('cheerio'),
            $ = cheerio.load(tmp);
            tmp = $('#content_zjlxtable').html();
            writeFile('c.html',tmp);
            //tmp = TableToCsv(tmp);
            //writeFile('d.csv',tmp);
        }else{
            console.log('Empty');
        }
    });
});
req.on('error', function(e) { 
        console.log('problem with request: ' + e.message); 
    }); 
req.end();
//写文件
function writeFile(file,str){
    var fs = require('fs');
    fs.writeFile(file, str, function(err){  
        if(err)console.log("fail " + err);  
    });
}
function replace(source){
    var rep0 = /'/g;
    var rep1 = /\r\n/g;//MLGB原来少个/r,win是/r/n,linux是/r
    var rep2 = /<!--.*?-->/ig;
    var rep3 = /\/\*.*?\*\//ig;
    var rep4 = /[ ]+</ig;
    var source0 = source.replace(rep0,'\"');
    var source1 = source0.replace(rep1,'');
    var source2 = source1.replace(rep2,'');
    var source3 = source2.replace(rep3,'');
    var source4 = source3.replace(rep4,'<');
    return source4;
}///<[^>]+>/g;
function TableToCsv(source){
    var rep0 = /\s/g;
    var rep1 = /<\/th>/g;
    var rep2 = /<\/td>/g;
    var rep3 = /<\/tr>/g;
    var rep4 = /<[^>]+>/g;
    var rep5 = / /g;
    var source0 = source.replace(rep0,'');
    var source1 = source0.replace(rep1,',');
    var source2 = source1.replace(rep2,',');
    var source3 = source2.replace(rep3,'\r\n');
    var source4 = source3.replace(rep4,'');
    var source5 = source4.replace(rep5,'');
    return source5;
}

这里写图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值