"use strict";
var request = require('request');
//get post 请求库
var cheerio = require('cheerio');
//类jQuery查询html字符串库
var iconv = require('iconv-lite');
//转换编码库
/*
安装库
npm install request cheerio iconv-lite --save
本爬虫目的是爬取新浪黄金资讯页面的新闻数据
*/
let url = 'http://roll.finance.sina.com.cn/finance/gjs/hjzx/index_1.shtml';
//新浪黄金资讯页面
let header = {
// "Content-type": "text/html",
// "Connection":"keep-alive",
// "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
// "Accept-Encoding":"gzip, deflate",
// "Accept-Language":"zh-CN,zh;q=0.8",
// "Cache-Control":"max-age=0",
// "Connection":"keep-alive",
// "cookie":'',
// "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
// "Accept-Encoding":"utf8"
};
let option = {
url:url,
method: "GET",
json: true,
headers: header,
encoding: null,
//request默认会将获得的网页数据以utf-8编码,这次页面采用gb2312编码,故设为不编码
};
request(option,function (error, response, body) {
if(error){
console.log('error');
console.log(error);
}
var html = iconv.decode(body, 'gb2312').toString();
//以gb2312解码
let $ = cheerio.load(html);
console.log('页面title: '+$('title').text());
console.log('新闻列表数据:');
let news = $('li');
let result = [];
let index = 0;
let length = news.length;
for(let i = 2;i < length ; i++){
let tmp = {};
tmp.url = news[i].children[0].attribs.href;
tmp.title = cheerio.load(news[i].children[0]).text();
// tmp.time = cheerio.load(news[i].children[1]).text()
result[index++] = tmp;
}
console.log(result);
//获得数据,之后可以存入数据库
});