【Node.js】爬虫--抓取新闻标题、图片、文字描述,支持QQ、iFeng

原创 2016年05月31日 15:59:36
文章目录
  1. 1. app.js
  2. 2. img-spider.js
  3. 3. img.gallery.js
  4. 4. ifengImgs.js
  5. 5. ifengPictures.js
  6. 6. qqImgs.js
  7. 7. imgs.html

先上效果图:

图片上部分为待解析的网页新闻链接,支持一次输入多个.
图片下部分为解析的进度日志打印。

qq.img.spider02

点击’Commit’之后,对比效果图如下。左边为腾讯新闻原网页,右边为抓取后的整合效果。

qq.img.spider

工程结构:

文件名描述
app.js程序启动
img-spider.js爬虫爬取管理
ifengImgs.js爬取iFeng下game/fashion的实现
ifengPictures.js爬取iFeng下game高清图的实现
qqImgs.js爬取腾讯新闻图的实现
img.gallery.js爬取图片的汇总
imgs.html提交爬取链接的html界面

应用到的知识点:

  • express:搭建Web服务
  • cheerio:类似jQuery的快速解析网页工具
  • iconv-lite:解决中文乱码问题
  • 正则表达式:网址匹配、内容匹配/过滤
  • Charles:抓包工具

更多细节看源码吧….

GitHub源码链接:Sodino#ImgSpider


app.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
var fs = require('fs');
var express = require('express');
var img_spider = require('./img-spider.js');
var app = express();
app.get('/imgs.html', (req, resp) => {
resp.writeHead(200, {'Content-Type' : 'text/html'});
resp.write('<head><meta charset="utf-8"/></head>');
var file = fs.createReadStream('./imgs.html');
file.pipe(resp);
});
app.post('/imgs.html', (req, resp) => {
req.on('data', (data) => {
var content = data.toString();
content = unescape(content);
content = content.replace('txtUrls=', '')
//.replace('/\r/g','')
//.replace('/\\r/g','')
;
var arrUrl = content.split(/\s+/);
var imgSpider = new img_spider();
imgSpider.spider(arrUrl, (err, arrImgGallery) => {
resp.writeHead(200, {'Content-Type' : 'text/html'});
resp.write('<head><meta charset="utf-8"/></head>');
resp.write('<body>');
if (err) {
var errStr = err.toString();
resp.write(errStr);
resp.write('</body>');
resp.end();
return;
}
arrImgGallery.forEach((element, index, arrGallery)=>{
var gallery = element;
resp.write('<p>============================================</p>');
resp.write('<p>' + gallery.title + '</p>');
var arrImgs = gallery.arrImgs;
arrImgs.forEach((ele, idx, arrImg)=>{
var desc = ele.desc;
var imgUrl = ele.imgBig;
resp.write('<p>idx=' + idx + "</p>");
resp.write('<p>' + desc + '</p>');
//<img id="bigPic" src="http://img1.gtimg.com/16/1615/161596/16159645_980x1200_0.jpg" style="opacity: 1;">
resp.write('<p><img id="bigPic" src="' + imgUrl+'" style="opacity: 1;"></img></p>');
resp.write('<p>------------------------</p>');
});
});
resp.write('</body>');
resp.end();
});
});
});
app.listen(1024);
console.log('server running on http://localhost:1024/imgs.html');

img-spider.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
var ifengImgs = require('./ifengImgs.js');
var ifengPictures = require('./ifengPictures.js');
var qqImgs = require('./qqImgs.js');
var ImgSpider = function(){
this.arrUrls = [];
this.arrImgGallery = [];
this.callback = null;
};
ImgSpider.prototype.spider = function(arrUrl, callback){
if (!Array.isArray(arrUrl)) {
throw new Error("arrUrl isn't a array!");
}
if (arrUrl.length == 0) {
throw new Error("arrUrl is empty.");
}
this.callback = callback;
arrUrl.forEach((element, index, arr) => {
if (ifengImgs.prototype.RegExp.test(element)) {
runSpider(element, ifengImgs, this);
} else if (ifengPictures.prototype.RegExp.test(element)) {
runSpider(element, ifengPictures, this);
} else if (qqImgs.prototype.RegExp.test(element)) {
runSpider(element, qqImgs, this);
} else {
element = element.trim();
if (element.length > 0) {
var err = new Error("Can't support this url:[" + element + ']');
callback(err, null);
} else {
// do nothing..
}
}
});
};
ImgSpider.prototype.clean = function () {
this.arrUrls = [];
this.arrImgGallery = [];
this.callback = null;
};
function runSpider(url, constructor, imgSpider) {
imgSpider.arrUrls.push(url);
spider = new constructor();
spider.spider(url, (err, imgGallery) => {
if (err) {
console.log('error');
console.log(err);
return;
}
console.log('Done:', imgGallery.url, imgGallery.title);
imgSpider.arrImgGallery.push(imgGallery);
if (imgSpider.arrImgGallery.length == imgSpider.arrUrls.length) {
if (Object.prototype.toString.call(imgSpider.callback)=== '[object Function]') {
imgSpider.callback(null, imgSpider.arrImgGallery);
}
}
});
}
module.exports = ImgSpider;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
var ImgGallery = function(url) {
this.url = url;
this.title = '';
this.arrImgs = [];
};
ImgGallery.prototype.push = function(idx, imgBig, imgSmall, desc) {
var img = new Img(idx, imgBig, imgSmall, desc);
var length = this.arrImgs.push(img);
return length;
}
var Img = function(idx, imgBig, imgSmall, desc) {
this.imgBig = imgBig;
this.imgSmall = imgSmall;
this.desc = desc;
this.index = idx;
};
module.exports = ImgGallery;

ifengImgs.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
var url = 'http://games.ifeng.com/a/20160504/41603363_0.shtml';
var cheerio = require("cheerio");
var http = require("http");
var iconv = require('iconv-lite');
var img_gallery = require('./img.gallery.js');
var SpiderIfengImgs = function() {
};
// http://games.ifeng.com/a/20160504/41603363_0.shtml
// http://fashion.ifeng.com/a/20160519/40162307_0.shtml#p=1
SpiderIfengImgs.prototype.RegExp = /http:\/\/(games)|(fashion).ifeng.com\/a\/\d{8}\/\d+_\d+.shtml/;
SpiderIfengImgs.prototype.spider = function(url, callback){
http.get(url, function(res){
var arrBuf = [];
var bufLength = 0;
res.on("data", function(chunk){
arrBuf.push(chunk);
bufLength += chunk.length;
})
.on("end", function(){
var imgGallery = new img_gallery(url);
var chunkAll = Buffer.concat(arrBuf, bufLength);
var html = iconv.decode(chunkAll,'utf-8');
var $ = cheerio.load(html);
imgGallery.title = $("title").text();
//console.log('page title', imgGallery.title);
var strStart = 'var G_listdata= ';
var strEnd = '</script>';
var idxStart = html.indexOf(strStart);
var idxEnd = html.indexOf(strEnd, strStart.length + idxStart);
var jsListData = html.slice(idxStart + strStart.length, idxEnd);
jsListData = jsListData.replace(/'/g, "\"")
.replace(/title/g, '\"title\"')
.replace(/big_img/g, '\"big_img\"')
.replace(/originalimg/g, '\"originalimg\"')
.replace(/picwidth/g, '\"picwidth\"')
.replace(/picheight/g, '\"picheight\"')
.replace(/morelink/g, '\"morelink\"')
.replace(/img:/g, '\"img\":')
.replace('];', ']')
;
var objJson = JSON.parse(jsListData);
objJson.forEach((element, index, arr) => {
var title = element.title;
var big = element.big_img;
var img = element.img;
var originalimg = element.originalimg;
imgGallery.push(index, big, img, title);
/*console.log(index);
console.log('title', title);
console.log('big', big);
console.log('img', img);
console.log('originalimg', originalimg);*/
});
if (Object.prototype.toString.call(callback)=== '[object Function]') {
callback(null, imgGallery);
}
});
});
};
module.exports = SpiderIfengImgs;

ifengPictures.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
var url = 'http://games.ifeng.com/picture/gaoqing/detail_2015_09/11/41081883_0.shtml';
var cheerio = require("cheerio");
var http = require("http");
var iconv = require('iconv-lite');
var img_gallery = require('./img.gallery.js');
var SpiderIfengPictures = function(){
};
SpiderIfengPictures.prototype.RegExp = /http:\/\/games.ifeng.com\/picture\/gaoqing\/detail_\d{4}_\d{2}\/\d{2}\/\d+_\d+.shtml/;
SpiderIfengPictures.prototype.spider = function (strUrl, callback) {
http.get(strUrl, function(res){
var arrBuf = [];
var bufLength = 0;
res.on("data", function(chunk){
arrBuf.push(chunk);
bufLength += chunk.length;
})
.on("end", function(){
var imgGallery = new img_gallery(strUrl);
var chunkAll = Buffer.concat(arrBuf, bufLength);
var html = iconv.decode(chunkAll,'utf-8');
console.log('-----------------------------------');
console.log('html', html);
var $ = cheerio.load(html);
imgGallery.title = $("title").text();
//console.log('page title', imgGallery.title);
var strStart = '_listdata[0] = ';
var strEnd = 'new ifeng.Gallery';
var idxStart = html.indexOf(strStart);
var idxEnd = html.indexOf(strEnd, strStart.length + idxStart);
var jsListData = html.slice(idxStart + strStart.length, idxEnd);
jsListData = jsListData.replace(/'/g, "\"")
.replace(/title/g, '\"title\"')
.replace(/morelink/g, '\"morelink\"')
.replace(/picwidth/g, '\"picwidth\"')
.replace(/picheight/g, '\"picheight\"')
.replace(/listimg/g, '\"listimg\"')
.replace(/timg:/g, '\"timg\":')
.replace(/img:/g, '\"img\":')
.replace(/\};_listdata\[\d*\] = /g, '},')
.replace('\};', '}')
;
jsListData = '[' + jsListData + ']';
var objJson = JSON.parse(jsListData);
//console.log('jsListData', jsListData);
objJson.forEach((element, index, arr) => {
var title = element.title;
var timg = element.timg;
var img = element.img;
var listimg = element.listimg;
imgGallery.push(index, timg, img, title);
/*console.log(index);
console.log('title', title);
console.log('timg', timg);
console.log('img', img);
console.log('listimg', listimg);*/
});
if (Object.prototype.toString.call(callback)=== '[object Function]') {
callback(null, imgGallery);
}
});
});
};
module.exports = SpiderIfengPictures;

qqImgs.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
var img_gallery = require('./img.gallery.js');
var http = require("http");
var iconv = require('iconv-lite');
var cheerio = require("cheerio");
var url = 'http://news.qq.com/a/20160512/009639.htm';
var url = 'http://news.qq.com/a/20160512/009639.hdBigPic.js';
var SpiderQQImgs = function() {
this.title = null;
this.imgGallery = null;
this.callback = null;
};
SpiderQQImgs.prototype.RegExp = /http:\/\/news.qq.com\/a\/\d{8}\/\d+.htm/;
SpiderQQImgs.prototype.send2callback = function() {
if ((typeof this.title =='string')&&this.title.constructor==String && this.title.length > 0 && this.imgGallery != null && Object.prototype.toString.call(this.callback)=== '[object Function]') {
this.imgGallery.title = this.title;
this.callback(null, this.imgGallery);
}
};
SpiderQQImgs.prototype.spider = function (url, callback) {
this.callback = callback;
this.spiderTitle(url);
url = url.replace('.htm', '.hdBigPic.js');
this.spiderImgGallery(url);
};
SpiderQQImgs.prototype.spiderTitle = function (url) {
var spider = this;
http.get(url, function(res){
var arrBuf = [];
var bufLength = 0;
res.on("data", function(chunk){
arrBuf.push(chunk);
bufLength += chunk.length;
})
.on("end", function(){
var chunkAll = Buffer.concat(arrBuf, bufLength);
var html = iconv.decode(chunkAll,'gb2312');
var $ = cheerio.load(html);
spider.title = $("title").text();
//console.log('page title', spider.title);
spider.send2callback();
});
});
};
SpiderQQImgs.prototype.spiderImgGallery = function (url) {
var spider = this;
http.get(url, function(res){
var arrBuf = [];
var bufLength = 0;
res.on("data", function(chunk){
arrBuf.push(chunk);
bufLength += chunk.length;
})
.on("end", function(){
var imgGallery = new img_gallery(url);
var chunkAll = Buffer.concat(arrBuf, bufLength);
var strJson = iconv.decode(chunkAll,'gb2312') // 汉字不乱码
.replace(/\/\*[\s\S]+?\*\//,'')/*.replace(subfix, '')*/ // 删除掉注释
.replace(/\'/g, '"') // 单引号变双引号才能解析成Object
/*.replace(/ /g, '')
.replace(/"Content":"",/g, '').replace(/"Attributes":\[\],/g, '')
.replace(/ /g, '')
.replace(/,"Children":\[\]/g,"")*/;
// console.log(strJson);
var objJson = JSON.parse(strJson);
deleteEmptyProperty(objJson);
var arr = objJson.Children[0].Children;
var shift1 = arr.shift();
var imgCount = shift1.Children[0].Content;
var arrImgs = arr.shift().Children;
// console.log('imgCount', imgCount);
arrImgs.forEach((element, index, array) => {
var arr = element.Children;
var small = arr[1];
var smallUrl = small.Children[0].Content;
var big = arr[2];
var bigUrl = big.Children[0].Content;
var text = arr[3];
var strText = text.Children[0].Content;
/*console.log('index', index);
console.log('smallUrl', smallUrl);
console.log('bigUrl', bigUrl);
console.log('text', strText);*/
imgGallery.push(index, bigUrl, smallUrl, strText);
});
spider.imgGallery = imgGallery;
spider.send2callback();
});
});
};
function deleteEmptyProperty(object){
for (var i in object) {
var value = object[i];
// console.log('typeof object[' + i + ']', (typeof value));
if (typeof value === 'object') {
if (Array.isArray(value)) {
if (value.length == 0) {
delete object[i];
//console.log('delete Array', i);
continue;
}
}
deleteEmptyProperty(value);
if (isEmpty(value)) {
//console.log('isOwnEmpty true', i, value);
delete object[i];
//console.log('delete a empty object');
}
} else {
if (value === '' || value === null || value === undefined) {
delete object[i];
//console.log('delete ', i);
} else {
//console.log('check ', i, value);
}
}
}
}
function isEmpty(object) {
for (var name in object) {
return false;
}
return true;
}
module.exports = SpiderQQImgs;

imgs.html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
<!DOCTYPE html><html lang="zh-CN">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
<title>Images spider</title>
</head>
<body>
<form id="form1" actoin="imgs.html" method="POST">
Please input urls:<br/>
<textarea name="txtUrls" style="width:500px;height:120px;">http://news.qq.com/a/20160531/018019.htm#p=1
http://games.ifeng.com/a/20160530/41615842_0.shtml#p=1
</textarea><br/>
<br/>
<input type="submit" value="commit"/><br/>
<br/>
</form>
</body>
</html>

About Sodino

vfp9.0通过WebBrowser控件获取QQ新闻标题及链接

*!*       by:十豆三 *!*  vfp版本:vfp9.0 *!* 控件名称: Microsoft Web Browser(Microsoft Internet Controls) *...

一个爬去搜狐新闻标题的简单爬虫。

先看简短代码: #!user/bin/python # coding: utf-8 import urllib2 import re #1. def get_html(url): req ...

JS 新闻标题单行向上滚动效果

var marqueeContent=new Array(); //滚动新闻 marqueeContent[0]='14:25 小泉称若自民党在议会选举中失败'; marqueeContent[1...

python3爬虫-爬取新浪新闻首页所有新闻标题

准备工作:安装requests和BeautifulSoup4。打开cmd,输入如下命令pip install requests pip install BeautifulSoup4打开我们要爬取的页面...

4张图片焦点图,带文字描述

  • 2014年02月20日 10:56
  • 257KB
  • 下载

Generative Adversarial Text to Image Synthesis --- 根据文字描述生成对应的图片

根据文字描述生成对应的图片

点击某个按钮切换图片、文字描述(按钮颜色跟着改变)(简化代码)

html: 数量正在加载... 文字正在加载... css:body {background-color: #333;} ul {padding: 0;margin...

android发布图片加文字描述

android发布图片加文字描述        一个应用总是要上传点图片的,所以自己写个总结的小demo,后台的话没钱买服务器,就用bmob存图片好了,以后再换后台。其实写这个小demo的时候也遇到...
内容举报
返回顶部
收藏助手
不良信息举报
您举报文章:【Node.js】爬虫--抓取新闻标题、图片、文字描述,支持QQ、iFeng
举报原因:
原因补充:

(最多只允许输入30个字)