先上效果图:

图片上部分为待解析的网页新闻链接,支持一次输入多个.
图片下部分为解析的进度日志打印。

qq.img.spider02

点击’Commit’之后,对比效果图如下。左边为腾讯新闻原网页,右边为抓取后的整合效果。

qq.img.spider

工程结构:

文件名 描述
app.js 程序启动
img-spider.js 爬虫爬取管理
ifengImgs.js 爬取iFeng下game/fashion的实现
ifengPictures.js 爬取iFeng下game高清图的实现
qqImgs.js 爬取腾讯新闻图的实现
img.gallery.js 爬取图片的汇总
imgs.html 提交爬取链接的html界面

应用到的知识点:

  • express:搭建Web服务
  • cheerio:类似jQuery的快速解析网页工具
  • iconv-lite:解决中文乱码问题
  • 正则表达式:网址匹配、内容匹配/过滤
  • Charles:抓包工具

更多细节看源码吧….

GitHub源码链接:Sodino#ImgSpider


app.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
var fs = require('fs');
var express = require('express');
var img_spider = require('./img-spider.js');
var app = express();


app.get('/imgs.html', (req, resp) => {
resp.writeHead(200, {'Content-Type' : 'text/html'});
resp.write('<head><meta charset="utf-8"/></head>');
var file = fs.createReadStream('./imgs.html');
file.pipe(resp);

});

app.post('/imgs.html', (req, resp) => {
req.on('data', (data) => {
var content = data.toString();
content = unescape(content);
content = content.replace('txtUrls=', '')
//.replace('/\r/g','')
//.replace('/\\r/g','')
;
var arrUrl = content.split(/\s+/);
var imgSpider = new img_spider();
imgSpider.spider(arrUrl, (err, arrImgGallery) => {

resp.writeHead(200, {'Content-Type' : 'text/html'});
resp.write('<head><meta charset="utf-8"/></head>');
resp.write('<body>');
if (err) {
var errStr = err.toString();
resp.write(errStr);
resp.write('</body>');
resp.end();
return;
}
arrImgGallery.forEach((element, index, arrGallery)=>{
var gallery = element;

resp.write('<p>============================================</p>');
resp.write('<p>' + gallery.title + '</p>');
var arrImgs = gallery.arrImgs;
arrImgs.forEach((ele, idx, arrImg)=>{
var desc = ele.desc;
var imgUrl = ele.imgBig;
resp.write('<p>idx=' + idx + "</p>");
resp.write('<p>' + desc + '</p>');
//<img id="bigPic" src="http://img1.gtimg.com/16/1615/161596/16159645_980x1200_0.jpg" style="opacity: 1;">
resp.write('<p><img id="bigPic" src="' + imgUrl+'" style="opacity: 1;"></img></p>');
resp.write('<p>------------------------</p>');
});
});
resp.write('</body>');
resp.end();
});
});
});



app.listen(1024);
console.log('server running on http://localhost:1024/imgs.html');

img-spider.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
var ifengImgs = require('./ifengImgs.js');
var ifengPictures = require('./ifengPictures.js');
var qqImgs = require('./qqImgs.js');


var ImgSpider = function(){
this.arrUrls = [];
this.arrImgGallery = [];
this.callback = null;
};

ImgSpider.prototype.spider = function(arrUrl, callback){
if (!Array.isArray(arrUrl)) {
throw new Error("arrUrl isn't a array!");
}
if (arrUrl.length == 0) {
throw new Error("arrUrl is empty.");
}
this.callback = callback;

arrUrl.forEach((element, index, arr) => {

if (ifengImgs.prototype.RegExp.test(element)) {
runSpider(element, ifengImgs, this);
} else if (ifengPictures.prototype.RegExp.test(element)) {
runSpider(element, ifengPictures, this);
} else if (qqImgs.prototype.RegExp.test(element)) {
runSpider(element, qqImgs, this);
} else {
element = element.trim();
if (element.length > 0) {
var err = new Error("Can't support this url:[" + element + ']');
callback(err, null);
} else {
// do nothing..
}
}

});
};

ImgSpider.prototype.clean = function () {
this.arrUrls = [];
this.arrImgGallery = [];
this.callback = null;
};

function runSpider(url, constructor, imgSpider) {
imgSpider.arrUrls.push(url);
spider = new constructor();
spider.spider(url, (err, imgGallery) => {
if (err) {
console.log('error');
console.log(err);
return;
}
console.log('Done:', imgGallery.url, imgGallery.title);
imgSpider.arrImgGallery.push(imgGallery);

if (imgSpider.arrImgGallery.length == imgSpider.arrUrls.length) {
if (Object.prototype.toString.call(imgSpider.callback)=== '[object Function]') {
imgSpider.callback(null, imgSpider.arrImgGallery);
}
}

});
}

module.exports = ImgSpider;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
var ImgGallery = function(url) {
this.url = url;
this.title = '';
this.arrImgs = [];
};

ImgGallery.prototype.push = function(idx, imgBig, imgSmall, desc) {
var img = new Img(idx, imgBig, imgSmall, desc);
var length = this.arrImgs.push(img);
return length;
}

var Img = function(idx, imgBig, imgSmall, desc) {
this.imgBig = imgBig;
this.imgSmall = imgSmall;
this.desc = desc;
this.index = idx;
};




module.exports = ImgGallery;

ifengImgs.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
var url = 'http://games.ifeng.com/a/20160504/41603363_0.shtml';

var cheerio = require("cheerio");
var http = require("http");
var iconv = require('iconv-lite');
var img_gallery = require('./img.gallery.js');

var SpiderIfengImgs = function() {

};

// http://games.ifeng.com/a/20160504/41603363_0.shtml
// http://fashion.ifeng.com/a/20160519/40162307_0.shtml#p=1
SpiderIfengImgs.prototype.RegExp = /http:\/\/(games)|(fashion).ifeng.com\/a\/\d{8}\/\d+_\d+.shtml/;

SpiderIfengImgs.prototype.spider = function(url, callback){
http.get(url, function(res){
var arrBuf = [];
var bufLength = 0;
res.on("data", function(chunk){
arrBuf.push(chunk);
bufLength += chunk.length;
})
.on("end", function(){
var imgGallery = new img_gallery(url);


var chunkAll = Buffer.concat(arrBuf, bufLength);

var html = iconv.decode(chunkAll,'utf-8');

var $ = cheerio.load(html);
imgGallery.title = $("title").text();
//console.log('page title', imgGallery.title);


var strStart = 'var G_listdata= ';
var strEnd = '</script>';
var idxStart = html.indexOf(strStart);
var idxEnd = html.indexOf(strEnd, strStart.length + idxStart);
var jsListData = html.slice(idxStart + strStart.length, idxEnd);
jsListData = jsListData.replace(/'/g, "\"")
.replace(/title/g, '\"title\"')
.replace(/big_img/g, '\"big_img\"')
.replace(/originalimg/g, '\"originalimg\"')
.replace(/picwidth/g, '\"picwidth\"')
.replace(/picheight/g, '\"picheight\"')
.replace(/morelink/g, '\"morelink\"')
.replace(/img:/g, '\"img\":')
.replace('];', ']')
;
var objJson = JSON.parse(jsListData);


objJson.forEach((element, index, arr) => {

var title = element.title;
var big = element.big_img;
var img = element.img;
var originalimg = element.originalimg;

imgGallery.push(index, big, img, title);

/*console.log(index);
console.log('title', title);
console.log('big', big);
console.log('img', img);
console.log('originalimg', originalimg);*/
});

if (Object.prototype.toString.call(callback)=== '[object Function]') {
callback(null, imgGallery);
}

});
});

};


module.exports = SpiderIfengImgs;

ifengPictures.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
var url = 'http://games.ifeng.com/picture/gaoqing/detail_2015_09/11/41081883_0.shtml';

var cheerio = require("cheerio");
var http = require("http");
var iconv = require('iconv-lite');
var img_gallery = require('./img.gallery.js');

var SpiderIfengPictures = function(){

};


SpiderIfengPictures.prototype.RegExp = /http:\/\/games.ifeng.com\/picture\/gaoqing\/detail_\d{4}_\d{2}\/\d{2}\/\d+_\d+.shtml/;

SpiderIfengPictures.prototype.spider = function (strUrl, callback) {
http.get(strUrl, function(res){
var arrBuf = [];
var bufLength = 0;
res.on("data", function(chunk){
arrBuf.push(chunk);
bufLength += chunk.length;
})
.on("end", function(){
var imgGallery = new img_gallery(strUrl);

var chunkAll = Buffer.concat(arrBuf, bufLength);

var html = iconv.decode(chunkAll,'utf-8');
console.log('-----------------------------------');
console.log('html', html);
var $ = cheerio.load(html);
imgGallery.title = $("title").text();
//console.log('page title', imgGallery.title);


var strStart = '_listdata[0] = ';
var strEnd = 'new ifeng.Gallery';
var idxStart = html.indexOf(strStart);
var idxEnd = html.indexOf(strEnd, strStart.length + idxStart);
var jsListData = html.slice(idxStart + strStart.length, idxEnd);

jsListData = jsListData.replace(/'/g, "\"")
.replace(/title/g, '\"title\"')
.replace(/morelink/g, '\"morelink\"')
.replace(/picwidth/g, '\"picwidth\"')
.replace(/picheight/g, '\"picheight\"')
.replace(/listimg/g, '\"listimg\"')
.replace(/timg:/g, '\"timg\":')
.replace(/img:/g, '\"img\":')
.replace(/\};_listdata\[\d*\] = /g, '},')
.replace('\};', '}')
;
jsListData = '[' + jsListData + ']';
var objJson = JSON.parse(jsListData);

//console.log('jsListData', jsListData);
objJson.forEach((element, index, arr) => {
var title = element.title;
var timg = element.timg;
var img = element.img;
var listimg = element.listimg;

imgGallery.push(index, timg, img, title);
/*console.log(index);
console.log('title', title);
console.log('timg', timg);
console.log('img', img);
console.log('listimg', listimg);*/
});

if (Object.prototype.toString.call(callback)=== '[object Function]') {
callback(null, imgGallery);
}
});
});

};

module.exports = SpiderIfengPictures;

qqImgs.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
var img_gallery = require('./img.gallery.js');
var http = require("http");
var iconv = require('iconv-lite');
var cheerio = require("cheerio");

var url = 'http://news.qq.com/a/20160512/009639.htm';
var url = 'http://news.qq.com/a/20160512/009639.hdBigPic.js';

var SpiderQQImgs = function() {
this.title = null;
this.imgGallery = null;
this.callback = null;
};

SpiderQQImgs.prototype.RegExp = /http:\/\/news.qq.com\/a\/\d{8}\/\d+.htm/;

SpiderQQImgs.prototype.send2callback = function() {
if ((typeof this.title =='string')&&this.title.constructor==String && this.title.length > 0 && this.imgGallery != null && Object.prototype.toString.call(this.callback)=== '[object Function]') {
this.imgGallery.title = this.title;
this.callback(null, this.imgGallery);
}
};

SpiderQQImgs.prototype.spider = function (url, callback) {
this.callback = callback;

this.spiderTitle(url);

url = url.replace('.htm', '.hdBigPic.js');
this.spiderImgGallery(url);
};


SpiderQQImgs.prototype.spiderTitle = function (url) {
var spider = this;
http.get(url, function(res){
var arrBuf = [];
var bufLength = 0;
res.on("data", function(chunk){
arrBuf.push(chunk);
bufLength += chunk.length;
})
.on("end", function(){
var chunkAll = Buffer.concat(arrBuf, bufLength);

var html = iconv.decode(chunkAll,'gb2312');

var $ = cheerio.load(html);
spider.title = $("title").text();
//console.log('page title', spider.title);
spider.send2callback();
});
});
};


SpiderQQImgs.prototype.spiderImgGallery = function (url) {
var spider = this;
http.get(url, function(res){
var arrBuf = [];
var bufLength = 0;
res.on("data", function(chunk){
arrBuf.push(chunk);
bufLength += chunk.length;
})
.on("end", function(){
var imgGallery = new img_gallery(url);

var chunkAll = Buffer.concat(arrBuf, bufLength);

var strJson = iconv.decode(chunkAll,'gb2312') // 汉字不乱码
.replace(/\/\*[\s\S]+?\*\//,'')/*.replace(subfix, '')*/ // 删除掉注释
.replace(/\'/g, '"') // 单引号变双引号才能解析成Object
/*.replace(/&nbsp;/g, '')
.replace(/"Content":"",/g, '').replace(/"Attributes":\[\],/g, '')
.replace(/ /g, '')
.replace(/,"Children":\[\]/g,"")*/;

// console.log(strJson);

var objJson = JSON.parse(strJson);

deleteEmptyProperty(objJson);

var arr = objJson.Children[0].Children;
var shift1 = arr.shift();
var imgCount = shift1.Children[0].Content;
var arrImgs = arr.shift().Children;

// console.log('imgCount', imgCount);

arrImgs.forEach((element, index, array) => {
var arr = element.Children;
var small = arr[1];
var smallUrl = small.Children[0].Content;

var big = arr[2];
var bigUrl = big.Children[0].Content;

var text = arr[3];
var strText = text.Children[0].Content;

/*console.log('index', index);
console.log('smallUrl', smallUrl);
console.log('bigUrl', bigUrl);
console.log('text', strText);*/

imgGallery.push(index, bigUrl, smallUrl, strText);
});

spider.imgGallery = imgGallery;
spider.send2callback();
});
});
};



function deleteEmptyProperty(object){
for (var i in object) {
var value = object[i];
// console.log('typeof object[' + i + ']', (typeof value));
if (typeof value === 'object') {
if (Array.isArray(value)) {
if (value.length == 0) {
delete object[i];
//console.log('delete Array', i);
continue;
}
}

deleteEmptyProperty(value);

if (isEmpty(value)) {
//console.log('isOwnEmpty true', i, value);
delete object[i];
//console.log('delete a empty object');
}
} else {
if (value === '' || value === null || value === undefined) {
delete object[i];
//console.log('delete ', i);
} else {
//console.log('check ', i, value);
}
}
}
}


function isEmpty(object) {
for (var name in object) {
return false;
}
return true;
}


module.exports = SpiderQQImgs;

imgs.html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
<!DOCTYPE html><html lang="zh-CN">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
<title>Images spider</title>
</head>

<body>
<form id="form1" actoin="imgs.html" method="POST">
Please input urls:<br/>
<textarea name="txtUrls" style="width:500px;height:120px;">http://news.qq.com/a/20160531/018019.htm#p=1
http://games.ifeng.com/a/20160530/41615842_0.shtml#p=1
</textarea><br/>
<br/>
<input type="submit" value="commit"/><br/>
<br/>
</form>
</body>
</html>

About Sodino