1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
| var img_gallery = require('./img.gallery.js'); var http = require("http"); var iconv = require('iconv-lite'); var cheerio = require("cheerio");
var url = 'http://news.qq.com/a/20160512/009639.htm'; var url = 'http://news.qq.com/a/20160512/009639.hdBigPic.js';
var SpiderQQImgs = function() { this.title = null; this.imgGallery = null; this.callback = null; };
SpiderQQImgs.prototype.RegExp = /http:\/\/news.qq.com\/a\/\d{8}\/\d+.htm/;
SpiderQQImgs.prototype.send2callback = function() { if ((typeof this.title =='string')&&this.title.constructor==String && this.title.length > 0 && this.imgGallery != null && Object.prototype.toString.call(this.callback)=== '[object Function]') { this.imgGallery.title = this.title; this.callback(null, this.imgGallery); } };
SpiderQQImgs.prototype.spider = function (url, callback) { this.callback = callback;
this.spiderTitle(url);
url = url.replace('.htm', '.hdBigPic.js'); this.spiderImgGallery(url); };
SpiderQQImgs.prototype.spiderTitle = function (url) { var spider = this; http.get(url, function(res){ var arrBuf = []; var bufLength = 0; res.on("data", function(chunk){ arrBuf.push(chunk); bufLength += chunk.length; }) .on("end", function(){ var chunkAll = Buffer.concat(arrBuf, bufLength);
var html = iconv.decode(chunkAll,'gb2312');
var $ = cheerio.load(html); spider.title = $("title").text(); //console.log('page title', spider.title); spider.send2callback(); }); }); };
SpiderQQImgs.prototype.spiderImgGallery = function (url) { var spider = this; http.get(url, function(res){ var arrBuf = []; var bufLength = 0; res.on("data", function(chunk){ arrBuf.push(chunk); bufLength += chunk.length; }) .on("end", function(){ var imgGallery = new img_gallery(url); var chunkAll = Buffer.concat(arrBuf, bufLength);
var strJson = iconv.decode(chunkAll,'gb2312') // 汉字不乱码 .replace(/\/\*[\s\S]+?\*\//,'')/*.replace(subfix, '')*/ // 删除掉注释 .replace(/\'/g, '"') // 单引号变双引号才能解析成Object /*.replace(/ /g, '') .replace(/"Content":"",/g, '').replace(/"Attributes":\[\],/g, '') .replace(/ /g, '') .replace(/,"Children":\[\]/g,"")*/;
// console.log(strJson);
var objJson = JSON.parse(strJson); deleteEmptyProperty(objJson); var arr = objJson.Children[0].Children; var shift1 = arr.shift(); var imgCount = shift1.Children[0].Content; var arrImgs = arr.shift().Children;
// console.log('imgCount', imgCount);
arrImgs.forEach((element, index, array) => { var arr = element.Children; var small = arr[1]; var smallUrl = small.Children[0].Content;
var big = arr[2]; var bigUrl = big.Children[0].Content;
var text = arr[3]; var strText = text.Children[0].Content;
/*console.log('index', index); console.log('smallUrl', smallUrl); console.log('bigUrl', bigUrl); console.log('text', strText);*/
imgGallery.push(index, bigUrl, smallUrl, strText); });
spider.imgGallery = imgGallery; spider.send2callback(); }); }); };
function deleteEmptyProperty(object){ for (var i in object) { var value = object[i]; // console.log('typeof object[' + i + ']', (typeof value)); if (typeof value === 'object') { if (Array.isArray(value)) { if (value.length == 0) { delete object[i]; //console.log('delete Array', i); continue; } }
deleteEmptyProperty(value);
if (isEmpty(value)) { //console.log('isOwnEmpty true', i, value); delete object[i]; //console.log('delete a empty object'); } } else { if (value === '' || value === null || value === undefined) { delete object[i]; //console.log('delete ', i); } else { //console.log('check ', i, value); } } } }
function isEmpty(object) { for (var name in object) { return false; } return true; }
module.exports = SpiderQQImgs;
|