爬取网站的图片之一

mutongwu

浏览: 452505 次
性别:
来自: 广州

最近访客更多访客>>

lxswfc

xjg9083

oszerone

micai_1234

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Nodejs

nodejs JavaScript

    var http = require('http');  
    var fs = require('fs');
    var request = require('request');

function base64_decode (data) {
    var b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
    var o1, o2, o3, h1, h2, h3, h4, bits, i = 0, ac = 0, dec = "", tmp_arr = [];
    if (!data) {return data;}
    data += '';
    do { 
        h1 = b64.indexOf(data.charAt(i++));
        h2 = b64.indexOf(data.charAt(i++));
        h3 = b64.indexOf(data.charAt(i++));
        h4 = b64.indexOf(data.charAt(i++));
        bits = h1<<18 | h2<<12 | h3<<6 | h4;
        o1 = bits>>16 & 0xff;
        o2 = bits>>8 & 0xff;
        o3 = bits & 0xff;
        if (h3 == 64) {
            tmp_arr[ac++] = String.fromCharCode(o1);
        } else if (h4 == 64) {
            tmp_arr[ac++] = String.fromCharCode(o1, o2);
        } else {
            tmp_arr[ac++] = String.fromCharCode(o1, o2, o3);
        }
    } while (i < data.length);
    dec = tmp_arr.join('');
    dec = utf8_decode(dec);
    return dec;
}
function base64_encode (data) {
    var b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
    var o1, o2, o3, h1, h2, h3, h4, bits, i = 0, ac = 0, enc="", tmp_arr = [];
    if (!data){return data;}
    data = utf8_encode(data+'');
    do {
        o1 = data.charCodeAt(i++);
        o2 = data.charCodeAt(i++);
        o3 = data.charCodeAt(i++);
        bits = o1<<16 | o2<<8 | o3;
        h1 = bits>>18 & 0x3f;
        h2 = bits>>12 & 0x3f;
        h3 = bits>>6 & 0x3f;
        h4 = bits & 0x3f;
        tmp_arr[ac++] = b64.charAt(h1) + b64.charAt(h2) + b64.charAt(h3) + b64.charAt(h4);
    } while (i < data.length);
    enc = tmp_arr.join('');
    switch (data.length % 3) {
        case 1:
            enc = enc.slice(0, -2) + '==';
        break;
        case 2:
            enc = enc.slice(0, -1) + '=';
        break;
    }
    return enc;
}
function utf8_decode ( str_data ) {
    var tmp_arr = [], i = 0, ac = 0, c1 = 0, c2 = 0, c3 = 0;
    str_data += '';
    while ( i < str_data.length ) {
        c1 = str_data.charCodeAt(i);
        if (c1 < 128) {
            tmp_arr[ac++] = String.fromCharCode(c1);
            i++;
        } else if ((c1 > 191) && (c1 < 224)) {
            c2 = str_data.charCodeAt(i+1);
            tmp_arr[ac++] = String.fromCharCode(((c1 & 31) << 6) | (c2 & 63));
            i += 2;
        } else {
            c2 = str_data.charCodeAt(i+1);
            c3 = str_data.charCodeAt(i+2);
            tmp_arr[ac++] = String.fromCharCode(((c1 & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
            i += 3;
        }
    }
    return tmp_arr.join('');
}

function utf8_encode ( argString ) {
    var string = (argString+''); 
    var utftext = "";
    var start, end;
    var stringl = 0;
    start = end = 0;
    stringl = string.length;
    for (var n = 0; n < stringl; n++) {
        var c1 = string.charCodeAt(n);
        var enc = null;
        if (c1 < 128) {
            end++;
        } else if (c1 > 127 && c1 < 2048) {
            enc = String.fromCharCode((c1 >> 6) | 192) + String.fromCharCode((c1 & 63) | 128);
        } else {
            enc = String.fromCharCode((c1 >> 12) | 224) + String.fromCharCode(((c1 >> 6) & 63) | 128) + String.fromCharCode((c1 & 63) | 128);
        }
        if (enc !== null) {
            if (end > start) {
                utftext += string.substring(start, end);
            }
            utftext += enc;
            start = end = n+1;
        }
    }
    if (end > start) {
        utftext += string.substring(start, string.length);
    }
    return utftext;
}


var counter = 0;
var totalNum = 0;

function downloadPic(url,filePath,fn){
    var r = request(url).pipe(fs.createWriteStream(filePath));
    r.on('close', function(){
        fn();
    }).on('error',function(){
        log('download pic error!');
        if(nextChapterPath && chapterNow < chapterNum){
            downloadChapter(nextChapterPath);
        }else{
            chapterNow++;
        }
    });
}

function downloadFiles(arr,baseDir){
    var length = arr.length;
    var i = 0;
    function loadNext(index){
        downloadPic(arr[i], baseDir + '/' + index + '.jpg',function(){
            i++;
            if(i < length){
                loadNext(i);
            }else{
                console.log('Chapter ' + chapterNow + ' All done!');
                chapterNow++;
                if(nextChapterPath && chapterNow < chapterNum){
                    downloadChapter(nextChapterPath);
                }
            }
        });
    }
    loadNext(i);
}



var baseDir = 'F:/node_test/Pictures/';

var urlReg = new RegExp('qTcms_S_m_murl_e\\s*=\\s*"([\\w+/=]+)"');
var isUrlReg = /^http:\/\//;
var chapterReg = new RegExp('qTcms_S_m_playm\\s*=\\s*"\\W+(\\d+)\\W+"');
var nextChapterReg = new RegExp('qTcms_Pic_nextArr\\s*=\\s*"([/\\w.])html"');

var chapterNow = 0,
    chapterNum = 10;
var nextChapterPath = '';

function log(data){
    fs.appendFile(baseDir + 'log.txt',data + '\r\n','utf8',function(err){
        if(err)
        {
            console.log(err);
        }
    });
}



function downloadChapter(path){
    log('//------------------------------------------------------------------//');

    var page = 'http://some.page.com' + path;
    log('正在读取:' + page);

    var html = '';  
    http.get(page , function(res) {  
        res.setEncoding('utf-8');  
        res.on('data', function(data) {  
            // collect the data chunks to the variable named "html"  
            html += data;  
        }).on('end', function() {

            var dir = baseDir;
            if(chapterReg.test(html)){
                dir +=  '第' + RegExp.$1 + '话';
            }else{
                console.log('找不到章节');
                return;
            }
            if (!fs.existsSync(dir)) {
                fs.mkdirSync(dir);
            }else{
                console.log(dir + ' 已存在!');
                return;
            }
            //
            log('文件存放目录：' + dir);

            var nextChapterReg = new RegExp('qTcms_Pic_nextArr\\s*=\\s*"([/\\w.]+)"');
            if(nextChapterReg.test(html)){
                nextChapterPath = RegExp.$1;
                log('下一个页面地址：' + nextChapterPath);
            }else{
                nextChapterPath = null;
            }

            var url64 = null,
                picArr = null;
            if(urlReg.test(html)){

                url64 = base64_decode (RegExp.$1);

                log('图片数组:\r\n' + url64);

                picArr = url64.split('$');
                picArr =  picArr.filter(function(val){
                    return isUrlReg.test(val);
                });


                downloadFiles(picArr,dir);
            }else{
                //console.log("not found.");
                console.log(html);
            }
        });  
    });  
}

downloadChapter('someurl');

分享到：

HTML History API | 爬取网站的图片

2015-03-05 18:12
浏览 907
评论(0)
分类:Web前端
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

爬取网站的图片之一

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

爬取网站的图片之一

评论

发表评论

相关推荐

webpack简单打包PC网站前端资源

grunt构建基于seajs的网站实现

fis-amd 的使用与修改

gulp下静态资源的合并、压缩、MD5后缀

爬取网站的图片

cordova3+sencha touch2.x 环境搭建

linux上nodejs安装

nodejs jsdom安装

网站图片抓取

最近访客更多访客>>