论坛首页 编程语言技术论坛

[爬虫源码]和大家分享一下瓜子二手车上的二车手信息爬虫源码

浏览 2936 次
精华帖 (0) :: 良好帖 (0) :: 新手帖 (0) :: 隐藏帖 (0)
作者 正文
   发表时间:2016-06-07  
C++
使用javascript编写的爬虫源码,用于爬取瓜子二手车上的二车手信息

源码如下:

var scanUrl = "http://www.guazi.com/hz/buy/";//@input(scanUrl, 入口url, 请输入一个需爬取城市的url,格式为:“http://www.guazi.com/城市名称/buy/”)

if (scanUrl.trim().length > 0) {
    var city = scanUrl.trim().substring(scanUrl.indexOf(".com/") + 5, scanUrl.indexOf("/buy/"));
}

var configs = {
    domains: ["guazi.com"],
    scanUrls: [scanUrl],
    contentUrlRegexes: ["https?://www\\.guazi\\.com/" + city + "/\\w+\\.htm"],
    helperUrlRegexes: ["https?://www\\.guazi\\.com/" + city + "/buy/(o\\d+/)?"],
    enableJS: false,
    interval: 10000,
    fields: [
        {
            name: "car_name",
            selector: "//h1[contains(@class,'dt-titletype')]"
        },
        {
            name: "car_price",
            selector: "//span[contains(@class,'fc-org pricestype')]"
        },
        {
            name: "car_license",
            selector: "//li[contains(@class,'one')]/b"
        },
        {
            name: "car_mileage",
            selector: "//ul[contains(@class,'assort')]/li[2]/b"
        },
        {
            name: "car_gearbox",
            selector: "//ul[contains(@class,'assort')]/li[3]/b"
        },
        {
            name: "car_emission_standard",
            selector: "//li[contains(@class,'em-sta detailHoverTips')]/b"
        },
        {
            name: "car_license_location",
            selector: "//ul[contains(@class,'assort')]/li[5]/b"
        },
        {
            name: "car_owner",
            selector: "//li[contains(@class,'owner')]/text()[2]"
        },
        {
            name: "car_description",
            selector: "//*[@id='base']/p"
        }
    ]
};

configs.afterExtractField = function(fieldName, data, page) {
    if (fieldName == "car_price") {
        var price = extract(data, "//b").replace("¥", "¥");
        var coinUnit = exclude(data, "//b");
        return (price + coinUnit);
    }
    else if (fieldName == "car_owner") {
        return data.trim();
    }
    else if (fieldName == "car_description") {
        return data.replace("<em></em>", "");
    }
    return data;
};

var crawler = new Crawler(configs);
crawler.start();


代码运行方法点这里:


如何运行爬虫



论坛首页 编程语言技术版

跳转论坛:
Global site tag (gtag.js) - Google Analytics