构建自己的DSL之一 Simple Crawler

fuliang

浏览: 1663245 次
性别:
来自: 北京

最近访客更多访客>>

依然任逍遥

stephenworld

lli

samwalt

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Ruby
Machine Learning

Ruby DSL Crawler

转载请标明出处：http://fuliang.iteye.com/blog/1122008

经常需要从网上抓取一些需要的内容做成语料，供分类使用。所以需要一个灵活的抓取、抽取程序-自己的DSL来做这件事，这样每次只需要写几行代码就能得到需要的内容。比如我比较希望以下几行代码就能把我的博客的内容给抓下来：


crawler = Crawler.new
1.upto(10) do |pn|
    urls = []
    crawler.fetch "http://fuliang.iteye.com/?page=#{pn}" do |page|
        page.css("div.blog_title > h3 > a").each do |node|
            urls << "http://fuliang.iteye.com#{node.attributes['href']}"
        end
    end

    urls.each do |url|
        crawler.fetch url do |page|
            page.xpath(:title => '//*[@id="main"]/div/div[2]/h3/a',:content => '//*[@id="blog_content"]').each do |entry|
                printf("%s\t%s\n",entry[:title].text.gsub(/\s+/,""),entry[:content].text.gsub(/\s+/,""))
            end
        end
    end
    break
end

我们先创建一个Crawler对象，然后按照我博客的列表页分页特征，得到第pn页的url是
http://fuliang.iteye.com/?page=#{pn}，当然有可能有复杂的规则，构建列表页的url列表，然后遍历。crawler只有一个fetch方法，就可以把页面fetch下来，然后得到这个页面在块中处理。这个页面可以直接根据xpath、css来得到需要抽取的内容，还可以一次抽取一个记录，只需要向xpath,css方法中传递一个字段到xpath/css的hash，然后得到对应的记录的hash。
按照上面的描述，我们先编写一个简单的Crawler，为了防止被封我们使用了几个代理：

class Crawler
    def initialize
        @proxies = 1.upto(6).collect{|index| "http://l-crwl#{index}:1080"}
    end

    def fetch(url)
        yield Page.new( Nokogiri::HTML(open(url,fetch_options)) )
    end

private
    def rand_proxy
        @proxies[(rand * 6).to_i]
    end

    def fetch_options
        user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20061201 Firefox/2.0.0.2 (Ubuntu-feisty)"

        fetch_options = {
            "User-Agent" => user_agent,
            "proxy" => rand_proxy
        }
    end
end

然后我们定义Page类，动态定义了css和xpath的方法,我们直接代理给Nokogiri
的css、xpath，让它来做事情，我们收集一下抽取的结果，一些就ok了：

class Page
    def initialize(html)
        @html = html
    end

    class_eval do
        [:css,:xpath].each do |extract_by|
            define_method extract_by do |arg,&block|
                if arg.is_a? String then
                    if block.nil? then
                       @html.send(extract_by,arg)
                    else
                        block.call(@html.send(extract_by,arg))
                    end
                elsif arg.is_a? Hash then
                    extract_raw = arg.collect{|key,value| [key, @html.send(extract_by,value)]}
                    data = extract_raw.collect do |key, vals|
                        ([key] * vals.size).zip(vals)
                    end
                    result =  data[0].zip(*data[1..-1]).collect{|e| Hash[ * e.flatten ]}
                    if block.nil? then
                        result
                    else
                        block.call(result)
                    end
                else
                    raise ArgumentError.new('Argument type must String or Hash type')
                end
            end
        end
    end
end

整个的代码：

#!/usr/bin/env ruby

require 'rubygems'
require 'nokogiri'
require 'open-uri'

class Crawler
    def initialize
        @proxies = 1.upto(6).collect{|index| "http://l-crwl#{index}:1080"}
    end
    
    def fetch(url)
        yield Page.new( Nokogiri::HTML(open(url,fetch_options)) )
    end

private
    def rand_proxy
        @proxies[(rand * 6).to_i]  
    end

    def fetch_options  
        user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20061201 Firefox/2.0.0.2 (Ubuntu-feisty)"

        fetch_options = {  
            "User-Agent" => user_agent,  
            "proxy" => rand_proxy  
        }  
    end  
end

class Page
    def initialize(html)
        @html = html
    end

    class_eval do
        [:css,:xpath].each do |extract_by|
            define_method extract_by do |arg,&block|
                if arg.is_a? String then
                    if block.nil? then 
                       @html.send(extract_by,arg)
                    else
                        block.call(@html.send(extract_by,arg))
                    end
                elsif arg.is_a? Hash then
                    extract_raw = arg.collect{|key,value| [key, @html.send(extract_by,value)]}
                    data = extract_raw.collect do |key, vals|
                        ([key] * vals.size).zip(vals)
                    end
                    result =  data[0].zip(*data[1..-1]).collect{|e| Hash[ * e.flatten ]}
                    if block.nil? then
                        result
                    else
                        block.call(result)
                    end
                else
                    raise ArgumentError.new('Argument type must String or Hash type')
                end
            end
        end
    end
end

crawler = Crawler.new
1.upto(10) do |pn|
    urls = []
    crawler.fetch "http://fuliang.iteye.com/?page=#{pn}" do |page|
        page.css("div.blog_title > h3 > a").each do |node|
            urls << "http://fuliang.iteye.com#{node.attributes['href']}"
        end
    end

    urls.each do |url|
        crawler.fetch url do |page|
            page.xpath(:title => '//*[@id="main"]/div/div[2]/h3/a',:content => '//*[@id="blog_content"]').each do |entry|
                printf("%s\t%s\n",entry[:title].text.gsub(/\s+/,""),entry[:content].text.gsub(/\s+/,""))
            end
        end
    end
    break
end

3
顶

2
踩

分享到：

构建自己的DSL之二抓取文本处理 | paper and book阅读

2011-07-11 22:08
浏览 3047
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论