- 浏览: 44682 次
- 性别:
- 来自: 杭州
#! /usr/local/bin/ruby require 'hpricot' require "open-uri" require 'net/http' require 'CompanyLink' require 'parser/CompanyParser' require 'parser/CompanyParser2' require 'export/CsvExport' require 'export/MysqlExport' class AlibabaWebCatcher def initialize @exporter = nil end def setExporter(export = nil) @exporter = export end #collect by given company site def collectByCompanyLink(companySite) companyLink = CompanyLink.new(companySite) companyHostUri = companyLink.hostUri companyIntroduceUri = companyLink.introduceUri companyContactUri = companyLink.contactUri begin puts 'Collect company : ' + companySite http = Net::HTTP.new(companyHostUri, 80) puts 'Opening Page: http://' + companyHostUri + companyIntroduceUri resp, data = http.get(companyIntroduceUri, nil) #puts data if resp.message == "OK" then puts 'Opened Page: http://' + companyHostUri + companyIntroduceUri companyLink.parser.parseProfile(data) else puts 'Opening Page: http://' + companyHostUri + companyIntroduceUri + ' failed' end puts 'Opening Page: http://' + companyHostUri + companyContactUri resp2, data2 = http.get(companyContactUri, nil) #put data2 if resp2.message == 'OK' then puts 'Opened Page: http://' + companyHostUri + companyContactUri companyLink.parser.parseContact(data2) else puts 'Opening Page: http://' + companyHostUri + companyContactUri + ' failed' end if @exporter.nil? then puts 'Please provider exporter for export function.' else @exporter.doExport(companyLink.parser.getCompany) end rescue Exception => err puts 'Exception happend caused by : ' + err.to_s #raise err ensure puts 'Collect company : ' + companySite + " finished." end end #collect by company list def collectByCompanyList(companyListUrl) companySearchHostUri = getCompanySearchHost companySearchUri = companyListUrl.split(/alibaba.com/)[1] #puts companySearchUri begin http = Net::HTTP.new(companySearchHostUri, 80) puts 'Opening Page: http://' + companySearchHostUri + companySearchUri resp, data = http.get(companySearchUri, nil) #puts data if resp.message == "OK" puts 'Opened Page: http://' + companySearchHostUri + companySearchUri #parse company list companySites = parseCompanySites(data) for companySite in companySites collectByCompanyLink(companySite) end nextPage = getNextSearchPage(data) if !nextPage.nil? then collectByCompanyList(nextPage) end else puts 'Opening Page: http://' + companySearchHostUri + companySearchUri + ' failed' end rescue Exception => err puts 'Exception happend caused by : ' + err.to_s raise err ensure end end #collect by company list url def collectByKeywords(searchKeywords) puts 'Searching ... ' + searchKeywords companySearchHostUri = getCompanySearchHost() companySearchUri = getCompanySearchUri(searchKeywords) collectByCompanyList(companySearchHostUri + companySearchUri) end #collect by company list url use thread def collectByKeywordsByThread(searchKeywords) puts 'Searching ... ' + searchKeywords companySearchHostUri = getCompanySearchHost() companySearchUri = getCompanySearchUri(searchKeywords) http = Net::HTTP.new(companySearchHostUri, 80) puts 'Opening Page: http://' + companySearchHostUri + companySearchUri resp, data = http.get(companySearchUri, nil) #puts data if resp.message == "OK" then puts 'Opened Page: http://' + companySearchHostUri + companySearchUri totalPage = getTotalPage(data) secondPage = getNextSearchPage(data) uri = '/company' + secondPage.split(/\/company/)[1] #puts uri threads = [] for i in 1..totalPage threads << Thread.new(companySearchHostUri) do |page| begin h = Net::HTTP.new(page, 80) uri = uri.gsub(/\d+\.html/, i.to_s + '.html') #puts uri puts "Opening Page: http://#{page}#{uri}" resp_t, data_t = h.get(uri, nil) #puts data #puts resp_t.message if resp_t.message == "OK" then puts "Opened Page: http://#{page}#{uri}" #parse company list companySites = parseCompanySites(data_t) for companySite in companySites collectByCompanyLink(companySite) end else puts "Opening Page: http://#{page}#{uri} failed." end rescue Exception => err puts 'Exception thrown out when got page ' + i.to_s + ' since ' + err.to_s #raise ensure end end end threads.each { |t| t.join } else puts 'Opening Page: http://' + companySearchHostUri + companySearchUri + ' failed' end end protected #parse the company site list def parseCompanySites(companyListHtml) companySites = Array.new doc = Hpricot(companyListHtml) doc.search('//div[@class="content"]/div[@class="info"]/span[@class="m undline"]/a').each do |item| companyHref = item.attributes['href'] puts '公司链接: ' + companyHref companySites << companyHref end return companySites end private #get company search host def getCompanySearchHost return 'search.china.alibaba.com'; end #get company search uri according to keywords def getCompanySearchUri(keywords) return '/search/company_search.htm?filt=y&categoryId=0&maxCatId=&isNoRepost=false&descend_order=&show_fid=&cat_field=' + '&tradeType=&searchType=&pageSize=30&sm=&seType=&townId=0&onlineStatus=all&memberlevel=' + '&province=%BD%AD%CB%D5%2C%D5%E3%BD%AD%2C%C9%CF%BA%A3&city=&biztype=&established_year=' + '&keywords=' + URI.escape(keywords) #return '/company/' + URI.escape(keywords, 'utf-8') + '/1.html?province=%BD%AD%CB%D5%2C%D5%E3%BD%AD%2C%C9%CF%BA%A3' end #get company search next page def getNextSearchPage(data) doc = Hpricot(data) nextPageHref = doc.at('//div[@class="pages"]/div[@class="list_offer_pages"]/h1/b/a[text()="下一页"]') if !nextPageHref.nil? then #puts '下一页: ' + nextPageHref.attributes['href'] return nextPageHref.attributes['href'] end end #get company search total page def getTotalPage(data) totalItems = data.scan(/共找到\s<span class="red sm">(\d+)<\/span>\s条/)[0] if !totalItems.nil? then totalPage = ((totalItems.last.to_i%30 == 0) ? totalItems.last.to_i/30 : (totalItems.last.to_i/30 + 1)) puts '共 ' + totalItems.last + '条 共' + totalPage.to_s + ' 页' return totalPage else return 0 end end end
#! /usr/local/bin/ruby require 'test/unit' require 'AlibabaWebCatcher' require 'export/CsvExport' require 'export/MysqlExport' class AlibabaWebCatcherTest < Test::Unit::TestCase def setup @webCatcher = AlibabaWebCatcher.new #@webCatcher.export = CsvExport.new('test.csv') @webCatcher.exporter = MysqlExport.new end def teardown @webCatcher = nil end def test_download assert_not_nil(@webCatcher) #@webCatcher.collectByCompanyShortName('qianbusha') @webCatcher.collectByKeywords('婴幼儿用品') assert(true, 'Should go here') end end
2012-04-26 10:05 837#!/usr/bin/perl -w package C ... -
2012-04-26 10:02 1057#!/usr/bin/perl -w package M ... -
2011-06-17 13:45 999ImageUtils.rb #!/usr/local/bi ... -
NCS(Nightly CoSIM Script)
2011-06-14 15:45 865NCS(Nightly CoSIM Script)包括Perl ... -
2011-06-14 15:38 1763Perl解释Properties不太方便,需要自己分析 # ...
用Ruby写的一个网络爬虫,用到了正则表达式 和哈希表
### 网络爬虫——Python与数据分析 #### 一、网络爬虫概述 网络爬虫,也称为网络蜘蛛或网络机器人,是一种自动化的程序,主要用于在网络上抓取和下载网页内容。这种技术对于搜索引擎至关重要,因为它们依赖于爬虫...
### 网络爬虫爬取Ajax:利用Ruby技术实现 在现代互联网应用中,Ajax(Asynchronous JavaScript and XML)已成为提升用户体验的关键技术之一。它允许网页在不重新加载整个页面的情况下更新部分数据,从而实现了更为...
一个非常基本的网络爬虫 给定一个 url,将对所有链接页面(在同一域和方案上)执行广度优先爬网,并构建每个页面所依赖的静态资产列表。 它需要一个完整的 url,带有方案(http 或 https)。 给定一个子域,它不会...
用Python写网络爬虫-35 Python是一种非常流行的语言,用来写网络爬虫非常合适。本文将详细介绍如何用Python写网络爬虫,以及网络爬虫的应用场景。 首先,什么是网络爬虫?网络爬虫是一个自动提取网页的程序,它为...
#### 一、网络爬虫概述 **定义:** - **网络爬虫**是一种自动从互联网上获取数据的程序,通常用于搜索引擎来下载网页内容。 **功能:** - 作为搜索引擎的基础组件,负责网页的收集工作。 - 可用于构建垂直搜索引擎...