heritrix文档上的一个例子，放这备用

qzxfl008

浏览: 79928 次
性别:
来自: 浙江

最近访客更多访客>>

sgq0085

malie0

mazhongxing_jay

chenjun296

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

爬虫heritrix

package mypackage;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.archive.crawler.datamodel.CandidateURI;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.datamodel.FetchStatusCodes;
import org.archive.crawler.datamodel.UURI;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.Frontier;
import org.archive.crawler.framework.FrontierMarker;
import org.archive.crawler.framework.exceptions.FatalConfigurationException;
import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
import org.archive.crawler.settings.ModuleType;


/**
 * A simple Frontier implementation for tutorial purposes
 */
public class MyFrontier extends ModuleType implements Frontier,
        FetchStatusCodes {
    // A list of the discovered URIs that should be crawled.
    List pendingURIs = new ArrayList();
    
    // A list of prerequisites that needs to be met before any other URI is
    // allowed to be crawled, e.g. DNS-lookups
    List prerequisites = new ArrayList();
    
    // A hash of already crawled URIs so that every URI is crawled only once.
    Map alreadyIncluded = new HashMap();
    
    // Reference to the CrawlController.
    CrawlController controller;

    // Flag to note if a URI is being processed.
    boolean uriInProcess = false;
    
    // top-level stats
    long successCount = 0;
    long failedCount = 0;
    long disregardedCount = 0;
    long totalProcessedBytes = 0;

    public MyFrontier(String name) {
        super(Frontier.ATTR_NAME, "A simple frontier.");
    }

    public void initialize(CrawlController controller)
            throws FatalConfigurationException, IOException {
        this.controller = controller;
        
        // Initialize the pending queue with the seeds
        this.controller.getScope().refreshSeeds();
        List seeds = this.controller.getScope().getSeedlist();
        synchronized(seeds) {
            for (Iterator i = seeds.iterator(); i.hasNext();) {
                UURI u = (UURI) i.next();
                CandidateURI caUri = new CandidateURI(u);
                caUri.setSeed();
                schedule(caUri);
            }
        }
    }

    public synchronized CrawlURI next(int timeout) throws InterruptedException {
        if (!uriInProcess && !isEmpty()) {
            uriInProcess = true;
            CrawlURI curi;
            if (!prerequisites.isEmpty()) {
                curi = CrawlURI.from((CandidateURI) prerequisites.remove(0));
            } else {
                curi = CrawlURI.from((CandidateURI) pendingURIs.remove(0));
            }
            curi.setServer(controller.getServerCache().getServerFor(curi));
            return curi;
        } else {
            wait(timeout);
            return null;
        }
    }

    public boolean isEmpty() {
        return pendingURIs.isEmpty() && prerequisites.isEmpty();
    }

    public synchronized void schedule(CandidateURI caURI) {
        // Schedule a uri for crawling if it is not already crawled
        if (!alreadyIncluded.containsKey(caURI.getURIString())) {
            if(caURI.needsImmediateScheduling()) {
                prerequisites.add(caURI);
            } else {
                pendingURIs.add(caURI);
            }
            alreadyIncluded.put(caURI.getURIString(), caURI);
        }
    }

    public void batchSchedule(CandidateURI caURI) {
        schedule(caURI);
    }

    public void batchFlush() {
    }

    public synchronized void finished(CrawlURI cURI) {
        uriInProcess = false;
        if (cURI.isSuccess()) {
            successCount++;
            totalProcessedBytes += cURI.getContentSize();
            controller.fireCrawledURISuccessfulEvent(cURI);
            cURI.stripToMinimal();
        } else if (cURI.getFetchStatus() == S_DEFERRED) {
            cURI.processingCleanup();
            alreadyIncluded.remove(cURI.getURIString());
            schedule(cURI);
        } else if (cURI.getFetchStatus() == S_ROBOTS_PRECLUDED
                || cURI.getFetchStatus() == S_OUT_OF_SCOPE
                || cURI.getFetchStatus() == S_BLOCKED_BY_USER
                || cURI.getFetchStatus() == S_TOO_MANY_EMBED_HOPS
                || cURI.getFetchStatus() == S_TOO_MANY_LINK_HOPS
                || cURI.getFetchStatus() == S_DELETED_BY_USER) {
            controller.fireCrawledURIDisregardEvent(cURI);
            disregardedCount++;
            cURI.stripToMinimal();
        } else {
            controller.fireCrawledURIFailureEvent(cURI);
            failedCount++;
            cURI.stripToMinimal();
        }
        cURI.processingCleanup();
    }

    public long discoveredUriCount() {
        return alreadyIncluded.size();
    }

    public long queuedUriCount() {
        return pendingURIs.size() + prerequisites.size();
    }

    public long finishedUriCount() {
        return successCount + failedCount + disregardedCount;
    }

    public long successfullyFetchedCount() {
        return successCount;
    }

    public long failedFetchCount() {
        return failedCount;
    }

    public long disregardedFetchCount() {
        return disregardedCount;
    }

    public long totalBytesWritten() {
        return totalProcessedBytes;
    }

    public String report() {
        return "This frontier does not return a report.";
    }

    public void importRecoverLog(String pathToLog) throws IOException {
        throw new UnsupportedOperationException();
    }

    public FrontierMarker getInitialMarker(String regexpr,
            boolean inCacheOnly) {
        return null;
    }

    public ArrayList getURIsList(FrontierMarker marker, int numberOfMatches,
            boolean verbose) throws InvalidFrontierMarkerException {
        return null;
    }

    public long deleteURIs(String match) {
        return 0;
    }

}

分享到：

An example processor | Crawl Scope 抓取范围

2011-06-02 18:49
浏览 1492
评论(0)
分类:互联网
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

heritrix文档上的一个例子，放这备用

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

heritrix文档上的一个例子，放这备用

评论

发表评论

相关推荐

html解析页面中的A标签

Heritrix抓取hexun网上的stock信息

ELF hash算法 java版

An example processor

Crawl Scope 抓取范围

heritrix中ExtractorJS扩展源代码

最近访客更多访客>>