网络爬虫

aguang110

浏览: 887523 次
性别:
来自: 北京

最近访客更多访客>>

tongwawa

ggggoo

lyne15730

gggfff39

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

java

thread HTML Go .net

package com.heaton.bot;
import com.heaton.bot.*;
import java.net.*;

/**
* The SpiderWorker class performs the actual work of
* spidering pages. It is implemented as a thread
* that is created by the spider class.
*
* Copyright 2001-2003 by Jeff Heaton (http://www.jeffheaton.com)
*
* @author Jeff Heaton
* @version 1.2
*/
public class SpiderWorker extends Thread {

/**
   * The URL that this spider worker
   * should be downloading.
   */
protected String target;

/**
   * The owner of this spider worker class,
   * should always be a Spider object.
   * This is the class that this spider
   * worker will send its data to.
   */
protected Spider owner;

/**
   * Indicates if the spider is busy or not.
   * true = busy
   * false = idle
   */
protected boolean busy;

/**
   * A descendant of the HTTP object that
   * this class should be using for HTTP
   * communication. This is usually the
   * HTTPSocket class.
   */
protected HTTP http;

/**
   * Constructs a spider worker object.
   *
   * @param owner The owner of this object, usually
   * a Spider object.
   * @param http
   */
public SpiderWorker(Spider owner,HTTP http)
{
    this.http = http;
    this.owner = owner;
}

/**
   * Returns true of false to indicate if
   * the spider is busy or idle.
   *
   * @return true = busy
   * false = idle
   */
public boolean isBusy()
{
    return this.busy;
}

/**
   * The run method causes this thread to go idle
   * and wait for a workload. Once a workload is
   * received, the processWorkload method is called
   * to handle the workload.
   */
public void run()
{
    for ( ;; ) {
      target = this.owner.getWorkload();
      if ( target==null )
        return;
      owner.getSpiderDone().workerBegin();
      processWorkload();
      owner.getSpiderDone().workerEnd();
    }
}

/**
   * The run method actually performs the
   * the workload assigned to this object.
   */
public void processWorkload()
{
    try {
      busy = true;
      Log.log(Log.LOG_LEVEL_NORMAL,"Spidering " + target );
      http.send(target,null);
      Attribute typeAttribute = http.getServerHeaders().get("Content-Type");

      // if no content-type at all, its PROBABLY not HTML
      if ( typeAttribute==null )
        return;

      // now check to see if is HTML, ONLY PARSE text type files(namely HTML)
      owner.processPage(http);
      if ( !typeAttribute.getValue().startsWith("text/") )
        return;

      HTMLParser parse = new HTMLParser();

      parse.source = new StringBuffer(http.getBody());
      // find all the links
      while ( !parse.eof() ) {
        char ch = parse.get();
        if ( ch==0 ) {
          HTMLTag tag = parse.getTag();
          Attribute link = tag.get("HREF");
          if ( link==null )
            link = tag.get("SRC");

          if ( link==null )
            continue;

          URL target=null;
          try {
            target = new URL(new URL(this.target),link.getValue());
          } catch ( MalformedURLException e ) {
            Log.log(Log.LOG_LEVEL_TRACE,
                    "Spider found other link: " + link );
            owner.foundOtherLink(link.getValue());
            continue;
          }

          if ( owner.getRemoveQuery() )
            target = URLUtility.stripQuery(target);
          target = URLUtility.stripAnhcor(target);

          if ( target.getHost().equalsIgnoreCase(
                                                new URL(this.target).getHost()) ) {
            Log.log(Log.LOG_LEVEL_NORMAL,
                    "Spider found internal link: " + target.toString() );
            owner.foundInternalLink(target.toString());
          } else {
            Log.log(Log.LOG_LEVEL_NORMAL,
                    "Spider found external link: " + target.toString() );
            owner.foundExternalLink(target.toString());
          }
        }
      }
      owner.completePage(http,false);
} catch ( java.io.IOException e ) {
      Log.log(Log.LOG_LEVEL_ERROR,
              "Error loading file("+ target +"): " + e );
      owner.completePage(http,true);
    } catch ( Exception e ) {
      Log.logException(
                      "Exception while processing file("+ target +"): ", e );
      owner.completePage(http,true);
    } finally {
      busy = false;
    }
}

/**
   * Returns the HTTP descendant that this
   * object should use for all HTTP communication.
   *
   * @return An HTTP descendant object.
   */
public HTTP getHTTP()
{
    return http;
}
}

分享到：

Java中移位操作符的运算规则 | paint repaint paintComponent update

2010-08-20 23:41
浏览 1340
评论(0)
分类:编程语言
查看更多

发表评论

文章已被作者锁定，不允许评论。

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

网络爬虫

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

网络爬虫

评论

发表评论

相关推荐

操作系统的目标和作用

利用（ffmpeg)生成视频缩略图（java)

对Java多线程技术中所有方法的详细解析

java乱码

学习apache commons-io类库中的文件清除器

java 正则表达式 过滤html标签

转---Eclipse中web-inf和meta-inf文件夹的信息

logback与Log4J的区别

性能优化

JAVA的Random类(转)

非阻塞的Socket链接

创建临时文件

面向对象设计的基本原则

proxool

当前Java软件开发中几种认识误区

Java中查看一个方法被调用的层次(Reflection、StackTrace)

反序列化时恢复transient字段

用socket连接服务器直接发送接收邮件

利用JavaMail收/发Gmail邮件(SSL)

Java 反射与内省

最近访客更多访客>>

java 正则表达式过滤html标签