网络爬虫

akululu

浏览: 46345 次
性别:
来自: 北京

最近访客更多访客>>

herman_liu76

Fly872365

xxl11231220

春天好

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Spider

thread Go

public class Spider implements Runnable {

private ArrayList urls; //URL列表

private HashMap indexedURLs; //已经检索过的URL列表

private int threads ; //初始化线程数

public static void main(String argv[]) throws Exception {

if(argv[0] == null){

System.out.println("Missing required argument: [Sit URL]";

return ;

}

Spider Spider = new Spider(argv[0]);

Spider.go();

}

public Spider(String strURL) {

urls = new ArrayList();

threads = 10;

urls.add(strURL);

threadList = new ArrayList();

indexedURLs = new HashMap();

if (urls.size() == 0)

throw new IllegalArgumentException("Missing required argument: -u [start url]";

if (threads < 1)

throw new IllegalArgumentException("Invalid number of threads: " +

threads);

}

public void go(String strURL) throws Exception {

// index each entry point URL

long start = System.currentTimeMillis();

for (int i = 0; i < threads; i++) {

Thread t = new Thread(this, "Spide " + (i+1));

t.start();

threadList.add(t);

}

while (threadList.size() >; 0) {

Thread child = (Thread)threadList.remove(0);

child.join();

}

long elapsed = System.currentTimeMillis() - start;

}

public void run() {

String url;

try {

while ((url = dequeueURL()) != null) {

indexURL(url);

}

}catch(Exception e) {

logger.info(e.getMessage());

}

//检测URL列表容器中有没有URL没有被解析,如果有则返回URL由线程继续执行

public synchronized String dequeueURL() throws Exception {

while (true) {

if (urls.size() >; 0) {

return (String)urls.remove(0);

}else {

threads--;

if (threads >; 0) {

wait();

threads++;

}else {

notifyAll();

return null;

}

*添加URL和当前URL的级数，并唤醒睡眠线程

public synchronized void enqueueURL(String url,int level) {

if (indexedURLs.get(url) == null) {

urls.add(url);

indexedURLs.put(url, new Integer(level));

notifyAll();

}

/**

*通过URL解析出网页内容并解析出页面上的URL

* @param url页面链接

* @throws java.lang.Exception

private void indexURL(String url) throws Exception {

boolean flag = true ;

//判断网页链接的级别，系统默认为三级

int level = 1 ;

if (indexedURLs.get(url) == null) {

indexedURLs.put(url, new Integer(level));

}else{

level = ((Integer)indexedURLs.get(url)).intValue();

//只检测到页面的第二级

if(level >; 2 )

return ;

level++ ;

}

String strBody = null ;

try{

//解析页面内容

strBody = loadURL(url);

}catch(Exception e){

return ;

}

if (strBody != null) {

String urlGroups[] = null ;

try{

//解析出页面所以URL

urlGroups = parseURLs(summary);

}catch(Exception e){

logger.info(e.getMessage());

}

if(urlGroups == null)

urlGroups = new String[0] ;

strBody = null ;

for (int i = 0; i < urlGroups.length; i++) {

enqueueURL(urlGroups,level);

}

分享到：

URL消重-信息指纹

2009-01-13 15:46
浏览 1605
评论(1)
查看更多

1 楼春天好 2016-06-17

写的不错

分享一个免费好用的云端爬虫开发平台
http://www.shenjianshou.cn/

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

网络爬虫

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

网络爬虫

评论

发表评论

相关推荐

URL消重-信息指纹

爬虫/蜘蛛程序的制作

最近访客更多访客>>