浏览 7632 次
精华帖 (0) :: 良好帖 (0) :: 新手帖 (0) :: 隐藏帖 (0)
|
|
---|---|
作者 | 正文 |
发表时间:2008-03-30
以下代码基于lucene-2.3.1,htmlparser-1.6,je-analysis-1.5.3,以及自己修改过的cpdetector-1.0.5; 下载地址分别为 htmlparser:http://sourceforge.net/project/showfiles.php?group_id=24399 je-analysis:http://www.jesoft.cn/je-analysis-1.5.3.jar lucene就不用说了,cpdetector-1.0.5见附件. (应部分网友要求,把所用到的工具打包成一个spider.rar,方便下载测试) spindle的官方站点:http://www.bitmechanic.com/projects/spindle/ 主类SiteCapturer代码如下: package com.huizhi.kanine.util; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.io.UnsupportedEncodingException; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.SocketException; import java.net.SocketTimeoutException; import java.net.URL; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.HashSet; import jeasy.analysis.MMAnalyzer; import org.apache.log4j.Logger; import org.apache.log4j.PropertyConfigurator; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.RAMDirectory; import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.BaseHrefTag; import org.htmlparser.tags.FrameTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.MetaTag; import org.htmlparser.util.EncodingChangeException; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; /** * @author 张波 * E-mail:kaninebruno@hotmail.com * Created On : 2008-03-30 * Updated On : 2008-04-06 */ public class SiteCapturer implements Runnable { /* 基准(初始)URL */ protected URL baseURL = null; /* 索引文件或抓取页面的存放位置 */ protected String indexDir = null; /** * 待解析的URL地址集合,所有新检测到的链接均存放于此; * 解析时按照先入先出(First-In First-Out)法则线性取出 */ protected ArrayList URLs = new ArrayList(); /* 已索引的URL地址集合,避免链接的重复抓取 */ protected HashSet indexedURLs = new HashSet(); protected Parser parser = new Parser();; /* 程序运行线程数,默认2个线程 */ protected int threads = 2; /* 存储于磁盘的IndexWriter */ protected IndexWriter FSDWriter; /* 存储于内存的IndexWriter */ protected IndexWriter RAMWriter; protected IndexSearcher indexSearcher; protected RAMDirectory ramDirectory = new RAMDirectory(); /* 筛选页面内容的分词器 */ protected Analyzer luceneAnalyzer = new MMAnalyzer(); /* 解析页面时的字符编码 */ protected String charset; /* 基准端口 */ protected int basePort; /* 基准主机 */ protected String baseHost; /* 是否索引,默认true */ protected boolean justIndex = true; /* 是否保存,默认false */ protected boolean justCopy = false; /* 检测索引中是否存在当前URL信息,避免重复抓取 */ protected boolean isRepeatedCheck = false; /* 索引操作的写入线程锁 */ public static final Object indexLock = new Object(); public static Logger logger = Logger .getLogger(SiteCapturer.class.getName()); public SiteCapturer() { PrototypicalNodeFactory factory = new PrototypicalNodeFactory(); factory.registerTag(new LocalLinkTag()); factory.registerTag(new LocalFrameTag()); factory.registerTag(new LocalBaseHrefTag()); parser.setNodeFactory(factory); } /** * 程序入口,在此初始化mPages、IndexWriter * 通过协调各线程间的活动完成website的抓取工作 * 任务完成后将所有的索引片段合并为一个以优化检索 */ public void capture() { URLs.clear(); URLs.add(getBaseURL()); int responseCode = 0; String contentType = ""; PropertyConfigurator.configure("/log4j.properties"); try { HttpURLConnection uc = (HttpURLConnection) baseURL.openConnection(); responseCode = uc.getResponseCode(); contentType = uc.getContentType(); } catch (MalformedURLException mue) { logger.error("Invalid URL : " + getBaseURL()); } catch (UnknownHostException uhe) { logger.error("UnknowHost : " + getBaseURL()); } catch (SocketException se) { logger.error("Socket Error : " + se.getMessage() + " " + getBaseURL()); } catch (IOException ie) { logger.error("IOException : " + ie); } if (responseCode == HttpURLConnection.HTTP_OK && contentType.startsWith("text/html")) { charset = ParserUtils.autoDetectCharset(baseURL); basePort = baseURL.getPort(); baseHost = baseURL.getHost(); if (charset.equals("windows-1252")) charset = "GBK"; /* 存放索引文件的位置 */ File indexDirectory = new File(indexDir); /* 标记是否重新建立索引,true为重新建立索引 */ boolean flag = true; if (!indexDirectory.exists()) { /* 如果文件夹不存在则创建 */ indexDirectory.mkdir(); } else if (IndexReader.indexExists(indexDirectory)) { /* 如果已存在索引,则追加索引 */ flag = false; File lockfile = new File(indexDirectory + File.separator + "write.lock"); if (lockfile.exists()) lockfile.delete(); } try { if (justIndex) { FSDWriter = new IndexWriter(indexDirectory, luceneAnalyzer, flag); RAMWriter = new IndexWriter(ramDirectory, luceneAnalyzer, true); if (isRepeatedCheck) { IndexReader indexReader = IndexReader.open(indexDir); indexSearcher = new IndexSearcher(indexReader); } } long start = System.currentTimeMillis(); ArrayList threadList = new ArrayList(); for (int i = 0; i < threads; i++) { Thread t = new Thread(this, "K-9 Spider Thread #" + (i + 1)); t.start(); threadList.add(t); } while (threadList.size() > 0) { Thread child = (Thread) threadList.remove(0); try { child.join(); } catch (InterruptedException ie) { logger.error("InterruptedException : " + ie); } } long elapsed = System.currentTimeMillis() - start; if (justIndex) { RAMWriter.close(); FSDWriter.addIndexes(new Directory[] { ramDirectory }); FSDWriter.optimize(); FSDWriter.close(); } logger.info("Finished in " + (elapsed / 1000) + " seconds"); logger.info("The Count of the Links Captured is " + indexedURLs.size()); } catch (CorruptIndexException cie) { logger.error("CorruptIndexException : " + cie); } catch (LockObtainFailedException lofe) { logger.error("LockObtainFailedException : " + lofe); } catch (IOException ie) { logger.error("IOException : " + ie); } } } public void run() { String url; while ((url = dequeueURL()) != null) { if (justIndex) process(url); } threads--; } /** * 判断提取到的链接是否符合解析条件;标准为Port及Host与基准URL相同且类型为text/html或text/plain */ public boolean isToBeCaptured(String url) { boolean flag = false; HttpURLConnection uc = null; int responseCode = 0; String contentType = ""; String host = ""; int port = 0; try { URL source = new URL(url); String protocol = source.getProtocol(); if (protocol != null && protocol.equals("http")) { host = source.getHost(); port = source.getPort(); uc = (HttpURLConnection) source.openConnection(); uc.setConnectTimeout(8000); responseCode = uc.getResponseCode(); contentType = uc.getContentType(); } } catch (MalformedURLException mue) { logger.error("Invalid URL : " + url); } catch (UnknownHostException uhe) { logger.error("UnknowHost : " + url); } catch (SocketException se) { logger.error("Socket Error : " + se.getMessage() + " " + url); } catch (SocketTimeoutException ste) { logger.error("Socket Connection Time Out : " + url); } catch (FileNotFoundException fnfe) { logger.error("broken link " + url + " ignored"); } catch (IOException ie) { logger.error("IOException : " + ie); } if (port == basePort && responseCode == HttpURLConnection.HTTP_OK && host.equals(baseHost) && (contentType.startsWith("text/html") || contentType .startsWith("text/plain"))) flag = true; return flag; } /* 从URL队列mPages里取出单个的URL */ public synchronized String dequeueURL() { while (true) if (URLs.size() > 0) { String url = (String) URLs.remove(0); indexedURLs.add(url); if (isToBeCaptured(url)) { NodeList list; try { int bookmark = URLs.size(); /* 获取页面所有节点 */ parser.setURL(url); try { list = new NodeList(); for (NodeIterator e = parser.elements(); e .hasMoreNodes();) list.add(e.nextNode()); } catch (EncodingChangeException ece) { /* 解码出错的异常处理 */ parser.reset(); list = new NodeList(); for (NodeIterator e = parser.elements(); e .hasMoreNodes();) list.add(e.nextNode()); } /* 抓取静态页面 */ if (-1 == url.indexOf("?") && justCopy) copy(url, list); /** * 依据 http://www.robotstxt.org/wc/meta-user.html 处理 * Robots <META> tag */ NodeList robots = list .extractAllNodesThatMatch( new AndFilter(new NodeClassFilter( MetaTag.class), new HasAttributeFilter("name", "robots")), true); if (0 != robots.size()) { MetaTag robot = (MetaTag) robots.elementAt(0); String content = robot.getAttribute("content") .toLowerCase(); if ((-1 != content.indexOf("none")) || (-1 != content.indexOf("nofollow"))) for (int i = bookmark; i < URLs.size(); i++) URLs.remove(i); } } catch (ParserException pe) { logger.error("ParserException : " + pe); } return url; } } else { threads--; if (threads > 0) { try { wait(); threads++; } catch (InterruptedException ie) { logger.error("InterruptedException : " + ie); } } else { notifyAll(); return null; } } } /** * 处理单独的URL地址,解析页面并加入到lucene索引中;通过自动探测页面编码保证抓取工作的顺利执行 */ protected void process(String url) { String result[]; String content = null; String title = null; /* 此项操作较耗性能,故默认不予检测 */ if (isRepeatedCheck) { try { TermQuery query = new TermQuery(new Term("url", url)); Hits hits = indexSearcher.search(query); if (hits.length() > 0) { logger.info("The URL : " + url + " has already been captured"); } else { result = ParserUtils.parseHtml(url, charset); content = result[0]; title = result[1]; } } catch (IOException ie) { logger.error("IOException : " + ie); } } else { result = ParserUtils.parseHtml(url, charset); content = result[0]; title = result[1]; } if (content != null && content.trim().length() > 0) { Document document = new Document(); document.add(new Field("content", content, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); document.add(new Field("url", url, Field.Store.YES, Field.Index.UN_TOKENIZED)); document.add(new Field("title", title, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); document.add(new Field("date", DateTools.timeToString(System .currentTimeMillis(), DateTools.Resolution.DAY), Field.Store.YES, Field.Index.UN_TOKENIZED)); synchronized (indexLock) { try { RAMWriter.addDocument(document); /** * 当存放索引的内存使用大于指定值时将其写入硬盘;采用此方法的目的是通过内存缓冲避免频繁的 * IO操作,提高索引创建性能;合并索引时一定要调用被合并一方的IndexWriter的close()方法 */ if (RAMWriter.ramSizeInBytes() > 512 * 1024) { RAMWriter.close(); FSDWriter.addIndexes(new Directory[] { ramDirectory }); RAMWriter = new IndexWriter(ramDirectory, luceneAnalyzer, true); } logger.info("Indexed link : " + url); } catch (CorruptIndexException cie) { logger.error("CorruptIndexException : " + cie); } catch (IOException ie) { logger.error("IOException : " + ie); } } } } /* 将URL链接转换为本地目录的形式 */ protected String makeLocalLink(String link, String current) { String localLink; if (link.equals(getBaseURL())) localLink = "index.html"; else if (link.startsWith(getBaseURL()) && (link.length() > getBaseURL().length())) { localLink = link.substring(getBaseURL().length() + 1); if (-1 == localLink.indexOf(".")) localLink += "/" + "index.html"; } else localLink = link; if ((null != current) && link.startsWith(getBaseURL()) && (current.length() > getBaseURL().length())) { current = current.substring(getBaseURL().length() + 1); int i = 0, j; while (-1 != (j = current.indexOf('/', i))) { localLink = "../" + localLink; i = j + 1; } } return localLink; } /* 将页面按结构层次保存到本地硬盘 */ protected void copy(String url, NodeList list) { File file = new File(indexDir, makeLocalLink(url, "")); File dir = file.getParentFile(); if (!dir.exists()) dir.mkdirs(); else if (!dir.isDirectory()) { dir = new File(dir.getParentFile(), dir.getName() + ".content"); if (!dir.exists()) dir.mkdirs(); file = new File(dir, file.getName()); } try { PrintWriter out = new PrintWriter(new OutputStreamWriter( new FileOutputStream(file), charset)); for (int i = 0; i < list.size(); i++) out.print(list.elementAt(i).toHtml()); out.close(); logger.info("Captured link : " + url); } catch (FileNotFoundException fnfe) { logger.error("FileNotFoundException : " + fnfe); } catch (UnsupportedEncodingException uee) { logger.error("UnsupportedEncodingException : " + uee); } } /** * Link tag that rewrites the HREF. * The HREF is changed to a local target if it matches the source. */ class LocalLinkTag extends LinkTag { public void doSemanticAction() { String link = getLink(); if (link.endsWith("/")) link = link.substring(0, link.length() - 1); int pos = link.indexOf("#"); if (pos != -1) link = link.substring(0, pos); /* 将链接加入到处理队列中 */ if (!(indexedURLs.contains(link) || URLs.contains(link))) URLs.add(link); setLink(link); } } /** * Frame tag that rewrites the SRC URLs. The SRC URLs are mapped to local * targets if they match the source. */ class LocalFrameTag extends FrameTag { public void doSemanticAction() { String link = getFrameLocation(); if (link.endsWith("/")) link = link.substring(0, link.length() - 1); int pos = link.indexOf("#"); if (pos != -1) link = link.substring(0, pos); /* 将链接加入到处理队列中 */ if (!(indexedURLs.contains(link) || URLs.contains(link))) URLs.add(link); setFrameLocation(link); } } /** * Base tag that doesn't show. The toHtml() method is overridden to return * an empty string, effectively shutting off the base reference. */ class LocalBaseHrefTag extends BaseHrefTag { public String toHtml() { return (""); } } public static void main(String[] args) { SiteCapturer worker = new SiteCapturer(); if (args.length < 6) { System.out .println("Usage: -u <start url> -d <index dir> -t <threads> [-r] [-c] [-i]"); return; } for (int i = 0; i < args.length; i++) { if (args[i].equals("-u")) worker.setBaseURL(args[++i]); else if (args[i].equals("-d")) worker.setIndexDir(args[++i]); else if (args[i].equals("-t")) worker.setThreads(Integer.parseInt(args[++i])); else if (args[i].equals("-r")) worker.setIsRepeatedCheck(true); else if (args[i].equals("-c")) worker.setJustCopy(true); else if (args[i].equals("-i")) worker.setJustIndex(false); } if (worker.getThreads() < 1) throw new IllegalArgumentException("Invalid number of threads: " + worker.getThreads()); worker.capture(); System.exit(0); } public String getBaseURL() { return baseURL.toString(); } public void setBaseURL(String source) { if (source.endsWith("/")) source = source.substring(0, source.length() - 1); try { baseURL = new URL(source); } catch (MalformedURLException e) { logger.error("Invalid URL : " + getBaseURL()); } } public void setIndexDir(String indexDirectory) { indexDir = indexDirectory; } public int getThreads() { return threads; } public void setThreads(int threadCount) { threads = threadCount; } public void setIsRepeatedCheck(boolean check) { isRepeatedCheck = check; } public void setJustIndex(boolean justIndex) { this.justIndex = justIndex; } public void setJustCopy(boolean justCopy) { this.justCopy = justCopy; } } 工具类ParserUtils代码如下: package com.huizhi.kanine.util; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.SocketException; import java.net.SocketTimeoutException; import java.net.URL; import java.net.UnknownHostException; import java.nio.charset.Charset; import org.htmlparser.Parser; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.visitors.HtmlPage; import cpdetector.io.ASCIIDetector; import cpdetector.io.CodepageDetectorProxy; import cpdetector.io.JChardetFacade; import cpdetector.io.ParsingDetector; import cpdetector.io.UnicodeDetector; public class ParserUtils { /* StringBuffer的缓冲区大小 */ public static int TRANSFER_SIZE = 4096; /* 当前平台的行分隔符 */ public static String lineSep = System.getProperty("line.separator"); /* 自动探测页面编码,避免中文乱码的出现 */ public static String autoDetectCharset(URL url) { CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance(); /** * ParsingDetector可用于检查HTML、XML等文件或字符流的编码 * 构造方法中的参数用于指示是否显示探测过程的详细信息 * 为false则不显示 */ detector.add(new ParsingDetector(false)); detector.add(JChardetFacade.getInstance()); detector.add(ASCIIDetector.getInstance()); detector.add(UnicodeDetector.getInstance()); Charset charset = null; try { charset = detector.detectCodepage(url); } catch (MalformedURLException mue) { mue.printStackTrace(); } catch (IOException ie) { ie.printStackTrace(); } if (charset == null) charset = Charset.defaultCharset(); return charset.name(); } /* 按照指定编码解析标准的html页面,为建立索引做准备*/ public static String[] parseHtml(String url, String charset) { String result[] = null; String content = null; try { URL source = new URL(url); InputStream in = source.openStream(); BufferedReader reader = new BufferedReader(new InputStreamReader( in, charset)); String line = new String(); StringBuffer temp = new StringBuffer(TRANSFER_SIZE); while ((line = reader.readLine()) != null) { temp.append(line); temp.append(lineSep); } reader.close(); in.close(); content = temp.toString(); } catch (UnsupportedEncodingException uee) { uee.printStackTrace(); } catch (MalformedURLException mue) { System.err.println("Invalid URL : " + url); } catch (UnknownHostException uhe) { System.err.println("UnknowHost : " + url); } catch (SocketException se) { System.err.println("Socket Error : " + se.getMessage() + " " + url); } catch (SocketTimeoutException ste) { System.err.println("Socket Connection Time Out : " + url); } catch (FileNotFoundException fnfe) { System.err.println("broken link " + ((FileNotFoundException) fnfe.getCause()).getMessage() + " ignored"); } catch (IOException ie) { ie.printStackTrace(); } if (content != null) { Parser myParser = Parser.createParser(content, charset); HtmlPage visitor = new HtmlPage(myParser); try { myParser.visitAllNodesWith(visitor); String body = null; String title = "Untitled"; if (visitor.getBody() != null) { NodeList nodelist = visitor.getBody(); body = nodelist.asString().trim(); } if (visitor.getTitle() != null) title = visitor.getTitle(); result = new String[] { body, title }; } catch (ParserException pe) { pe.printStackTrace(); } } return result; } } 程序运行可选择控制台或新建一JSP页面,加入以下代码即可 (另,示例代码中log4j的配置文件须放在项目所在磁盘的根目录下;可在capture() 方法的PropertyConfigurator.configure("/log4j.properties")处自由修改) <%@ page contentType="text/html; charset=UTF-8"%> <%@ page import="com.huizhi.kanine.util.*"%> <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> <title>Lucene</title> </head> <body> <% SiteCapturer worker= new SiteCapturer(); worker.setBaseURL("http://www.blabla.cn"); worker.setIndexDir("c:\\luceneIndex"); //worker.setIsRepeatedCheck(true);//可选,检测链接是否和索引重复 //worker.setJustCopy(true);//可选,将链接保存到本地 worker.setThreads(20); worker.capture(); %> </body> </html> 声明:ITeye文章版权属于作者,受法律保护。没有作者书面许可不得转载。
推荐链接
|
|
返回顶楼 | |
发表时间:2008-03-31
楼主辛苦了啊,能否把它的优点和性能测试一起放上来?
|
|
返回顶楼 | |
发表时间:2008-03-31
关于性能,在用spindle的时候即使是只有几十个页面的简单站点都会在抓取完成后进入线程的死锁,我的这个改版不会出现这样的情况,而且加入了很多处理异常的操作,尽可能地保证了页面解析与建立索引的速度及准确性
|
|
返回顶楼 | |
发表时间:2008-04-01
很好,期待很久了。
|
|
返回顶楼 | |
发表时间:2008-04-02
(1)不能抓获https协议的网站;
(2)缺少异常日志; (3)SiteCapturer类过长,不方便作业,不符合“建筑美”。 |
|
返回顶楼 | |
发表时间:2008-04-02
lib.rar我下不下来
|
|
返回顶楼 | |
发表时间:2008-04-06
今天把进一步完善的版本放上来,主要变动是添加了将页面抓取保存到本地的功能,另外加入了日志记录,修正了解析url时一处不妥的地方;欢迎测试!
|
|
返回顶楼 | |
发表时间:2008-04-07
长期关注,并测试。。希望楼主不断更新!
|
|
返回顶楼 | |