基于Spindle的增强HTTP Spider
构建于lucene之上的可用的Java开源Spider少之又少,spindle长期没有更新且功能不够完善,故而自己参考其源代码重新编写了一个可扩展的WebCrawler,本着开源共享,共同进步的想法发布于此,期冀得到大家的批评指正,有任何意见及建议均可Email联系我 (kaninebruno@hotmail.com)
package com.huizhi.kanine.util; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.io.UnsupportedEncodingException; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.SocketException; import java.net.SocketTimeoutException; import java.net.URL; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.HashSet; import jeasy.analysis.MMAnalyzer; import org.apache.log4j.Logger; import org.apache.log4j.PropertyConfigurator; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.RAMDirectory; import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.BaseHrefTag; import org.htmlparser.tags.FrameTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.MetaTag; import org.htmlparser.util.EncodingChangeException; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; /** * @author 张波 * E-mail:kaninebruno@hotmail.com * Created On : 2008-03-30 * Updated On : 2008-04-06 */ public class SiteCapturer implements Runnable { /* 基准(初始)URL */ protected URL baseURL = null; /* 索引文件或抓取页面的存放位置 */ protected String indexDir = null; /** * 待解析的URL地址集合,所有新检测到的链接均存放于此; * 解析时按照先入先出(First-In First-Out)法则线性取出 */ protected ArrayList URLs = new ArrayList(); /* 已索引的URL地址集合,避免链接的重复抓取 */ protected HashSet indexedURLs = new HashSet(); protected Parser parser = new Parser();; /* 程序运行线程数,默认2个线程 */ protected int threads = 2; /* 存储于磁盘的IndexWriter */ protected IndexWriter FSDWriter; /* 存储于内存的IndexWriter */ protected IndexWriter RAMWriter; protected IndexSearcher indexSearcher; protected RAMDirectory ramDirectory = new RAMDirectory(); /* 筛选页面内容的分词器 */ protected Analyzer luceneAnalyzer = new MMAnalyzer(); /* 解析页面时的字符编码 */ protected String charset; /* 基准端口 */ protected int basePort; /* 基准主机 */ protected String baseHost; /* 是否索引,默认true */ protected boolean justIndex = true; /* 是否保存,默认false */ protected boolean justCopy = false; /* 检测索引中是否存在当前URL信息,避免重复抓取 */ protected boolean isRepeatedCheck = false; /* 索引操作的写入线程锁 */ public static final Object indexLock = new Object(); public static Logger logger = Logger .getLogger(SiteCapturer.class.getName()); public SiteCapturer() { PrototypicalNodeFactory factory = new PrototypicalNodeFactory(); factory.registerTag(new LocalLinkTag()); factory.registerTag(new LocalFrameTag()); factory.registerTag(new LocalBaseHrefTag()); parser.setNodeFactory(factory); } /** * 程序入口,在此初始化mPages、IndexWriter * 通过协调各线程间的活动完成website的抓取工作 * 任务完成后将所有的索引片段合并为一个以优化检索 */ public void capture() { URLs.clear(); URLs.add(getBaseURL()); int responseCode = 0; String contentType = ""; PropertyConfigurator.configure("/log4j.properties"); try { HttpURLConnection uc = (HttpURLConnection) baseURL.openConnection(); responseCode = uc.getResponseCode(); contentType = uc.getContentType(); } catch (MalformedURLException mue) { logger.error("Invalid URL : " + getBaseURL()); } catch (UnknownHostException uhe) { logger.error("UnknowHost : " + getBaseURL()); } catch (SocketException se) { logger.error("Socket Error : " + se.getMessage() + " " + getBaseURL()); } catch (IOException ie) { logger.error("IOException : " + ie); } if (responseCode == HttpURLConnection.HTTP_OK && contentType.startsWith("text/html")) { charset = ParserUtils.autoDetectCharset(baseURL); basePort = baseURL.getPort(); baseHost = baseURL.getHost(); if (charset.equals("windows-1252")) charset = "GBK"; /* 存放索引文件的位置 */ File indexDirectory = new File(indexDir); /* 标记是否重新建立索引,true为重新建立索引 */ boolean flag = true; if (!indexDirectory.exists()) { /* 如果文件夹不存在则创建 */ indexDirectory.mkdir(); } else if (IndexReader.indexExists(indexDirectory)) { /* 如果已存在索引,则追加索引 */ flag = false; File lockfile = new File(indexDirectory + File.separator + "write.lock"); if (lockfile.exists()) lockfile.delete(); } try { if (justIndex) { FSDWriter = new IndexWriter(indexDirectory, luceneAnalyzer, flag); RAMWriter = new IndexWriter(ramDirectory, luceneAnalyzer, true); if (isRepeatedCheck) { IndexReader indexReader = IndexReader.open(indexDir); indexSearcher = new IndexSearcher(indexReader); } } long start = System.currentTimeMillis(); ArrayList threadList = new ArrayList(); for (int i = 0; i < threads; i++) { Thread t = new Thread(this, "K-9 Spider Thread #" + (i + 1)); t.start(); threadList.add(t); } while (threadList.size() > 0) { Thread child = (Thread) threadList.remove(0); try { child.join(); } catch (InterruptedException ie) { logger.error("InterruptedException : " + ie); } } long elapsed = System.currentTimeMillis() - start; if (justIndex) { RAMWriter.close(); FSDWriter.addIndexes(new Directory[] { ramDirectory }); FSDWriter.optimize(); FSDWriter.close(); } logger.info("Finished in " + (elapsed / 1000) + " seconds"); logger.info("The Count of the Links Captured is " + indexedURLs.size()); } catch (CorruptIndexException cie) { logger.error("CorruptIndexException : " + cie); } catch (LockObtainFailedException lofe) { logger.error("LockObtainFailedException : " + lofe); } catch (IOException ie) { logger.error("IOException : " + ie); } } } public void run() { String url; while ((url = dequeueURL()) != null) { if (justIndex) process(url); } threads--; } /** * 判断提取到的链接是否符合解析条件;标准为Port及Host与基准URL相同且类型为text/html或text/plain */ public boolean isToBeCaptured(String url) { boolean flag = false; HttpURLConnection uc = null; int responseCode = 0; String contentType = ""; String host = ""; int port = 0; try { URL source = new URL(url); String protocol = source.getProtocol(); if (protocol != null && protocol.equals("http")) { host = source.getHost(); port = source.getPort(); uc = (HttpURLConnection) source.openConnection(); uc.setConnectTimeout(8000); responseCode = uc.getResponseCode(); contentType = uc.getContentType(); } } catch (MalformedURLException mue) { logger.error("Invalid URL : " + url); } catch (UnknownHostException uhe) { logger.error("UnknowHost : " + url); } catch (SocketException se) { logger.error("Socket Error : " + se.getMessage() + " " + url); } catch (SocketTimeoutException ste) { logger.error("Socket Connection Time Out : " + url); } catch (FileNotFoundException fnfe) { logger.error("broken link " + url + " ignored"); } catch (IOException ie) { logger.error("IOException : " + ie); } if (port == basePort && responseCode == HttpURLConnection.HTTP_OK && host.equals(baseHost) && (contentType.startsWith("text/html") || contentType .startsWith("text/plain"))) flag = true; return flag; } /* 从URL队列mPages里取出单个的URL */ public synchronized String dequeueURL() { while (true) if (URLs.size() > 0) { String url = (String) URLs.remove(0); indexedURLs.add(url); if (isToBeCaptured(url)) { NodeList list; try { int bookmark = URLs.size(); /* 获取页面所有节点 */ parser.setURL(url); try { list = new NodeList(); for (NodeIterator e = parser.elements(); e .hasMoreNodes();) list.add(e.nextNode()); } catch (EncodingChangeException ece) { /* 解码出错的异常处理 */ parser.reset(); list = new NodeList(); for (NodeIterator e = parser.elements(); e .hasMoreNodes();) list.add(e.nextNode()); } /* 抓取静态页面 */ if (-1 == url.indexOf("?") && justCopy) copy(url, list); /** * 依据 http://www.robotstxt.org/wc/meta-user.html 处理 * Robots <META> tag */ NodeList robots = list .extractAllNodesThatMatch( new AndFilter(new NodeClassFilter( MetaTag.class), new HasAttributeFilter("name", "robots")), true); if (0 != robots.size()) { MetaTag robot = (MetaTag) robots.elementAt(0); String content = robot.getAttribute("content") .toLowerCase(); if ((-1 != content.indexOf("none")) || (-1 != content.indexOf("nofollow"))) for (int i = bookmark; i < URLs.size(); i++) URLs.remove(i); } } catch (ParserException pe) { logger.error("ParserException : " + pe); } return url; } } else { threads--; if (threads > 0) { try { wait(); threads++; } catch (InterruptedException ie) { logger.error("InterruptedException : " + ie); } } else { notifyAll(); return null; } } } /** * 处理单独的URL地址,解析页面并加入到lucene索引中;通过自动探测页面编码保证抓取工作的顺利执行 */ protected void process(String url) { String result[]; String content = null; String title = null; /* 此项操作较耗性能,故默认不予检测 */ if (isRepeatedCheck) { try { TermQuery query = new TermQuery(new Term("url", url)); Hits hits = indexSearcher.search(query); if (hits.length() > 0) { logger.info("The URL : " + url + " has already been captured"); } else { result = ParserUtils.parseHtml(url, charset); content = result[0]; title = result[1]; } } catch (IOException ie) { logger.error("IOException : " + ie); } } else { result = ParserUtils.parseHtml(url, charset); content = result[0]; title = result[1]; } if (content != null && content.trim().length() > 0) { Document document = new Document(); document.add(new Field("content", content, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); document.add(new Field("url", url, Field.Store.YES, Field.Index.UN_TOKENIZED)); document.add(new Field("title", title, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); document.add(new Field("date", DateTools.timeToString(System .currentTimeMillis(), DateTools.Resolution.DAY), Field.Store.YES, Field.Index.UN_TOKENIZED)); synchronized (indexLock) { try { RAMWriter.addDocument(document); /** * 当存放索引的内存使用大于指定值时将其写入硬盘;采用此方法的目的是通过内存缓冲避免频繁的 * IO操作,提高索引创建性能;合并索引时一定要调用被合并一方的IndexWriter的close()方法 */ if (RAMWriter.ramSizeInBytes() > 512 * 1024) { RAMWriter.close(); FSDWriter.addIndexes(new Directory[] { ramDirectory }); RAMWriter = new IndexWriter(ramDirectory, luceneAnalyzer, true); } logger.info("Indexed link : " + url); } catch (CorruptIndexException cie) { logger.error("CorruptIndexException : " + cie); } catch (IOException ie) { logger.error("IOException : " + ie); } } } } /* 将URL链接转换为本地目录的形式 */ protected String makeLocalLink(String link, String current) { String localLink; if (link.equals(getBaseURL())) localLink = "index.html"; else if (link.startsWith(getBaseURL()) && (link.length() > getBaseURL().length())) { localLink = link.substring(getBaseURL().length() + 1); if (-1 == localLink.indexOf(".")) localLink += "/" + "index.html"; } else localLink = link; if ((null != current) && link.startsWith(getBaseURL()) && (current.length() > getBaseURL().length())) { current = current.substring(getBaseURL().length() + 1); int i = 0, j; while (-1 != (j = current.indexOf('/', i))) { localLink = "../" + localLink; i = j + 1; } } return localLink; } /* 将页面按结构层次保存到本地硬盘 */ protected void copy(String url, NodeList list) { File file = new File(indexDir, makeLocalLink(url, "")); File dir = file.getParentFile(); if (!dir.exists()) dir.mkdirs(); else if (!dir.isDirectory()) { dir = new File(dir.getParentFile(), dir.getName() + ".content"); if (!dir.exists()) dir.mkdirs(); file = new File(dir, file.getName()); } try { PrintWriter out = new PrintWriter(new OutputStreamWriter( new FileOutputStream(file), charset)); for (int i = 0; i < list.size(); i++) out.print(list.elementAt(i).toHtml()); out.close(); logger.info("Captured link : " + url); } catch (FileNotFoundException fnfe) { logger.error("FileNotFoundException : " + fnfe); } catch (UnsupportedEncodingException uee) { logger.error("UnsupportedEncodingException : " + uee); } } /** * Link tag that rewrites the HREF. * The HREF is changed to a local target if it matches the source. */ class LocalLinkTag extends LinkTag { public void doSemanticAction() { String link = getLink(); if (link.endsWith("/")) link = link.substring(0, link.length() - 1); int pos = link.indexOf("#"); if (pos != -1) link = link.substring(0, pos); /* 将链接加入到处理队列中 */ if (!(indexedURLs.contains(link) || URLs.contains(link))) URLs.add(link); setLink(link); } } /** * Frame tag that rewrites the SRC URLs. The SRC URLs are mapped to local * targets if they match the source. */ class LocalFrameTag extends FrameTag { public void doSemanticAction() { String link = getFrameLocation(); if (link.endsWith("/")) link = link.substring(0, link.length() - 1); int pos = link.indexOf("#"); if (pos != -1) link = link.substring(0, pos); /* 将链接加入到处理队列中 */ if (!(indexedURLs.contains(link) || URLs.contains(link))) URLs.add(link); setFrameLocation(link); } } /** * Base tag that doesn't show. The toHtml() method is overridden to return * an empty string, effectively shutting off the base reference. */ class LocalBaseHrefTag extends BaseHrefTag { public String toHtml() { return (""); } } public static void main(String[] args) { SiteCapturer worker = new SiteCapturer(); if (args.length < 6) { System.out .println("Usage: -u <start url> -d <index dir> -t <threads> [-r] [-c] [-i]"); return; } for (int i = 0; i < args.length; i++) { if (args[i].equals("-u")) worker.setBaseURL(args[++i]); else if (args[i].equals("-d")) worker.setIndexDir(args[++i]); else if (args[i].equals("-t")) worker.setThreads(Integer.parseInt(args[++i])); else if (args[i].equals("-r")) worker.setIsRepeatedCheck(true); else if (args[i].equals("-c")) worker.setJustCopy(true); else if (args[i].equals("-i")) worker.setJustIndex(false); } if (worker.getThreads() < 1) throw new IllegalArgumentException("Invalid number of threads: " + worker.getThreads()); worker.capture(); System.exit(0); } public String getBaseURL() { return baseURL.toString(); } public void setBaseURL(String source) { if (source.endsWith("/")) source = source.substring(0, source.length() - 1); try { baseURL = new URL(source); } catch (MalformedURLException e) { logger.error("Invalid URL : " + getBaseURL()); } } public void setIndexDir(String indexDirectory) { indexDir = indexDirectory; } public int getThreads() { return threads; } public void setThreads(int threadCount) { threads = threadCount; } public void setIsRepeatedCheck(boolean check) { isRepeatedCheck = check; } public void setJustIndex(boolean justIndex) { this.justIndex = justIndex; } public void setJustCopy(boolean justCopy) { this.justCopy = justCopy; } }
package com.huizhi.kanine.util; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.SocketException; import java.net.SocketTimeoutException; import java.net.URL; import java.net.UnknownHostException; import java.nio.charset.Charset; import org.htmlparser.Parser; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.visitors.HtmlPage; import cpdetector.io.ASCIIDetector; import cpdetector.io.CodepageDetectorProxy; import cpdetector.io.JChardetFacade; import cpdetector.io.ParsingDetector; import cpdetector.io.UnicodeDetector; public class ParserUtils { /* StringBuffer的缓冲区大小 */ public static int TRANSFER_SIZE = 4096; /* 当前平台的行分隔符 */ public static String lineSep = System.getProperty("line.separator"); /* 自动探测页面编码,避免中文乱码的出现 */ public static String autoDetectCharset(URL url) { CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance(); /** * ParsingDetector可用于检查HTML、XML等文件或字符流的编码 * 构造方法中的参数用于指示是否显示探测过程的详细信息 * 为false则不显示 */ detector.add(new ParsingDetector(false)); detector.add(JChardetFacade.getInstance()); detector.add(ASCIIDetector.getInstance()); detector.add(UnicodeDetector.getInstance()); Charset charset = null; try { charset = detector.detectCodepage(url); } catch (MalformedURLException mue) { mue.printStackTrace(); } catch (IOException ie) { ie.printStackTrace(); } if (charset == null) charset = Charset.defaultCharset(); return charset.name(); } /* 按照指定编码解析标准的html页面,为建立索引做准备*/ public static String[] parseHtml(String url, String charset) { String result[] = null; String content = null; try { URL source = new URL(url); InputStream in = source.openStream(); BufferedReader reader = new BufferedReader(new InputStreamReader( in, charset)); String line = new String(); StringBuffer temp = new StringBuffer(TRANSFER_SIZE); while ((line = reader.readLine()) != null) { temp.append(line); temp.append(lineSep); } reader.close(); in.close(); content = temp.toString(); } catch (UnsupportedEncodingException uee) { uee.printStackTrace(); } catch (MalformedURLException mue) { System.err.println("Invalid URL : " + url); } catch (UnknownHostException uhe) { System.err.println("UnknowHost : " + url); } catch (SocketException se) { System.err.println("Socket Error : " + se.getMessage() + " " + url); } catch (SocketTimeoutException ste) { System.err.println("Socket Connection Time Out : " + url); } catch (FileNotFoundException fnfe) { System.err.println("broken link " + ((FileNotFoundException) fnfe.getCause()).getMessage() + " ignored"); } catch (IOException ie) { ie.printStackTrace(); } if (content != null) { Parser myParser = Parser.createParser(content, charset); HtmlPage visitor = new HtmlPage(myParser); try { myParser.visitAllNodesWith(visitor); String body = null; String title = "Untitled"; if (visitor.getBody() != null) { NodeList nodelist = visitor.getBody(); body = nodelist.asString().trim(); } if (visitor.getTitle() != null) title = visitor.getTitle(); result = new String[] { body, title }; } catch (ParserException pe) { pe.printStackTrace(); } } return result; } }
<%@ page contentType="text/html; charset=UTF-8"%> <%@ page import="com.huizhi.kanine.util.*"%> <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> <title>Lucene</title> </head> <body> <% SiteCapturer worker= new SiteCapturer(); worker.setBaseURL("http://www.blabla.cn"); worker.setIndexDir("c:\\luceneIndex"); //worker.setIsRepeatedCheck(true);//可选,检测链接是否和索引重复 //worker.setJustCopy(true);//可选,将链接保存到本地 worker.setThreads(20); worker.capture(); %> </body> </html>
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
public class TestQuery {
public static void main(String[] args) throws IOException, ParseException {
Hits hits = null;
String queryString = "茶馆";
Query query = null;
IndexSearcher searcher = new IndexSearcher("C:\\luceneIndex");
Analyzer analyzer = new StandardAnalyzer();
try {
QueryParser qp = new QueryParser("content", analyzer);
query = qp.parse(queryString);
} catch (ParseException e) {
if (searcher != null) {
hits = searcher.search(query);
if (hits.length() > 0) {
System.out.println("找到:" + hits.length() + " 个结果!");
