最近学习Lucene,在别人基础上,做了一个小例子

pengchua

浏览: 153704 次
性别:
来自: 上海

最近访客更多访客>>

songhait

leimingchao

king114963349

2560445422

博主相关

博客

微博

相册

留言

关于我

文章分类

全部博客 (73)

社区版块

存档分类

lucene Java Apache

最近学习Lucene,在别人基础上,做了一个小例子 ,以便共同学习！

import java.io.InputStream;

import lia.handlingtypes.framework.DocumentHandlerException;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.textmining.text.extraction.WordExtractor;

public class DocDocumentHandler implements DocumentHandler {

public Document getDocument(InputStream is) throws Exception {

// TODO Auto-generated method stub

String bodyText = null;

try {

bodyText = new WordExtractor().extractText(is);

}

catch (Exception e) {

throw new DocumentHandlerException(

"Cannot extract text from a Word document", e);

}

if ((bodyText != null) && (bodyText.trim().length() > 0)) {

Document doc = new Document();

doc.add(Field.UnStored("body", bodyText));

return doc;

}

return null;

}

import java.io.InputStream;

import org.apache.lucene.document.Document;

public interface DocumentHandler {

Document getDocument(InputStream is)

throws Exception;

}

import java.io.InputStream;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.w3c.dom.Element;

import org.w3c.dom.Node;

import org.w3c.dom.NodeList;

import org.w3c.dom.Text;

import org.w3c.tidy.Tidy;

public class HtmlDocumentHandler implements DocumentHandler {

public Document getDocument(InputStream is) throws Exception {

// TODO Auto-generated method stub

Tidy tidy = new Tidy();

tidy.setQuiet(true);

tidy.setShowWarnings(false);

org.w3c.dom.Document root = tidy.parseDOM(is, null);

Element rawDoc = root.getDocumentElement();

Document doc = new Document();

String title = getTitle(rawDoc);

String body = getBody(rawDoc);

if ((title != null) && (!title.equals(""))) {

doc.add(Field.Text("title", title));

}

if ((body != null) && (!body.equals(""))) {

doc.add(Field.Text("body", body));

}

return doc;

}

private String getTitle(Element rawDoc) {

if (rawDoc == null) {

return null;

}

String title = "";

NodeList children = rawDoc.getElementsByTagName("title");

if (children.getLength() > 0) {

Element titleElement = ((Element) children.item(0));

Text text = (Text) titleElement.getFirstChild();

if (text != null) {

title = text.getData();

}

return title;

}

/**

* Gets the body text of the HTML document.

* @rawDoc the DOM Element to extract body Node from

* @return the body text

private String getBody(Element rawDoc) {

if (rawDoc == null) {

return null;

}

String body = "";

NodeList children = rawDoc.getElementsByTagName("body");

if (children.getLength() > 0) {

body = getText(children.item(0));

}

return body;

}

/**

* Extracts text from the DOM node.

* @param node a DOM node

* @return the text value of the node

private String getText(Node node) {

NodeList children = node.getChildNodes();

StringBuffer sb = new StringBuffer();

for (int i = 0; i < children.getLength(); i++) {

Node child = children.item(i);

switch (child.getNodeType()) {

case Node.ELEMENT_NODE:

sb.append(getText(child));

sb.append(" ");

break;

case Node.TEXT_NODE:

sb.append(((Text) child).getData());

break;

}

return sb.toString();

}

import java.io.File;

import java.io.FileInputStream;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import lia.handlingtypes.framework.DocumentHandlerException;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.pdfbox.cos.COSDocument;

import org.pdfbox.encryption.DecryptDocument;

import org.pdfbox.exceptions.CryptographyException;

import org.pdfbox.exceptions.InvalidPasswordException;

import org.pdfbox.pdfparser.PDFParser;

import org.pdfbox.pdmodel.PDDocument;

import org.pdfbox.pdmodel.PDDocumentInformation;

import org.pdfbox.searchengine.lucene.LucenePDFDocument;

import org.pdfbox.util.PDFTextStripper;

public class PdfDocumentHandler implements DocumentHandler {

&nbs

分享到：

Hibernate中引用View的解决方案_来自一网 ...

2007-06-21 16:32
浏览 1331
评论(0)
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论