`
conkeyn
  • 浏览: 1524598 次
  • 性别: Icon_minigender_1
  • 来自: 厦门
社区版块
存档分类
最新评论

Lucene 3.x+的Ananlyzer学习实例

 
阅读更多

参考以下连接地址:http://stackoverflow.com/questions/2638200/how-to-get-a-token-from-a-lucene-tokenstream

 

 

package cn.itcast.lucene.analyzer;

import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
import org.junit.Test;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class AnalyzerTest {

	String enText = "IndexWriter addDocument's a javadoc.txt";
	// String zhText = "我们是中国人";
	// String zhText = "小笑话_总统的房间 Room .txt";
	String zhText = "一位绅士到旅游胜地的一家饭店要开个房间";

	Analyzer en1 = new StandardAnalyzer(Version.LUCENE_36); // 单字分词
	Analyzer en2 = new SimpleAnalyzer();

	// Analyzer zh1 = new CJKAnalyzer(); // 二分法分词
	Analyzer zh2 = new IKAnalyzer(); // 词库分词

	@Test
	public void test() throws Exception {
		// analyze(en2, enText);
		// analyze(en1, zhText);

		// analyze(zh1, zhText);
		analyze(zh2, zhText);
	}

	public void analyze(Analyzer analyzer, String text) throws Exception {
		System.out.println("-------------> 分词器:" + analyzer.getClass());
		TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text));
		// 2.9及前的版本使用注释的代码,3.X后使用没有注释的代码
		// for (Token token = new Token(); (token = tokenStream.next(token)) != null;) {
		// System.out.println(token);
		// }
		// 3.X方法一
		// while (tokenStream.incrementToken()) {
		// System.out.println(tokenStream.getAttribute(CharTermAttribute.class).toString());
		// }
		// 3.X方法二
		// OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
		// TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);
		// while (tokenStream.incrementToken()) {
		// int startOffset = offsetAttribute.startOffset();
		// int endOffset = offsetAttribute.endOffset();
		// String term = termAttribute.term();
		// System.out.println("term:" + term + ",startOffset:" + startOffset + ",endOffset:" + endOffset);
		// }
		// 3.X方法三
		OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
		CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class);
		while (tokenStream.incrementToken()) {
			int startOffset = offsetAttribute.startOffset();
			int endOffset = offsetAttribute.endOffset();
			String term = termAttribute.toString();
			System.out.println("term:" + term + ",startOffset:" + startOffset + ",endOffset:" + endOffset);
		}

		// 以下方法,上面几种情况都需要调用
		tokenStream.end();
		tokenStream.close();
	}
}
 
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics