`

Apache SOLR and Carrot2 integration strategies 2

 
阅读更多

In order to use custom chinese tokenizer(eg. jcseg). Following the next steps

1. download carrot2 souce code  and import it to eclipse

#git clone git://github.com/carrot2/carrot2.git

#cd carrot2

#ant -p

#ant eclipse

2. import jecseg to eclipse and reference it to carrot2-util-text subproject.



 

 

3.  modify org.carrot2.text.linguistic.DefaultTokenizerFactory.java

private static EnumMap<LanguageCode, IFactory<ITokenizer>> createDefaultTokenizers() {
		EnumMap<LanguageCode, IFactory<ITokenizer>> map = Maps
				.newEnumMap(LanguageCode.class);

		// By default, we use our own tokenizer for all languages.
		IFactory<ITokenizer> whitespaceTokenizerFactory = new NewClassInstanceFactory<ITokenizer>(
				ExtendedWhitespaceTokenizer.class);
 		
		IFactory<ITokenizer> chineseTokenizerFactory = new NewClassInstanceFactory<ITokenizer>(
				InokChineseTokenizerAdapter.class);

		for (LanguageCode lc : LanguageCode.values()) {
			map.put(lc, whitespaceTokenizerFactory);
		}

		// Chinese and Thai are exceptions, we use adapters around tokenizers
		// from Lucene.
	
		map.put(LanguageCode.CHINESE_SIMPLIFIED, chineseTokenizerFactory);
.....
}

 

4. create new class org.carrot2.text.linguistic.lucene.InokChineseTokenizerAdapter.java

package org.carrot2.text.linguistic.lucene;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.util.MutableCharArray;
import org.lionsoul.jcseg.analyzer.JcsegFilter;
import org.lionsoul.jcseg.analyzer.JcsegTokenizer;
import org.lionsoul.jcseg.core.ADictionary;
import org.lionsoul.jcseg.core.DictionaryFactory;
import org.lionsoul.jcseg.core.ISegment;
import org.lionsoul.jcseg.core.IWord;
import org.lionsoul.jcseg.core.JcsegException;
import org.lionsoul.jcseg.core.JcsegTaskConfig;
import org.lionsoul.jcseg.core.SegmentFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class InokChineseTokenizerAdapter extends Tokenizer implements
		ITokenizer {
	private final static Logger logger = LoggerFactory
			.getLogger(InokChineseTokenizerAdapter.class);
	private ISegment segmentor;

	private OffsetAttribute offsetAtt;
	private CharTermAttribute termAtt = null;

	private final MutableCharArray tempCharSequence;

	public InokChineseTokenizerAdapter() throws JcsegException, IOException {

		super(new StringReader("")); 
		JcsegTaskConfig config = new JcsegTaskConfig();
		ADictionary dic = DictionaryFactory.createDefaultDictionary(config);
		this.tempCharSequence = new MutableCharArray(new char[0]);
		segmentor = SegmentFactory.createJcseg(1, new Object[] { config, dic });
		segmentor.reset(input);
		termAtt = addAttribute(CharTermAttribute.class);
		offsetAtt = addAttribute(OffsetAttribute.class);
	}

	@Override
	public void reset(Reader reader) throws IOException {
		super.reset();
		segmentor.reset(reader);
	}

	@Override
	public short nextToken() throws IOException {

		final boolean hasNextToken = incrementToken();

		if (hasNextToken) {
			short flags = 0;
			final char[] image = termAtt.buffer();
			final int length = termAtt.length();
			tempCharSequence.reset(image, 0, length);

			if (length == 1) {

				flags = ITokenizer.TT_PUNCTUATION;
			} else {
				flags = ITokenizer.TT_TERM;
			}
			return flags;
		}

		return ITokenizer.TT_EOF;
	}

	@Override
	public void setTermBuffer(MutableCharArray array) {
		// TODO Auto-generated method stub
		array.reset(termAtt.buffer(), 0, termAtt.length());
	}

	@Override
	public boolean incrementToken() throws IOException {
		clearAttributes();
		IWord word = segmentor.next();
		if (word != null) {
			termAtt.append(word.getValue());
			termAtt.setLength(word.getLength());
			offsetAtt.setOffset(word.getPosition(),
					word.getPosition() + word.getLength());
			return true;
		} else {
			end();
			return false;
		}
	}

}

 

5. recompile and build jars in carrot2

#cd carrot2

a. modify build.xml  to add jcseg jars

 <patternset id="lib.test">
    <include name="core/**/*.jar" />
    <include name="lib/**/*.jar" />
    <include name="lib/jcseg-*.jar" />
    <exclude name="lib/org.slf4j/slf4j-nop*" />
    <include name="applications/carrot2-dcs/**/*.jar" />
    <include name="applications/carrot2-webapp/lib/*.jar" />
    <include name="applications/carrot2-benchmarks/lib/*.jar" />
  </patternset>

 

  <patternset id="lib.core">
    <include name="lib/**/*.jar" />
    <include name="core/carrot2-util-matrix/lib/*.jar" />
    <include name="lib/jcseg-*.jar" />
    <patternset refid="lib.core.excludes" />
  </patternset>

 

  <patternset id="lib.core.mini">
    <include name="lib/**/mahout-*.jar" />
    <include name="lib/jcseg-*.jar" />
    <include name="lib/**/mahout.LICENSE" />
    <include name="lib/**/colt.LICENSE" />
    <include name="lib/**/commons-lang*" />
    <include name="lib/**/guava*" />
    <include name="lib/**/jackson*" />
    <include name="lib/**/lucene-snowball*" />
    <include name="lib/**/lucene.LICENSE" />
    <include name="lib/**/hppc-*.jar" />
    <include name="lib/**/hppc*.LICENSE" />

    <include name="lib/**/slf4j-api*.jar" />
    <include name="lib/**/slf4j-nop*.jar" />
    <include name="lib/**/slf4j.LICENSE" />

    <include name="lib/**/attributes-binder-*.jar" />
  </patternset>

 Note:     lib/jcseg-*.jar

b. cp jcseg-analyzer-1.9.5.jar  and jcseg-core-1.9.5.jar to carrot2/lib/

c.run recompile and build jar

#ant jar

d. cp tmp/jar/carrot2-core-3.10.0-SNAPSHOT.jar   to solr/WEB-INF/lib/ '

Note: you should copy jars in contrib/clustering/lib/ , jcesg jars, lexcion dir and  jcseg.properties file to solr/WEB-INF/lib/.

 

Warning: the most important configure in solrconfig.xml is to define tokenizerFactory attribute

 <str name="PreprocessingPipeline.tokenizerFactory">org.carrot2.text.linguistic.DefaultTokenizerFactory</str>

  <searchComponent name="clustering"
                   enable="true"
                   class="solr.clustering.ClusteringComponent" >
    <lst name="engine">
      <str name="name">lingo</str>
      <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
      <str name="carrot.resourcesDir">clustering/carrot2</str>
      <str name="MultilingualClustering.defaultLanguage">CHINESE_SIMPLIFIED</str>
      <str name="PreprocessingPipeline.tokenizerFactory">org.carrot2.text.linguistic.DefaultTokenizerFactory</str>
    </lst>
</searchComponent>

 

 

 

 

 

 

 

 

 

  • 大小: 81.1 KB
分享到:
评论

相关推荐

    solr的carrot2需要用到的文件

    solr的carrot2需要用到的文件solr-integration-strategies-gh-pages carrot3.9webapp,还有tomcat还有solr4.81请自己下载

    最新可用已配置好solr的carrot2插件

    最新可用已配置好solr的carrot2插件,tomcat里面需配置好solr具体到http://carrot2.github.io/solr-integration-strategies/carrot2-3.8.0/index.html查看

    Apache Solr 4 Cookbook

    Apache Solr 4 Cookbook Apache Solr 4 Cookbook Apache Solr 4 Cookbook Apache Solr 4 Cookbook Apache Solr 4 Cookbook

    Spring Data for Apache Solr API(Spring Data for Apache Solr 开发文档).CHM

    Spring Data for Apache Solr API。 Spring Data for Apache Solr 开发文档

    Apache Solr(solr-8.11.1.zip)

    Apache Solr是一款开源的企业级搜索平台,由Apache软件基金会维护。它是基于Java的,提供了高效、可扩展的全文检索、数据分析和分布式搜索功能。Solr-8.11.1是该软件的一个特定版本,包含了从早期版本到8.11.1的所有...

    Apache Solr(solr-8.11.1.tgz)

    Apache Solr 是一个开源的全文搜索引擎,由Apache软件基金会维护,是Lucene项目的一部分。它提供了高效、可扩展的搜索和导航功能,广泛应用于企业级的搜索应用中。Solr-8.11.1是该软件的一个特定版本,包含了最新的...

    Apache Solr Essentials(PACKT,2015)

    Apache Solr Essentials is a fast-paced guide to help you quickly learn the process of creating a scalable, efficient, and powerful search application. The book starts off by explaining the ...

    apache solr Reference guide 4.5.pdf

    Apache Solr是一个基于Apache Lucene构建的开源搜索平台。它是一个高性能的企业级搜索引擎,专为全文搜索和搜索应用程序而设计。Solr提供了可扩展、容错和分布式的特点,同时提供了多种接口,包括REST API,使其可以...

    Apache.Solr.Search.Patterns.1783981849

    This book is for developers who already know how to use Solr and are looking at procuring advanced strategies for improving their search using Solr. This book is also for people who work with ...

    apache solr搜索系统的.Net实现

    apache solr搜索系统的.Net实现

    Apache Solr and Tomcat6 Search engine

    **Apache Solr与Tomcat6搜索引擎** Apache Solr是一个开源的企业级搜索平台,它基于Lucene库,提供了高效、可扩展的全文检索、命中高亮、拼写检查、分类、聚类等多种功能。Solr的核心特性是其强大的索引能力和快速...

    Apache Solr High Performance.pdf&Solr;+In+Action+2013.pdf英文版

    Apache Solr是一款强大的开源搜索平台,它被广泛用于构建高效、可扩展的全文搜索引擎。这两本电子书——"Apache Solr High Performance.pdf" 和 "Solr In Action 2013.pdf" 提供了深入的Solr知识,帮助读者理解和...

    apache solr 源文件 3.6.1

    Apache Solr 是一个开源的企业级搜索平台,由Apache软件基金会维护。版本3.6.1是Solr的一个重要里程碑,提供了稳定性和性能优化。通过深入理解这个版本的源代码,开发者可以更深入地掌握Solr的工作原理,从而更好地...

    Apache Solr Search

    ### Apache Solr Search:一种强大的开源企业搜索解决方案 #### Apache Solr简介 Apache Solr是一款基于Lucene Java搜索引擎库的企业级搜索服务器。它不仅继承了Lucene的强大功能,还在此基础上进行了扩展,提供了...

    Mastering Apache Solr 7.x An expert guide to advancing, optimizing, 无水印转化版pdf

    ### Apache Solr 7.x Mastering Guide:提升、优化与扩展企业级搜索技术详解 #### 知识点一:Apache Solr 7.x 概览 - **版本更新要点**:本书聚焦于Apache Solr 7.x版本的核心特性和新增功能,包括性能提升、稳定性...

    apache solr guide 4.7

    ### Apache Solr Guide 4.7 知识点解析 #### 一、Apache Solr 概述 **Apache Solr** 是一个高性能、基于 Lucene 的全文检索服务系统,广泛应用于互联网企业的搜索服务中。Solr 提供了高度可扩展且稳定的搜索功能,...

Global site tag (gtag.js) - Google Analytics