`
lxwt909
  • 浏览: 573368 次
  • 性别: Icon_minigender_1
  • 来自: 北京
社区版块
存档分类
最新评论

跟益达学Solr5之拼音分词[改进版]

    博客分类:
  • Solr
阅读更多

      之前一篇介绍过如何自定义实现拼音分词器,不过当初只考虑了全拼这种情况,且有些BUG,趁着抗日胜利70周年阅兵3天假期有时间,又把当初的代码拿起来进行了改进,改进点包括支持全拼,简拼以及全拼+简拼,支持汉字数字是否NGram处理的可配置,支持NGram长度范围的可配置等,特此更新此篇进行分享!如有不妥之处,还望不吝指正!

      废话不多说,直接上代码:

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.pinyin.utils.Constant;
import org.apache.lucene.analysis.pinyin.utils.Pinyin4jUtil;
import org.apache.lucene.analysis.pinyin.utils.StringUtils;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/**
 * 拼音过滤器[负责将汉字转换为拼音]
 * @author Lanxiaowei
 *
 */
public class PinyinTokenFilter extends TokenFilter {
	/**是否输出原中文*/
	private boolean isOutChinese;
	/**是否只转换简拼*/
	private boolean shortPinyin;
	/**是否转换全拼+简拼*/
	private boolean pinyinAll;
	/**中文词组长度过滤,默认超过2位长度的中文才转换拼音*/
	private int minTermLength;

	/**词元输入缓存*/
	private char[] curTermBuffer;
	/**词元输入长度*/
	private int curTermLength;

	private final CharTermAttribute termAtt = (CharTermAttribute) addAttribute(CharTermAttribute.class);
	/**位置增量属性*/
	private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
	private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
	/**当前输入是否已输出*/
	private boolean hasCurOut;
	/**拼音结果集*/
	private Collection<String> terms;
	/**拼音结果集迭代器*/
	private Iterator<String> termIte;

	public PinyinTokenFilter(TokenStream input) {
		this(input,Constant.DEFAULT_MIN_TERM_LRNGTH);
	}

	public PinyinTokenFilter(TokenStream input, int minTermLength) {
		this(input, Constant.DEFAULT_SHORT_PINYIN, Constant.DEFAULT_PINYIN_ALL,minTermLength);
	}

	public PinyinTokenFilter(TokenStream input, boolean shortPinyin) {
		this(input, shortPinyin, Constant.DEFAULT_PINYIN_ALL);
	}
	
	public PinyinTokenFilter(TokenStream input, boolean shortPinyin,boolean pinyinAll) {
		this(input, shortPinyin,pinyinAll, Constant.DEFAULT_MIN_TERM_LRNGTH);
	}
	
	public PinyinTokenFilter(TokenStream input, boolean shortPinyin,boolean pinyinAll,int minTermLength) {
		this(input, shortPinyin,pinyinAll,Constant.DEFAULT_OUT_CHINESE, minTermLength);
	}

	public PinyinTokenFilter(TokenStream input, boolean shortPinyin,boolean pinyinAll,
			boolean isOutChinese,int minTermLength) {
		super(input);
		this.minTermLength = minTermLength;
		if (this.minTermLength < 1) {
			this.minTermLength = 1;
		}
		this.isOutChinese = isOutChinese;
		this.shortPinyin = shortPinyin;
		this.pinyinAll = pinyinAll;
		// 偏移量属性
		addAttribute(OffsetAttribute.class); 
	}
	
	@Override
	public final boolean incrementToken() throws IOException {
		while (true) {
			// 开始处理或上一输入词元已被处理完成
			if (this.curTermBuffer == null) {
				// 获取下一词元输入
				if (!this.input.incrementToken()) { 
					// 没有后继词元输入,处理完成,返回false,结束上层调用
					return false; 
				}
				// 缓存词元输入
				this.curTermBuffer = ((char[]) this.termAtt.buffer().clone());
				this.curTermLength = this.termAtt.length();
			}
			String chinese = this.termAtt.toString();
			// 处理原输入词元
			if ((this.isOutChinese) && (!this.hasCurOut) && (this.termIte == null)) {
				// 准许输出原中文词元且当前没有输出原输入词元且还没有处理拼音结果集
				// 标记以保证下次循环不会输出
				this.hasCurOut = true; 
				// 写入原输入词元
				this.termAtt.copyBuffer(this.curTermBuffer, 0,
						this.curTermLength);
				this.posIncrAtt.setPositionIncrement(this.posIncrAtt.getPositionIncrement());
				this.typeAtt.setType(StringUtils.isNumeric(chinese)? "numeric_original" : 
					(StringUtils.containsChinese(chinese)?"chinese_original" : "normal_word"));
				return true;
			}
			
			String type = this.typeAtt.type();
			// 若包含中文且中文字符长度不小于限定的最小长度minTermLength
			if (StringUtils.chineseCharCount(chinese) >= this.minTermLength) {
				// 如果需要全拼+简拼
				if(this.pinyinAll) {
					Collection<String> quanpinColl = Pinyin4jUtil.getPinyinCollection(chinese);
					quanpinColl.addAll(Pinyin4jUtil.getPinyinShortCollection(chinese));
					this.terms = quanpinColl;
				} else {
					// 简拼 or 全拼,二选一
					this.terms = this.shortPinyin ? 
							Pinyin4jUtil.getPinyinShortCollection(chinese) : 
							Pinyin4jUtil.getPinyinCollection(chinese);
				}
				
				if (this.terms != null) {
					this.termIte = this.terms.iterator();
				}
			} else {
				if(null != type && ("numeric_original".equals(type) ||
						"normal_word".equals(type))) {
					Collection<String> coll = new ArrayList<String>();
					coll.add(chinese);
					this.terms = coll;
					if (this.terms != null) {
						this.termIte = this.terms.iterator();
					}
				}
			}
			if (this.termIte != null) {
				// 有拼音结果集且未处理完成
				while (this.termIte.hasNext()) { 
					String pinyin = this.termIte.next();
					this.termAtt.copyBuffer(pinyin.toCharArray(), 0, pinyin.length());
					//同义词的原理
					this.posIncrAtt.setPositionIncrement(0);
					this.typeAtt.setType(this.shortPinyin ? "short_pinyin" : "pinyin");
					return true;
				}
			}
			// 没有中文或转换拼音失败,不用处理,
			// 清理缓存,下次取新词元
			this.curTermBuffer = null;
			this.termIte = null;
			this.hasCurOut = false; 
		}
	}

	@Override
	public void reset() throws IOException {
		super.reset();
	}
}

   

import java.io.IOException;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
import org.apache.lucene.analysis.pinyin.utils.Constant;
import org.apache.lucene.analysis.pinyin.utils.StringUtils;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;

/**
 * 对转换后的拼音进行NGram处理的TokenFilter
 * 
 * @author Lanxiaowei
 * 
 */
@SuppressWarnings("unused")
public class PinyinNGramTokenFilter extends TokenFilter {
	private char[] curTermBuffer;
	private int curTermLength;
	private int curCodePointCount;
	private int curGramSize;
	private int curPos;
	private int curPosInc, curPosLen;
	private int tokStart;
	private int tokEnd;
	private boolean hasIllegalOffsets;

	private int minGram;
	private int maxGram;
	/** 是否需要对中文进行NGram[默认为false] */
	private final boolean nGramChinese;
	/** 是否需要对纯数字进行NGram[默认为false] */
	private final boolean nGramNumber;

	private final CharacterUtils charUtils;
	private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	private PositionIncrementAttribute posIncAtt;
	private PositionLengthAttribute posLenAtt;
	private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
	private TypeAttribute typeAtt;

	public PinyinNGramTokenFilter(TokenStream input, int minGram, int maxGram,
			boolean nGramChinese,boolean nGramNumber) {
		super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE));
		this.charUtils = CharacterUtils.getInstance();
		if (minGram < 1) {
			throw new IllegalArgumentException(
					"minGram must be greater than zero");
		}
		if (minGram > maxGram) {
			throw new IllegalArgumentException(
					"minGram must not be greater than maxGram");
		}
		this.minGram = minGram;
		this.maxGram = maxGram;
		this.nGramChinese = nGramChinese;
		this.nGramNumber = nGramNumber;
		
		this.termAtt = addAttribute(CharTermAttribute.class);
		this.offsetAtt = addAttribute(OffsetAttribute.class);
		this.typeAtt = addAttribute(TypeAttribute.class);
		this.posIncAtt = addAttribute(PositionIncrementAttribute.class);
		this.posLenAtt = addAttribute(PositionLengthAttribute.class);
	}

	public PinyinNGramTokenFilter(TokenStream input, int minGram, int maxGram,
			boolean nGramChinese) {
		this(input, minGram, maxGram, nGramChinese, Constant.DEFAULT_NGRAM_NUMBER);
	}
	
	public PinyinNGramTokenFilter(TokenStream input, int minGram, int maxGram) {
		this(input, minGram, maxGram, Constant.DEFAULT_NGRAM_CHINESE);
	}
	
	public PinyinNGramTokenFilter(TokenStream input, int minGram) {
		this(input, minGram, Constant.DEFAULT_MAX_GRAM);
	}
	
	public PinyinNGramTokenFilter(TokenStream input) {
		this(input, Constant.DEFAULT_MIN_GRAM);
	}

	@Override
	public final boolean incrementToken() throws IOException {
		while (true) {
			if (curTermBuffer == null) {
				if (!input.incrementToken()) {
					return false;
				}
				String type = this.typeAtt.type();
				if(null != type && "normal_word".equals(type)) {
					return true;
				}
				if(null != type && "numeric_original".equals(type)) {
					return true;
				}
				if(null != type && "chinese_original".equals(type)) {
					return true;
				}
				if ((!this.nGramNumber)
						&& (StringUtils.isNumeric(this.termAtt.toString()))) {
					return true;
				}
				if ((!this.nGramChinese)
						&& (StringUtils.containsChinese(this.termAtt.toString()))) {
					return true;
				}
				curTermBuffer = termAtt.buffer().clone();
				curTermLength = termAtt.length();
				curCodePointCount = charUtils.codePointCount(termAtt);
				curGramSize = minGram;
				curPos = 0;
				curPosInc = posIncAtt.getPositionIncrement();
				curPosLen = posLenAtt.getPositionLength();
				tokStart = offsetAtt.startOffset();
				tokEnd = offsetAtt.endOffset();

				hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
			}

			if (curGramSize > maxGram
					|| (curPos + curGramSize) > curCodePointCount) {
				++curPos;
				curGramSize = minGram;
			}
			if ((curPos + curGramSize) <= curCodePointCount) {
				clearAttributes();
				final int start = charUtils.offsetByCodePoints(curTermBuffer,
						0, curTermLength, 0, curPos);
				final int end = charUtils.offsetByCodePoints(curTermBuffer, 0,
						curTermLength, start, curGramSize);
				termAtt.copyBuffer(curTermBuffer, start, end - start);
				posIncAtt.setPositionIncrement(curPosInc);
				curPosInc = 0;
				posLenAtt.setPositionLength(curPosLen);
				offsetAtt.setOffset(tokStart, tokEnd);
				curGramSize++;
				return true;
			}
			curTermBuffer = null;
		}
	}

	@Override
	public void reset() throws IOException {
		super.reset();
		curTermBuffer = null;
	}
}

   

import java.io.IOException;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.pinyin.utils.Constant;
import org.apache.lucene.analysis.pinyin.utils.StringUtils;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;

/**
 * 对转换后的拼音进行EdgeNGram处理的TokenFilter
 * 
 * @author Lanxiaowei
 * 
 */
public class PinyinEdgeNGramTokenFilter extends TokenFilter {
	private final int minGram;
	private final int maxGram;
	/** 是否需要对中文进行NGram[默认为false] */
	private final boolean nGramChinese;
	/** 是否需要对纯数字进行NGram[默认为false] */
	private final boolean nGramNumber;
	private final CharacterUtils charUtils;
	private char[] curTermBuffer;
	private int curTermLength;
	private int curCodePointCount;
	private int curGramSize;
	private int tokStart;
	private int tokEnd;
	private int savePosIncr;
	private int savePosLen;

	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
	private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
	private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
	private TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
	
	public PinyinEdgeNGramTokenFilter(TokenStream input, int minGram,
			int maxGram, boolean nGramChinese, boolean nGramNumber) {
		super(input);
		if (minGram < 1) {
			throw new IllegalArgumentException(
					"minGram must be greater than zero");
		}

		if (minGram > maxGram) {
			throw new IllegalArgumentException(
					"minGram must not be greater than maxGram");
		}

		this.charUtils = CharacterUtils.getInstance();
		this.minGram = minGram;
		this.maxGram = maxGram;
		this.nGramChinese = nGramChinese;
		this.nGramNumber = nGramNumber;
	}
	
	public PinyinEdgeNGramTokenFilter(TokenStream input, int minGram,
			int maxGram, boolean nGramChinese) {
		this(input, minGram, maxGram, nGramChinese, Constant.DEFAULT_NGRAM_NUMBER);
	}
	
	public PinyinEdgeNGramTokenFilter(TokenStream input, int minGram,
			int maxGram) {
		this(input, minGram, maxGram, Constant.DEFAULT_NGRAM_CHINESE);
	}
	
	public PinyinEdgeNGramTokenFilter(TokenStream input, int minGram) {
		this(input, minGram, Constant.DEFAULT_MAX_GRAM);
	}
	
	public PinyinEdgeNGramTokenFilter(TokenStream input) {
		this(input, Constant.DEFAULT_MIN_GRAM);
	}

	@Override
	public final boolean incrementToken() throws IOException {
		while (true) {
			if (curTermBuffer == null) {
				if (!input.incrementToken()) {
					return false;
				}
				String type = this.typeAtt.type();
				if(null != type && "normal_word".equals(type)) {
					return true;
				}
				if(null != type && "numeric_original".equals(type)) {
					return true;
				}
				if(null != type && "chinese_original".equals(type)) {
					return true;
				}
				if ((!this.nGramNumber)
						&& (StringUtils.isNumeric(this.termAtt.toString()))) {
					return true;
				}
				if ((!this.nGramChinese)
						&& (StringUtils.containsChinese(this.termAtt.toString()))) {
					return true;
				}
				curTermBuffer = termAtt.buffer().clone();
				curTermLength = termAtt.length();
				curCodePointCount = charUtils.codePointCount(termAtt);
				curGramSize = minGram;
				tokStart = offsetAtt.startOffset();
				tokEnd = offsetAtt.endOffset();
				savePosIncr += posIncrAtt.getPositionIncrement();
				savePosLen = posLenAtt.getPositionLength();
			}
			if (curGramSize <= maxGram) { 
				if (curGramSize <= curCodePointCount) { 
					clearAttributes();
					offsetAtt.setOffset(tokStart, tokEnd);
					if (curGramSize == minGram) {
						posIncrAtt.setPositionIncrement(savePosIncr);
						savePosIncr = 0;
					} else {
						posIncrAtt.setPositionIncrement(0);
					}
					posLenAtt.setPositionLength(savePosLen);
					final int charLength = charUtils.offsetByCodePoints(
							curTermBuffer, 0, curTermLength, 0, curGramSize);
					termAtt.copyBuffer(curTermBuffer, 0, charLength);
					curGramSize++;
					return true;
				}
			}
			curTermBuffer = null;
		}
	}

	@Override
	public void reset() throws IOException {
		super.reset();
		curTermBuffer = null;
		savePosIncr = 0;
	}
}

   

package org.apache.lucene.analysis.pinyin.lucene5;

import java.io.BufferedReader;
import java.io.Reader;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.pinyin.utils.Constant;
import org.wltea.analyzer.lucene.IKTokenizer;
/**
 * 自定义拼音分词器
 * @author Lanxiaowei
 *
 */
public class PinyinAnalyzer extends Analyzer {
	private int minGram;
	private int maxGram;
	private boolean useSmart;
	/** 是否需要对中文进行NGram[默认为false] */
	private boolean nGramChinese;
	/** 是否需要对纯数字进行NGram[默认为false] */
	private boolean nGramNumber;
	/**是否开启edgesNGram模式*/
	private boolean edgesNGram;
	
	public PinyinAnalyzer() {
		this(Constant.DEFAULT_IK_USE_SMART);
	}
	
	public PinyinAnalyzer(boolean useSmart) {
		this(Constant.DEFAULT_MIN_GRAM, Constant.DEFAULT_MAX_GRAM, Constant.DEFAULT_EDGES_GRAM, useSmart,Constant.DEFAULT_NGRAM_CHINESE);
	}
	
	public PinyinAnalyzer(int minGram) {
		this(minGram, Constant.DEFAULT_MAX_GRAM, Constant.DEFAULT_EDGES_GRAM, Constant.DEFAULT_IK_USE_SMART, Constant.DEFAULT_NGRAM_CHINESE,Constant.DEFAULT_NGRAM_NUMBER);
	}

	public PinyinAnalyzer(int minGram,boolean useSmart) {
		this(minGram, Constant.DEFAULT_MAX_GRAM, Constant.DEFAULT_EDGES_GRAM, useSmart,Constant.DEFAULT_NGRAM_CHINESE);
	}
	
	public PinyinAnalyzer(int minGram, int maxGram) {
		this(minGram, maxGram, Constant.DEFAULT_EDGES_GRAM);
	}
	
	public PinyinAnalyzer(int minGram, int maxGram,boolean edgesNGram) {
		this(minGram, maxGram, edgesNGram, Constant.DEFAULT_IK_USE_SMART);
	}
	
	public PinyinAnalyzer(int minGram, int maxGram,boolean edgesNGram,boolean useSmart) {
		this(minGram, maxGram, edgesNGram, useSmart,Constant.DEFAULT_NGRAM_CHINESE);
	}

	public PinyinAnalyzer(int minGram, int maxGram,boolean edgesNGram,boolean useSmart,
			boolean nGramChinese) {
		this(minGram, maxGram, edgesNGram, useSmart,nGramChinese,Constant.DEFAULT_NGRAM_NUMBER);
	}
	
	public PinyinAnalyzer(int minGram, int maxGram,boolean edgesNGram,boolean useSmart,
			boolean nGramChinese,boolean nGramNumber) {
		super();
		this.minGram = minGram;
		this.maxGram = maxGram;
		this.edgesNGram = edgesNGram;
		this.useSmart = useSmart;
		this.nGramChinese = nGramChinese;
		this.nGramNumber = nGramNumber;
	}

	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
		Reader reader = new BufferedReader(new StringReader(fieldName));
		Tokenizer tokenizer = new IKTokenizer(reader, useSmart);
		//转拼音
		TokenStream tokenStream = new PinyinTokenFilter(tokenizer,
			Constant.DEFAULT_SHORT_PINYIN,Constant.DEFAULT_PINYIN_ALL, Constant.DEFAULT_MIN_TERM_LRNGTH);
		//对拼音进行NGram处理
		if(edgesNGram) {
			tokenStream = new PinyinEdgeNGramTokenFilter(tokenStream,this.minGram,
				this.maxGram,this.nGramChinese,this.nGramNumber);
		} else {
			tokenStream = new PinyinNGramTokenFilter(tokenStream,this.minGram,
					this.maxGram,this.nGramChinese,this.nGramNumber);
		}
	    return new Analyzer.TokenStreamComponents(tokenizer, tokenStream);
	}
}

   Lucene5中PinyinAnalyzer分词器使用示例代码如下:

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;

import com.yida.framework.lucene5.pinyin.PinyinAnalyzer;
@SuppressWarnings("resource")
public class AnalyzerTest {
	public static void main(String[] args) throws IOException {
		String s = "京华时报2009年1月23日报道 the this that welcome to beijing 虽然我很丑,但是我很温柔,昨天,受一股来自中西伯利亚的强冷空气影响,本市出现大风降温天气,白天最高气温只有零下7摄氏度,同时伴有6到7级的偏北风。";
		
		//Analyzer analyzer = new IKAnalyzer();
		Analyzer analyzer = new PinyinAnalyzer();
		TokenStream tokenStream = analyzer.tokenStream("text", s);
		displayTokens(tokenStream);

	}
	
	public static void displayTokens(TokenStream tokenStream) throws IOException {
		OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
		PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);
		CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
		TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);
		
		tokenStream.reset();
		int position = 0;
		while (tokenStream.incrementToken()) {
			int increment = positionIncrementAttribute.getPositionIncrement();
			if(increment > 0) {
				position = position + increment;
				System.out.print(position + ":");
			}
		    int startOffset = offsetAttribute.startOffset();
		    int endOffset = offsetAttribute.endOffset();
		    String term = charTermAttribute.toString();
		    System.out.println("[" + term + "]" + ":(" + startOffset + "-->" + endOffset + "):" + typeAttribute.type());
		}
		tokenStream.end();
		tokenStream.close();
	}
}

    

package org.apache.lucene.analysis.pinyin.solr5;

import java.util.Map;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.pinyin.lucene5.PinyinTokenFilter;
import org.apache.lucene.analysis.pinyin.utils.Constant;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
 * PinyinTokenFilter工厂类
 * @author Lanxiaowei
 *
 */
public class PinyinTokenFilterFactory extends TokenFilterFactory {
	/**是否输出原中文*/
	private boolean outChinese;
	/**是否只转换简拼*/
	private boolean shortPinyin;
	/**是否转换全拼+简拼*/
	private boolean pinyinAll;
	/**中文词组长度过滤,默认超过minTermLength长度的中文才转换拼音*/
	private int minTermLength;

	public PinyinTokenFilterFactory(Map<String, String> args) {
		super(args);
		this.outChinese = getBoolean(args, "outChinese", Constant.DEFAULT_OUT_CHINESE);
		this.shortPinyin = getBoolean(args, "shortPinyin", Constant.DEFAULT_SHORT_PINYIN);
		this.pinyinAll = getBoolean(args, "pinyinAll", Constant.DEFAULT_PINYIN_ALL);
		this.minTermLength = getInt(args, "minTermLength", Constant.DEFAULT_MIN_TERM_LRNGTH);
	}

	public TokenFilter create(TokenStream input) {
		return new PinyinTokenFilter(input, this.shortPinyin,this.outChinese,
				this.minTermLength);
	}

	public boolean isOutChinese() {
		return outChinese;
	}

	public void setOutChinese(boolean outChinese) {
		this.outChinese = outChinese;
	}

	public boolean isShortPinyin() {
		return shortPinyin;
	}

	public void setShortPinyin(boolean shortPinyin) {
		this.shortPinyin = shortPinyin;
	}

	public boolean isPinyinAll() {
		return pinyinAll;
	}

	public void setPinyinAll(boolean pinyinAll) {
		this.pinyinAll = pinyinAll;
	}

	public int getMinTermLength() {
		
		
		return minTermLength;
	}

	public void setMinTermLength(int minTermLength) {
		this.minTermLength = minTermLength;
	}
}

   

import java.util.Map;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.pinyin.lucene5.PinyinEdgeNGramTokenFilter;
import org.apache.lucene.analysis.pinyin.lucene5.PinyinNGramTokenFilter;
import org.apache.lucene.analysis.pinyin.utils.Constant;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
 * PinyinNGramTokenFilter工厂类
 * @author Lanxiaowei
 *
 */
public class PinyinNGramTokenFilterFactory extends TokenFilterFactory {
	private int minGram;
	private int maxGram;
	/** 是否需要对中文进行NGram[默认为false] */
	private boolean nGramChinese;
	/** 是否需要对纯数字进行NGram[默认为false] */
	private boolean nGramNumber;
	/**是否开启edgesNGram模式*/
	private boolean edgesNGram;

	public PinyinNGramTokenFilterFactory(Map<String, String> args) {
		super(args);

		this.minGram = getInt(args, "minGram", Constant.DEFAULT_MIN_GRAM);
		this.maxGram = getInt(args, "maxGram", Constant.DEFAULT_MAX_GRAM);
		this.edgesNGram = getBoolean(args, "edgesNGram", Constant.DEFAULT_EDGES_GRAM);
		this.nGramChinese = getBoolean(args, "nGramChinese", Constant.DEFAULT_NGRAM_CHINESE);
		this.nGramNumber = getBoolean(args, "nGramNumber", Constant.DEFAULT_NGRAM_NUMBER);
	}

	public TokenFilter create(TokenStream input) {
		if(edgesNGram) {
			return new PinyinEdgeNGramTokenFilter(input, this.minGram, this.maxGram, 
				this.nGramChinese, this.nGramNumber);
		}
		return new PinyinNGramTokenFilter(input, this.minGram, this.maxGram,
				this.nGramChinese,this.nGramNumber);
	}
}

    我已经将他们打包成了两个jar包:lucene-analyzer-pinyin.5.1.0.jar和solr-analyzer-pinyin.5.1.0.jar(这两个jar包我已经上传到最底下的附件里,特此提醒!!!),只需要把这两个jar放入core的lib目录下,如图:

 然后在schema.xml中添加拼音分词的域类型,如图:

 然后如图应用定义好的text_pinyin这个域类型,看图:

 然后你就可以启动你的tomcat部署solr,进行拼音分词测试了:

 如果你看到如图效果,表明拼音分词已经部署成功且测试成功!如果你有任何疑问,请联系我!我的联系方式请查阅我之前的博客,打完收工,谢谢!就此晚安啦!

 

       

    

  • 大小: 106.2 KB
  • 大小: 69.6 KB
  • 大小: 76.2 KB
  • 大小: 69.9 KB
5
1
分享到:
评论
8 楼 yingyong01 2017-12-21  
7 楼 miwula 2017-06-15  
java.lang.NoClassDefFoundError: org/apache/lucene/analysis/util/CharacterUtils
at org.apache.lucene.analysis.pinyin.lucene5.PinyinNGramTokenFilter.&lt;init&gt;(PinyinNGramTokenFilter.java:69)
at org.apache.lucene.analysis.pinyin.solr5.PinyinNGramTokenFilterFactory.create(PinyinNGramTokenFilterFactory.java:33)
at org.apache.lucene.analysis.pinyin.solr5.PinyinNGramTokenFilterFactory.create(PinyinNGramTokenFilterFactory.java:1)
at org.apache.solr.analysis.TokenizerChain.createComponents(TokenizerChain.java:105)
6 楼 liuyuan3 2016-12-28  
就按你发的例子,我很丑里头的很字分词成为了,hen/h/en 这样的话查en123也能找到,这种怎么解决?
5 楼 Jeremy__Pan 2015-12-06  
您好,虽然看完了您写的这些例子。也实践了。但是怎么在 web项目中引入solr呢?是直接调用solrj 实现的API来操作吗?
4 楼 cy87669252 2015-12-02  
等等佛挡杀佛是否水电费都是都是
3 楼 cy87669252 2015-12-02  
                                                    [size=x-small][/size]
2 楼 lxwt909 2015-09-16  
lq881016 写道
报错,我的solr版本4.4,但是之前有装成功,重新装到另外个索引文件,就报错了

org.apache.solr.common.SolrException: Plugin init failure for [schema.xml] fieldType "text_ik": Plugin init failure for [schema.xml] analyzer/tokenizer: class org.apache.lucene.analysis.pinyin.solr5.PinyinTokenFilterFactory
at org.apache.solr.util.plugin.AbstractPluginLoader.load(AbstractPluginLoader.java:177)
at org.apache.solr.schema.IndexSchema.readSchema(IndexSchema.java:467)
at org.apache.solr.schema.IndexSchema.<init>(IndexSchema.java:164)
at org.apache.solr.schema.IndexSchemaFactory.create(IndexSchemaFactory.java:55)
at org.apache.solr.schema.IndexSchemaFactory.buildIndexSchema(IndexSchemaFactory.java:69)
at org.apache.solr.core.CoreContainer.createFromLocal(CoreContainer.java:619)
at org.apache.solr.core.CoreContainer.create(CoreContainer.java:657)
at org.apache.solr.core.CoreContainer$1.call(CoreContainer.java:364)
at org.apache.solr.core.CoreContainer$1.call(CoreContainer.java:356)
at java.util.concurrent.FutureTask$Sync.innerRun(FutureTask.java:334)
at java.util.concurrent.FutureTask.run(FutureTask.java:166)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
at java.util.concurrent.FutureTask$Sync.innerRun(FutureTask.java:334)
at java.util.concurrent.FutureTask.run(FutureTask.java:166)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1110)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:603)

不保证在Solr4.x下可以正常使用,我使用的是Solr5.1.0。
1 楼 lq881016 2015-09-07  
报错,我的solr版本4.4,但是之前有装成功,重新装到另外个索引文件,就报错了

org.apache.solr.common.SolrException: Plugin init failure for [schema.xml] fieldType "text_ik": Plugin init failure for [schema.xml] analyzer/tokenizer: class org.apache.lucene.analysis.pinyin.solr5.PinyinTokenFilterFactory
at org.apache.solr.util.plugin.AbstractPluginLoader.load(AbstractPluginLoader.java:177)
at org.apache.solr.schema.IndexSchema.readSchema(IndexSchema.java:467)
at org.apache.solr.schema.IndexSchema.<init>(IndexSchema.java:164)
at org.apache.solr.schema.IndexSchemaFactory.create(IndexSchemaFactory.java:55)
at org.apache.solr.schema.IndexSchemaFactory.buildIndexSchema(IndexSchemaFactory.java:69)
at org.apache.solr.core.CoreContainer.createFromLocal(CoreContainer.java:619)
at org.apache.solr.core.CoreContainer.create(CoreContainer.java:657)
at org.apache.solr.core.CoreContainer$1.call(CoreContainer.java:364)
at org.apache.solr.core.CoreContainer$1.call(CoreContainer.java:356)
at java.util.concurrent.FutureTask$Sync.innerRun(FutureTask.java:334)
at java.util.concurrent.FutureTask.run(FutureTask.java:166)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
at java.util.concurrent.FutureTask$Sync.innerRun(FutureTask.java:334)
at java.util.concurrent.FutureTask.run(FutureTask.java:166)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1110)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:603)

相关推荐

    跟益达学Solr5之拼音分词

    《Solr5拼音分词深度解析》 在深入探讨Solr5的拼音分词之前,首先需要理解什么是Solr。Apache Solr是一款基于Lucene的开源搜索引擎,它提供了全文搜索、命中高亮、 faceted search(面向切面的搜索)、自动完成、...

    跟益达学Solr5之使用IK分词器

    本篇将围绕“跟益达学Solr5之使用IK分词器”这一主题,详细讲解如何在Solr5中集成并运用IK分词器,以及它的工作原理和优化技巧。 首先,让我们了解下什么是分词器。在中文搜索引擎中,由于中文句子没有明显的分隔符...

    跟益达学Solr5之从MySQL数据库导入数据并索引

    《跟益达学Solr5之从MySQL数据库导入数据并索引》这篇文章主要探讨了如何使用Apache Solr 5从MySQL数据库中导入数据并建立索引,以便进行高效的全文搜索。Solr是一款强大的开源搜索服务器,它提供了丰富的查询语言、...

    跟益达学Solr5之使用Ansj分词器

    《跟益达学Solr5之使用Ansj分词器》 在中文信息检索和文本分析领域,分词是至关重要的第一步。Solr,作为一款强大的开源搜索平台,提供了多种分词器供用户选择,其中之一就是Ansj分词器。这篇文章将深入探讨如何在...

    跟益达学Solr5之索引网络上远程文件

    《Solr5索引网络上远程文件详解》 在信息技术领域,搜索引擎的高效与便捷是不可或缺的,Apache Solr作为一款强大的开源搜索平台,被广泛应用于各种数据检索场景。本篇我们将深入探讨如何利用Solr5来索引网络上的...

    跟益达学Solr5之索引文件夹下所有文件

    本教程将基于"跟益达学Solr5之索引文件夹下所有文件"的主题,深入探讨如何在Solr5中对文件夹内的所有文件进行索引。 首先,理解索引的概念至关重要。在信息检索领域,索引是一种数据结构,用于快速查找文档中的特定...

    跟益达学Solr5之增量索引MySQL数据库表数据

    本教程以"跟益达学Solr5之增量索引MySQL数据库表数据"为主题,旨在教授如何利用Solr5来实现对MySQL数据库表数据的增量索引,以便在海量数据中快速检索。 首先,我们需要了解Solr的基本架构。Solr运行在Jetty服务器...

    跟益达学Solr5之使用MMSeg4J分词器

    《Solr5与MMSeg4J分词器深度解析》 在中文信息检索和文本分析领域,分词是至关重要的第一步。Solr,作为一款强大的开源全文搜索引擎,提供了多种分词器供用户选择,其中之一就是MMSeg4J。本篇文章将带你深入学习...

    转自:跟益达学Solr5之玩转post.jar

    《跟益达学Solr5之玩转post.jar》这篇博文主要探讨了如何利用Solr的`post.jar`工具进行数据导入,这是Solr提供的一个非常实用的功能,用于快速将各种格式的数据导入到Solr索引中。在这个过程中,我们不仅会了解`post...

    跟益达学Solr5之使用Tika从PDF中提取数据导入索引

    在本篇博文中,“跟益达学Solr5之使用Tika从PDF中提取数据导入索引”,我们将探讨如何利用Apache Solr 5和Tika这两个强大的开源工具,从PDF文档中抽取数据并将其有效地导入到Solr索引库中。Apache Solr是一款功能...

    跟益达学Solr5之批量索引JSON数据

    《Solr5批量索引JSON数据详解》 在大数据时代,高效检索与分析大量文本信息是企业业务中不可或缺的一部分。Apache Solr,作为一款强大的开源搜索引擎,提供了对JSON等多格式数据的快速索引和查询能力。本篇将深度...

    Solr 权威指南上下卷

    国内较早接触Solr的技术专家之一,长期致力于Solr的技术研究、实践和生产环境部署,是Solr社区的积极参与者和实践者,以让Solr技术能够在中国得到广泛应用不遗余力并乐此不疲。现就职于国美金融,曾就职于各种大大...

    益达新产品男士益达推出市场广告策划书学习教案.pptx

    5. **广告策略**:虽然具体内容未详述,但可以推测文档中可能会讨论针对男士益达产品的广告创意、目标受众定位、媒介选择、广告执行计划等方面,这些都是成功推广新产品的关键步骤。 6. **市场潜力与机会**:男性...

    实益达:首次公开发行股票招股说明书.PDF

    实益达:首次公开发行股票招股说明书.PDF

    从整合营销看益达广告.docx

    在当今的快节奏市场环境中,益达口香糖凭借其巧妙的整合营销策略,成功地在竞争激烈的口香糖市场中脱颖而出。整合营销传播(IMC)是指企业在营销活动中采取一致的沟通策略,以确保所有信息的统一性,从而达到最佳的...

Global site tag (gtag.js) - Google Analytics