十五、简单实现同义词索引 -

youyang_java

浏览: 321536 次
性别:
来自: 重庆

最近访客更多访客>>

morelily

txlong_onz

LonelyMJ

tianxizhong

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

十五、简单实现同义词索引

import java.util.HashMap;
import java.util.Map;

public class SimpleSamewordContext implements SamewordContext {

    Map<String,String[]> maps = new HashMap<String,String[]>();
    public SimpleSamewordContext() {
        maps.put("中国",new String[]{"天朝","大陆"});
        maps.put("我",new String[]{"咱","俺"});
    }

    @Override
    public String[] getSamewords(String name) {
        return maps.get(name);
    }

}

import java.io.IOException;
import java.util.Stack;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;

public class MySameTokenFilter extends TokenFilter {
    private CharTermAttribute            cta        = null;
    private PositionIncrementAttribute    pia        = null;
    private AttributeSource.State        current;
    private Stack<String>                sames    = null;
    private final SamewordContext        samewordContext;

    protected MySameTokenFilter(TokenStream input, SamewordContext samewordContext) {
        super(input);
        cta = this.addAttribute(CharTermAttribute.class);
        pia = this.addAttribute(PositionIncrementAttribute.class);
        sames = new Stack<String>();
        this.samewordContext = samewordContext;
    }

    @Override
    public boolean incrementToken() throws IOException {
        if (sames.size() > 0) {
            //将元素出栈，并且获取这个同义词
            String str = sames.pop();
            //还原状态
            restoreState(current);
            cta.setEmpty();
            cta.append(str);
            //设置位置0
            pia.setPositionIncrement(0);
            return true;
        }

        if (!this.input.incrementToken())
            return false;

        if (addSames(cta.toString())) {
            //如果有同义词将当前状态先保存
            current = captureState();
        }
        return true;
    }

    private boolean addSames(String name) {
        String[] sws = samewordContext.getSamewords(name);
        if (sws != null) {
            for (String str : sws) {
                sames.push(str);
            }
            return true;
        }
        return false;
    }

}

import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;

import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.analysis.MMSegTokenizer;

public class MySameAnalyzer extends Analyzer {
    private final SamewordContext    samewordContext;

    public MySameAnalyzer(SamewordContext swc) {
        samewordContext = swc;
    }

    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
        //Dictionary dic = Dictionary.getInstance("D:\\tools\\javaTools\\lucene\\mmseg4j-1.8.5\\data");
        Dictionary dic = Dictionary.getInstance();
        return new MySameTokenFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader),
            samewordContext);
    }

}

public static void displayAllTokenInfo(String str,Analyzer a) {
        try {
            TokenStream stream = a.tokenStream("content",new StringReader(str));
            //位置增量的属性，存储语汇单元之间的距离
            PositionIncrementAttribute pia =
                        stream.addAttribute(PositionIncrementAttribute.class);
            //每个语汇单元的位置偏移量
            OffsetAttribute oa =
                        stream.addAttribute(OffsetAttribute.class);
            //存储每一个语汇单元的信息（分词单元信息）
            CharTermAttribute cta =
                        stream.addAttribute(CharTermAttribute.class);
            //使用的分词器的类型信息
            TypeAttribute ta =
                        stream.addAttribute(TypeAttribute.class);
            for(;stream.incrementToken();) {
                System.out.print(pia.getPositionIncrement()+":");
                System.out.print(cta+"["+oa.startOffset()+"-"+oa.endOffset()+"]-->"+ta.type()+"\n");
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

@Test
    public void test05() {
        try {
            Analyzer a2 = new MySameAnalyzer(new SimpleSamewordContext());
            String txt = "我来自中国重庆江北";
            Directory dir = new RAMDirectory();
            IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_35, a2));
            Document doc = new Document();
            doc.add(new Field("content", txt, Field.Store.YES, Field.Index.ANALYZED));
            writer.addDocument(doc);
            writer.close();
            IndexSearcher searcher = new IndexSearcher(IndexReader.open(dir));
            TopDocs tds = searcher.search(new TermQuery(new Term("content", "我")), 10);
            Document d = searcher.doc(tds.scoreDocs[0].doc);
            System.out.println(d.get("content"));
            AnalyzerUtils.displayAllTokenInfo(txt, a2);
        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (LockObtainFailedException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }