`
xiang37
  • 浏览: 431495 次
  • 性别: Icon_minigender_1
  • 来自: 南京
社区版块
存档分类
最新评论

Lucene4.x SmartChineseAnalyzer添加扩展词

 
阅读更多

    之前有一点研究,现在奉上比较完整的代码,可根据项目需要,自行扩展

 

package com.xiva.test.lucene;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;

import org.apache.lucene.analysis.cn.smart.Utility;

public class ExtendWordDict
{

    private short[] wordIndexTable;

    private char[] charIndexTable;

    private char[][][] wordItem_charArrayTable;

    private int[][] wordItem_frequencyTable;
    
    public static final int PRIME_INDEX_LENGTH = 12071;

    private void loadFromObjectInputStream(InputStream serialObjectInputStream) throws IOException, ClassNotFoundException
    {
        ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
        wordIndexTable = (short[]) input.readObject();
        charIndexTable = (char[]) input.readObject();
        wordItem_charArrayTable = (char[][][]) input.readObject();
        wordItem_frequencyTable = (int[][]) input.readObject();
    }

    private long hash1(char c)
    {
        final long p = 1099511628211L;
        long hash = 0xcbf29ce484222325L;
        hash = (hash ^ (c & 0x00FF)) * p;
        hash = (hash ^ (c >> 8)) * p;
        hash += hash << 13;
        hash ^= hash >> 7;
        hash += hash << 3;
        hash ^= hash >> 17;
        hash += hash << 5;
        return hash;
    }

    private int hash2(char c)
    {
        int hash = 5381;

        /* hash 33 + c */
        hash = ((hash << 5) + hash) + c & 0x00FF;
        hash = ((hash << 5) + hash) + c >> 8;

        return hash;
    }

    private short getWordItemTableIndex(char c)
    {
        int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
        int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
        if (hash1 < 0)
            hash1 = PRIME_INDEX_LENGTH + hash1;
        if (hash2 < 0)
            hash2 = PRIME_INDEX_LENGTH + hash2;
        int index = hash1;
        int i = 1;
        while (charIndexTable[index] != 0 && charIndexTable[index] != c && i < PRIME_INDEX_LENGTH)
        {
            index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
            i++;
        }

        if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c)
        {
            return (short) index;
        }
        else
            return -1;
    }

    private void sortEachItems()
    {
        char[] tmpArray;
        int tmpFreq;
        for (int i = 0; i < wordItem_charArrayTable.length; i++)
        {
            if (wordItem_charArrayTable[i] != null && wordItem_charArrayTable[i].length > 1)
            {
                for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++)
                {
                    for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++)
                    {
                        if (Utility.compareArray(wordItem_charArrayTable[i][j], 0, wordItem_charArrayTable[i][j2], 0) > 0)
                        {
                            tmpArray = wordItem_charArrayTable[i][j];
                            tmpFreq = wordItem_frequencyTable[i][j];
                            wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2];
                            wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2];
                            wordItem_charArrayTable[i][j2] = tmpArray;
                            wordItem_frequencyTable[i][j2] = tmpFreq;
                        }
                    }
                }
            }
        }
    }

    private void addExtendWords()
    {
        char[] extendChar = "李四".toCharArray();

        short extendIdx = this.getWordItemTableIndex(extendChar[0]);

        char[][] items = wordItem_charArrayTable[wordIndexTable[extendIdx]];
        if (items != null && items.length > 0)
        {
            System.out.println("start items" + items.length);
            char[][] extendItems = new char[items.length + 1][];
            int[] extendfreqs = new int[items.length + 1];

            extendfreqs[items.length] = 100;
            extendItems[items.length] = "四".toCharArray();

            System.arraycopy(items, 0, extendItems, 0, items.length);

            wordItem_charArrayTable[wordIndexTable[extendIdx]] = extendItems;
            int[] freqs = wordItem_frequencyTable[wordIndexTable[extendIdx]];
            wordItem_frequencyTable[wordIndexTable[extendIdx]] = extendfreqs;

            for (int freq : freqs)
            {
                System.out.println(freq);
            }
            this.sortEachItems();
            System.out.println("End");
        }
    }
    private void saveToObj(File serialObj)
    {
        try
        {
            ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(serialObj));
            output.writeObject(wordIndexTable);
            output.writeObject(charIndexTable);
            output.writeObject(wordItem_charArrayTable);
            output.writeObject(wordItem_frequencyTable);
            output.close();
            // log.info("serialize core dict.");
        }
        catch (Exception e)
        {
            System.out.println(e.toString());
            // log.warn(e.getMessage());
        }
        
        System.out.println("save End");
    }

    public void load() throws IOException, ClassNotFoundException
    {
        InputStream input = this.getClass().getResourceAsStream("coredict.mem");
        loadFromObjectInputStream(input);
    }

    public static void main(String[] args)
    {
        File file = new File("coredict.mem");
        
        ExtendWordDict wordDict = new ExtendWordDict();
        
        try
        {
            // 加载字典
            wordDict.load();
        }
        catch (Exception e)
        {
            e.printStackTrace();
        }
        
        //添加扩展词,可使用循环从文件读取需要扩展的词
        wordDict.addExtendWords();
        
        //将扩展词保存到文件
        wordDict.saveToObj(file);
    }
}

 

   最后将新生成的coredict.mem文件,替换掉Jar包中的文件。

  

  后续扩展:修改源码,添加一个扩展的txt文件。

除了扩展词,还有同义词需要研究。当然,禁止词SmartChineseAnalyzer已支持。

 

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics