关于分词算法---双数组T树java实现

achi217

浏览: 10215 次
性别:
来自: 北京

最近访客更多访客>>

雪山飞狐

Kanepan

博主相关

博客

微博

相册

留言

关于我

文章分类

全部博客 (10)

社区版块

存档分类

2008-03 ( 8)
更多存档...

算法 Java 数据结构 C C++

首先要声明的是，这个代码我也参考过一个C++的实现，不过，他实在写的过于烦琐，一堆的模板代码，和stl的使用。幸好10年前摸过C/C++ 2年，否则还真不知道他在干什么。可惜这个代码有些致命的缺点是，字典需要生成后使用，无法做动态的扩展。不过呢，动态加入一个新词，性能是是致命的。程序的工作模式是：
1. 通过build()函数，把所有的词生成数据，
2. 然后通过save（）函数保存数据。
3. 使用的时候就可以用load()载入数据。

public class DoubelArrayTrie{

    // 节点信息
    private int            baseArray[];
    private int            checkArray[];

    // 保存节点已经使用
    private boolean        usedArray[];

    private int            nextCheckPos;
    private int            writeSize = 0;

    public void build(List<char[]> wordList, PreProcess process) {
        if (wordList == null) {
            return;
        }
        int size = wordList.size();
        if (size > 0) {
            List<Element> elements = null;
            if (process != null) {
                elements = process.process(wordList);
            } else {
                elements = new ArrayList<Element>(wordList.size());
                for (char[] cs : wordList) {
                    elements.add(new GenericElement(cs));
                }
            }
            Collections.sort(elements, new CharArrayComparator<Element>());
            resize(1);
            baseArray[0] = 1;
            nextCheckPos = 0;
            Node root_node = new Node();
            root_node.left = 0;
            root_node.right = size;
            root_node.depth = 0;
            List<Node> siblings = createSiblings();
            fetch(elements, root_node, siblings);
            insert(elements, siblings);
            size = size + (1 << 8 * 2) + 1;
            if (size > usedArray.length) {
                resize(size);
            }

        }
    }

    private int insert(List<Element> elements, List<Node> siblings) {
        int begin = 0;
        int nonZeroCount = 0;
        boolean first = false;

        int pos = (siblings.get(0).code + 1 > nextCheckPos ? siblings.get(0).code + 1 : nextCheckPos) - 1;
        if (pos >= usedArray.length) {
            resize(pos + 1);
        }
        while (true) {
            pos++;

            if (pos >= usedArray.length) {
                resize(pos + 65535);
            }
            if (checkArray[pos] != 0) {
                nonZeroCount++;
                continue;
            } else if (!first) {
                nextCheckPos = pos;
                first = true;
            }
            begin = pos - siblings.get(0).code;

            int t = begin + siblings.get(siblings.size() - 1).code;
            if (t > usedArray.length) {
                resize(t + 65535);
            }

            if (usedArray[begin]) {
                continue;
            }
            boolean flag = false;
            for (int i = 1; i < siblings.size(); i++) {
                if (checkArray[begin + siblings.get(i).code] != 0) {
                    flag = true;
                    break;
                }
            }
            if (!flag) break;
        }

        if (1.0 * nonZeroCount / (pos - nextCheckPos + 1) >= 0.95) {
            nextCheckPos = pos;
        }
        usedArray[begin] = true;
        writeSize = Math.max(writeSize, begin + siblings.get(siblings.size() - 1).code + 1);
        for (Node node : siblings) {
            checkArray[begin + node.code] = begin;
        }

        for (Node node : siblings) {
            List<Node> newSiblings = createSiblings();
            if (fetch(elements, node, newSiblings) == 0) {
                baseArray[begin + node.code] = -node.left - 1;


            } else {
                int ins = insert(elements, newSiblings);
                baseArray[begin + node.code] = ins;
            }

        }

        return begin;
    }

    private List<Node> createSiblings() {
        return new ArrayList<Node>();
    }

    private void resize(int size) {
        // checkArray array
        int tmp[] = new int[size];
        if (baseArray != null) {
            System.arraycopy(baseArray, 0, tmp, 0, baseArray.length);
        }
        baseArray = tmp;

        // baseArray array
        int tmp1[] = new int[size];
        if (checkArray != null) {
            System.arraycopy(checkArray, 0, tmp1, 0, checkArray.length);
        }
        checkArray = tmp1;

        // usedArray array
        boolean tmp2[] = new boolean[size];
        if (usedArray != null) {
            System.arraycopy(usedArray, 0, tmp2, 0, usedArray.length);
        }
        usedArray = tmp2;



    }

    private int fetch(List<Element> words, Node parent, List<Node> siblings) {
        int prev = 0;
        Node preNode = null;
        for (int i = parent.left; i < parent.right; i++) {
            char word[] = words.get(i).getChars();
            int len = word.length;
            if (len < parent.depth) {
                continue;
            }
            int cur = 0;
            if (len != parent.depth) {
                cur = word[parent.depth] + 1;
            }

            if (prev > cur) {
                throw new RuntimeException("Fatal: sort dictionary first.\n");
            }
            if (cur != prev || siblings.size() == 0) {
                Node tmpNode = new Node();
                tmpNode.depth = parent.depth + 1;
                tmpNode.code = cur; // 重新计算每个字的映射？
                tmpNode.left = i;
                if (len == parent.depth + 1) {
                    tmpNode.frequence = words.get(i).getFrequence();
                }
                if (preNode != null) {
                    preNode.right = i;
                }
                preNode = tmpNode;
                siblings.add(tmpNode);
            }
            prev = cur;
        }

        if (preNode != null) {
            preNode.right = parent.right;
        }
        return siblings.size();
    }

    public void save(String file) throws IOException {
        DataOutputStream out = null;
        int dsize = checkArray.length;
        try {
            out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file)));
            out.writeInt(dsize);
            for (int i = 0; i < dsize; i++) {
                out.writeInt(checkArray[i]);
                out.writeInt(baseArray[i]);

            }
            out.close();
        } finally {
            if (out != null) {
                out.close();
            }
        }
    }

    public void load(String fileName) throws IOException {
        File file = new File(fileName);
        DataInputStream is = null;
        try {
            is = new DataInputStream(new BufferedInputStream(new FileInputStream(file), 1024 * 1024));
            load(is);
        } finally {
            if (is != null) is.close();
        }
    }

    public void load(InputStream in) throws IOException {
        DataInputStream is = new DataInputStream(new BufferedInputStream(in, 1024 * 1024));
        int size = is.readInt();
        checkArray = new int[size];
        baseArray = new int[size];

        for (int i = 0; i < size; i++) {
            checkArray[i] = is.readInt();
            baseArray[i] = is.readInt();

        }

    }

    public int search(String key) {
        return search(key.toCharArray(), 0, key.length());
    }

    public int search(char key[], int pos, int len) {
        if (len == 0) {
            len = key.length;
        }
        int b = baseArray[0];
        int p;
        for (int i = pos; i < len; i++) {

            p = b + key[i] + 1;
            if (b == checkArray[p]) {
                b = baseArray[p];
            } else {
                return -1;
            }
        }
        p = b;
        int n = baseArray[p];
        if (b == checkArray[p] && n < 0) {
            return -n - 1;
        }
        return -1;
    }



    public List<Word> prefixSearch(char[] key, int pos, int len) {
        int p, n, i, b = baseArray[0];
        List<Word> result = new ArrayList<Word>();
        for (i = pos; i < len; ++i) {
            p = b; // + 0;
            n = baseArray[p];
            if (b == checkArray[p] && n < 0) {
                Word w = new Word();
                w.position = -n - 1;
                w.begin = pos;
                w.length = i - pos;

                result.add(w);
            }
            p = b + (key[i]) + 1;
            if (b == checkArray[p]) {
                b = baseArray[p];
            } else {
                return result;
            }
        }
        p = b;
        n = baseArray[p];
        if (b == checkArray[p] && n < 0) {
            Word w = new Word();
            w.position = -n - 1;
            w.begin = pos;
            w.length = i - pos;

            result.add(w);
        }

        return result;
    }

    public Word prefixSearchMax(char[] key, int pos, int len) {
        int p, n, i, b = baseArray[0];
        Word w = null;
        for (i = pos; i < pos + len; ++i) {
            p = b; // + 0;
            n = baseArray[p];
            if (b == checkArray[p] && n < 0) {
                if (w == null) {
                    w = new Word();
                }
                w.position = -n - 1;
                w.begin = pos;
                w.length = i - pos;

            }
            p = b + (key[i]) + 1;
            if (b == checkArray[p]) {
                b = baseArray[p];
            } else {
                return w;
            }
        }
        p = b;
        n = baseArray[p];
        if (b == checkArray[p] && n < 0) {
            if (w == null) {
                w = new Word();
            }
            w.position = -n - 1;
            w.begin = pos;
            w.length = i - pos;

        }
        return w;
    }

//字符数组比较子
public class CharArrayComparator<T> implements Comparator<T> {

    public int compare(T o1, T o2) {
        char[] a = ((Element) o1).getChars();
        char[] b = ((Element) o2).getChars();
        int loop = a.length > b.length ? b.length : a.length;
        for (int i = 0; i < loop; i++) {
            int c = a[i] - b[i];
            if (c != 0) {
                return c;
            }
        }
        return a.length - b.length;
    }
}

//在生成数据前，这个接口实现了特定的数据处理
public interface PreProcess {
    public List<Element> process(List<char[]> lines);
}

里面还有一些简单的数据结构，当然这些都不是必然需要的，我为了自己的业务需求，实现了一些特定的数据结构。当然，这个版本我已经删除了我的业务代码，可能会编译通过不了。但是，所有BUG已经被修正了。

分享到：