生成文本聚类java实现 (3)

abc123456789cba

浏览: 619901 次
性别:
来自: 北京

最近访客更多访客>>

yumo93121

hedehuang

lims813927980

kingtsing

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

java
算法

很多网友看到我的聚类的研究，到后来基本上都是到carrot2的研究上去了。但由于carrot2对中文的理解很不靠谱，所以参考了网络上的一些资料，现在贡献出来所有代码。

　代码的思路就是找字或者词出现的频度，并进行打分，最后按照出现次数和重要性，找出重要的语汇。现在贴出来一些可用的代码。

　ClusterBuilder.java

/**

Java代码  
 *   
* @author    
* @version 创建时间：2011-3-8 下午02:02:36  
* 聚类生成器  
 */  
public class ClusterBuilder {  
    private static final Log LOG;  
    private List<DocCluster> clusters;  
    private ICTHit[] docs;  
    private int maxLevels;  
    private ClusteringOptions[] options;  
    private boolean useTagsAsTitle;  
    private String wordsExcluded;  
    private static short[] bit1Table;  
  
    static {  
        LOG = LogFactory.getLog(ClusterBuilder.class.getName());  
  
        bit1Table = new short[65536];  
  
        for (int n = 0; n < bit1Table.length; n++) {  
            String s = Integer.toBinaryString(n);  
            short m = 0;  
            for (int k = 0; k < s.length(); k++) {  
                if (s.charAt(k) == '1') {  
                    m = (short) (m + 1);  
                }  
            }  
            bit1Table[n] = m;  
        }  
    }  
  
    private static int getValidBitCount(long n) {  
        int i3 = (int) (n % 65536L);  
        n /= 65536L;  
        int i2 = (int) (n % 65536L);  
        n /= 65536L;  
        int i1 = (int) (n % 65536L);  
        n /= 65536L;  
        int i0 = (int) (n % 65536L);  
        return bit1Table[i0] + bit1Table[i1] + bit1Table[i2] + bit1Table[i3];  
    }  
  
    private static int getDocHitCount(long[] hits) {  
        assert (hits != null);  
        if (hits == null)  
            return 0;  
        int n0 = 0;  
        for (int i = 0; i < hits.length; i++) {  
            n0 += getValidBitCount(hits[i]);  
        }  
        return n0;  
    }  
  
    public ClusterBuilder() {  
        for (int n = 0; n < bit1Table.length; n++)  
        {  
            String s = Integer.toBinaryString(n);  
            short m = 0;  
            for (int k = 0; k < s.length(); k++)  
            {  
                if (s.getBytes()[k] == '1')  
                {  
                    m = (short)(m + 1);  
                }  
            }  
            bit1Table[n] = m;  
        }  
    }  
    /** 
     *  
     * @param docsToCluster 要聚类的记录列表 
     * @param exWords 不使用的主题词列表，多个词用西文逗号分隔。这些词将不会作为主题词。 
     * @param maxLevels 最大聚类级数 
     * @param useTagsAsTitle 是否使用主题词作为类别主题词。如果不使用，则根据文档标题自动生成类别主题词。 
     */  
    public ClusterBuilder(ICTHit[] docsToCluster, String exWords, int maxLevels, boolean useTagsAsTitle) {  
        this.useTagsAsTitle = useTagsAsTitle;  
        this.wordsExcluded = exWords;  
        this.maxLevels = maxLevels;  
        this.docs = docsToCluster;  
        this.options = new ClusteringOptions[3];  
        this.options[0] = new ClusteringOptions();  
        this.options[0].setDocMaxTagCount(10);  
        this.options[0].setMinTagRelevance(60);  
        this.options[0].setMinSameDocPercent(80);  
  
        this.options[1] = new ClusteringOptions();  
        this.options[1].setDocMaxTagCount(8);  
        this.options[1].setMinTagRelevance(85);  
        this.options[1].setMinSameDocPercent(70);  
        this.options[1].setTagMinDocCount(2);  
        this.options[1].setMinSameDocs(2);  
  
        this.options[2] = new ClusteringOptions();  
        this.options[2].setDocMaxTagCount(8);  
        this.options[2].setMinTagRelevance(50);  
        this.options[2].setMinSameDocPercent(70);  
        this.options[2].setTagMinDocCount(2);  
        this.options[2].setMinSameDocs(2);  
    }  
    /** 
     * 对Docs记录列表执行聚类，结果存放于Clusters中 
     */  
    public void cluster() {  
        this.clusters = createLevelClusters(docs, 0, options[0]);  
        List subs = null;  
        if (this.maxLevels <= 1) {  
            return;  
        }  
        for (DocCluster dc : this.clusters) {  
            if ((dc.getDocList().length < options[0].getMinDocsToCluster()) || (dc.getTags() == "其他"))  
                continue;  
            subs = createLevelClusters(dc.getDocList(), 1, options[1]);  
            if (subs.size() > 1)  
                dc.setSubclusters(subs);  
        }  
    }  
    /** 
     * 创建一个层级的聚类 
     * @param docs 文档列表 
     * @param level 层级号 
     * @param levelOpt 该层级的聚类选项 
     * @return 
     */  
    private List<DocCluster> createLevelClusters(ICTHit[] docs, int level, ClusteringOptions levelOpt) {  
        TagHitMatrix matrix = new TagHitMatrix(docs.length, levelOpt.getDocMaxTagCount());  
        List clusters = new ArrayList();  
        int i, ValidTagCount;  
        int DocCount = 0;  
        // 扫描文档列表，根据每个文档的主题词列表，初始化主题词文档对照表。  
        for (i = 0; i < docs.length; i++) {  
            ICTHit d = docs[i];  
            int validTagCount = 0;  
            if (d.getTagList() != null) {  
                String[] tagList = d.getTagList();  
                for (int tagIdx = 0; (tagIdx < tagList.length) && (validTagCount < levelOpt.getDocMaxTagCount()); tagIdx++) {  
                    String tag = tagList[tagIdx].trim();  
                     // 主题词长度大于6个字的丢弃  
                    if ((tag.length() <= 0)  
                            || (tag.length() > 20)  
                            || ((this.wordsExcluded.length() != 0) && ((tag.contains(this.wordsExcluded)) || (this.wordsExcluded  
                                    .contains(tag)))))  
                        continue;  
                    matrix.AddDocHit(tag, i);  
                    validTagCount++;  
                }  
            }  
  
        }  
  
        int maxKwDocCount = 0;  
        List entryListToRemove = new ArrayList();  
        String kwWithMaxDocCount = "";  
        LOG.debug("有效关键词：");  
        for (Map.Entry entry : matrix.entrySet()) {  
            // 统计当前主题词的命中文档数，文档数小于预设值，则该主题词将被删除  
            int n = getDocHitCount((long[]) entry.getValue());  
            if (n < levelOpt.getTagMinDocCount()) {  
                entryListToRemove.add((String) entry.getKey());  
            } else {  
                LOG.debug((String) entry.getKey() + "(" + n + "), ");  
  
                DocCount += n;  
            }  
            if (n > maxKwDocCount) {  
                maxKwDocCount = n;  
                kwWithMaxDocCount = (String) entry.getKey();  
            }  
        }  
        LOG.debug("");  
  
        LOG.debug("被忽略的关键词：");  
  
        for (i = 0; i < entryListToRemove.size(); i++) {  
            LOG.debug((String) entryListToRemove.get(i) + ", ");  
            matrix.remove(entryListToRemove.get(i));  
        }  
  
        LOG.debug("");  
  
        LOG.debug(entryListToRemove.size() + "个关键词被忽略。剩余" + matrix.size() + "个关键词。");  
  
        LOG.debug("最大文档数的关键词：" + kwWithMaxDocCount + "，文档数：" + maxKwDocCount + "。");  
  
        double docCountPerTag = matrix.size() > 0 ? DocCount / matrix.size() : 0.0D;  
        LOG.debug("关键词平均文档数：" + docCountPerTag);  
  
        levelOpt.setMinSameDocs((int) (docCountPerTag / (2.0D + level)));  
        if (levelOpt.getMinSameDocs() < 1) {  
            levelOpt.setMinSameDocs(1);  
        }  
  
        while (mergeClusters(matrix, levelOpt) > 0) {  
        }  
        return createResult(matrix, docs, level, levelOpt);  
    }  
  
    private int mergeClusters(TagHitMatrix matrix, ClusteringOptions opt) {  
        if (matrix.size() == 0)  
            return 0;  
        long[] docHitsMerged = (long[]) null;  
        long[] maxDocHitsMerged = (long[]) null;  
        String word1 = "";  
        String word2 = "";  
        String word1ToMerge = "";  
        String word2ToMerge = "";  
        int i,j;  
        int sameDocs = 0;  
        // 初始化一个相关度数组，0到100分，共101项  
        List rankMatrix = new ArrayList();  
        for (i = 0; i < 101; i++) {  
            rankMatrix.add(new ArrayList());  
        }  
        List matrix2List = new ArrayList();  
        matrix2List.addAll(matrix.entrySet());  
        // 将主题词文档映射表中的主题词两两比对  
        for (int i1 = 0; i1 < matrix2List.size() - 1; i1++) {  
            Map.Entry hits1 = (Map.Entry) matrix2List.get(i1);  
            word1 = (String) hits1.getKey();  
            for (int i2 = i1 + 1; i2 < matrix2List.size(); i2++) {  
                Map.Entry hits2 = (Map.Entry) matrix2List.get(i2);  
                word2 = (String) hits2.getKey();  
                Object[] re = getWordsRelevance(mapEntry2TagHitEntry(hits1), mapEntry2TagHitEntry(hits2),  
                        docHitsMerged, sameDocs, opt, matrix.hitsItemCount);  
                // 计算两个词的相关性，获取两词的文档汇总表，以及相同文档数  
                int nRank = ((Integer) re[0]).intValue();  
                docHitsMerged = (long[]) re[1];  
                sameDocs = ((Integer) re[2]).intValue();  
                // 相关度小于预设阈值的忽略  
                if (nRank >= opt.getMinTagRelevance()) {  
                    ((List) rankMatrix.get(nRank)).add(new IdPair(i1, i2));  
                }  
  
            }  
  
        }  
  
        List tagListToRemove = new ArrayList();  
        List entryListMerged = new ArrayList();  
        entryListMerged.add(new TagHitEntry("", null));  
        HashSet idPairTable = new HashSet();  
        TagHitEntry entryToMerge1;  
        while (true) {  
            // 找到最大相关性的两个主题词  
            for (i = 100; (i >= opt.getMinTagRelevance()) && (((List) rankMatrix.get(i)).size() == 0); i--){};  
            if (i < opt.getMinTagRelevance()) {  
                break;  
            }  
            IdPair ip = (IdPair) ((List) rankMatrix.get(i)).get(0);  
            // 合并两个类别  
            ((List) rankMatrix.get(i)).remove(0);  
              
            entryToMerge1 = ip.Id1 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(ip.Id1))  
                    : (TagHitEntry) entryListMerged.get(-ip.Id1);  
            TagHitEntry entryToMerge2 = ip.Id2 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(ip.Id2))  
                    : (TagHitEntry) entryListMerged.get(-ip.Id2);  
            word1ToMerge = entryToMerge1.key;  
            word2ToMerge = entryToMerge2.key;  
            assert ((word1ToMerge.length() > 0) && (word2ToMerge.length() > 0));  
  
            String wordsMerged = word1ToMerge + "," + word2ToMerge;  
            long[] lDocs0 = entryToMerge1.value;  
            long[] lDocs1 = entryToMerge2.value;  
            maxDocHitsMerged = new long[matrix.hitsItemCount];  
            for (i = 0; i < lDocs0.length; i++) {  
                lDocs0[i] |= lDocs1[i];// 获取合并的文档集  
            }  
            if (ip.Id1 >= 0)  
                tagListToRemove.add(word1ToMerge);  
            else  
                entryListMerged.set(-ip.Id1, new TagHitEntry("", null));  
            if (ip.Id2 >= 0)  
                tagListToRemove.add(word2ToMerge);  
            else {  
                entryListMerged.set(-ip.Id2, new TagHitEntry("", null));  
            }  
            entryListMerged.add(new TagHitEntry(wordsMerged, maxDocHitsMerged));  
            // 替换与合并主题词有关联的其他相关主题词对的评分  
            int idMerged = -(entryListMerged.size() - 1);  
            int id2 = 0;  
  
            boolean CanDelete = false;  
  
            for (i = 0; i <= 100; i++) {  
                int ListCount = ((List) rankMatrix.get(i)).size();  
                if (ListCount == 0) {  
                    continue;  
                }  
  
                for (j = 0; j < ListCount; j++) {  
                    IdPair p = (IdPair) ((List) rankMatrix.get(i)).get(j);  
                    CanDelete = false;  
                    if ((ip.Id1 == p.Id1) || (ip.Id2 == p.Id1)) {  
                        id2 = p.Id2;  
                        CanDelete = true;  
                    } else if ((ip.Id1 == p.Id2) || (ip.Id2 == p.Id2)) {  
                        id2 = p.Id1;  
                        CanDelete = true;  
                    }  
                    if (!CanDelete)  
                        continue;  
                    if (idMerged == id2) {  
                        continue;  
                    }  
  
                    ((List) rankMatrix.get(i)).remove(j);  
                    j--;  
                    ListCount--;  
  
                    IdPair pairMerged = new IdPair(idMerged, id2);  
                    if (idPairTable.contains(pairMerged)) {  
                        continue;  
                    }  
  
                    TagHitEntry e2 = id2 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(id2))  
                            : (TagHitEntry) entryListMerged.get(-id2);  
  
                    assert ((e2.key.length() != 0) && (e2.key != wordsMerged));  
  
                    Object[] re = getWordsRelevance(new TagHitEntry(wordsMerged, maxDocHitsMerged), e2, docHitsMerged,  
                            sameDocs, opt, matrix.hitsItemCount);  
                    int rank = ((Integer) re[0]).intValue();  
                    docHitsMerged = (long[]) re[1];  
                    sameDocs = ((Integer) re[2]).intValue();  
  
                    if (rank <= opt.getMinTagRelevance())  
                        continue;  
                    ((List) rankMatrix.get(rank)).add(pairMerged);  
                    idPairTable.add(pairMerged);  
                }  
  
            }  
  
        }  
        // 删除被合并的主题词  
        for (int m =0;m<tagListToRemove.size();m++){  
            matrix.remove(tagListToRemove.get(m));  
        }  
        /** 
        for (String w : tagListToRemove) 
            matrix.remove(w); 
        **/   
        // 添加合并而成的新主题词  
        for (int n=0;n<entryListMerged.size();n++){  
            TagHitEntry e = (TagHitEntry) entryListMerged.get(n);  
            matrix.put(e.getKey(), e.getValue());  
        }  
        /** 
        for (TagHitEntry e : entryListMerged) { 
            if (e.getKey().length() > 0) 
                matrix.put(e.getKey(), e.getValue()); 
        } 
        **/  
        return 0;  
    }  
  
    private int mergeClusters1(TagHitMatrix matrix, ClusteringOptions opt) {  
        if (matrix.size() == 0)  
            return 0;  
        long[] docHitsMerged = (long[]) null;  
        long[] maxDocHitsMerged = (long[]) null;  
        int nMaxRank = 0;  
        String word1 = "";  
        String word2 = "";  
        String word1ToMerge = "";  
        String word2ToMerge = "";  
        int sameDocs = 0;  
  
        List matrix2List = new ArrayList();  
        matrix2List.addAll(matrix.entrySet());  
  
        for (int i1 = 0; i1 < matrix2List.size() - 1; i1++) {  
            TagHitEntry hits1 = mapEntry2TagHitEntry((Map.Entry) matrix2List.get(i1));  
            word1 = hits1.getKey();  
            for (int i2 = i1 + 1; i2 < matrix2List.size(); i2++) {  
                TagHitEntry hits2 = mapEntry2TagHitEntry((Map.Entry) matrix2List.get(i2));  
                word2 = hits2.getKey();  
                Object[] re = getWordsRelevance(hits1, hits2, docHitsMerged, sameDocs, opt, matrix.hitsItemCount);  
                int nRank = ((Integer) re[0]).intValue();  
                docHitsMerged = (long[]) re[1];  
                sameDocs = ((Integer) re[2]).intValue();  
  
                if ((nRank <= nMaxRank) || (nRank <= opt.getMinTagRelevance()))  
                    continue;  
                nMaxRank = nRank;  
                maxDocHitsMerged = docHitsMerged;  
                word1ToMerge = word1;  
                word2ToMerge = word2;  
            }  
  
        }  
  
        if ((word1ToMerge.length() == 0) || (word2ToMerge.length() == 0)) {  
            return 0;  
        }  
  
        String wordsMerged = word1ToMerge + "," + word2ToMerge;  
        if ((nMaxRank > opt.getMinTagRelevance()) && (wordsMerged != "")) {  
            matrix.remove(word1ToMerge);  
            matrix.remove(word2ToMerge);  
            matrix.put(wordsMerged, maxDocHitsMerged);  
            LOG.debug("(" + word1ToMerge + ") - (" + word2ToMerge + ")");  
  
            return 1;  
        }  
  
        return 0;  
    }  
  
    private Object[] getWordsRelevance(TagHitEntry entry1, TagHitEntry entry2, long[] docHitsMerged, int sameDocCount,  
            ClusteringOptions opt, int hitsItemCount) {  
        Object[] re = new Object[3];  
        docHitsMerged = new long[hitsItemCount];  
        sameDocCount = 0;  
  
        String tag1 = entry1.getKey();  
        String tag2 = entry2.getKey();  
        assert (tag2 != tag1);  
  
        long[] lDocs0 = entry1.getValue();  
        long[] lDocs1 = entry2.getValue();  
        int n0 = 0;  
        int n1 = 0;  
        n0 = getDocHitCount(lDocs0);  
        n1 = getDocHitCount(lDocs1);  
        int docCountMin = Math.min(n0, n1);  
        int docCountMax = Math.max(n0, n1);  
        int docCountMerged = 0;  
  
        long sameDocBits = 0L;  
        long diffDocBits = 0L;  
        int diffDocCount = 0;  
        for (int i = 0; i < lDocs0.length; i++) {  
            docHitsMerged[i] = lDocs0[i] | lDocs1[i];// 获取合并的文档集  
            docCountMerged += getValidBitCount(docHitsMerged[i]);  
            diffDocBits = lDocs0[i] ^ lDocs1[i];// 获取不同的文档集  
            diffDocCount += getValidBitCount(diffDocBits);  
            sameDocBits = lDocs0[i] & lDocs1[i];// 获取相同的文档集  
            sameDocCount += getValidBitCount(sameDocBits);  
        }  
  
        boolean IsSubstring = false;  
        // 一个主题词是另一个的子串，则得分较高  
        if ((tag2.contains(tag1)) || (tag1.contains(tag2))) {  
            IsSubstring = true;  
            docCountMin += opt.getTagMinDocCount();  
        }  
  
        if ((sameDocCount == 0) && (!IsSubstring)) {  
            re[0] = Integer.valueOf(0);  
            re[1] = docHitsMerged;  
            re[2] = Integer.valueOf(sameDocCount);  
            return re;  
        }  
  
        if (docCountMin < opt.getTagMinDocCount()) {  
            re[0] = Integer.valueOf(0);  
            re[1] = docHitsMerged;  
            re[2] = Integer.valueOf(sameDocCount);  
            return re;  
        }  
  
        int samePercent = (int) Math.round(sameDocCount * 100.0D / docCountMerged);  
        int samePercentMin = (int) Math.round(sameDocCount * 100.0D / docCountMin);  
        int diffPercent = (int) Math.round(diffDocCount * 100.0D / docCountMerged);  
        LOG.debug("相关性：" + tag1 + "(" + n0 + ")-(" + n1 + ")" + tag2);  
        LOG.debug(", SamePercent=" + samePercent);  
        LOG.debug(", SamePercentMin=" + samePercentMin);  
        LOG.debug(", DiffPercent=" + diffPercent);  
        int nRank;  
        if ((sameDocCount >= opt.getMinSameDocs())  
                && ((docCountMin < 10) || (samePercentMin >= opt.getMinSameDocPercent()))) {  
            nRank = (int) Math.round((samePercentMin + samePercent) * 0.85D - diffPercent * 0.2D);  
        } else {  
            nRank = 0;  
        }  
        if (IsSubstring)  
            nRank += 80;  
        LOG.debug(", Rank=" + nRank);  
  
        re[0] = Integer.valueOf(Math.min(nRank, 100));  
        re[1] = docHitsMerged;  
        re[2] = Integer.valueOf(sameDocCount);  
        return re;  
    }  
  
    private TagHitEntry mapEntry2TagHitEntry(Map.Entry<String, long[]> e) {  
        return new TagHitEntry((String) e.getKey(), (long[]) e.getValue());  
    }  
  
    @SuppressWarnings("unchecked")  
    private List<DocCluster> createResult(TagHitMatrix matrix, ICTHit[] docs, int level, ClusteringOptions opt) {  
        int i,j;  
        Map<String,DocValue> clsIdList = new HashMap();  
        List ClassTitleList = new ArrayList();  
        for (Map.Entry de : matrix.entrySet()) {  
            DocValue dv = new DocValue();  
            clsIdList.put((String) de.getKey(), dv);  
        }  
  
        List<Integer> otherIdList = new ArrayList();  
        TagHitEntry maxTagHitEntry = new TagHitEntry();  
        int clsCount;  
        String tag;  
        // 确定每个文档所属的类别  
        for (i = 0; i < docs.length; i++) {  
            ICTHit d = docs[i];  
            TagHitMatrix.ClusterDocInfo di = matrix.docs[i];  
            assert (docs[i] != null);  
            int maxTagHit = 0;  
            clsCount = 0;  
  
            for (Map.Entry hits : matrix.entrySet()) {  
                int tagHitCount = 0;  
                int score = 0;  
                String clsWordListStr = "," + (String) hits.getKey() + ",";  
                // 那个类别包含当前文档的主题词最多，该文档就属于哪个类别  
                for (j = 0; j < di.TagCount; j++) {  
                    tag = di.TagList[j];  
                    score = j < 3 ? 2 : 1;  
                    assert (tag.length() > 0);  
                    if (!clsWordListStr.contains("," + tag + ","))  
                        continue;  
                    tagHitCount += score;  
                    clsCount++;  
                }  
  
                if (maxTagHit >= tagHitCount)  
                    continue;  
                maxTagHit = tagHitCount;  
                maxTagHitEntry = mapEntry2TagHitEntry(hits);  
            }  
  
            if (maxTagHit > 0) {  
                DocValue dv = (DocValue) clsIdList.get(maxTagHitEntry.getKey());  
                dv.idList.add(Integer.valueOf(i));  
            } else {  
                otherIdList.add(Integer.valueOf(i));  
            }  
  
        }  
        // 生成类别列表  
        List<DocCluster> clusterList = new ArrayList();  
        String[] TagList;  
        Object dc;  
        for (Map.Entry<String,DocValue> kv : clsIdList.entrySet()) {  
            DocValue dv = (DocValue) kv.getValue();  
            if (dv.idList.size() <= 0)  
                continue;  
            if (dv.idList.size() == 1) {  
                otherIdList.add((Integer) dv.idList.get(0));  
            } else {  
                dc = new DocCluster();  
                ((DocCluster) dc).setDocIdList(new String[dv.idList.size()]);  
                ((DocCluster) dc).setDocList(new ICTHit[dv.idList.size()]);  
                for (i = 0; i < dv.idList.size(); i++) {  
                    ((DocCluster) dc).getDocIdList()[i] = docs[((Integer) dv.idList.get(i)).intValue()].getDocId();  
                    ((DocCluster) dc).getDocList()[i] = docs[((Integer) dv.idList.get(i)).intValue()];  
                }  
                ((DocCluster) dc).setLevel(level);  
                ((DocCluster) dc).setTags((String) kv.getKey());  
  
                for (i = 0; (i < clusterList.size())  
                        && (((DocCluster) dc).getDocIdList().length <= ((DocCluster) clusterList.get(i)).getDocIdList().length);) {  
                    i++;  
                }  
                clusterList.add(i, (DocCluster) dc);  
            }  
        }  
        for (i = opt.getMaxClusterCount(); i < clusterList.size();) {  
            DocCluster c = (DocCluster) clusterList.get(i);  
            List idList = ((DocValue) clsIdList.get(c.getTags())).idList;  
            for (dc = idList.iterator(); ((Iterator) dc).hasNext();) {  
                int idx = ((Integer) ((Iterator) dc).next()).intValue();  
                otherIdList.add(Integer.valueOf(idx));  
            }  
            clusterList.remove(i);  
        }  
        int i1;  
        for (i = 0; i < clusterList.size(); i++) {  
            DocCluster dc1 = (DocCluster) clusterList.get(i);  
            String[] tagList = dc1.getTags().split(",");  
            String newTags = "";  
  
            for (j = 0; j < tagList.length; j++) {  
                i1 = dc1.getTags().indexOf(tagList[j]);  
                int i2 = dc1.getTags().lastIndexOf(tagList[j]);  
                if (i1 == i2)  
                    newTags = newTags + tagList[j] + ",";  
            }  
            if ((newTags.trim().length() > 0) && (newTags.endsWith(","))) {  
                newTags = newTags.substring(0, newTags.length() - 1);  
            }  
            dc1.setTags(newTags);  
  
            dc1.setTitle("");  
  
            if (this.useTagsAsTitle) {  
                tagList = dc1.getTags().split(",");  
                for (j = 0; (tagList != null) && (j < tagList.length); j++) {  
                    if ((dc1.getTitle() + tagList[j]).length() > 16)  
                        break;  
                    boolean isSubstr = false;  
                    for (DocCluster c : clusterList) {  
                        if ((c.getTitle().length() <= 0)  
                                || ((!c.getTitle().contains(tagList[j])) && (!tagList[j].contains(c.getTitle()))))  
                            continue;  
                        isSubstr = true;  
                        break;  
                    }  
                    if (!isSubstr)  
                        dc1.setTitle(dc1.getTitle() + tagList[j] + ",");  
                }  
                if ((dc1.getTitle().trim().length() > 0) && (dc1.getTitle().endsWith(","))) {  
                    dc1.setTitle(dc1.getTitle().substring(0, dc1.getTitle().length() - 1));  
                }  
  
            }  
  
            if (dc1.getTitle() != "")  
                continue;  
            dc1.setTitle(dc1.getTags());  
            if (dc1.getTitle().length() <= 16)  
                continue;  
            String s = dc1.getTitle().substring(0, 16);  
            int li = s.lastIndexOf(',');  
            if (li > 0) {  
                dc1.setTitle(s.substring(0, li));  
            }  
  
        }  
  
        if (otherIdList.size() > 0) {  
            DocCluster clusterOther = new DocCluster();  
            clusterOther.setDocIdList(new String[otherIdList.size()]);  
            clusterOther.setDocList(new ICTHit[otherIdList.size()]);  
            clusterOther.setLevel(level);  
            clusterOther.setTitle("其他");  
            clusterOther.setTags("其他");  
            i = 0;  
            for (int k=0;k<otherIdList.size();k++) {  
                int idx = otherIdList.get(k);  
  
                clusterOther.getDocIdList()[i] = docs[idx].getDocId();  
                clusterOther.getDocList()[i] = docs[idx];  
                i++;  
            }  
            clusterList.add(clusterOther);  
        }  
  
        return (List<DocCluster>) clusterList;  
    }  
  
    public List<DocCluster> getClusters() {  
        return this.clusters;  
    }  
  
    public void setClusters(List<DocCluster> clusters) {  
        this.clusters = clusters;  
    }  
  
    public ICTHit[] getDocs() {  
        return this.docs;  
    }  
  
    public void setDocs(ICTHit[] docs) {  
        this.docs = docs;  
    }  
  
    public int getMaxLevels() {  
        return this.maxLevels;  
    }  
  
    public void setMaxLevels(int maxLevels) {  
        this.maxLevels = maxLevels;  
    }  
  
    public ClusteringOptions[] getOptions() {  
        return this.options;  
    }  
  
    public void setOptions(ClusteringOptions[] options) {  
        this.options = options;  
    }  
  
    public boolean isUseTagsAsTitle() {  
        return this.useTagsAsTitle;  
    }  
  
    public void setUseTagsAsTitle(boolean useTagsAsTitle) {  
        this.useTagsAsTitle = useTagsAsTitle;  
    }  
  
    public String getWordsExcluded() {  
        return this.wordsExcluded;  
    }  
  
    public void setWordsExcluded(String wordsExcluded) {  
        this.wordsExcluded = wordsExcluded;  
    }  
  
    private class DocValue {  
        public List<Integer> idList = new ArrayList();  
        public String titleListStr = "";  
  
        private DocValue() {  
        }  
    }  
    /** 
     * 主题词ID对，主题词ID为该主题词在主题词文档映射表中的主键位置。 
    * @author  
    * @version 创建时间：2011-3-9 下午02:52:44 
     */  
    private class IdPair {  
        public int Id1;  
        public int Id2;  
  
        public IdPair(int id1, int id2) {  
            assert (id1 != id2);  
            if (id1 < id2) {  
                this.Id1 = id1;  
                this.Id2 = id2;  
            } else {  
                this.Id1 = id2;  
                this.Id2 = id1;  
            }  
        }  
  
        public int hashCode() {  
            return -1;  
        }  
  
        public boolean equals(Object o) {  
            return (((IdPair) o).Id1 == this.Id1) && (((IdPair) o).Id2 == this.Id2);  
        }  
    }  
  
    public static class TagHitEntry {  
        public String key;  
        public long[] value;  
  
        public TagHitEntry() {  
        }  
  
        public TagHitEntry(String k, long[] v) {  
            this.key = k;  
            this.value = v;  
        }  
  
        public String getKey() {  
            return this.key;  
        }  
  
        public long[] getValue() {  
            return this.value;  
        }  
    }  
}  

ClusteringOptions.java

Java代码  
/** 
 *  
* @author  
* @version 创建时间：2011-3-8 上午10:23:27 
 */  
public class ClusteringOptions {  
    public static int DefMaxClusterCount = 20;  
    public static int DefMaxKeywordCount = 6;  
    public static int DefMinWordsRelevance = 10;  
    public static int DefTagMinDocCount = 3;  
    public static int DefIgnoreSameDocs = 2;  
    public static int DefSameDocPercent = 50;  
    public static int DefMinDocsToCluster = 8;  
    private int docMaxTagCount;  
    private int maxClusterCount;  
    private int minDocsToCluster;  
    private int minSameDocPercent;  
    private int minSameDocs;  
    private int minTagRelevance;  
    private int tagMinDocCount;  
  
    public ClusteringOptions() {  
        this.maxClusterCount = DefMaxClusterCount;  
        this.minTagRelevance = DefMinWordsRelevance;  
        this.tagMinDocCount = DefTagMinDocCount;  
        this.minSameDocs = DefIgnoreSameDocs;  
        this.minSameDocPercent = DefSameDocPercent;  
        this.docMaxTagCount = DefMaxKeywordCount;  
        this.minDocsToCluster = DefMinDocsToCluster;  
    }  
  
    public int getDocMaxTagCount() {  
        return this.docMaxTagCount;  
    }  
  
    public void setDocMaxTagCount(int docMaxTagCount) {  
        this.docMaxTagCount = docMaxTagCount;  
    }  
  
    public int getMaxClusterCount() {  
        return this.maxClusterCount;  
    }  
  
    public void setMaxClusterCount(int maxClusterCount) {  
        this.maxClusterCount = maxClusterCount;  
    }  
  
    public int getMinDocsToCluster() {  
        return this.minDocsToCluster;  
    }  
  
    public void setMinDocsToCluster(int minDocsToCluster) {  
        this.minDocsToCluster = minDocsToCluster;  
    }  
  
    public int getMinSameDocPercent() {  
        return this.minSameDocPercent;  
    }  
  
    public void setMinSameDocPercent(int minSameDocPercent) {  
        this.minSameDocPercent = minSameDocPercent;  
    }  
  
    public int getMinSameDocs() {  
        return this.minSameDocs;  
    }  
  
    public void setMinSameDocs(int minSameDocs) {  
        this.minSameDocs = minSameDocs;  
    }  
  
    public int getMinTagRelevance() {  
        return this.minTagRelevance;  
    }  
  
    public void setMinTagRelevance(int minTagRelevance) {  
        this.minTagRelevance = minTagRelevance;  
    }  
  
    public int getTagMinDocCount() {  
        return this.tagMinDocCount;  
    }  
  
    public void setTagMinDocCount(int tagMinDocCount) {  
        this.tagMinDocCount = tagMinDocCount;  
    }  
}  

DocCluster.java

Java代码  
/** 
 *  
* @author 
* @version 创建时间：2011-3-8 上午10:23:35 
 */  
public class DocCluster {  
    private String[] docIdList;  
    private ICTHit[] docList;  
    private int level;  
    private List<DocCluster> subclusters;  
    private String tags;  
    private String title;  
  
    public String[] getDocIdList() {  
        return this.docIdList;  
    }  
  
    public void setDocIdList(String[] docIdList) {  
        this.docIdList = docIdList;  
    }  
  
    public ICTHit[] getDocList() {  
        return this.docList;  
    }  
  
    public void setDocList(ICTHit[] docList) {  
        this.docList = docList;  
    }  
  
    public int getLevel() {  
        return level;  
    }  
  
    public void setLevel(int level) {  
        this.level = level;  
    }  
  
    public List<DocCluster> getSubclusters() {  
        return this.subclusters;  
    }  
  
    public void setSubclusters(List<DocCluster> subclusters) {  
        this.subclusters = subclusters;  
    }  
  
    public String getTags() {  
        return this.tags;  
    }  
  
    public void setTags(String tags) {  
        this.tags = tags;  
    }  
  
    public String getTitle() {  
        if (title == null)  
            title = "";  
        return this.title;  
    }  
  
    public void setTitle(String title) {  
        this.title = title;  
    }  
}  

ICTHit.java

Java代码  
public class ICTHit implements Serializable {  
    /* 
     * 关键词数组 
     */  
    private String[] TagList;  
    private String docId;  
    private String title;  
  
    public String[] getTagList() {  
        return TagList;  
    }  
  
    public void setTagList(String[] tagList) {  
        TagList = tagList;  
    }  
  
    public String getDocId() {  
        return docId;  
    }  
  
    public void setDocId(String docId) {  
        this.docId = docId;  
    }  
  
    public String getTitle() {  
        return title;  
    }  
  
    public void setTitle(String title) {  
        this.title = title;  
    }     
      
}  

TagHitMatrix.java

Java代码  
public class TagHitMatrix extends LinkedHashMap<String, long[]> {  
    /** 
     *  
     */  
    private static final long serialVersionUID = -7511464445378974433L;  
    public static int ii = 0;  
    public ClusterDocInfo[] docs;  
    public int hitsItemCount;  
  
    public TagHitMatrix(int DocCount, int MaxTagCount) {  
        this.hitsItemCount = (int) (DocCount / 62.0D + 0.984375D);  
        this.docs = new ClusterDocInfo[DocCount];  
  
        for (int i = 0; i < this.docs.length; i++)  
            this.docs[i] = new ClusterDocInfo(MaxTagCount);  
    }  
  
    public void AddDocHit(String TagStr, int Position) {  
        TagStr = TagStr.trim();  
  
        int n = Position / 62;  
        int m = Position % 62;  
        long[] DocHits = (long[]) get(TagStr);  
        if (DocHits == null) {  
            DocHits = new long[this.hitsItemCount];  
            put(TagStr, DocHits);  
        }  
        DocHits[n] |= Math.round(Math.pow(2.0D, m));  
        ClusterDocInfo di = this.docs[Position];  
        di.TagList[(di.TagCount++)] = TagStr;  
    }  
  
    class ClusterDocInfo {  
        public String[] TagList;  
        public int TagCount;  
  
        public ClusterDocInfo(int MaxTagCount) {  
            this.TagList = new String[MaxTagCount];  
            this.TagCount = 0;  
        }  
    }  
}  

测试方法：

Java代码  
public void test(ICTHit[] icthits) throws IOException {  
        ClusterBuilder clusterBuilder = new ClusterBuilder();  
        // 设置需要聚类的数据集合，测试中用的null。  
        clusterBuilder.setDocs(icthits);  
        // 设置聚类级别，只使用1级  
        clusterBuilder.setMaxLevels(10);  
        clusterBuilder.setUseTagsAsTitle(true);  
        // 一般将检索词设置为wordsExcluded  
        clusterBuilder.setWordsExcluded("万美元,日本,公司,视频,北京时间,图文,新华网,新浪,消息,通讯,互联网,美国,中国");  
        clusterBuilder  
                .setOptions(new ClusteringOptions[] { new ClusteringOptions(),new ClusteringOptions() });  
  
        // 开始聚类  
        clusterBuilder.cluster();  
        FileWriter fw1 = new FileWriter("c:/today-20110509-cluster.txt ", true);  
        BufferedWriter bw1 = new BufferedWriter(fw1);  
  
        // 打印结果  
        if (clusterBuilder.getClusters() != null) {  
            int i = 0;  
            for (DocCluster docCluster : clusterBuilder.getClusters()) {  
                i++;  
                System.out.println("tag:" + docCluster.getTags() + "("  
                        + docCluster.getDocIdList().length + ")");  
                bw1.write(docCluster.getTags() + "("+ docCluster.getDocIdList().length + ")"+"\r\n ");                
                  
                if (docCluster.getDocList() != null  
                        && docCluster.getDocList().length > 0) {  
                    for (ICTHit co : docCluster.getDocList()) {  
                        System.out.println("     DocID: " + co.getDocId());  
                        bw1.write("标题为: "   + co.getTitle()+",ID为"+co.getDocId()+"\r\n ");    
                        for (int m = 0; m < co.getTagList().length; m++) {                             
                            bw1.write("标题为: "   + co.getTitle()+",ID为"+co.getDocId()+"\r\n ");    
                            System.out.println("     Key Word: "  
                                    + co.getTagList()[m]);  
                        }  
                        System.out.println("");  
                    }  
                    System.out.println("");  
                } else {  
                    bw1.write("      该分类下无数据！"+"\r\n ");      
                }  
                bw1.write("-------------------------------------------------------------------------------\r\n");  
            }  
        }  
        bw1.close();  
        fw1.close();  
    }  

　如上方法可以，是一个示例性的，没有用在生产当中。核心方法有了。大家可以引用到项目当中。效果比carrot２标准的方法要好很多。

http://heweiya.iteye.com/blog/1704401

分享到：

TF-IDF与余弦相似性的应用（一）：自动提取 ... | 生成文本聚类java实现 (2)

2014-05-25 22:25
浏览 799
评论(0)
分类:行业应用
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

生成文本聚类java实现 (3)

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

生成文本聚类java实现 (3)

评论

发表评论

相关推荐

hash算法 (hashmap 实现原理)

数据挖掘十大经典算法

京东个性化推荐系统持续优化的奥秘

php 经典的算法题你懂的

并发队列ConcurrentLinkedQueue和阻塞队列LinkedBlockingQueue用法

RabbitMQ （五）主题（Topic）

RabbitMQ （四） 路由选择 (Routing)

RabbitMQ （三） 发布/订阅

RabbitMQ （二）工作队列

RabbitMQ 入门 Helloworld

PHP实现常见查找和排序算法

B树、B-树、B+树、B*树

TF-IDF与余弦相似性的应用（三）：自动摘要

TF-IDF与余弦相似性的应用（二）：找出相似文章

TF-IDF与余弦相似性的应用（一）：自动提取关键词

生成文本聚类java实现 (2)

生成文本聚类java实现 (1)

贝叶斯推断及其互联网应用（二）：过滤垃圾邮件[转]

基于用户投票的排名算法（一）：Delicious和Hacker News[转]

F-IDF与余弦相似性的应用（一）：自动提取关键词

最近访客更多访客>>

RabbitMQ （四）路由选择 (Routing)

RabbitMQ （三）发布/订阅