互联网时代的社会语言学：基于SNS的文本数据挖掘

orange.lpai

浏览: 94058 次
性别:
来自: 上海

最近访客更多访客>>

fanan_666

asp188

persourle

zzzzlm

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

大数据处理
自然语言处理

数据挖掘互联网 sns

互联网时代的社会语言学：基于SNS的文本数据挖掘
本文转载于http://www.matrix67.com/blog/archives/5044

几个概念

凝固度
我们定义“电影院”的凝合程度就是 p(电影院) 与 p(电) · p(影院) 比值和 p(电影院) 与 p(电影) · p(院) 的比值中的较小值，“的电影”的凝合程度则是 p(的电影) 分别除以 p(的) · p(电影) 和 p(的电) · p(影) 所得的商的较小值。

自由度
我们不妨就把一个文本片段的自由运用程度定义为它的左邻字信息熵和右邻字信息熵中的较小值。

java实现，100M文本效果还可以，但大于100M以后内存会溢出

public class FindWordsByWordArray {  
  
    private final static ResourceBundle resourceBundle = ResourceBundle.getBundle("finder");  
  
    private Map<String, Word> wordsMap = new HashMap<String, Word>();  
  
    private int wordMaxLen = 5;  
    private double allTextLen = 0;  
    private double allDomSize = 0;  
    private double mutualInformationPunish = 0.5;  
    private double leftAndRightEntropyPunish = 1;  
    private double wholePunish = 10;  
  
    public FindWordsByWordArray() {  
    }  
  
    public FindWordsByWordArray(long num) {  
        this.allDomSize = num;  
    }  
  
    public static long pretreatment(File input, File output) throws IOException {  
  
        if(output.exists())  
            FileUtils.deleteQuietly(output);  
  
        LineIterator list = FileUtils.lineIterator(input, "utf-8");  
  
        List<String> res = new ArrayList<String>();  
  
        long num = 0;  
        for (String text = list.next(); list.hasNext(); text = list.next()) {  
              
            num++;  
              
            res.addAll(pretreatment(text));  
  
            if(res.size() > 500000) {  
                FileUtils.writeLines(output, res, true);  
                System.out.println("write lines 500000.");  
                res.clear();  
            }  
  
        }  
        list.close();  
  
        if(res.size() > 0)  
            FileUtils.writeLines(output, res, true);  
  
        System.out.println("pretreatment over.");  
        return num;  
    }  
  
    private static List<String> pretreatment(String... texts) {  
        List<String> res = new ArrayList<String>();  
          
        for(String text: texts)  {  
            text = text.toLowerCase().replaceAll("\\d", "N")  
                    .replaceAll("(\\p{P}|\\s+|&[a-zA-Z]*;|[a-zA-z]+://[^\\s]*|~|～|★)", "#")  
                    .replace('.', '#')  
                    .replace('+', '#')  
                    .replace('|', '#')  
                    .replace('>', '#');  
  
            for (String some : text.split("#")) {  
                if (some.length() < 5)  
                    continue;  
                res.add(some);  
            }  
        }  
        return res;  
    }  
  
    public void parse(boolean needPretreatment, String... texts) {  
        if(needPretreatment) {  
            allDomSize += texts.length;  
            parse(false, pretreatment(texts));  
            return;  
        }  
        for (String text : texts) {  
            if (text.matches("^[a-zA-Z]*")) {  
                parseEnglish(text);  
                allTextLen += 1;  
            }else {  
                parseChinese(text);  
                allTextLen += text.length();  
            }  
        }  
    }  
  
    private void parseEnglish(String text) {  
        addEnglishWord(text);  
    }  
  
    private void parseChinese(String text) {  
  
        WordArray wordArray = new WordArray(text);  
        String left = null;  
        int thisWordMaxLen = wordMaxLen;  
  
        for (int index = 0, textLen = wordArray.wordLen(); index < textLen - 1; index++) {  
            for (int i = 2; i <= thisWordMaxLen; i++) {  
                int toIndex = index + i;  
                if (toIndex > textLen)  
                    break;  
                String word = wordArray.subWords(index, toIndex);  
                addWord(word);  
                if (left != null)  
                    wordsMap.get(word).leftAdd(left);  
                if (toIndex + 1 <= textLen)  
                    wordsMap.get(word).rightAdd(wordArray.subWords(toIndex, toIndex + 1));  
            }  
            left = wordArray.subWords(index, index + 1);  
        }  
  
        for (String s : wordArray.getChineseWords()) {  
           addWord(s);  
        }  
        for (String s : wordArray.getEnglishWords()) {  
            addEnglishWord(s);  
        }  
    }  
  
    private void addWord(String word) {  
        if (word.length() == 0)  
            throw new IllegalArgumentException("word length is 0.");  
        if (wordsMap.containsKey(word))  
            wordsMap.get(word).getTf().incrementAndGet();  
        else  
            wordsMap.put(word, new Word(word));  
    }  
  
    private void addEnglishWord(String word) {  
        addWord(word);  
        wordsMap.get(word).setAllEnglish(true);  
    }  
  
    public void parse(boolean needPretreatment, Collection<String> texts) {  
        parse(needPretreatment, texts.toArray(new String[texts.size()]));  
    }  
  
    public List<String> print() {  
        return print(getRes());  
    }  
  
    public List<String> print(List<Word> words) {  
        List<String> res = new ArrayList<String>();  
        for (Word word : words) {  
            res.add(word.toTab());  
        }  
        return res;  
    }  
  
    public List<Word> getRes() {  
        List<Word> words = new ArrayList<Word>(wordsMap.values());  
        words = Lists.newArrayList(Collections2.filter(words, new Predicate<Word>() {  
            @Override  
            public boolean apply(Word word) {  
                return word.getConfidenceLevel() > 1;  
            }  
        }));  
        Collections.sort(words, new Comparator<Word>() {  
            @Override  
            public int compare(Word word1, Word word2) {  
                return word2.tf.get() - word1.tf.get();  
            }  
        });  
        return words;  
    }  
  
    class Word {  
  
        private String word;  
        private AtomicInteger tf;  
        private StringBuilder left;  
        private StringBuilder right;  
        private Double level = null;  
        private boolean isAllEnglish = false;  
  
        Word(String word) {  
            this.word = word;  
            this.tf = new AtomicInteger(1);  
        }  
  
        public String getWord() {  
            return word;  
        }  
  
        public AtomicInteger getTf() {  
            return tf;  
        }  
  
        public void leftAdd(String str) {  
            if(left == null)  
                this.left = new StringBuilder(3);  
            if(this.left.indexOf(str) < 0)  
                this.left.append(str);  
        }  
  
        public int getLeftNum() {  
            if(left == null)  
                return 0;  
            return new WordArray(left.toString()).wordLen();  
        }  
  
        public void rightAdd(String str) {  
            if(right == null)  
                this.right = new StringBuilder(3);  
            if(this.right.indexOf(str) < 0)  
                this.right.append(str);  
        }  
  
        public int getRightNum() {  
            if(right == null)  
                return 0;  
            return new WordArray(right.toString()).wordLen();  
        }  
  
        public void setAllEnglish(boolean allEnglish) {  
            isAllEnglish = allEnglish;  
        }  
  
        private Double getConfidenceLevel() {  
            if (this.level != null)  
                return this.level;  
  
            double allDomSize = FindWordsByWordArray.this.allDomSize;  
  
            if (this.getWord().replaceAll("N","").length() <= 1)  
                return 0d;  
            if (this.getTf().get() < allDomSize / 90)  
                return 0d;  
            double value;  
            if (!this.isAllEnglish) {  
  
                if (this.getLeftNum() < allDomSize / 190)  
                    return 0d;  
                if (this.getRightNum() < allDomSize / 190)  
                    return 0d;  
                if ((this.getRightNum() + this.getLeftNum()) < allDomSize / 90)  
                    return 0d;  
                value = Double.MAX_VALUE;  
  
                WordArray wordArray = new WordArray(this.getWord());  
  
                for (int i = 1; i < wordArray.wordLen(); i++) {  
  
                    int leftTf = wordsMap.get(wordArray.subWords(0, i)).getTf().get();  
  
                    int rightTf = wordsMap.get(wordArray.subWords(i)).getTf().get();  
  
                    double normal = leftTf * rightTf / (allTextLen * allTextLen);  
  
                    double reality = this.getTf().get() * 2 / allTextLen;  
  
                    value = reality / normal < value ? reality / normal : value;  
                }  
  
                int size = this.getLeftNum() > this.getRightNum() ?  
                        this.getRightNum() : this.getLeftNum();  
  
                value = Math.pow(value, mutualInformationPunish) *  
                        Math.pow(size, leftAndRightEntropyPunish)  
                        / wholePunish;  
            } else {  
                value = this.getTf().get() * 15 / allDomSize;  
            }  
            this.level = value;  
            return value;  
        }  
  
        @Override  
        public String toString() {  
            return "Word{" +  
                    "word='" + word + '\'' +  
                    ", tf=" + tf +  
                    ", left=" + cutOff(left.toString(), 15) +  
                    ", right=" + cutOff(right.toString(), 15) +  
                    '}';  
        }  
  
        public String toTab() {  
            return word + '\t' +  
                    tf + '\t' +  
                    level + '\t' +  
                    getLeftNum() + '\t' +  
                    getRightNum();  
        }  
  
        private String cutOff(String str, int max) {  
              if (str.length() > max)  
                str = str.substring(0, max) + "...]";  
            return "(" + new WordArray(str).wordLen() + ")" + str;  
        }  
    }  
  
    public void setWordMaxLen(int wordMaxLen) {  
        this.wordMaxLen = wordMaxLen;  
    }  
  
    public void setMutualInformationPunish(double mutualInformationPunish) {  
        this.mutualInformationPunish = mutualInformationPunish;  
    }  
  
    public void setLeftAndRightEntropyPunish(double leftAndRightEntropyPunish) {  
        this.leftAndRightEntropyPunish = leftAndRightEntropyPunish;  
    }  
  
    public void setWholePunish(double wholePunish) {  
        this.wholePunish = wholePunish;  
    }  
  
    public static void main(String[] args) throws IOException {  
  
        String inputPath = "e:/xiaoshuo.txt";  
        String outputPath = "e:/xiaoshuo_words";  
  
//        String inputPath = "e:/tweet/parse";  
//        String outputPath = "e:/tweet/words";  
  
        File inputFile = new File(inputPath);  
  
        if (inputFile.isFile()) {  
            File pretreatFile = new File("e:/xiaoshuo_p");  
            long domSize = pretreatment(new File(inputPath), pretreatFile);  
            System.out.println(domSize);  
            FindWordsByWordArray findWords = getFindWords(domSize);  
            LineIterator list = FileUtils.lineIterator(pretreatFile, "utf-8");  
            int i = 0;  
            for(String str = list.next(); list.hasNext(); str = list.next()){  
                findWords.parse(false, str);  
                if(i++ % 500000 == 0)  
                    System.out.print(".");  
            }                     
            list.close();  
            FileUtils.writeLines(new File(outputPath), findWords.print());  
        } else {  
            for (String inputFileName : inputFile.list()) {  
                FindWordsByWordArray findWords = getFindWords();  
                List<String> list = FileUtils.readLines(new File(inputPath, inputFileName), "utf-8");  
                findWords.parse(true, Lists.transform(list, new Function<String, String>() {  
                    @Override  
                    public String apply(String s) {  
                        return s.substring(s.split("\t")[0].length());  
                    }  
                }));  
                String outputFileName = inputFileName + "-words.";  
                if (inputFileName.split("\\.").length == 2)  
                    outputFileName = inputFileName.split("\\.")[0] + "-words." +  
                            inputFileName.split("\\.")[1];  
                List<String> printList = findWords.print();  
                if (printList.size() > 500)  
                    printList = printList.subList(0, 500);  
                FileUtils.writeLines(new File(outputPath, outputFileName), printList);  
            }  
        }  
    }  
  
    private static FindWordsByWordArray getFindWords() {  
        return getFindWords(0);  
    }  
  
    private static FindWordsByWordArray getFindWords(long num) {  
        FindWordsByWordArray findWords = new FindWordsByWordArray(num);  
        findWords.setWordMaxLen(Integer.parseInt(resourceBundle.getString("word.max.len")));  
        findWords.setMutualInformationPunish(  
                Double.parseDouble(resourceBundle.getString("mutual.information.punish")));  
        findWords.setLeftAndRightEntropyPunish(  
                Double.parseDouble(resourceBundle.getString("left.and.right.entropy.punish")));  
        findWords.setWholePunish(Double.parseDouble(resourceBundle.getString("whole.punish")));  
        return findWords;  
    }

public class WordArray {  
  
    private String someWord;  
    private List<int[]> enIndexAndLen = null;  
  
    public WordArray(String someWord) {  
        this.someWord = someWord;  
        char[] chars = someWord.toCharArray();  
        for(int i = 0, charsLen = chars.length; i<charsLen; i++) {  
            if(CharUtils.isEnglish(chars[i])) {  
                int index = i;  
                while (++i < charsLen && CharUtils.isEnglish(chars[i]));  
                if(enIndexAndLen == null)  
                    enIndexAndLen = new ArrayList<int[]>();  
                enIndexAndLen.add(new int[]{index, i - index});  
            }  
        }  
    }  
      
    public String subWords(int beginIndex, int endIndex) {  
        int realityBeginIndex = beginIndex;  
        int realityEndIndex = endIndex;  
        if(enIndexAndLen != null) {  
            for(int[] intArray: enIndexAndLen) {  
                if(intArray[0] < realityBeginIndex) {  
                    realityBeginIndex += intArray[1] -1;  
                }  
                if(intArray[0] < realityEndIndex) {  
                    realityEndIndex += intArray[1] - 1;  
                }  
            }  
        }  
        return someWord.substring(realityBeginIndex, realityEndIndex);  
    }  
  
    public String subWords(int beginIndex) {  
        return subWords(beginIndex, wordLen());  
    }  
  
    public int wordLen() {  
        int len = someWord.length();  
        if(enIndexAndLen != null)  
            for(int[] intArray: enIndexAndLen)  
                len -= (intArray[1] - 1);  
        return len;  
    }  
  
    public String[] getEnglishWords() {  
        if(enIndexAndLen != null) {  
            String[] strings = new String[enIndexAndLen.size()];  
            int i = 0;  
            for(int[] intArray: enIndexAndLen)  
                strings[i++] = someWord.substring(intArray[0], intArray[0]+intArray[1]);  
            return strings;  
        }else{  
            return new String[0];  
        }  
    }  
  
    public List<String> getChineseWords() {  
        List<String> strings = new ArrayList<String>();  
        for (char c : someWord.toCharArray()) {  
            if(CharUtils.isEnglish(c))  
                continue;  
            strings.add(String.valueOf(c));  
        }  
        return strings;  
    }  
  
    public static void main(String[] args) {  
        WordArray wordArray = new WordArray("我爱Style江南的music哈");  
        System.out.println(wordArray.subWords(0, 5));  
        System.out.println(wordArray.subWords(5, wordArray.wordLen()));  
        System.out.println(wordArray.wordLen());  
        System.out.println(Arrays.toString(wordArray.getEnglishWords()));  
    }  
  
}

#(int)[2-n default=10] word max len.  
word.max.len = 5  
  
#(double)[1.0-0.0 default=0.3] MutualInformation punish.  
mutual.information.punish = 0.5  
  
#(double)[1.0-0.0 default=1.0] LeftAndRightEntropy punish.  
left.and.right.entropy.punish = 1  
  
#(double)[1-n default=10] WholePunish punish.  
whole.punish = 10

分享到：

关键词抽取（keywords extraction）的相关 ... | 中文分词工具|页面分词

2012-10-16 10:34
浏览 2048
评论(0)
分类:互联网
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

互联网时代的社会语言学：基于SNS的文本数据挖掘

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

互联网时代的社会语言学：基于SNS的文本数据挖掘

评论

发表评论

相关推荐

文本特征提取方法研究

关键词抽取（keywords extraction）的相关研究

互联网时代的社会语言学：基于SNS的文本数据挖掘

中文分词工具|页面分词

大数据处理

开源分类器

搜索引擎查询相关提示功能（搜索建议）

Spelling Checker拼写检查错误提示(以及拼音提示功能)

百度分词技术

Java自然语言处理 LingPipe

Lingpipe中的spell模块-搜索建议

Lingpipe中的spell模块-拼写纠错

最近访客更多访客>>