`
orange.lpai
  • 浏览: 93872 次
  • 性别: Icon_minigender_1
  • 来自: 上海
社区版块
存档分类
最新评论

互联网时代的社会语言学:基于SNS的文本数据挖掘

阅读更多
互联网时代的社会语言学:基于SNS的文本数据挖掘
本文转载于http://www.matrix67.com/blog/archives/5044

几个概念


凝固度

我们定义“电影院”的凝合程度就是 p(电影院) 与 p(电) · p(影院) 比值和 p(电影院) 与 p(电影) · p(院) 的比值中的较小值,“的电影”的凝合程度则是 p(的电影) 分别除以 p(的) · p(电影) 和 p(的电) · p(影) 所得的商的较小值。

自由度
我们不妨就把一个文本片段的自由运用程度定义为它的左邻字信息熵和右邻字信息熵中的较小值。

java实现,100M文本效果还可以,但大于100M以后内存会溢出


public class FindWordsByWordArray {  
  
    private final static ResourceBundle resourceBundle = ResourceBundle.getBundle("finder");  
  
    private Map<String, Word> wordsMap = new HashMap<String, Word>();  
  
    private int wordMaxLen = 5;  
    private double allTextLen = 0;  
    private double allDomSize = 0;  
    private double mutualInformationPunish = 0.5;  
    private double leftAndRightEntropyPunish = 1;  
    private double wholePunish = 10;  
  
    public FindWordsByWordArray() {  
    }  
  
    public FindWordsByWordArray(long num) {  
        this.allDomSize = num;  
    }  
  
    public static long pretreatment(File input, File output) throws IOException {  
  
        if(output.exists())  
            FileUtils.deleteQuietly(output);  
  
        LineIterator list = FileUtils.lineIterator(input, "utf-8");  
  
        List<String> res = new ArrayList<String>();  
  
        long num = 0;  
        for (String text = list.next(); list.hasNext(); text = list.next()) {  
              
            num++;  
              
            res.addAll(pretreatment(text));  
  
            if(res.size() > 500000) {  
                FileUtils.writeLines(output, res, true);  
                System.out.println("write lines 500000.");  
                res.clear();  
            }  
  
        }  
        list.close();  
  
        if(res.size() > 0)  
            FileUtils.writeLines(output, res, true);  
  
        System.out.println("pretreatment over.");  
        return num;  
    }  
  
    private static List<String> pretreatment(String... texts) {  
        List<String> res = new ArrayList<String>();  
          
        for(String text: texts)  {  
            text = text.toLowerCase().replaceAll("\\d", "N")  
                    .replaceAll("(\\p{P}|\\s+|&[a-zA-Z]*;|[a-zA-z]+://[^\\s]*|~|~|★)", "#")  
                    .replace('.', '#')  
                    .replace('+', '#')  
                    .replace('|', '#')  
                    .replace('>', '#');  
  
            for (String some : text.split("#")) {  
                if (some.length() < 5)  
                    continue;  
                res.add(some);  
            }  
        }  
        return res;  
    }  
  
    public void parse(boolean needPretreatment, String... texts) {  
        if(needPretreatment) {  
            allDomSize += texts.length;  
            parse(false, pretreatment(texts));  
            return;  
        }  
        for (String text : texts) {  
            if (text.matches("^[a-zA-Z]*")) {  
                parseEnglish(text);  
                allTextLen += 1;  
            }else {  
                parseChinese(text);  
                allTextLen += text.length();  
            }  
        }  
    }  
  
    private void parseEnglish(String text) {  
        addEnglishWord(text);  
    }  
  
    private void parseChinese(String text) {  
  
        WordArray wordArray = new WordArray(text);  
        String left = null;  
        int thisWordMaxLen = wordMaxLen;  
  
        for (int index = 0, textLen = wordArray.wordLen(); index < textLen - 1; index++) {  
            for (int i = 2; i <= thisWordMaxLen; i++) {  
                int toIndex = index + i;  
                if (toIndex > textLen)  
                    break;  
                String word = wordArray.subWords(index, toIndex);  
                addWord(word);  
                if (left != null)  
                    wordsMap.get(word).leftAdd(left);  
                if (toIndex + 1 <= textLen)  
                    wordsMap.get(word).rightAdd(wordArray.subWords(toIndex, toIndex + 1));  
            }  
            left = wordArray.subWords(index, index + 1);  
        }  
  
        for (String s : wordArray.getChineseWords()) {  
           addWord(s);  
        }  
        for (String s : wordArray.getEnglishWords()) {  
            addEnglishWord(s);  
        }  
    }  
  
    private void addWord(String word) {  
        if (word.length() == 0)  
            throw new IllegalArgumentException("word length is 0.");  
        if (wordsMap.containsKey(word))  
            wordsMap.get(word).getTf().incrementAndGet();  
        else  
            wordsMap.put(word, new Word(word));  
    }  
  
    private void addEnglishWord(String word) {  
        addWord(word);  
        wordsMap.get(word).setAllEnglish(true);  
    }  
  
    public void parse(boolean needPretreatment, Collection<String> texts) {  
        parse(needPretreatment, texts.toArray(new String[texts.size()]));  
    }  
  
    public List<String> print() {  
        return print(getRes());  
    }  
  
    public List<String> print(List<Word> words) {  
        List<String> res = new ArrayList<String>();  
        for (Word word : words) {  
            res.add(word.toTab());  
        }  
        return res;  
    }  
  
    public List<Word> getRes() {  
        List<Word> words = new ArrayList<Word>(wordsMap.values());  
        words = Lists.newArrayList(Collections2.filter(words, new Predicate<Word>() {  
            @Override  
            public boolean apply(Word word) {  
                return word.getConfidenceLevel() > 1;  
            }  
        }));  
        Collections.sort(words, new Comparator<Word>() {  
            @Override  
            public int compare(Word word1, Word word2) {  
                return word2.tf.get() - word1.tf.get();  
            }  
        });  
        return words;  
    }  
  
    class Word {  
  
        private String word;  
        private AtomicInteger tf;  
        private StringBuilder left;  
        private StringBuilder right;  
        private Double level = null;  
        private boolean isAllEnglish = false;  
  
        Word(String word) {  
            this.word = word;  
            this.tf = new AtomicInteger(1);  
        }  
  
        public String getWord() {  
            return word;  
        }  
  
        public AtomicInteger getTf() {  
            return tf;  
        }  
  
        public void leftAdd(String str) {  
            if(left == null)  
                this.left = new StringBuilder(3);  
            if(this.left.indexOf(str) < 0)  
                this.left.append(str);  
        }  
  
        public int getLeftNum() {  
            if(left == null)  
                return 0;  
            return new WordArray(left.toString()).wordLen();  
        }  
  
        public void rightAdd(String str) {  
            if(right == null)  
                this.right = new StringBuilder(3);  
            if(this.right.indexOf(str) < 0)  
                this.right.append(str);  
        }  
  
        public int getRightNum() {  
            if(right == null)  
                return 0;  
            return new WordArray(right.toString()).wordLen();  
        }  
  
        public void setAllEnglish(boolean allEnglish) {  
            isAllEnglish = allEnglish;  
        }  
  
        private Double getConfidenceLevel() {  
            if (this.level != null)  
                return this.level;  
  
            double allDomSize = FindWordsByWordArray.this.allDomSize;  
  
            if (this.getWord().replaceAll("N","").length() <= 1)  
                return 0d;  
            if (this.getTf().get() < allDomSize / 90)  
                return 0d;  
            double value;  
            if (!this.isAllEnglish) {  
  
                if (this.getLeftNum() < allDomSize / 190)  
                    return 0d;  
                if (this.getRightNum() < allDomSize / 190)  
                    return 0d;  
                if ((this.getRightNum() + this.getLeftNum()) < allDomSize / 90)  
                    return 0d;  
                value = Double.MAX_VALUE;  
  
                WordArray wordArray = new WordArray(this.getWord());  
  
                for (int i = 1; i < wordArray.wordLen(); i++) {  
  
                    int leftTf = wordsMap.get(wordArray.subWords(0, i)).getTf().get();  
  
                    int rightTf = wordsMap.get(wordArray.subWords(i)).getTf().get();  
  
                    double normal = leftTf * rightTf / (allTextLen * allTextLen);  
  
                    double reality = this.getTf().get() * 2 / allTextLen;  
  
                    value = reality / normal < value ? reality / normal : value;  
                }  
  
                int size = this.getLeftNum() > this.getRightNum() ?  
                        this.getRightNum() : this.getLeftNum();  
  
                value = Math.pow(value, mutualInformationPunish) *  
                        Math.pow(size, leftAndRightEntropyPunish)  
                        / wholePunish;  
            } else {  
                value = this.getTf().get() * 15 / allDomSize;  
            }  
            this.level = value;  
            return value;  
        }  
  
        @Override  
        public String toString() {  
            return "Word{" +  
                    "word='" + word + '\'' +  
                    ", tf=" + tf +  
                    ", left=" + cutOff(left.toString(), 15) +  
                    ", right=" + cutOff(right.toString(), 15) +  
                    '}';  
        }  
  
        public String toTab() {  
            return word + '\t' +  
                    tf + '\t' +  
                    level + '\t' +  
                    getLeftNum() + '\t' +  
                    getRightNum();  
        }  
  
        private String cutOff(String str, int max) {  
              if (str.length() > max)  
                str = str.substring(0, max) + "...]";  
            return "(" + new WordArray(str).wordLen() + ")" + str;  
        }  
    }  
  
    public void setWordMaxLen(int wordMaxLen) {  
        this.wordMaxLen = wordMaxLen;  
    }  
  
    public void setMutualInformationPunish(double mutualInformationPunish) {  
        this.mutualInformationPunish = mutualInformationPunish;  
    }  
  
    public void setLeftAndRightEntropyPunish(double leftAndRightEntropyPunish) {  
        this.leftAndRightEntropyPunish = leftAndRightEntropyPunish;  
    }  
  
    public void setWholePunish(double wholePunish) {  
        this.wholePunish = wholePunish;  
    }  
  
    public static void main(String[] args) throws IOException {  
  
        String inputPath = "e:/xiaoshuo.txt";  
        String outputPath = "e:/xiaoshuo_words";  
  
//        String inputPath = "e:/tweet/parse";  
//        String outputPath = "e:/tweet/words";  
  
        File inputFile = new File(inputPath);  
  
        if (inputFile.isFile()) {  
            File pretreatFile = new File("e:/xiaoshuo_p");  
            long domSize = pretreatment(new File(inputPath), pretreatFile);  
            System.out.println(domSize);  
            FindWordsByWordArray findWords = getFindWords(domSize);  
            LineIterator list = FileUtils.lineIterator(pretreatFile, "utf-8");  
            int i = 0;  
            for(String str = list.next(); list.hasNext(); str = list.next()){  
                findWords.parse(false, str);  
                if(i++ % 500000 == 0)  
                    System.out.print(".");  
            }                     
            list.close();  
            FileUtils.writeLines(new File(outputPath), findWords.print());  
        } else {  
            for (String inputFileName : inputFile.list()) {  
                FindWordsByWordArray findWords = getFindWords();  
                List<String> list = FileUtils.readLines(new File(inputPath, inputFileName), "utf-8");  
                findWords.parse(true, Lists.transform(list, new Function<String, String>() {  
                    @Override  
                    public String apply(String s) {  
                        return s.substring(s.split("\t")[0].length());  
                    }  
                }));  
                String outputFileName = inputFileName + "-words.";  
                if (inputFileName.split("\\.").length == 2)  
                    outputFileName = inputFileName.split("\\.")[0] + "-words." +  
                            inputFileName.split("\\.")[1];  
                List<String> printList = findWords.print();  
                if (printList.size() > 500)  
                    printList = printList.subList(0, 500);  
                FileUtils.writeLines(new File(outputPath, outputFileName), printList);  
            }  
        }  
    }  
  
    private static FindWordsByWordArray getFindWords() {  
        return getFindWords(0);  
    }  
  
    private static FindWordsByWordArray getFindWords(long num) {  
        FindWordsByWordArray findWords = new FindWordsByWordArray(num);  
        findWords.setWordMaxLen(Integer.parseInt(resourceBundle.getString("word.max.len")));  
        findWords.setMutualInformationPunish(  
                Double.parseDouble(resourceBundle.getString("mutual.information.punish")));  
        findWords.setLeftAndRightEntropyPunish(  
                Double.parseDouble(resourceBundle.getString("left.and.right.entropy.punish")));  
        findWords.setWholePunish(Double.parseDouble(resourceBundle.getString("whole.punish")));  
        return findWords;  
    }  


public class WordArray {  
  
    private String someWord;  
    private List<int[]> enIndexAndLen = null;  
  
    public WordArray(String someWord) {  
        this.someWord = someWord;  
        char[] chars = someWord.toCharArray();  
        for(int i = 0, charsLen = chars.length; i<charsLen; i++) {  
            if(CharUtils.isEnglish(chars[i])) {  
                int index = i;  
                while (++i < charsLen && CharUtils.isEnglish(chars[i]));  
                if(enIndexAndLen == null)  
                    enIndexAndLen = new ArrayList<int[]>();  
                enIndexAndLen.add(new int[]{index, i - index});  
            }  
        }  
    }  
      
    public String subWords(int beginIndex, int endIndex) {  
        int realityBeginIndex = beginIndex;  
        int realityEndIndex = endIndex;  
        if(enIndexAndLen != null) {  
            for(int[] intArray: enIndexAndLen) {  
                if(intArray[0] < realityBeginIndex) {  
                    realityBeginIndex += intArray[1] -1;  
                }  
                if(intArray[0] < realityEndIndex) {  
                    realityEndIndex += intArray[1] - 1;  
                }  
            }  
        }  
        return someWord.substring(realityBeginIndex, realityEndIndex);  
    }  
  
    public String subWords(int beginIndex) {  
        return subWords(beginIndex, wordLen());  
    }  
  
    public int wordLen() {  
        int len = someWord.length();  
        if(enIndexAndLen != null)  
            for(int[] intArray: enIndexAndLen)  
                len -= (intArray[1] - 1);  
        return len;  
    }  
  
    public String[] getEnglishWords() {  
        if(enIndexAndLen != null) {  
            String[] strings = new String[enIndexAndLen.size()];  
            int i = 0;  
            for(int[] intArray: enIndexAndLen)  
                strings[i++] = someWord.substring(intArray[0], intArray[0]+intArray[1]);  
            return strings;  
        }else{  
            return new String[0];  
        }  
    }  
  
    public List<String> getChineseWords() {  
        List<String> strings = new ArrayList<String>();  
        for (char c : someWord.toCharArray()) {  
            if(CharUtils.isEnglish(c))  
                continue;  
            strings.add(String.valueOf(c));  
        }  
        return strings;  
    }  
  
    public static void main(String[] args) {  
        WordArray wordArray = new WordArray("我爱Style江南的music哈");  
        System.out.println(wordArray.subWords(0, 5));  
        System.out.println(wordArray.subWords(5, wordArray.wordLen()));  
        System.out.println(wordArray.wordLen());  
        System.out.println(Arrays.toString(wordArray.getEnglishWords()));  
    }  
  
}  



#(int)[2-n default=10] word max len.  
word.max.len = 5  
  
#(double)[1.0-0.0 default=0.3] MutualInformation punish.  
mutual.information.punish = 0.5  
  
#(double)[1.0-0.0 default=1.0] LeftAndRightEntropy punish.  
left.and.right.entropy.punish = 1  
  
#(double)[1-n default=10] WholePunish punish.  
whole.punish = 10  
分享到:
评论

相关推荐

    new words Discovery

    所采用的新词发现的算法思想来源于知名博主matrix67的一篇文章《互联网时代的社会语言学:基于SNS的文本数据挖掘》,算法实现改编之《新词发现之爆笑NBA》提供的python代码。 我仅仅是将原来的python代码改写成C++...

    限定域文本语料的短语挖掘综述.pdf

    Matrix67在2012年提出了《互联网络时代的社会语言学:基于SNS的文本数据挖掘》一种基于统计学角度的新词挖掘算法,通过计算凝固度和左右临字信息熵抽取新词,效果灰常不错。 《西游记》抽取结果如下所示:行者,戒...

    阿里研究中心:大数据时代

    - 数据挖掘:关联规则分析、分类、聚类等。 - 模型预测:预测模型、机器学习、建模仿真等。 - **大数据技术**: - 数据采集:ETL工具。 - 数据存取:关系数据库、NoSQL、SQL等。 - 基础架构支持:云存储、分布式文件...

    Python数据分析与应用,实训数据

    在数据挖掘方面,Python的Scikit-learn库是一个强大的机器学习库,包含了多种监督和无监督学习算法,如线性回归、决策树、随机森林、聚类等。这些算法可以用来预测、分类和发现数据中的模式。使用`train_test_split...

    School_District_Analysis:标准化测试数据分析

    它是一个开放源代码的Web应用程序,允许用户创建和共享包含代码、方程、可视化和文本的文档,特别适合数据分析和科学计算。在这个项目中,我们将使用Python编程语言,配合Pandas、NumPy、Matplotlib等库,对数据进行...

Global site tag (gtag.js) - Google Analytics