`

TF-IDF(转)输出到文本

 
阅读更多
import java.io.*;
import java.util.*;

import org.wltea.analyzer.lucene.IKAnalyzer;

public class ReadFiles {

    /**
     * @param args
     */   
    private static ArrayList<String> FileList = new ArrayList<String>(); // the list of file

    //get list of file for the directory, including sub-directory of it
    public static List<String> readDirs(String filepath) throws FileNotFoundException, IOException
    {
        try
        {
            File file = new File(filepath);
            if(!file.isDirectory())
            {
                System.out.println("输入的[]");
                System.out.println("filepath:" + file.getAbsolutePath());
            }
            else
            {
                String[] flist = file.list();
                for(int i = 0; i < flist.length; i++)
                {
                    File newfile = new File(filepath + "\\" + flist[i]);
                    if(!newfile.isDirectory())
                    {
                        FileList.add(newfile.getAbsolutePath());
                    }
                    else if(newfile.isDirectory()) //if file is a directory, call ReadDirs
                    {
                        readDirs(filepath + "\\" + flist[i]);
                    }                   
                }
            }
        }catch(FileNotFoundException e)
        {
            System.out.println(e.getMessage());
        }
        return FileList;
    }
   
    //read file
    public static String readFile(String file) throws FileNotFoundException, IOException
    {
        StringBuffer strSb = new StringBuffer(); //String is constant, StringBuffer can be changed.
        InputStreamReader inStrR = new InputStreamReader(new FileInputStream(file), "gbk"); //byte streams to character streams
        BufferedReader br = new BufferedReader(inStrR);
        String line = br.readLine();
        while(line != null){
            strSb.append(line).append("\r\n");
            line = br.readLine();   
        }
       
        return strSb.toString();
    }
   
    //word segmentation
    public static ArrayList<String> cutWords(String file) throws IOException{
       
        ArrayList<String> words = new ArrayList<String>();
        String text = ReadFiles.readFile(file);
        IKAnalyzer analyzer = new IKAnalyzer();
        words = analyzer.split(text);
       
        return words;
    }
   
    //term frequency in a file, times for each word
    public static HashMap<String, Integer> normalTF(ArrayList<String> cutwords){
        HashMap<String, Integer> resTF = new HashMap<String, Integer>();
       
        for(String word : cutwords){
            if(resTF.get(word) == null){
                resTF.put(word, 1);
                System.out.println(word);
            }
            else{
                resTF.put(word, resTF.get(word) + 1);
                System.out.println(word.toString());
            }
        }
        return resTF;
    }
   
    //term frequency in a file, frequency of each word
    public static HashMap<String, Float> tf(ArrayList<String> cutwords){
        HashMap<String, Float> resTF = new HashMap<String, Float>();
       
        int wordLen = cutwords.size();
        HashMap<String, Integer> intTF = ReadFiles.normalTF(cutwords);
       
        Iterator iter = intTF.entrySet().iterator(); //iterator for that get from TF
        try
        {
        FileWriter writer = new FileWriter("d:\\DF.txt", true);
       
        while(iter.hasNext()){
            Map.Entry entry = (Map.Entry)iter.next();
            resTF.put(entry.getKey().toString(), Float.parseFloat(entry.getValue().toString()) / wordLen);
            System.out.println(entry.getKey().toString() + " = "+  Float.parseFloat(entry.getValue().toString()) / wordLen);
            // 输出到文件
            writer.write(entry.getKey().toString() + " = "+  Float.parseFloat(entry.getValue().toString()) / wordLen+"\r\n");
               
           
        }//end with while
        writer.close();
        }
        catch(Exception ex)
        {
       
        }
        return resTF;
    }
   
    //tf times for file
    public static HashMap<String, HashMap<String, Integer>> normalTFAllFiles(String dirc) throws IOException{
        HashMap<String, HashMap<String, Integer>> allNormalTF = new HashMap<String, HashMap<String,Integer>>();
       
        List<String> filelist = ReadFiles.readDirs(dirc);
        for(String file : filelist){
            HashMap<String, Integer> dict = new HashMap<String, Integer>();
            ArrayList<String> cutwords = ReadFiles.cutWords(file); //get cut word for one file
           
            dict = ReadFiles.normalTF(cutwords);
            allNormalTF.put(file, dict);
        }   
        return allNormalTF;
    }
   
    //tf for all file
    public static HashMap<String,HashMap<String, Float>> tfAllFiles(String dirc) throws IOException{
        HashMap<String, HashMap<String, Float>> allTF = new HashMap<String, HashMap<String, Float>>();
        List<String> filelist = ReadFiles.readDirs(dirc);
       
        for(String file : filelist){
            HashMap<String, Float> dict = new HashMap<String, Float>();
            ArrayList<String> cutwords = ReadFiles.cutWords(file); //get cut words for one file
           
            dict = ReadFiles.tf(cutwords);
            allTF.put(file, dict);
        }
        return allTF;
    }
    public static HashMap<String, Float> idf(HashMap<String,HashMap<String, Float>> all_tf){
        HashMap<String, Float> resIdf = new HashMap<String, Float>();
        HashMap<String, Integer> dict = new HashMap<String, Integer>();
        int docNum = FileList.size();
       
        for(int i = 0; i < docNum; i++){
            HashMap<String, Float> temp = all_tf.get(FileList.get(i));
            Iterator iter = temp.entrySet().iterator();
            while(iter.hasNext()){
                Map.Entry entry = (Map.Entry)iter.next();
                String word = entry.getKey().toString();
                if(dict.get(word) == null){
                    dict.put(word, 1);
                }else {
                    dict.put(word, dict.get(word) + 1);
                }
            }
        }
        System.out.println("IDF for every word is:");
        try
        {
        FileWriter writer = new FileWriter("d:\\IDF.txt", true);
        Iterator iter_dict = dict.entrySet().iterator();
        while(iter_dict.hasNext()){
            Map.Entry entry = (Map.Entry)iter_dict.next();
            float value = (float)Math.log(docNum / Float.parseFloat(entry.getValue().toString()));
            resIdf.put(entry.getKey().toString(), value);
            System.out.println(entry.getKey().toString() + " = " + value);
            writer.write(entry.getKey().toString() + " = " + value+"\r\n");
           
        }
        writer.close();
        }
        catch(Exception ex)
        {
        System.out.println("Error");
        return null;
        }
        return resIdf;
    }
    public static void tf_idf(HashMap<String,HashMap<String, Float>> all_tf,HashMap<String, Float> idfs){
        HashMap<String, HashMap<String, Float>> resTfIdf = new HashMap<String, HashMap<String, Float>>();
           
        int docNum = FileList.size();
        for(int i = 0; i < docNum; i++){
            String filepath = FileList.get(i);
            HashMap<String, Float> tfidf = new HashMap<String, Float>();
            HashMap<String, Float> temp = all_tf.get(filepath);
            Iterator iter = temp.entrySet().iterator();
            while(iter.hasNext()){
                Map.Entry entry = (Map.Entry)iter.next();
                String word = entry.getKey().toString();
                Float value = (float)Float.parseFloat(entry.getValue().toString()) * idfs.get(word);
                tfidf.put(word, value);
            }
            resTfIdf.put(filepath, tfidf);
        }
        System.out.println("TF-IDF for Every file is :");
        DisTfIdf(resTfIdf);
    }
    public static void DisTfIdf(HashMap<String, HashMap<String, Float>> tfidf){
        Iterator iter1 = tfidf.entrySet().iterator();
        try
        {
        FileWriter writer = new FileWriter("d:\\TF-IDF.txt", true);
        String strtemp="";
        while(iter1.hasNext()){
            Map.Entry entrys = (Map.Entry)iter1.next();
            System.out.println("FileName: " + entrys.getKey().toString());
            //writer.write("FileName: " + entrys.getKey().toString());
            System.out.print("{");
            writer.write("{");
            HashMap<String, Float> temp = (HashMap<String, Float>) entrys.getValue();
            Iterator iter2 = temp.entrySet().iterator();
           
           
           
            while(iter2.hasNext()){
                Map.Entry entry = (Map.Entry)iter2.next();
                System.out.print(entry.getKey().toString() + " = " + entry.getValue().toString() + ", ");
                // 输出到文件
                strtemp+=entry.getKey().toString() + " = " + entry.getValue().toString() + ", ";
                //writer.write(entry.getKey().toString() + " = " + entry.getValue().toString() + ", ");
               
            }
            strtemp=strtemp.substring(0, strtemp.length()-2);
            writer.write(strtemp);
            System.out.println("}");
            writer.write("}"+"\r\n");
        }
        writer.close();
       }
        catch(Exception ex)
        {
        System.out.println("error!");
        return;
        }
    }
    public static void main(String[] args) throws IOException {
        // TODO Auto-generated method stub
        String file = "D:/testfiles";

        HashMap<String,HashMap<String, Float>> all_tf = tfAllFiles(file);
        System.out.println();
        HashMap<String, Float> idfs = idf(all_tf);
        System.out.println();
        tf_idf(all_tf, idfs);
       
    }

}
分享到:
评论

相关推荐

    tf-idf.zip_Information Retrival_python IR_python TF-IDF_tf-idf

    TF-IDF(Term Frequency-Inverse Document Frequency)是一种在信息检索和文本挖掘领域广泛使用的统计方法,用于评估一个词在文档中的重要性。它基于两个概念:词频(Term Frequency, TF)和逆文档频率(Inverse ...

    基于特定语料库的TF-IDF的中文关键词提取

    用户可能需要进一步了解代码结构,学习如何加载特定语料库,如何进行预处理,如何计算TF-IDF值,以及如何输出和筛选关键词。这涉及到Python编程、NLP库的使用(如nltk、gensim等),以及数据分析的基本技能。 总之...

    基于Python实现TF-IDF矩阵表示(人工智能实验)【100011921】

    总结,Python中的TF-IDF矩阵表示涉及到理解TF-IDF算法原理,使用`sklearn`库进行数据预处理和转换,并通过文件输出和分析结果来加深对算法的理解。这个实验不仅锻炼了编程技巧,也对信息检索和文本挖掘有了深入的...

    使用python进行朴素贝叶斯的数据分析,使用TF-IDF方法整理数据

    这个类会将文本转换为TF-IDF向量,从而可以输入到机器学习模型中,如朴素贝叶斯分类器。 **使用TF-IDF和朴素贝叶斯进行数据分析步骤** 1. **数据预处理**:清洗文本,去除无关字符,如标点符号、数字等,并进行...

    TF-IDF计算程序

    TF-IDF(Term Frequency-Inverse Document Frequency)是一种在信息检索和文本挖掘中广泛使用的统计方法,用于评估一个词在文档中的重要性。它基于两个主要概念:词频(Term Frequency, TF)和逆文档频率(Inverse ...

    GetFileTimes.rar_IF-IDF_TF_java TF-IDF_tf idf_tf idf java

    在压缩包内的文件中,"GetFileTimes.java"很可能是实现TF-IDF算法的主要源代码文件,可能包括读取文本、计算词频、计算IDF值以及生成输出等功能。而"www.pudn.com.txt"则可能是一个示例文本文件,用于测试代码,这个...

    NLP技术使用TF-IDF将文本数据转换为特征向量表示然后使用支持向量机SVM进行文本分类

    ### NLP技术使用TF-IDF将文本数据转换为特征向量表示然后使用支持向量机SVM进行文本分类 #### 知识点概览 本文将深入探讨如何使用TF-IDF(Term Frequency-Inverse Document Frequency)对文本数据进行特征提取,并...

    TF-IDF.rar_TFIDF 排序_java tfidf_tf-idf_tfidf_tfidf排序

    TF-IDF(Term Frequency-Inverse Document Frequency)是一种在信息检索和文本挖掘领域广泛使用的权重计算方法,用于评估一个词在文档中的重要性。这个概念基于两个原则:词频(Term Frequency,TF)和逆文档频率...

    IF-IDF算法(Python实现)

    **IF-IDF算法** IF-IDF(Informational Freqency-Inverse Document Frequency)是一种在文本挖掘和信息检索领域广泛使用...通过结合TF-IDF和搜索技术,我们可以有效地处理大量文本数据,提升信息检索的准确性和效率。

    Hadoop MapReduce实现tfidf源码

    本篇文章将详细讲解如何利用Hadoop MapReduce实现TF-IDF(Term Frequency-Inverse Document Frequency)算法,这是一种在信息检索和文本挖掘中用于评估一个词在文档中的重要性的统计方法。 首先,我们要理解TF-IDF...

    使用Python和TF-IDF算法进行关键词提取

    在这个例子中,`TfidfVectorizer`首先创建了一个TF-IDF向量化器,然后使用`fit_transform`方法对语料库中的文本进行向量化处理,最后输出每个文档的关键词及其对应的TF-IDF值。 TF-IDF算法在文本挖掘、信息检索、...

    实用的tf-idf代码

    其中,tf-idf(Term Frequency-Inverse Document Frequency)是一种常用的文本特征提取方法,它通过计算词频(tf)和逆文档频率(idf)来为每个单词赋予权重,以此来反映单词对整个文档的重要性。 #### 二、tf-idf...

    基于MapReduce的TF-IDF统计.zip

    标题中的“基于MapReduce的TF-IDF统计”指的是在大数据处理场景中,使用Hadoop的MapReduce框架来计算文本数据的TF-IDF值。TF-IDF(Term Frequency-Inverse Document Frequency)是一种常用的文本特征提取方法,它能...

    TF-IDF:NLP中的TF_IDF的公式,并与Sklearn中的结果进行比较

    TF-IDF(Term Frequency-Inverse Document Frequency)是自然语言处理(NLP)中一个重要的文本表示方法,用于评估一个词在文档集中的重要性。它综合考虑了词频(Term Frequency, TF)和逆文档频率(Inverse Document...

    人工智能-文本分类-基于tf-idf+SVM的长文本分类、基于textCNN的短文本分类

    总结,这个项目涵盖了从传统机器学习方法(tf-idf+SVM)到深度学习方法(textCNN)的文本分类技术,展示了在处理长文本和短文本时的不同策略。长文本分类更多依赖于词汇的统计特性,而短文本分类则利用了深度学习...

    基于 TF-IDF 的文本分类(txtClassify.py)

    tfidf_train()函数用于训练TF-IDF特征提取器,并将特征提取器保存到磁盘。 tfidf_test()函数用于加载保存在磁盘上的TF-IDF特征提取器,并使用它来处理测试数据。 svm_grid()函数用于使用网格搜索法寻找最佳的支持...

    python_tf-idf.rar

    TF-IDF(Term Frequency-Inverse Document Frequency)是一种在信息检索和自然语言处理中广泛使用的文本表示方法。它通过衡量一个词在文档中的频率(Term Frequency)以及在整个文档集合中的逆文档频率(Inverse ...

    python TF-IDF算法实现文本关键词提取

    在我们得到词频(TF)和逆文档频率(IDF)以后,将两个值相乘,即可得到一个词的TF-IDF值,某个词对文章的重要性越高,其TF-IDF值就越大,所以排在最前面的几个词就是文章的关键词。 TF-IDF算法的优点是简单快速,...

    TFIDF算法 java实现

    TF-IDF(Term Frequency-Inverse Document Frequency)是一种常用于信息检索与文本挖掘中的权重计算公式。它通过统计单词在文档中出现的频率以及在整个文集中的频率来评估一个词对于一篇文档的重要程度。 **TF-IDF*...

Global site tag (gtag.js) - Google Analytics