- 浏览: 33221 次
最新评论
import java.io.*;
import java.util.*;
import org.wltea.analyzer.lucene.IKAnalyzer;
public class ReadFiles {
/**
* @param args
*/
private static ArrayList<String> FileList = new ArrayList<String>(); // the list of file
//get list of file for the directory, including sub-directory of it
public static List<String> readDirs(String filepath) throws FileNotFoundException, IOException
{
try
{
File file = new File(filepath);
if(!file.isDirectory())
{
System.out.println("输入的[]");
System.out.println("filepath:" + file.getAbsolutePath());
}
else
{
String[] flist = file.list();
for(int i = 0; i < flist.length; i++)
{
File newfile = new File(filepath + "\\" + flist[i]);
if(!newfile.isDirectory())
{
FileList.add(newfile.getAbsolutePath());
}
else if(newfile.isDirectory()) //if file is a directory, call ReadDirs
{
readDirs(filepath + "\\" + flist[i]);
}
}
}
}catch(FileNotFoundException e)
{
System.out.println(e.getMessage());
}
return FileList;
}
//read file
public static String readFile(String file) throws FileNotFoundException, IOException
{
StringBuffer strSb = new StringBuffer(); //String is constant, StringBuffer can be changed.
InputStreamReader inStrR = new InputStreamReader(new FileInputStream(file), "gbk"); //byte streams to character streams
BufferedReader br = new BufferedReader(inStrR);
String line = br.readLine();
while(line != null){
strSb.append(line).append("\r\n");
line = br.readLine();
}
return strSb.toString();
}
//word segmentation
public static ArrayList<String> cutWords(String file) throws IOException{
ArrayList<String> words = new ArrayList<String>();
String text = ReadFiles.readFile(file);
IKAnalyzer analyzer = new IKAnalyzer();
words = analyzer.split(text);
return words;
}
//term frequency in a file, times for each word
public static HashMap<String, Integer> normalTF(ArrayList<String> cutwords){
HashMap<String, Integer> resTF = new HashMap<String, Integer>();
for(String word : cutwords){
if(resTF.get(word) == null){
resTF.put(word, 1);
System.out.println(word);
}
else{
resTF.put(word, resTF.get(word) + 1);
System.out.println(word.toString());
}
}
return resTF;
}
//term frequency in a file, frequency of each word
public static HashMap<String, Float> tf(ArrayList<String> cutwords){
HashMap<String, Float> resTF = new HashMap<String, Float>();
int wordLen = cutwords.size();
HashMap<String, Integer> intTF = ReadFiles.normalTF(cutwords);
Iterator iter = intTF.entrySet().iterator(); //iterator for that get from TF
try
{
FileWriter writer = new FileWriter("d:\\DF.txt", true);
while(iter.hasNext()){
Map.Entry entry = (Map.Entry)iter.next();
resTF.put(entry.getKey().toString(), Float.parseFloat(entry.getValue().toString()) / wordLen);
System.out.println(entry.getKey().toString() + " = "+ Float.parseFloat(entry.getValue().toString()) / wordLen);
// 输出到文件
writer.write(entry.getKey().toString() + " = "+ Float.parseFloat(entry.getValue().toString()) / wordLen+"\r\n");
}//end with while
writer.close();
}
catch(Exception ex)
{
}
return resTF;
}
//tf times for file
public static HashMap<String, HashMap<String, Integer>> normalTFAllFiles(String dirc) throws IOException{
HashMap<String, HashMap<String, Integer>> allNormalTF = new HashMap<String, HashMap<String,Integer>>();
List<String> filelist = ReadFiles.readDirs(dirc);
for(String file : filelist){
HashMap<String, Integer> dict = new HashMap<String, Integer>();
ArrayList<String> cutwords = ReadFiles.cutWords(file); //get cut word for one file
dict = ReadFiles.normalTF(cutwords);
allNormalTF.put(file, dict);
}
return allNormalTF;
}
//tf for all file
public static HashMap<String,HashMap<String, Float>> tfAllFiles(String dirc) throws IOException{
HashMap<String, HashMap<String, Float>> allTF = new HashMap<String, HashMap<String, Float>>();
List<String> filelist = ReadFiles.readDirs(dirc);
for(String file : filelist){
HashMap<String, Float> dict = new HashMap<String, Float>();
ArrayList<String> cutwords = ReadFiles.cutWords(file); //get cut words for one file
dict = ReadFiles.tf(cutwords);
allTF.put(file, dict);
}
return allTF;
}
public static HashMap<String, Float> idf(HashMap<String,HashMap<String, Float>> all_tf){
HashMap<String, Float> resIdf = new HashMap<String, Float>();
HashMap<String, Integer> dict = new HashMap<String, Integer>();
int docNum = FileList.size();
for(int i = 0; i < docNum; i++){
HashMap<String, Float> temp = all_tf.get(FileList.get(i));
Iterator iter = temp.entrySet().iterator();
while(iter.hasNext()){
Map.Entry entry = (Map.Entry)iter.next();
String word = entry.getKey().toString();
if(dict.get(word) == null){
dict.put(word, 1);
}else {
dict.put(word, dict.get(word) + 1);
}
}
}
System.out.println("IDF for every word is:");
try
{
FileWriter writer = new FileWriter("d:\\IDF.txt", true);
Iterator iter_dict = dict.entrySet().iterator();
while(iter_dict.hasNext()){
Map.Entry entry = (Map.Entry)iter_dict.next();
float value = (float)Math.log(docNum / Float.parseFloat(entry.getValue().toString()));
resIdf.put(entry.getKey().toString(), value);
System.out.println(entry.getKey().toString() + " = " + value);
writer.write(entry.getKey().toString() + " = " + value+"\r\n");
}
writer.close();
}
catch(Exception ex)
{
System.out.println("Error");
return null;
}
return resIdf;
}
public static void tf_idf(HashMap<String,HashMap<String, Float>> all_tf,HashMap<String, Float> idfs){
HashMap<String, HashMap<String, Float>> resTfIdf = new HashMap<String, HashMap<String, Float>>();
int docNum = FileList.size();
for(int i = 0; i < docNum; i++){
String filepath = FileList.get(i);
HashMap<String, Float> tfidf = new HashMap<String, Float>();
HashMap<String, Float> temp = all_tf.get(filepath);
Iterator iter = temp.entrySet().iterator();
while(iter.hasNext()){
Map.Entry entry = (Map.Entry)iter.next();
String word = entry.getKey().toString();
Float value = (float)Float.parseFloat(entry.getValue().toString()) * idfs.get(word);
tfidf.put(word, value);
}
resTfIdf.put(filepath, tfidf);
}
System.out.println("TF-IDF for Every file is :");
DisTfIdf(resTfIdf);
}
public static void DisTfIdf(HashMap<String, HashMap<String, Float>> tfidf){
Iterator iter1 = tfidf.entrySet().iterator();
try
{
FileWriter writer = new FileWriter("d:\\TF-IDF.txt", true);
String strtemp="";
while(iter1.hasNext()){
Map.Entry entrys = (Map.Entry)iter1.next();
System.out.println("FileName: " + entrys.getKey().toString());
//writer.write("FileName: " + entrys.getKey().toString());
System.out.print("{");
writer.write("{");
HashMap<String, Float> temp = (HashMap<String, Float>) entrys.getValue();
Iterator iter2 = temp.entrySet().iterator();
while(iter2.hasNext()){
Map.Entry entry = (Map.Entry)iter2.next();
System.out.print(entry.getKey().toString() + " = " + entry.getValue().toString() + ", ");
// 输出到文件
strtemp+=entry.getKey().toString() + " = " + entry.getValue().toString() + ", ";
//writer.write(entry.getKey().toString() + " = " + entry.getValue().toString() + ", ");
}
strtemp=strtemp.substring(0, strtemp.length()-2);
writer.write(strtemp);
System.out.println("}");
writer.write("}"+"\r\n");
}
writer.close();
}
catch(Exception ex)
{
System.out.println("error!");
return;
}
}
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String file = "D:/testfiles";
HashMap<String,HashMap<String, Float>> all_tf = tfAllFiles(file);
System.out.println();
HashMap<String, Float> idfs = idf(all_tf);
System.out.println();
tf_idf(all_tf, idfs);
}
}
import java.util.*;
import org.wltea.analyzer.lucene.IKAnalyzer;
public class ReadFiles {
/**
* @param args
*/
private static ArrayList<String> FileList = new ArrayList<String>(); // the list of file
//get list of file for the directory, including sub-directory of it
public static List<String> readDirs(String filepath) throws FileNotFoundException, IOException
{
try
{
File file = new File(filepath);
if(!file.isDirectory())
{
System.out.println("输入的[]");
System.out.println("filepath:" + file.getAbsolutePath());
}
else
{
String[] flist = file.list();
for(int i = 0; i < flist.length; i++)
{
File newfile = new File(filepath + "\\" + flist[i]);
if(!newfile.isDirectory())
{
FileList.add(newfile.getAbsolutePath());
}
else if(newfile.isDirectory()) //if file is a directory, call ReadDirs
{
readDirs(filepath + "\\" + flist[i]);
}
}
}
}catch(FileNotFoundException e)
{
System.out.println(e.getMessage());
}
return FileList;
}
//read file
public static String readFile(String file) throws FileNotFoundException, IOException
{
StringBuffer strSb = new StringBuffer(); //String is constant, StringBuffer can be changed.
InputStreamReader inStrR = new InputStreamReader(new FileInputStream(file), "gbk"); //byte streams to character streams
BufferedReader br = new BufferedReader(inStrR);
String line = br.readLine();
while(line != null){
strSb.append(line).append("\r\n");
line = br.readLine();
}
return strSb.toString();
}
//word segmentation
public static ArrayList<String> cutWords(String file) throws IOException{
ArrayList<String> words = new ArrayList<String>();
String text = ReadFiles.readFile(file);
IKAnalyzer analyzer = new IKAnalyzer();
words = analyzer.split(text);
return words;
}
//term frequency in a file, times for each word
public static HashMap<String, Integer> normalTF(ArrayList<String> cutwords){
HashMap<String, Integer> resTF = new HashMap<String, Integer>();
for(String word : cutwords){
if(resTF.get(word) == null){
resTF.put(word, 1);
System.out.println(word);
}
else{
resTF.put(word, resTF.get(word) + 1);
System.out.println(word.toString());
}
}
return resTF;
}
//term frequency in a file, frequency of each word
public static HashMap<String, Float> tf(ArrayList<String> cutwords){
HashMap<String, Float> resTF = new HashMap<String, Float>();
int wordLen = cutwords.size();
HashMap<String, Integer> intTF = ReadFiles.normalTF(cutwords);
Iterator iter = intTF.entrySet().iterator(); //iterator for that get from TF
try
{
FileWriter writer = new FileWriter("d:\\DF.txt", true);
while(iter.hasNext()){
Map.Entry entry = (Map.Entry)iter.next();
resTF.put(entry.getKey().toString(), Float.parseFloat(entry.getValue().toString()) / wordLen);
System.out.println(entry.getKey().toString() + " = "+ Float.parseFloat(entry.getValue().toString()) / wordLen);
// 输出到文件
writer.write(entry.getKey().toString() + " = "+ Float.parseFloat(entry.getValue().toString()) / wordLen+"\r\n");
}//end with while
writer.close();
}
catch(Exception ex)
{
}
return resTF;
}
//tf times for file
public static HashMap<String, HashMap<String, Integer>> normalTFAllFiles(String dirc) throws IOException{
HashMap<String, HashMap<String, Integer>> allNormalTF = new HashMap<String, HashMap<String,Integer>>();
List<String> filelist = ReadFiles.readDirs(dirc);
for(String file : filelist){
HashMap<String, Integer> dict = new HashMap<String, Integer>();
ArrayList<String> cutwords = ReadFiles.cutWords(file); //get cut word for one file
dict = ReadFiles.normalTF(cutwords);
allNormalTF.put(file, dict);
}
return allNormalTF;
}
//tf for all file
public static HashMap<String,HashMap<String, Float>> tfAllFiles(String dirc) throws IOException{
HashMap<String, HashMap<String, Float>> allTF = new HashMap<String, HashMap<String, Float>>();
List<String> filelist = ReadFiles.readDirs(dirc);
for(String file : filelist){
HashMap<String, Float> dict = new HashMap<String, Float>();
ArrayList<String> cutwords = ReadFiles.cutWords(file); //get cut words for one file
dict = ReadFiles.tf(cutwords);
allTF.put(file, dict);
}
return allTF;
}
public static HashMap<String, Float> idf(HashMap<String,HashMap<String, Float>> all_tf){
HashMap<String, Float> resIdf = new HashMap<String, Float>();
HashMap<String, Integer> dict = new HashMap<String, Integer>();
int docNum = FileList.size();
for(int i = 0; i < docNum; i++){
HashMap<String, Float> temp = all_tf.get(FileList.get(i));
Iterator iter = temp.entrySet().iterator();
while(iter.hasNext()){
Map.Entry entry = (Map.Entry)iter.next();
String word = entry.getKey().toString();
if(dict.get(word) == null){
dict.put(word, 1);
}else {
dict.put(word, dict.get(word) + 1);
}
}
}
System.out.println("IDF for every word is:");
try
{
FileWriter writer = new FileWriter("d:\\IDF.txt", true);
Iterator iter_dict = dict.entrySet().iterator();
while(iter_dict.hasNext()){
Map.Entry entry = (Map.Entry)iter_dict.next();
float value = (float)Math.log(docNum / Float.parseFloat(entry.getValue().toString()));
resIdf.put(entry.getKey().toString(), value);
System.out.println(entry.getKey().toString() + " = " + value);
writer.write(entry.getKey().toString() + " = " + value+"\r\n");
}
writer.close();
}
catch(Exception ex)
{
System.out.println("Error");
return null;
}
return resIdf;
}
public static void tf_idf(HashMap<String,HashMap<String, Float>> all_tf,HashMap<String, Float> idfs){
HashMap<String, HashMap<String, Float>> resTfIdf = new HashMap<String, HashMap<String, Float>>();
int docNum = FileList.size();
for(int i = 0; i < docNum; i++){
String filepath = FileList.get(i);
HashMap<String, Float> tfidf = new HashMap<String, Float>();
HashMap<String, Float> temp = all_tf.get(filepath);
Iterator iter = temp.entrySet().iterator();
while(iter.hasNext()){
Map.Entry entry = (Map.Entry)iter.next();
String word = entry.getKey().toString();
Float value = (float)Float.parseFloat(entry.getValue().toString()) * idfs.get(word);
tfidf.put(word, value);
}
resTfIdf.put(filepath, tfidf);
}
System.out.println("TF-IDF for Every file is :");
DisTfIdf(resTfIdf);
}
public static void DisTfIdf(HashMap<String, HashMap<String, Float>> tfidf){
Iterator iter1 = tfidf.entrySet().iterator();
try
{
FileWriter writer = new FileWriter("d:\\TF-IDF.txt", true);
String strtemp="";
while(iter1.hasNext()){
Map.Entry entrys = (Map.Entry)iter1.next();
System.out.println("FileName: " + entrys.getKey().toString());
//writer.write("FileName: " + entrys.getKey().toString());
System.out.print("{");
writer.write("{");
HashMap<String, Float> temp = (HashMap<String, Float>) entrys.getValue();
Iterator iter2 = temp.entrySet().iterator();
while(iter2.hasNext()){
Map.Entry entry = (Map.Entry)iter2.next();
System.out.print(entry.getKey().toString() + " = " + entry.getValue().toString() + ", ");
// 输出到文件
strtemp+=entry.getKey().toString() + " = " + entry.getValue().toString() + ", ";
//writer.write(entry.getKey().toString() + " = " + entry.getValue().toString() + ", ");
}
strtemp=strtemp.substring(0, strtemp.length()-2);
writer.write(strtemp);
System.out.println("}");
writer.write("}"+"\r\n");
}
writer.close();
}
catch(Exception ex)
{
System.out.println("error!");
return;
}
}
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String file = "D:/testfiles";
HashMap<String,HashMap<String, Float>> all_tf = tfAllFiles(file);
System.out.println();
HashMap<String, Float> idfs = idf(all_tf);
System.out.println();
tf_idf(all_tf, idfs);
}
}
相关推荐
TF-IDF(Term Frequency-Inverse Document Frequency)是一种在信息检索和文本挖掘领域广泛使用的统计方法,用于评估一个词在文档中的重要性。它基于两个概念:词频(Term Frequency, TF)和逆文档频率(Inverse ...
用户可能需要进一步了解代码结构,学习如何加载特定语料库,如何进行预处理,如何计算TF-IDF值,以及如何输出和筛选关键词。这涉及到Python编程、NLP库的使用(如nltk、gensim等),以及数据分析的基本技能。 总之...
总结,Python中的TF-IDF矩阵表示涉及到理解TF-IDF算法原理,使用`sklearn`库进行数据预处理和转换,并通过文件输出和分析结果来加深对算法的理解。这个实验不仅锻炼了编程技巧,也对信息检索和文本挖掘有了深入的...
这个类会将文本转换为TF-IDF向量,从而可以输入到机器学习模型中,如朴素贝叶斯分类器。 **使用TF-IDF和朴素贝叶斯进行数据分析步骤** 1. **数据预处理**:清洗文本,去除无关字符,如标点符号、数字等,并进行...
TF-IDF(Term Frequency-Inverse Document Frequency)是一种在信息检索和文本挖掘中广泛使用的统计方法,用于评估一个词在文档中的重要性。它基于两个主要概念:词频(Term Frequency, TF)和逆文档频率(Inverse ...
在压缩包内的文件中,"GetFileTimes.java"很可能是实现TF-IDF算法的主要源代码文件,可能包括读取文本、计算词频、计算IDF值以及生成输出等功能。而"www.pudn.com.txt"则可能是一个示例文本文件,用于测试代码,这个...
### NLP技术使用TF-IDF将文本数据转换为特征向量表示然后使用支持向量机SVM进行文本分类 #### 知识点概览 本文将深入探讨如何使用TF-IDF(Term Frequency-Inverse Document Frequency)对文本数据进行特征提取,并...
TF-IDF(Term Frequency-Inverse Document Frequency)是一种在信息检索和文本挖掘领域广泛使用的权重计算方法,用于评估一个词在文档中的重要性。这个概念基于两个原则:词频(Term Frequency,TF)和逆文档频率...
**IF-IDF算法** IF-IDF(Informational Freqency-Inverse Document Frequency)是一种在文本挖掘和信息检索领域广泛使用...通过结合TF-IDF和搜索技术,我们可以有效地处理大量文本数据,提升信息检索的准确性和效率。
本篇文章将详细讲解如何利用Hadoop MapReduce实现TF-IDF(Term Frequency-Inverse Document Frequency)算法,这是一种在信息检索和文本挖掘中用于评估一个词在文档中的重要性的统计方法。 首先,我们要理解TF-IDF...
在这个例子中,`TfidfVectorizer`首先创建了一个TF-IDF向量化器,然后使用`fit_transform`方法对语料库中的文本进行向量化处理,最后输出每个文档的关键词及其对应的TF-IDF值。 TF-IDF算法在文本挖掘、信息检索、...
其中,tf-idf(Term Frequency-Inverse Document Frequency)是一种常用的文本特征提取方法,它通过计算词频(tf)和逆文档频率(idf)来为每个单词赋予权重,以此来反映单词对整个文档的重要性。 #### 二、tf-idf...
标题中的“基于MapReduce的TF-IDF统计”指的是在大数据处理场景中,使用Hadoop的MapReduce框架来计算文本数据的TF-IDF值。TF-IDF(Term Frequency-Inverse Document Frequency)是一种常用的文本特征提取方法,它能...
TF-IDF(Term Frequency-Inverse Document Frequency)是自然语言处理(NLP)中一个重要的文本表示方法,用于评估一个词在文档集中的重要性。它综合考虑了词频(Term Frequency, TF)和逆文档频率(Inverse Document...
总结,这个项目涵盖了从传统机器学习方法(tf-idf+SVM)到深度学习方法(textCNN)的文本分类技术,展示了在处理长文本和短文本时的不同策略。长文本分类更多依赖于词汇的统计特性,而短文本分类则利用了深度学习...
tfidf_train()函数用于训练TF-IDF特征提取器,并将特征提取器保存到磁盘。 tfidf_test()函数用于加载保存在磁盘上的TF-IDF特征提取器,并使用它来处理测试数据。 svm_grid()函数用于使用网格搜索法寻找最佳的支持...
TF-IDF(Term Frequency-Inverse Document Frequency)是一种在信息检索和自然语言处理中广泛使用的文本表示方法。它通过衡量一个词在文档中的频率(Term Frequency)以及在整个文档集合中的逆文档频率(Inverse ...
在我们得到词频(TF)和逆文档频率(IDF)以后,将两个值相乘,即可得到一个词的TF-IDF值,某个词对文章的重要性越高,其TF-IDF值就越大,所以排在最前面的几个词就是文章的关键词。 TF-IDF算法的优点是简单快速,...
TF-IDF(Term Frequency-Inverse Document Frequency)是一种常用于信息检索与文本挖掘中的权重计算公式。它通过统计单词在文档中出现的频率以及在整个文集中的频率来评估一个词对于一篇文档的重要程度。 **TF-IDF*...