`

TF-IDF

ESA 
阅读更多

package com.sap.research.semantic;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jdom.Document;
import org.jdom.JDOMException;
import org.jdom.Text;
import org.jdom.xpath.XPath;

import com.sap.research.basicio.reader.DirectoryLister;
import com.sap.research.basicio.reader.FileInputReader;
import com.sap.research.td.Parameters;
import com.sap.research.td.elements.TaggedToken;
import com.sap.research.td.html.JDOMParser;
import com.sap.research.td.html.XPathExtractor;
import com.sap.research.td.preproc.StopWordsFilter;
import com.sap.research.td.preprocessing.nlp.TaggedStemmerTokenizer;
import com.sap.research.td.preprocessing.nlp.Tokenizer;
import com.sap.research.util.Pair;

public class InvertedIndex {

	private static final java.text.SimpleDateFormat sdf = new java.text.SimpleDateFormat(
			"dd.MM.yyyy HH.mm.ss");

//	private HashMap<String, ArrayList<Pair<String, Integer>>> invertedIndices;
	private XPath titlePattern = null;
	private XPath contentPattern = null;
	private HashMap<String, ArrayList<Pair<String, Double>>> invertedIndicesDouble;

	public HashMap<String, ArrayList<Pair<String, Double>>> getInvertedIndicesDouble() {
		return invertedIndicesDouble;
	}

	public void setInvertedIndicesDouble(
			HashMap<String, ArrayList<Pair<String, Double>>> invertedIndicesDouble) {
		this.invertedIndicesDouble = invertedIndicesDouble;
	}

	public InvertedIndex(String propertyFilePath) {
		getParameters(propertyFilePath);
//		invertedIndices = new HashMap<String, ArrayList<Pair<String, Integer>>>();
		invertedIndicesDouble = new HashMap<String, ArrayList<Pair<String, Double>>>();
	}

	// necessary
	public String tokenizeFile(String content) {

		String tokenizedContent = "";
		Tokenizer tokenizer;
		String workingDir = "nlp/linguistXPlatform/windows-2003-i686-msvc-7.1-32";
		String inxightExec = "nlp/linguistXPlatform/windows-2003-i686-msvc-7.1-32/test_platform.exe";
		String posFilter = "(Nn|Prop|Abbr|Adj).*";
		String languageOptions = "-l english";

		tokenizer = new TaggedStemmerTokenizer(workingDir, inxightExec, null,
				posFilter, languageOptions, false, null);
		tokenizer.init(content, "TestDoc");
		for (Pair<String, TaggedToken> p : tokenizer) {
			if (p.first != null) {
				tokenizedContent += p.first + " ";
			}
		}

		return tokenizedContent;
	}

	public void getParameters(String propertyFilePath) {
		Properties properties = new Properties();
		try {
			properties.load(new FileInputStream(propertyFilePath));
		} catch (FileNotFoundException e) {
			e.printStackTrace();
			System.err.println("Could not find property file!");
			System.exit(1);
		} catch (IOException e) {
			e.printStackTrace();
			System.err.println("Could not read property file!");
			System.exit(1);
		}

		try {
			titlePattern = XPath.newInstance(properties
					.getProperty(Parameters.TITLE_PATTERN));
		} catch (JDOMException e1) {
			e1.printStackTrace();
		}

		try {
			contentPattern = XPath.newInstance(properties
					.getProperty(Parameters.CONTENT_PATTERN));
		} catch (JDOMException e1) {
			e1.printStackTrace();
		}
	}

	public HashMap<String, Pair<String, Integer>> termCount(File file) {
		// HashMap<term, Pair<title, frequency>>
		HashMap<String, Pair<String, Integer>> indicesforDoc = new HashMap<String, Pair<String, Integer>>();

		Pair<String, String> fileContent = extractContent(file);
		String title = fileContent.first;
		String content = fileContent.second;

		HashMap<String, Integer> index = new HashMap<String, Integer>();
		ArrayList<String> plainTextStrings = StopWordsFilter
				.stringFilter(tokenizeFile(content));
		for (String string : plainTextStrings) {
			if (!index.containsKey(string)) {
				index.put(string, 1);
			} else {
				Integer frequency = index.get(string);
				index.remove(string);
				index.put(string, ++frequency);
			}
		}

		for (Map.Entry<String, Integer> entry : index.entrySet()) {
			indicesforDoc.put(entry.getKey(), new Pair<String, Integer>(title,
					entry.getValue()));
		}

		return indicesforDoc;
	}

	public Pair<String, String> extractContent(File file) {

		String title = "";
		String content = "";

		try {

			String fileContent = new FileInputReader(file).getStringContent();

			XPathExtractor titleExtractor = new XPathExtractor(titlePattern);
			boolean treatUnknownTagsAsContent = false;
			JDOMParser parser = new JDOMParser(treatUnknownTagsAsContent);
			Document d = null;
			d = parser.HTML2JDom(fileContent, true);

			for (Text e : titleExtractor.extract(d)) {
				String normalizedTxt = e.getTextNormalize();
				if (!normalizedTxt.isEmpty()) {
					title = normalizedTxt.trim();
				}
			}

			XPathExtractor contentExtractor = new XPathExtractor(contentPattern);
			for (Text e : contentExtractor.extract(d)) {
				String normalizedTxt = e.getTextNormalize();
				if (!normalizedTxt.isEmpty()) {
					content += " " + normalizedTxt.trim();
				}
			}

		} catch (Exception e) {
			System.err.println(e);
		}

		return new Pair<String, String>(title, content);
	}

	public void output(
			HashMap<String, ArrayList<Pair<String, Integer>>> invertedIndices) {
		ArrayList<Pair<String, Integer>> pairList = null;
		for (Map.Entry<String, ArrayList<Pair<String, Integer>>> entry : invertedIndices
				.entrySet()) {
			pairList = entry.getValue();
			String pairStr = "";
			for (Pair<String, Integer> pair : pairList) {
				pairStr += "<" + pair.first + ", " + pair.second + ">, ";
			}
			System.out.println(entry.getKey() + " = " + pairStr);
		}
	}

	public void outputDouble() {
		ArrayList<Pair<String, Double>> pairList = null;
		for (Map.Entry<String, ArrayList<Pair<String, Double>>> entry : invertedIndicesDouble
				.entrySet()) {
			pairList = entry.getValue();
			String pairStr = "";
			for (Pair<String, Double> pair : pairList) {
				pairStr += "<" + pair.first + ", " + pair.second + ">, ";
			}
			System.out.println(entry.getKey() + " = " + pairStr);
		}
	}

	public int readFile(String directory, String suffixString, Boolean recursive) {
		// HashMap<term, ArrayList<Pair<title, frequency>>>
		HashMap<String, ArrayList<Pair<String, Integer>>> invertedIndices = new HashMap<String, ArrayList<Pair<String, Integer>>>();

		ArrayList<File> files = new ArrayList<File>();
		try {
			files = new DirectoryLister(directory, suffixString, recursive)
					.getContainedFiles();
		} catch (IOException e) {
			e.printStackTrace();
		}

		int numOfFiles = files.size();

		// idf(t, D) = log (|D|)/(|{d\in D: t\in d}|)
		HashMap<String, Pair<String, Integer>> tf = null;
		ArrayList<String> terms = new ArrayList<String>();
		for (File file : files) {
			
			tf = termCount(file);

			for (Map.Entry<String, Pair<String, Integer>> entry : tf.entrySet()) {
				if (!terms.contains(entry.getKey())) {
					terms.add(entry.getKey());
					ArrayList<Pair<String, Integer>> tmp = new ArrayList<Pair<String, Integer>>();
					tmp.add(entry.getValue());
					invertedIndices.put(entry.getKey(), tmp);
				} else {
					invertedIndices.get(entry.getKey()).add(entry.getValue());
				}
			}
		}
		// output();
		tfidf(numOfFiles, invertedIndices);
		
		return numOfFiles;
	}

	public void tfidf(int numOfFiles, HashMap<String, ArrayList<Pair<String, Integer>>> invertedIndices) {
		for (Map.Entry<String, ArrayList<Pair<String, Integer>>> entry : invertedIndices
				.entrySet()) {
			ArrayList<Pair<String, Integer>> pairList = entry.getValue();
			ArrayList<Pair<String, Double>> pairListDouble = new ArrayList<Pair<String, Double>>();
			for (Pair<String, Integer> pair : pairList) {
				double tfidf = pair.second
						* Math.log((double) numOfFiles
								/ (double) pairList.size());
				pairListDouble.add(new Pair<String, Double>(pair.first, tfidf));
			}
			invertedIndicesDouble.put(entry.getKey(), pairListDouble);
		}
//		 outputDouble();
	}

	// for efficiency, tokenize files and store tokenized files into local
	// directory
	public void tokenizer(File inputFile) {

		String outputFileDir = "test/output/terms/caches";
		Pair<String, String> fileContent = extractContent(inputFile);

		String terms = tokenizeFile(fileContent.second);
		// choose another approach

		try {
			String content = new FileInputReader(inputFile).getStringContent();

			// wxc: for different directorial structure of input corpus
			String outputFilePath = null;
			String[] strings = inputFile.toURI().toString().split("/");
			outputFilePath = strings[strings.length - 2];
			outputFileDir = outputFileDir + "/" + outputFilePath;
			if (outputFileDir != null) {

				// if the cache dir is not null but doesnt exist, create the
				// directory
				if (!new File(outputFileDir).isDirectory()) {
					System.out.print("...creating \"" + outputFileDir
							+ "\" cache directory...");
					new File(outputFileDir).mkdirs();
					System.out.println("successful.");
				}
			}

			String outputFileName = inputFile.getName();
			BufferedWriter out = new BufferedWriter(new FileWriter(
					outputFileDir + "/" + outputFileName));
			out.write(content);
			out.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public void addIndices(String dirPath, String fileExtension,
			boolean recursive) {
		System.out.println(sdf.format(new Date())
				+ ": Inverted-indexing started.");
		int numOfFiles = readFile(dirPath, fileExtension, recursive);
//		tfidf(numOfFiles);
		System.out.println(sdf.format(new Date())
				+ ": Inverted-indexing ended.");
	}

	public static void main(String[] args) {
		String propertyFilePath = "settings/html.properties";
		InvertedIndex index = new InvertedIndex(propertyFilePath);
		index.addIndices("test/input/terms", "html", true);
	}

}
 
分享到:
评论
发表评论

文章已被作者锁定,不允许评论。

相关推荐

    TF-IDF与余弦相似性的应用

    TF-IDF与余弦相似性的应用 TF-IDF(Term Frequency-Inverse Document Frequency)是一种常用的关键词提取算法,应用于自动关键词提取、信息检索等领域。该算法的优点是简单快速,结果比较符合实际情况。TF-IDF 算法...

    LDA和TF-IDF算法的相关论文

    《LDA与TF-IDF算法:深度探讨与应用》 在信息检索和自然语言处理领域,LDA(Latent Dirichlet Allocation)和TF-IDF(Term Frequency-Inverse Document Frequency)是两种至关重要的算法,它们在文本分析、文档分类...

    Using_TF-IDF_to_Determine_Word_Relevance_in_Document_Queries

    ### 使用TF-IDF确定文档查询中的词相关性 在当今数据驱动的世界中,从大量文本信息中高效地检索相关信息是一项至关重要的技能。本文探讨了如何应用TF-IDF(Term Frequency-Inverse Document Frequency)来确定文档...

    tf-idf_tf-idf_

    TF-IDF(Term Frequency-Inverse Document Frequency)是一种在信息检索和文本挖掘领域广泛使用的权重计算方法,用于评估一个词在文档中的重要性。这个概念基于两个主要因素:词频(Term Frequency, TF)和逆文档...

    NLP:基于TF-IDF的中文关键词提取.zip

    在这个主题中,"NLP:基于TF-IDF的中文关键词提取"是一个项目,它利用TF-IDF算法来从中文文本中提取具有代表性的关键词。TF-IDF是一种经典的文本特征权重计算方法,广泛应用于信息检索、文档分类和关键词提取等领域...

    基于TF-IDF算法抽取

    ### 基于TF-IDF算法抽取文章关键词 #### 一、引言 TF-IDF(Term Frequency-Inverse Document Frequency)是一种广泛应用于信息检索与文本挖掘领域的统计方法,用于评估单词对于一个文档集或者语料库中单个文档的...

    基于特定语料库的TF-IDF的中文关键词提取

    在"tf-idf-keyword-master"这个压缩包文件中,很可能包含了实现TF-IDF关键词提取的代码框架或者示例。用户可能需要进一步了解代码结构,学习如何加载特定语料库,如何进行预处理,如何计算TF-IDF值,以及如何输出和...

    C语言、Python实现TF-IDF算法

    TF-IDF(Term Frequency-Inverse Document Frequency)是一种在信息检索和自然语言处理中广泛使用的统计方法,用于评估一个词在文档中的重要性。这个方法基于两个核心概念:词频(Term Frequency, TF)和逆文档频率...

    基于Python实现TF-IDF矩阵表示(人工智能实验)【100011921】

    TF-IDF(Term Frequency-Inverse Document Frequency)是一种在信息检索和文本挖掘领域广泛使用的权重计算方法,用于评估一个词在文档中的重要性。这个概念基于两个原则:词频(Term Frequency, TF)和逆文档频率...

    关键词提取TF-IDF算法综述

    TF-IDF算法,即词频-逆文档频率(Term Frequency-Inverse Document Frequency)算法,是关键词提取中最常用的方法之一。该算法综合了词频(TF)和逆文档频率(IDF)两个因子来评估词汇在文档集合中的重要性。 在...

    使用python进行朴素贝叶斯的数据分析,使用TF-IDF方法整理数据

    在数据分析领域,Python是一种非常强大的工具,而朴素贝叶斯(Naive Bayes)和TF-IDF(Term Frequency-Inverse Document Frequency)则是两种常见的技术,常用于文本分类和信息检索。接下来,我们将深入探讨这两个...

    TF-IDF计算程序

    TF-IDF(Term Frequency-Inverse Document Frequency)是一种在信息检索和文本挖掘中广泛使用的统计方法,用于评估一个词在文档中的重要性。它基于两个主要概念:词频(Term Frequency, TF)和逆文档频率(Inverse ...

    基于TF-IDF文本向量化的SQL注入攻击检测.pdf

    基于 TF-IDF 文本向量化的 SQL 注入攻击检测 SQL 注入攻击是最常见的 Web 应用程序攻击手段,利用机器学习检测 SQL 注入攻击已成为一种趋势。该论文提出了基于 TF-IDF 文本向量化的 SQL 注入攻击检测方法,旨在提高...

    基于Word2vec和改进TF-IDF算法的深度学习模型研究.pdf

    本研究论文主要探讨了基于Word2vec和改进TF-IDF算法的深度学习模型,以及它们在物流评价分类预测中的应用。研究背景是随着电子商务的兴起,网上购物变得普及,伴随着的是大量的评价信息产生,这些信息对商家来说是...

    基于改进TF-IDF算法的牛疾病智能诊断系统.pdf

    "基于改进TF-IDF算法的牛疾病智能诊断系统" 本文介绍了一种基于改进TF-IDF算法的牛疾病智能诊断系统。传统的TF-IDF算法存在一些缺陷,例如无法合理地代表某疾病的症状,降低智能诊断系统的性能。为了解决这个问题,...

    基于TF-IDF算法和LDA主题模型数据挖掘技术在电力客户抱怨文本中的应用.pdf

    本文将通过梳理文本挖掘技术,并采用TF-IDF算法处理词频信息,运用LDA主题模型进行有效的文本分类,旨在得到有意义的结果。 TF-IDF(Term Frequency-Inverse Document Frequency)算法是文本挖掘中常用的一种统计...

    sklearn实现基于TF-IDF的KNN新闻标题文本分类

    本项目中,我们采用`sklearn`(Scikit-learn)库来实现一个基于TF-IDF的KNN(K-Nearest Neighbors)新闻标题文本分类器。以下是关于这个主题的详细知识点: 1. **TF-IDF**: - **TF(Term Frequency)**:词频,表示一个...

Global site tag (gtag.js) - Google Analytics