TF-IDF

sanjewel
浏览: 7179 次
性别:
来自: 重庆
最近访客更多访客>>

博主相关

博客
微博
相册
留言
关于我
文章分类

社区版块

存档分类

博客分类：
java explicit semantic analysis
tf idf
ESA
package com.sap.research.semantic;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jdom.Document;
import org.jdom.JDOMException;
import org.jdom.Text;
import org.jdom.xpath.XPath;

import com.sap.research.basicio.reader.DirectoryLister;
import com.sap.research.basicio.reader.FileInputReader;
import com.sap.research.td.Parameters;
import com.sap.research.td.elements.TaggedToken;
import com.sap.research.td.html.JDOMParser;
import com.sap.research.td.html.XPathExtractor;
import com.sap.research.td.preproc.StopWordsFilter;
import com.sap.research.td.preprocessing.nlp.TaggedStemmerTokenizer;
import com.sap.research.td.preprocessing.nlp.Tokenizer;
import com.sap.research.util.Pair;

public class InvertedIndex {

	private static final java.text.SimpleDateFormat sdf = new java.text.SimpleDateFormat(
			"dd.MM.yyyy HH.mm.ss");

//	private HashMap<String, ArrayList<Pair<String, Integer>>> invertedIndices;
	private XPath titlePattern = null;
	private XPath contentPattern = null;
	private HashMap<String, ArrayList<Pair<String, Double>>> invertedIndicesDouble;

	public HashMap<String, ArrayList<Pair<String, Double>>> getInvertedIndicesDouble() {
		return invertedIndicesDouble;
	}

	public void setInvertedIndicesDouble(
			HashMap<String, ArrayList<Pair<String, Double>>> invertedIndicesDouble) {
		this.invertedIndicesDouble = invertedIndicesDouble;
	}

	public InvertedIndex(String propertyFilePath) {
		getParameters(propertyFilePath);
//		invertedIndices = new HashMap<String, ArrayList<Pair<String, Integer>>>();
		invertedIndicesDouble = new HashMap<String, ArrayList<Pair<String, Double>>>();
	}

	// necessary
	public String tokenizeFile(String content) {

		String tokenizedContent = "";
		Tokenizer tokenizer;
		String workingDir = "nlp/linguistXPlatform/windows-2003-i686-msvc-7.1-32";
		String inxightExec = "nlp/linguistXPlatform/windows-2003-i686-msvc-7.1-32/test_platform.exe";
		String posFilter = "(Nn|Prop|Abbr|Adj).*";
		String languageOptions = "-l english";

		tokenizer = new TaggedStemmerTokenizer(workingDir, inxightExec, null,
				posFilter, languageOptions, false, null);
		tokenizer.init(content, "TestDoc");
		for (Pair<String, TaggedToken> p : tokenizer) {
			if (p.first != null) {
				tokenizedContent += p.first + " ";
			}
		}

		return tokenizedContent;
	}

	public void getParameters(String propertyFilePath) {
		Properties properties = new Properties();
		try {
			properties.load(new FileInputStream(propertyFilePath));
		} catch (FileNotFoundException e) {
			e.printStackTrace();
			System.err.println("Could not find property file!");
			System.exit(1);
		} catch (IOException e) {
			e.printStackTrace();
			System.err.println("Could not read property file!");
			System.exit(1);
		}

		try {
			titlePattern = XPath.newInstance(properties
					.getProperty(Parameters.TITLE_PATTERN));
		} catch (JDOMException e1) {
			e1.printStackTrace();
		}

		try {
			contentPattern = XPath.newInstance(properties
					.getProperty(Parameters.CONTENT_PATTERN));
		} catch (JDOMException e1) {
			e1.printStackTrace();
		}
	}

	public HashMap<String, Pair<String, Integer>> termCount(File file) {
		// HashMap<term, Pair<title, frequency>>
		HashMap<String, Pair<String, Integer>> indicesforDoc = new HashMap<String, Pair<String, Integer>>();

		Pair<String, String> fileContent = extractContent(file);
		String title = fileContent.first;
		String content = fileContent.second;

		HashMap<String, Integer> index = new HashMap<String, Integer>();
		ArrayList<String> plainTextStrings = StopWordsFilter
				.stringFilter(tokenizeFile(content));
		for (String string : plainTextStrings) {
			if (!index.containsKey(string)) {
				index.put(string, 1);
			} else {
				Integer frequency = index.get(string);
				index.remove(string);
				index.put(string, ++frequency);
			}
		}

		for (Map.Entry<String, Integer> entry : index.entrySet()) {
			indicesforDoc.put(entry.getKey(), new Pair<String, Integer>(title,
					entry.getValue()));
		}

		return indicesforDoc;
	}

	public Pair<String, String> extractContent(File file) {

		String title = "";
		String content = "";

		try {

			String fileContent = new FileInputReader(file).getStringContent();

			XPathExtractor titleExtractor = new XPathExtractor(titlePattern);
			boolean treatUnknownTagsAsContent = false;
			JDOMParser parser = new JDOMParser(treatUnknownTagsAsContent);
			Document d = null;
			d = parser.HTML2JDom(fileContent, true);

			for (Text e : titleExtractor.extract(d)) {
				String normalizedTxt = e.getTextNormalize();
				if (!normalizedTxt.isEmpty()) {
					title = normalizedTxt.trim();
				}
			}

			XPathExtractor contentExtractor = new XPathExtractor(contentPattern);
			for (Text e : contentExtractor.extract(d)) {
				String normalizedTxt = e.getTextNormalize();
				if (!normalizedTxt.isEmpty()) {
					content += " " + normalizedTxt.trim();
				}
			}

		} catch (Exception e) {
			System.err.println(e);
		}

		return new Pair<String, String>(title, content);
	}

	public void output(
			HashMap<String, ArrayList<Pair<String, Integer>>> invertedIndices) {
		ArrayList<Pair<String, Integer>> pairList = null;
		for (Map.Entry<String, ArrayList<Pair<String, Integer>>> entry : invertedIndices
				.entrySet()) {
			pairList = entry.getValue();
			String pairStr = "";
			for (Pair<String, Integer> pair : pairList) {
				pairStr += "<" + pair.first + ", " + pair.second + ">, ";
			}
			System.out.println(entry.getKey() + " = " + pairStr);
		}
	}

	public void outputDouble() {
		ArrayList<Pair<String, Double>> pairList = null;
		for (Map.Entry<String, ArrayList<Pair<String, Double>>> entry : invertedIndicesDouble
				.entrySet()) {
			pairList = entry.getValue();
			String pairStr = "";
			for (Pair<String, Double> pair : pairList) {
				pairStr += "<" + pair.first + ", " + pair.second + ">, ";
			}
			System.out.println(entry.getKey() + " = " + pairStr);
		}
	}

	public int readFile(String directory, String suffixString, Boolean recursive) {
		// HashMap<term, ArrayList<Pair<title, frequency>>>
		HashMap<String, ArrayList<Pair<String, Integer>>> invertedIndices = new HashMap<String, ArrayList<Pair<String, Integer>>>();

		ArrayList<File> files = new ArrayList<File>();
		try {
			files = new DirectoryLister(directory, suffixString, recursive)
					.getContainedFiles();
		} catch (IOException e) {
			e.printStackTrace();
		}

		int numOfFiles = files.size();

		// idf(t, D) = log (|D|)/(|{d\in D: t\in d}|)
		HashMap<String, Pair<String, Integer>> tf = null;
		ArrayList<String> terms = new ArrayList<String>();
		for (File file : files) {
			
			tf = termCount(file);

			for (Map.Entry<String, Pair<String, Integer>> entry : tf.entrySet()) {
				if (!terms.contains(entry.getKey())) {
					terms.add(entry.getKey());
					ArrayList<Pair<String, Integer>> tmp = new ArrayList<Pair<String, Integer>>();
					tmp.add(entry.getValue());
					invertedIndices.put(entry.getKey(), tmp);
				} else {
					invertedIndices.get(entry.getKey()).add(entry.getValue());
				}
			}
		}
		// output();
		tfidf(numOfFiles, invertedIndices);
		
		return numOfFiles;
	}

	public void tfidf(int numOfFiles, HashMap<String, ArrayList<Pair<String, Integer>>> invertedIndices) {
		for (Map.Entry<String, ArrayList<Pair<String, Integer>>> entry : invertedIndices
				.entrySet()) {
			ArrayList<Pair<String, Integer>> pairList = entry.getValue();
			ArrayList<Pair<String, Double>> pairListDouble = new ArrayList<Pair<String, Double>>();
			for (Pair<String, Integer> pair : pairList) {
				double tfidf = pair.second
						* Math.log((double) numOfFiles
								/ (double) pairList.size());
				pairListDouble.add(new Pair<String, Double>(pair.first, tfidf));
			}
			invertedIndicesDouble.put(entry.getKey(), pairListDouble);
		}
//		 outputDouble();
	}

	// for efficiency, tokenize files and store tokenized files into local
	// directory
	public void tokenizer(File inputFile) {

		String outputFileDir = "test/output/terms/caches";
		Pair<String, String> fileContent = extractContent(inputFile);

		String terms = tokenizeFile(fileContent.second);
		// choose another approach

		try {
			String content = new FileInputReader(inputFile).getStringContent();

			// wxc: for different directorial structure of input corpus
			String outputFilePath = null;
			String[] strings = inputFile.toURI().toString().split("/");
			outputFilePath = strings[strings.length - 2];
			outputFileDir = outputFileDir + "/" + outputFilePath;
			if (outputFileDir != null) {

				// if the cache dir is not null but doesnt exist, create the
				// directory
				if (!new File(outputFileDir).isDirectory()) {
					System.out.print("...creating \"" + outputFileDir
							+ "\" cache directory...");
					new File(outputFileDir).mkdirs();
					System.out.println("successful.");
				}
			}

			String outputFileName = inputFile.getName();
			BufferedWriter out = new BufferedWriter(new FileWriter(
					outputFileDir + "/" + outputFileName));
			out.write(content);
			out.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public void addIndices(String dirPath, String fileExtension,
			boolean recursive) {
		System.out.println(sdf.format(new Date())
				+ ": Inverted-indexing started.");
		int numOfFiles = readFile(dirPath, fileExtension, recursive);
//		tfidf(numOfFiles);
		System.out.println(sdf.format(new Date())
				+ ": Inverted-indexing ended.");
	}

	public static void main(String[] args) {
		String propertyFilePath = "settings/html.properties";
		InvertedIndex index = new InvertedIndex(propertyFilePath);
		index.addIndices("test/input/terms", "html", true);
	}

}
分享到：
TF-IDF[2] | LongSentenceFilter Joshua SMT [2]
2012-03-22 17:55
浏览 1159
评论(0)
分类:编程语言
查看更多
发表评论

文章已被作者锁定，不允许评论。
最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

TF-IDF

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

TF-IDF

评论

发表评论

相关推荐

TF-IDF[2]

最近访客更多访客>>