LongSentenceFilter Joshua SMT [2] -

sanjewel

浏览: 7244 次
性别:
来自: 重庆

最近访客更多访客>>

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

LongSentenceFilter Joshua SMT [2]

博客分类：

SMT Joshua

Joshua SMT decoder

Note that the first version of LongSentenceFilter is not complete, because even after filtering there still may be French sentences of more than 100 words. Now this version tackles this problem. Note also that this version is not optimal from implementational view and a better version will be in next post.

package util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;

public class LongSentenceFilter {

	public void filter(File enFile, File frFile, File oenFile, File ofrFile) {

		/*go through both English and French files, 
		 * remove sentences of more than 100 words in one file 
		 * and remove corresponding sentences (with same sentence number) 
		 * in the other file 
		 * */
		
		BufferedReader enBufferedReader;
		BufferedReader frBufferedReader;
		String line = null;
		int lineCount = 0;

		// record line numbers of those sentences that consist more than 100
		// words either in English file or in French file
		ArrayList<Integer> longSentenceIndices = new ArrayList<Integer>();

		// output stringbuffer
		StringBuffer enContent = new StringBuffer();
		StringBuffer frContent = new StringBuffer();

		try {
			// go through English file, find those sentences of more than
			// 100 words and keep record of those line numbers in
			// _longSentenceIndices
			enBufferedReader = new BufferedReader(new FileReader(enFile));
			while ((line = enBufferedReader.readLine()) != null) {
				String[] words = line.split(" ");
				lineCount++;
				if (words.length > 100)
					longSentenceIndices.add(lineCount);
			}
			System.out.println("Number of sentences in original document: "
					+ lineCount);

			// go through French file, keep those sentences, of words less 
			// or equal to 100 and whose line numbers are not in
			// _longSentenceIndices
			// at the same time, keep line numbers of sentences of more than
			// 100 words in _longSentenceIndices
			lineCount = 0;
			frBufferedReader = new BufferedReader(new FileReader(frFile));
			while ((line = frBufferedReader.readLine()) != null) {
				String[] words = line.split(" ");
				lineCount++;
				if (words.length <= 100
						&& !longSentenceIndices.contains(lineCount)) {
					frContent.append(line);
					frContent.append('\n');
				} else {
					if (!longSentenceIndices.contains(lineCount))
						longSentenceIndices.add(lineCount);
				}
			}
			
			// go again through English file, keep those sentences, whose line 
			// number are not in _longSentenceIndices
			lineCount = 0;
			int newLineCount = 0;
			enBufferedReader = new BufferedReader(new FileReader(enFile));
			while ((line = enBufferedReader.readLine()) != null) {
				lineCount++;
				if (!longSentenceIndices.contains(lineCount)) {
					newLineCount++;
					enContent.append(line);
					enContent.append('\n');
				}
			}
			System.out.println("Number of sentences after filteration: "
					+ newLineCount);
		} catch (FileNotFoundException e2) {
			e2.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}

		// write stringbuffer to output files
		Writer output;
		try {
			output = new BufferedWriter(new FileWriter(oenFile));
			output.write(enContent.toString());
			output.close();

			output = new BufferedWriter(new FileWriter(ofrFile));
			output.write(frContent.toString());
			output.close();
		} catch (IOException e) {
			e.printStackTrace();
		}

	}

	public static void main(String[] args) {

		LongSentenceFilter filter = new LongSentenceFilter();
		// English input, output
		File enFile = new File("test/input/hansard.5.en.tok.lc");
		File oenFile = new File("test/output/hansard.5.en.tok.lc.filtered");
		// French input, output
		File frFile = new File("test/input/hansard.5.fr.tok.lc");
		File ofrFile = new File("test/output/hansard.5.fr.tok.lc.filtered");

		// Note that _enFile and _frFile should be translation of each other!
		filter.filter(enFile, frFile, oenFile, ofrFile);
	}

}

分享到：