LongSentenceFilter Joshua SMT [2]

最新推荐文章于 2024-09-02 10:00:08 发布

iteye_11686

最新推荐文章于 2024-09-02 10:00:08 发布

阅读量126

点赞数

分类专栏： SMT Joshua 文章标签： Joshua SMT decoder

本文链接：https://blog.csdn.net/iteye_11686/article/details/82276599

版权

SMT Joshua 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

Note that the first version of LongSentenceFilter is not complete, because even after filtering there still may be French sentences of more than 100 words. Now this version tackles this problem. Note also that this version is not optimal from implementational view and a better version will be in next post.

package util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;

public class LongSentenceFilter {

	public void filter(File enFile, File frFile, File oenFile, File ofrFile) {

		/*go through both English and French files, 
		 * remove sentences of more than 100 words in one file 
		 * and remove corresponding sentences (with same sentence number) 
		 * in the other file 
		 * */
		
		BufferedReader enBufferedReader;
		BufferedReader frBufferedReader;
		String line = null;
		int lineCount = 0;

		// record line numbers of those sentences that consist more than 100
		// words either in English file or in French file
		ArrayList<Integer> longSentenceIndices = new ArrayList<Integer>();

		// output stringbuffer
		StringBuffer enContent = new StringBuffer();
		StringBuffer frContent = new StringBuffer();

		try {
			// go through English file, find those sentences of more than
			// 100 words and keep record of those line numbers in
			// _longSentenceIndices
			enBufferedReader = new BufferedReader(new FileReader(enFile));
			while ((line = enBufferedReader.readLine()) != null) {
				String[] words = line.split(" ");
				lineCount++;
				if (words.length > 100)
					longSentenceIndices.add(lineCount);
			}
			System.out.println("Number of sentences in original document: "
					+ lineCount);

			// go through French file, keep those sentences, of words less 
			// or equal to 100 and whose line numbers are not in
			// _longSentenceIndices
			// at the same time, keep line numbers of sentences of more than
			// 100 words in _longSentenceIndices
			lineCount = 0;
			frBufferedReader = new BufferedReader(new FileReader(frFile));
			while ((line = frBufferedReader.readLine()) != null) {
				String[] words = line.split(" ");
				lineCount++;
				if (words.length <= 100
						&& !longSentenceIndices.contains(lineCount)) {
					frContent.append(line);
					frContent.append('\n');
				} else {
					if (!longSentenceIndices.contains(lineCount))
						longSentenceIndices.add(lineCount);
				}
			}
			
			// go again through English file, keep those sentences, whose line 
			// number are not in _longSentenceIndices
			lineCount = 0;
			int newLineCount = 0;
			enBufferedReader = new BufferedReader(new FileReader(enFile));
			while ((line = enBufferedReader.readLine()) != null) {
				lineCount++;
				if (!longSentenceIndices.contains(lineCount)) {
					newLineCount++;
					enContent.append(line);
					enContent.append('\n');
				}
			}
			System.out.println("Number of sentences after filteration: "
					+ newLineCount);
		} catch (FileNotFoundException e2) {
			e2.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}

		// write stringbuffer to output files
		Writer output;
		try {
			output = new BufferedWriter(new FileWriter(oenFile));
			output.write(enContent.toString());
			output.close();

			output = new BufferedWriter(new FileWriter(ofrFile));
			output.write(frContent.toString());
			output.close();
		} catch (IOException e) {
			e.printStackTrace();
		}

	}

	public static void main(String[] args) {

		LongSentenceFilter filter = new LongSentenceFilter();
		// English input, output
		File enFile = new File("test/input/hansard.5.en.tok.lc");
		File oenFile = new File("test/output/hansard.5.en.tok.lc.filtered");
		// French input, output
		File frFile = new File("test/input/hansard.5.fr.tok.lc");
		File ofrFile = new File("test/output/hansard.5.fr.tok.lc.filtered");

		// Note that _enFile and _frFile should be translation of each other!
		filter.filter(enFile, frFile, oenFile, ofrFile);
	}

}