LongSentenceFilter Joshua SMT [2]

Note that the first version of LongSentenceFilter is not complete, because even after filtering there still may be French sentences of more than 100 words. Now this version tackles this problem. Note also that this version is not optimal from implementational view and a better version will be in next post.

package util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;

public class LongSentenceFilter {

	public void filter(File enFile, File frFile, File oenFile, File ofrFile) {

		/*go through both English and French files, 
		 * remove sentences of more than 100 words in one file 
		 * and remove corresponding sentences (with same sentence number) 
		 * in the other file 
		 * */
		
		BufferedReader enBufferedReader;
		BufferedReader frBufferedReader;
		String line = null;
		int lineCount = 0;

		// record line numbers of those sentences that consist more than 100
		// words either in English file or in French file
		ArrayList<Integer> longSentenceIndices = new ArrayList<Integer>();

		// output stringbuffer
		StringBuffer enContent = new StringBuffer();
		StringBuffer frContent = new StringBuffer();

		try {
			// go through English file, find those sentences of more than
			// 100 words and keep record of those line numbers in
			// _longSentenceIndices
			enBufferedReader = new BufferedReader(new FileReader(enFile));
			while ((line = enBufferedReader.readLine()) != null) {
				String[] words = line.split(" ");
				lineCount++;
				if (words.length > 100)
					longSentenceIndices.add(lineCount);
			}
			System.out.println("Number of sentences in original document: "
					+ lineCount);

			// go through French file, keep those sentences, of words less 
			// or equal to 100 and whose line numbers are not in
			// _longSentenceIndices
			// at the same time, keep line numbers of sentences of more than
			// 100 words in _longSentenceIndices
			lineCount = 0;
			frBufferedReader = new BufferedReader(new FileReader(frFile));
			while ((line = frBufferedReader.readLine()) != null) {
				String[] words = line.split(" ");
				lineCount++;
				if (words.length <= 100
						&& !longSentenceIndices.contains(lineCount)) {
					frContent.append(line);
					frContent.append('\n');
				} else {
					if (!longSentenceIndices.contains(lineCount))
						longSentenceIndices.add(lineCount);
				}
			}
			
			// go again through English file, keep those sentences, whose line 
			// number are not in _longSentenceIndices
			lineCount = 0;
			int newLineCount = 0;
			enBufferedReader = new BufferedReader(new FileReader(enFile));
			while ((line = enBufferedReader.readLine()) != null) {
				lineCount++;
				if (!longSentenceIndices.contains(lineCount)) {
					newLineCount++;
					enContent.append(line);
					enContent.append('\n');
				}
			}
			System.out.println("Number of sentences after filteration: "
					+ newLineCount);
		} catch (FileNotFoundException e2) {
			e2.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}

		// write stringbuffer to output files
		Writer output;
		try {
			output = new BufferedWriter(new FileWriter(oenFile));
			output.write(enContent.toString());
			output.close();

			output = new BufferedWriter(new FileWriter(ofrFile));
			output.write(frContent.toString());
			output.close();
		} catch (IOException e) {
			e.printStackTrace();
		}

	}

	public static void main(String[] args) {

		LongSentenceFilter filter = new LongSentenceFilter();
		// English input, output
		File enFile = new File("test/input/hansard.5.en.tok.lc");
		File oenFile = new File("test/output/hansard.5.en.tok.lc.filtered");
		// French input, output
		File frFile = new File("test/input/hansard.5.fr.tok.lc");
		File ofrFile = new File("test/output/hansard.5.fr.tok.lc.filtered");

		// Note that _enFile and _frFile should be translation of each other!
		filter.filter(enFile, frFile, oenFile, ofrFile);
	}

}
 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值