Note that the first version of LongSentenceFilter is not complete, because even after filtering there still may be French sentences of more than 100 words. Now this version tackles this problem. Note also that this version is not optimal from implementational view and a better version will be in next post.
package util;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
public class LongSentenceFilter {
public void filter(File enFile, File frFile, File oenFile, File ofrFile) {
/*go through both English and French files,
* remove sentences of more than 100 words in one file
* and remove corresponding sentences (with same sentence number)
* in the other file
* */
BufferedReader enBufferedReader;
BufferedReader frBufferedReader;
String line = null;
int lineCount = 0;
// record line numbers of those sentences that consist more than 100
// words either in English file or in French file
ArrayList<Integer> longSentenceIndices = new ArrayList<Integer>();
// output stringbuffer
StringBuffer enContent = new StringBuffer();
StringBuffer frContent = new StringBuffer();
try {
// go through English file, find those sentences of more than
// 100 words and keep record of those line numbers in
// _longSentenceIndices
enBufferedReader = new BufferedReader(new FileReader(enFile));
while ((line = enBufferedReader.readLine()) != null) {
String[] words = line.split(" ");
lineCount++;
if (words.length > 100)
longSentenceIndices.add(lineCount);
}
System.out.println("Number of sentences in original document: "
+ lineCount);
// go through French file, keep those sentences, of words less
// or equal to 100 and whose line numbers are not in
// _longSentenceIndices
// at the same time, keep line numbers of sentences of more than
// 100 words in _longSentenceIndices
lineCount = 0;
frBufferedReader = new BufferedReader(new FileReader(frFile));
while ((line = frBufferedReader.readLine()) != null) {
String[] words = line.split(" ");
lineCount++;
if (words.length <= 100
&& !longSentenceIndices.contains(lineCount)) {
frContent.append(line);
frContent.append('\n');
} else {
if (!longSentenceIndices.contains(lineCount))
longSentenceIndices.add(lineCount);
}
}
// go again through English file, keep those sentences, whose line
// number are not in _longSentenceIndices
lineCount = 0;
int newLineCount = 0;
enBufferedReader = new BufferedReader(new FileReader(enFile));
while ((line = enBufferedReader.readLine()) != null) {
lineCount++;
if (!longSentenceIndices.contains(lineCount)) {
newLineCount++;
enContent.append(line);
enContent.append('\n');
}
}
System.out.println("Number of sentences after filteration: "
+ newLineCount);
} catch (FileNotFoundException e2) {
e2.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
// write stringbuffer to output files
Writer output;
try {
output = new BufferedWriter(new FileWriter(oenFile));
output.write(enContent.toString());
output.close();
output = new BufferedWriter(new FileWriter(ofrFile));
output.write(frContent.toString());
output.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
LongSentenceFilter filter = new LongSentenceFilter();
// English input, output
File enFile = new File("test/input/hansard.5.en.tok.lc");
File oenFile = new File("test/output/hansard.5.en.tok.lc.filtered");
// French input, output
File frFile = new File("test/input/hansard.5.fr.tok.lc");
File ofrFile = new File("test/output/hansard.5.fr.tok.lc.filtered");
// Note that _enFile and _frFile should be translation of each other!
filter.filter(enFile, frFile, oenFile, ofrFile);
}
}