import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.StringTokenizer;
/*
* 文件格式:已分词的中文文本,每个词语空格分割,每行一个段落。
* 这个类适合读取每行数量较少的文本,比如分好段落的文本,一个段落一行存储。
* 读取一行,步长为1,返回词组。不会跨段落生成词组。
* 两种模式:
* 1 读到文件末尾,结束
* 2 读到文件末尾,从头再来
*/
public class WordReader
{
static final int normalMode = 0;
static final int againMode = 1;
int currentMode = 0;
BufferedReader br=null;
ArrayList<String> paraWords = null;
StringTokenizer tokenizer;
int currentPara = 0;
int paraPos = 0;
public WordReader(String fileName) throws IOException
{
File file=new File(fileName);
br=new BufferedReader(new InputStreamReader(new FileInputStream(file),"utf-8"));
br.mark((int)file.length()+1);
paraWords = new ArrayList<String>();
}
private boolean readPara() throws IOException
{
//if(currentPara>614005+10) return false;
String line = br.readLine();
if(line == null)//到文件末尾了
{
if(currentMode == normalMode)
{
return false;
}
else
{
br.reset();//从头再来
return readPara();
}
}
paraWords.clear();
tokenizer= new StringTokenizer(line," ");
while(tokenizer.hasMoreTokens())
{
paraWords.add(tokenizer.nextToken());
}
currentPara++;
paraPos = 0;
return true;
}
public String[] getNextWords(int count) throws IOException
{
if(paraPos+count >= paraWords.size())//到了段落末尾,读取新的段落
{
if(readPara())
return getNextWords(count);
else return null;
}
String[] words = new String[count];
for(int i=0;i<count;i++)
{
words[i] = paraWords.get(paraPos+i);
}
paraPos++;
return words;
}
public static void main(String[] args) throws IOException
{
// TODO Auto-generated method stub
WordReader wordReader = new WordReader("/home/linger/sources/ParaModel/electronic_seg.txt");
//wordReader.currentMode = WordReader.againMode;
while(true)//614005行
{
String[] words = wordReader.getNextWords(5);
if(words == null) break;
System.out.println(words[0]);
}
System.out.println(wordReader.currentPara);
}
}
发现bufferreader也是可以移动流位置的,利用mark和reset。
但是如果文件太大,调用mark时容易出错,也不知道为啥。