import java.io.EOFException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
/*
* 文件格式:已分词的文本,词语之间用空格,换行等空白符分割。
* 到了文件末尾就结束
* 适合读取一行很大的文本,因为这里的缓冲不是一行,而是若干个词语(比一行少)。
* 代码实现方式:每次读若干个词语作为一个句子,逐个字节读,以空白符区分词语的开始和结束。
*
*/
public class WordReader
{
RandomAccessFile raf = null;
ArrayList<String> sentence = null;
int senSize = 1000;
int senPos =0 ;
public WordReader(String fileName) throws IOException
{
File file=new File(fileName);
raf = new RandomAccessFile(file,"r") ;
sentence = new ArrayList<String>();
}
public String[] getNextWords(int count) throws IOException
{
if(senPos+count >= sentence.size())//到了段落末尾,读取新的段落
{
if(readSentence())
return getNextWords(count);
else return null;
}
String[] words = new String[count];
for(int i=0;i<count;i++)
{
words[i] = sentence.get(senPos+i);
}
senPos++;
return words;
}
private boolean readSentence()
{
try
{
sentence.clear();
for(int i=0;i<senSize;i++)
{
//System.out.println(i);
int len = 0;
while(true)
{
int b = raf.read();
if(b == -1) return false;
if(b == ' ' || b == '\n'|| b == '\r'|| b=='\t')
{
break;
}
len++;
}
raf.seek(raf.getFilePointer() -len-1);
byte[] buffer = new byte[len];
raf.read(buffer, 0, len);
//byte[] sub = new byte[len];
//for(int k=0;k<len;k++) sub[k] = buffer[k];
String word = new String(buffer,"utf-8");//这里有坑,不会根据结束符0截断字符串,必须手动处理
//System.out.println(word);
sentence.add(word);
while(true)
{
int b = raf.read();
if(b == -1) return false;
if(b == ' ' || b == '\n' || b == '\r' || b=='\t')
{
continue;
}
else break;
}
raf.seek(raf.getFilePointer() -1);
}
senPos = 0;
return true;
}
catch(EOFException ex)
{
ex.printStackTrace();
return false;
}
catch(IOException ex)
{
ex.printStackTrace();
return false;
}
}
public static void main(String[] args) throws IOException
{
// TODO Auto-generated method stub
//WordReader wr = new WordReader("/home/linger/sources/ParaModel/electronic_seg.txt");
WordReader wr = new WordReader("/home/linger/sources/resultbig.txt");
wr.readSentence();
//System.out.println("-------------------------");
//wr.readSentence();
//int i=0;
//while(true)//614005行
//{
//String[] words = wr.getNextWords(5);
//if(words == null) break;
//System.out.println(i++);
//System.out.println(words.length);
//System.out.printf("%s,%s,%s,%s,%s \n",words[0],words[1],words[2],words[3],words[4]);
//}
}
}
本文作者:linger
本文链接:http://blog.csdn.net/lingerlanlan/article/details/38337483