简单的分词程序

本程序没有复杂的逻辑,只是简单的分解英语单词,分割标点,汉语分成单个字,仅仅为了展示分词过程是怎么实现的。
有难度的分词算法有待进一步研究。
package chapter2;
import java.io.*;
import java.net.*;


public class WebParserFilter {


/**
* @param args
*/
private static String src_File_Path = "D:\\HtmlDownload\\htmlsrc.html";
private static String dst_File_Path = "D:\\HtmlDownload\\puresrc.txt";
public static void main(String[] args) throws IOException
{
// TODO Auto-generated method stub
try
{
ParserFilter();
}
catch (IOException e)
{
System.err.println("下载失败,请检查地址是否正确!");
System.exit(1);
}
}

public static void ParserFilter() throws IOException
{
try
{
int j = 0;
boolean bflag = true;
boolean bContent = true;
StringBuffer sBuffer = new StringBuffer(8096*2);
char[] cBuffer = new char[8096*2];
char[] dstBuffer = new char[8096*2];
int nCount = 0;
File srcfile = new File(src_File_Path);
FileReader fpReader = new FileReader(srcfile);
File dstfile = new File(dst_File_Path);
FileWriter fpWriter = new FileWriter(dstfile);
nCount = fpReader.read(cBuffer);
for(int i = 0; i < nCount; i++)
{
if(bContent==false)
{
if(cBuffer[i]=='>')
bContent = true;
else
continue;
}
else
{
if(cBuffer[i]=='<')
{
bContent = false;
continue;
}
else if(cBuffer[i]=='\n'||cBuffer[i]==' '||cBuffer[i]==' '||cBuffer[i]=='\t'||cBuffer[i]=='\r')
{
continue;
}
else if(cBuffer[i]=='&'&&cBuffer[i+1]=='n'&&cBuffer[i+2]=='b'&&cBuffer[i+3]=='s'&&cBuffer[i+4]=='p'&&cBuffer[i+5]==';')
{
i=i+5;
continue;
}
dstBuffer[j++] = cBuffer[i];
}
}
bflag = true;
for(int m = 0; m < j; m++)
{
if((dstBuffer[m] <= 'Z' && dstBuffer[m] >= 'A')||(dstBuffer[m] <= 'z' && dstBuffer[m] >= 'a')||(dstBuffer[m] <= '9' && dstBuffer[m] >= '0'))
{
if(bflag==false)
{
sBuffer.append(' ');
}
sBuffer.append(dstBuffer[m]);
bflag = true;
}
else
{
if(dstBuffer[m]=='、'||dstBuffer[m]=='、'||dstBuffer[m]=='"'||dstBuffer[m]==':'||dstBuffer[m]==';'||dstBuffer[m]=='.')
{
sBuffer.append(' ');
continue;
}
if(bflag==true)
{
sBuffer.append(' ');
}
sBuffer.append(dstBuffer[m]);
sBuffer.append(' ');
bflag = false;
}
}
System.out.println(sBuffer.toString());
fpWriter.write(sBuffer.toString());
fpReader.close();
fpWriter.close();
}
catch (UnknownHostException e)
{
System.err.println("无法访问指定主机!");
System.exit(1);
}
catch (IOException e)
{
throw e;
}
}


}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值