英文分词 java_java根据标点英文分词

package wordCount;

import java.util.*;

import java.util.Map.*;

import java.util.regex.*;

public class WordCount{

private String pattern = new String("([a-zA-Z]+)");

private HashMap map;

public void setPatterm(String p){

this.pattern = p;

}

public Map getMap(){

return this.map;

}

public void count(String str){

this.map = new HashMap();

Matcher matcher = Pattern.compile(this.pattern).matcher(str);

String key;

while ( matcher.find() ){

key = matcher.group();

if ( this.map.containsKey(key) ){

this.map.put(key, this.map.get(key) + 1);

}else {

this.map.put(key, 1);

}

}

}

}

output包

这个包包括了三个类,OutputProcesser,ConsoleOutput,FileOutput,其中OutputProcesser作为基类

OutputProcesser.java

构造器接收一个来自wordCount返回的map

processInternal()申明为抽象方法,不同的子类实现不同输出方式

output()作为外部调用的接口,接口会循环map,给processInternal提供entry,processInternal会根据提供的entry进行自己的输出

beforeOutput()在输出循环开始前调用

afterOutput()在输出循环结束后调用,这两个方法使用了模板设计模式,用于子类进行输出准备和结束操作,如输出到文件时,在循环开始前打开文件,循环结束之后关闭文件

package output;

import java.util.Iterator;

import java.util.Map;

import java.util.Map.Entry;

public abstract class OutputProcesser {

private Map map;

public OutputProcesser(Map map){

this.map = map;

}

public void output(){

if ( this.beforeOutput(this.map) ){

Iterator> iterator = this.map.entrySet().iterator();

while ( iterator.hasNext() ){

this.processInternal(iterator.next());

}

this.afterOutput(this.map);

}

}

protected boolean beforeOutput(Map map){

return true;

}

protected void afterOutput(Map map){

}

abstract protected void processInternal(Entry entry);

}

ConsoleOutput.java

package output;

import java.util.Map.Entry;

import java.util.*;

public class ConsoleOutput extends OutputProcesser{

public ConsoleOutput(Map map) {

super(map);

}

protected void processInternal(Entry entry){

System.out.println(this.logString(entry));

}

protected String logString(Entry entry){

return entry.getKey()+" : "+entry.getValue()+" times";

}

}

FileOutput.java

package output;

import java.io.FileWriter;

import java.io.IOException;

import java.io.PrintWriter;

import java.util.Map;

import java.util.Map.Entry;

public class FileOutput extends ConsoleOutput{

private String filePath = "result.txt";

private FileWriter fw;

private PrintWriter pw;

public FileOutput(Map map) {

super(map);

}

protected boolean beforeOutput(Map map){

try {

this.fw = new FileWriter(this.filePath);

this.pw = new PrintWriter(this.fw);

} catch (IOException e) {

System.out.println("IOException before process output");

}

return true;

}

protected void afterOutput(Map map){

try {

this.pw.close();

this.fw.close();

} catch (IOException e) {

System.out.println("IOException after process output");

}

}

protected void processInternal(Entry entry){

this.pw.println(this.logString(entry));

}

}

run包

该包对wordCount进行测试,并调用输出类,同时进行简单的性能测试(使用内存和耗时)。

首先会打开一个文本,并读入内存,将文本交给wordCount进行处理

package run;

import java.io.File;

import java.io.FileInputStream;

import java.util.logging.ConsoleHandler;

import org.omg.SendingContext.RunTime;

import wordCount.WordCount;

import output.*;

public class Run {

private long totalMemory = 0,time1 = 0,time2 = 0,memoryUsage = 0;

public void beginProfile(){

this.totalMemory = Runtime.getRuntime().totalMemory();

this.time1 = this.time2 = System.currentTimeMillis();

}

public void endProfile(){

this.memoryUsage = this.totalMemory - Runtime.getRuntime().freeMemory();

this.time2 = System.currentTimeMillis();

System.out.println("memory usage:"+this.memoryUsage+" B");

System.out.println("time usage:"+(this.time2 - this.time1)+"ms");

}

public String readFromFile(String filePath){

File file = new File(filePath);

Long fLength = file.length();

byte[] content = new byte[fLength.intValue()];

try {

FileInputStream input = new FileInputStream(file);

input.read(content);

input.close();

} catch (Exception e) {

}

return new String(content);

}

public static void main(String[] args) {

Run run = new Run();

WordCount wordCount = new WordCount();

run.beginProfile();

wordCount.count(run.readFromFile("messages.txt"));

run.endProfile();

OutputProcesser out = new FileOutput(wordCount.getMap());

out.output();

}

}

下面是简单的性能测试结果

903beaabfd1c25e5ee65822bdb0775ed.png

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值