这里需需要注意一下:
如果txt文件的编码不是utf-8会是乱码,所以需要设置一下txt的编码。
package com.java.hanzi.utf;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class TxtCount {
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
File file = new File("D:\\2012-0414.txt");
try {
//FileInputStream fin = new FileInputStream(file);
//FileReader默认使用的是GBK,查看123.txt文件的编码格式
//FileInputStreamReader(new InputStreamReader(new FileInputStream("path")),"UTF-8")
//
int count = 0;
//FileReader fr = new FileReader(file);
//System.out.println("fr.getEncoding()="+fr.getEncoding());
//BufferedReader bf = new BufferedReader(fr);
BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(file),"UTF-8"));
//System.out.println("fr.getEncoding()="+fr.getEncoding());
String str = null;
while((str=br.readLine())!=null){
count = count + calculator(str);
}
System.out.println("-----"+count);
br.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static int calculator(String str){
int count = 0;
String regEx = "[\\u4e00-\\u9fa5]";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(str);
while (m.find()) {
for (int i = 0; i <= m.groupCount(); i++) {
count = count + 1;
}
}
System.out.println(str);
System.out.println("共有 " + count + "个 ");
return count;
}
}