一种中文文本的快速分词方法(一)(未完待续)

这是本人梦寐以求的东西,终于搞出来了。这是写智能程序的第一步啊!

下面是中文分词的方法供给大家看看。

package org.zhukovasky.fileutil;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
/**
 * 以下是中文文本的处理工具,用来清洗文本的各种乱码
 * 文本的编码为UTF-8
 * @author zhukovasky
 * @version 1.0
 * @since 2013.12
 * @email zhukovasky@163.com
 * */
public class FileProcess {
	public final static Map<String,String> mapDemo;
	static{
		mapDemo=new HashMap<String,String>();
		String punctuation1[]={"㈠",
				"㈡","㈢","㈣","㈤",
				"㈥","㈦","㈧","㈨","㈩"};
				String punctuation2[]={"⑴","⑵","⑶","⑷","⑸","⑹","⑺","⑻","⑼","⑽"};
				String punctuation3[]={"⒈","⒉","⒊","⒋","⒌","⒍","⒎","⒏","⒐","⒑"};
				String punctuation4[]={"Ⅰ","Ⅱ","Ⅲ",
						"Ⅳ","Ⅴ","Ⅵ","Ⅶ","Ⅷ","Ⅸ","Ⅹ"};
				String punctuation5[]={"ⅰ","ⅱ","ⅲ","ⅳ","ⅴ","ⅵ","ⅶ","ⅷ","ⅸ","ⅹ"};
				String punctuation6[]={"1","2","3","4","5","6","7","8","9","10"};
				mapDemo.put(",", ",");
				mapDemo.put("。", ".");
				mapDemo.put("〈", "<");
				mapDemo.put("〉", ">");
				mapDemo.put("‖", "|");
				mapDemo.put("《", "<");
				mapDemo.put("》", ">");
				mapDemo.put("〔", "[");
				mapDemo.put("〕", "]");
				mapDemo.put("﹖", "?");
				mapDemo.put("?", "?");
				mapDemo.put("“", "\"");
				mapDemo.put("”", "\"");
				mapDemo.put(":", ":");
				mapDemo.put("、", ",");
				mapDemo.put("(", "(");
				mapDemo.put(")", ")");
				mapDemo.put("【", "[");
				mapDemo.put("】", "]");
				mapDemo.put("—", "-");
				mapDemo.put("~", "~");
				mapDemo.put("!", "!");
				mapDemo.put("‵", "'");
				mapDemo.put("①", "1");
				mapDemo.put("②", "2");
				mapDemo.put("③", "3");
				mapDemo.put("④", "4");
				mapDemo.put("⑤", "5");
				mapDemo.put("⑥", "6");
				mapDemo.put("⑦", "7");
				mapDemo.put("⑧", "8");
				mapDemo.put("⑨", "9");
				mapDemo.put("\", "\\");
				mapDemo.put("0", "0");
				mapDemo.put("/", "/");
				mapDemo.put(".",  ".");
				mapDemo.put("7", "7");
				String[] numberArray={"1","2","3","4","5","6","7","8","9","10"};
				for(int i=0;i<10;i++){
					mapDemo.put(punctuation1[i], numberArray[i]);
				}
				for(int i=0;i<10;i++){
					mapDemo.put(punctuation2[i], numberArray[i]);
				}
				for(int i=0;i<10;i++){
					mapDemo.put(punctuation3[i], numberArray[i]);
				}
				for(int i=0;i<10;i++){
					mapDemo.put(punctuation4[i], numberArray[i]);
				}
				for(int i=0;i<10;i++){
					mapDemo.put(punctuation5[i], numberArray[i]);
				}
				for(int i=0;i<10;i++){
					mapDemo.put(punctuation6[i], numberArray[i]);
				}
				
	}
	private static String replace(String line){
		for(int i=0;i<line.length();i++){
			String charat=line.substring(i, i+1);
				if(mapDemo.get(charat) != null){
					line=line.replace(charat,(String)mapDemo.get(charat));
				}
		}
		return line;
	}
	private static String replacePunctuation(String line){
		Map<Integer,String> mapDemo=new HashMap<Integer,String>();
		mapDemo.put(1, ",");
		mapDemo.put(2, ".");
		mapDemo.put(3, "<");
		mapDemo.put(4, ">");
		mapDemo.put(5, "|");
		mapDemo.put(6, "<");
		mapDemo.put(7, ">");
		mapDemo.put(8, "[");
		mapDemo.put(9, "]");
		mapDemo.put(10, "?");
		mapDemo.put(11, "?");
		mapDemo.put(12, "\"");
		mapDemo.put(13, "\"");
		mapDemo.put(14, ":");
		mapDemo.put(15, ",");
		mapDemo.put(16, "(");
		mapDemo.put(17, ")");
		mapDemo.put(18, "[");
		mapDemo.put(19, "]");
		mapDemo.put(20, "-");
		mapDemo.put(21, "~");
		mapDemo.put(22, "!");
		mapDemo.put(23, "'");
		mapDemo.put(24, "1");
		mapDemo.put(25, "2");
		mapDemo.put(26, "3");
		mapDemo.put(27, "4");
		mapDemo.put(28, "5");
		mapDemo.put(29, "6");
		mapDemo.put(30, "7");
		mapDemo.put(31, "8");
		mapDemo.put(32, "9");
		mapDemo.put(33, "0");
		mapDemo.put(34, "●");
		mapDemo.put(35, "→");
		mapDemo.put(36, "※");
		mapDemo.put(37,"·");
		mapDemo.put(38,"=");
		mapDemo.put(39, "==");
		mapDemo.put(40, "'");
		mapDemo.put(41, "'");
		mapDemo.put(42, ";");
		mapDemo.put(43, "(");
		mapDemo.put(44, "-");
		mapDemo.put(45, "");
		mapDemo.put(46, " ");
		mapDemo.put(47, " ");
		mapDemo.put(48, ",");
		mapDemo.put(49, "(");
		mapDemo.put(50, ")");
		mapDemo.put(51, "{");
		mapDemo.put(52, "}");
		mapDemo.put(53, "★");
		mapDemo.put(54, "㊣");
		mapDemo.put(55, "¶");
		mapDemo.put(56, "∮");
		mapDemo.put(57, "€");
		mapDemo.put(58, "☀");
		mapDemo.put(59, "Θ");
		mapDemo.put(60, "○");
		mapDemo.put(61, "№");
		mapDemo.put(62,"∷");
		mapDemo.put(63, "♂");
		mapDemo.put(64,"♀");
		mapDemo.put(65, "§");
		mapDemo.put(66,";");
		mapDemo.put(67, "「");
		mapDemo.put(68, "」");
		mapDemo.put(69, "!");
		mapDemo.put(70, "!");
		mapDemo.put(71, "│");
		mapDemo.put(72,"|");
		mapDemo.put(73, " (");
		mapDemo.put(74, " )");
		mapDemo.put(75,"%");
		mapDemo.put(76, "——");
		mapDemo.put(77, "+");
		mapDemo.put(78, "×");
		mapDemo.put(79, "☆");
		mapDemo.put(80,".");
		mapDemo.put(81, "’");
		mapDemo.put(82, "…");
		mapDemo.put(83, "‘");
		mapDemo.put(84, ",");
		mapDemo.put(85, "?");
		for(int i=0;i<line.length();i++){
			String charat=line.substring(i, i+1);
				if(mapDemo.containsValue(charat)){
					line=line.replace(charat,"");
				}
		}
		return line;
	}
	private static String dropBlank(String line){
		String dest="";
		if(line!=null){
				Pattern p=Pattern.compile("\\s*|\t|\r|\n");
				Matcher m=p.matcher(line);
				dest=m.replaceAll("");
		}
		return dest;
	}
	private static String dropNumber(String line){
		String dest="";
		if(line!=null){
			Pattern p=Pattern.compile("[a-zA-Z0-9]");
			Matcher m=p.matcher(line);
			dest=m.replaceAll("");
		}
		return dest;
	}
	/**
	 * 该方法实现清除旧文本中的字符标点等处理
	 * @param origin	旧文本
	 * @param newFile	新文本
	 * @return 返回新文本
	 * */
	public static File processFile(File origin,File newFile){
		Reader fr=null;
		Writer wr=null;
		BufferedReader reader=null;
		BufferedWriter writer=null;
		try {
			fr = new FileReader(origin);
			wr=new FileWriter(newFile);
			reader=new BufferedReader(fr);
			writer=new BufferedWriter(wr);
			String line=reader.readLine();
			while(line!=null){
				String newLine1=replace(line);
				String newLine2=newLine1.trim();
				String newLine3=dropBlank(newLine2);
				String newLine4=dropNumber(newLine3);
				String newLine6=newLine4.replace("●", "");
				String newLine7=newLine6.replace("[", "");
				String newLine8=newLine7.replace("]","");
				String newLine9=newLine8.replace("/", "");
				String newLine10=replacePunctuation(newLine9.replaceAll(" +",""));
				String newLine=StringUtils.trim(newLine10);
				writer.write(newLine);
				line=reader.readLine();
			}
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}finally{
			try {
				reader.close();
				writer.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
			
		}
		
		return newFile;
	}
}






  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值