【中文分词】亲手开发一款中文分词器——源代码

原创 2015年07月09日 10:01:23

这里提供我开发的中文分词器源代码。代码分为三个部分:

(一)状态矩阵元素对象GNode。    动态规划中,状态矩阵的元素需要记录当前最大概率和前一个最优匹配状态,这个矩阵是一个二维数组,每一个元素就是一个GNode对象。

(二)统计学习使用的嵌套哈希表THash。 用于统计学习时记录结果的HashMap,这个哈希表有三层嵌套,对于收入数据有一些必要操作,为了简单起见,我将这个哈希表进行了封装,并加几个方法方便程序的编写。

(三)分词器对象CGSegmenter。   其中封装了学习方法,标注器行为等,是分词器的主要功能对象。


注:本文只分享代码,不对代码进行分析解释,需要了解原理和代码分析,请参考博文《【中文分词】亲手开发一款中文分词器——原理》


源代码如下:

GNode

public class GNode {
	public Double MaxPos;
	public char CurTag;
	public char PreTag;
	
	public GNode(){
		this.MaxPos = 0.0;
	}
}


THash

public class THash {
        //一个哈希表,记录结构为:字符->符号->字符->符号->概率。代表一个字符标注为某一符号时,下一字符标注为某一符号的概率值,值为Double类型。
	private HashMap<Character, HashMap<Character, HashMap<Character, HashMap<Character, Double>>>> _myHashMap;

	private static Double INITIAL_VALUE = 1.0;
	private static Double POSSIBILITY_INTERPOLATE_VALUE = 1.02;

	public THash() {
		this._myHashMap = new HashMap<>();
	}

	public void PutValue(char pri_key, char pri_tag, char sec_key, char sec_tag) {
		if (!this._myHashMap.containsKey(pri_key)) {
			this._myHashMap.put(pri_key, new HashMap<Character, HashMap<Character, HashMap<Character, Double>>>());
		}
		HashMap<Character, HashMap<Character, HashMap<Character, Double>>> prihash = this._myHashMap
				.get(pri_key);

		if (!prihash.containsKey(pri_tag)) {
			prihash.put(pri_tag,
					new HashMap<Character, HashMap<Character, Double>>());
		}
		HashMap<Character, HashMap<Character, Double>> seccharhash = prihash
				.get(pri_tag);

		if (!seccharhash.containsKey(sec_key)) {
			seccharhash.put(sec_key, new HashMap<Character, Double>());
		}
		HashMap<Character, Double> sectaghash = seccharhash.get(sec_key);

		if (!sectaghash.containsKey(sec_tag)) {
			sectaghash.put(sec_tag, THash.INITIAL_VALUE);
		} else {
			Double _temp = sectaghash.get(sec_tag);
			_temp++;
		}
	}

	public void calculatePossibilityForAllCombinations(String path, String format) {
		File f = new File(path);
		try {
			if (!f.exists())
				f.createNewFile();
			PrintWriter writer = new PrintWriter(path, format);
			for(Character pri_key : this._myHashMap.keySet()){
				HashMap<Character, HashMap<Character, HashMap<Character, Double>>>
					_pritaghash = this._myHashMap.get(pri_key);
				for(Character pri_tag : _pritaghash.keySet()){
					HashMap<Character, HashMap<Character, Double>> 
					_sechash = _pritaghash.get(pri_tag);
					for(Character sec_key : _sechash.keySet()){
						HashMap<Character, Double> _sectaghash = _sechash.get(sec_key);
						Double total = 0.0;
						for(Character sec_tag : _sectaghash.keySet()){
							total += _sectaghash.get(sec_tag);
						}
						total *= THash.POSSIBILITY_INTERPOLATE_VALUE;
						for(Character sec_tag : _sectaghash.keySet()){
							StringBuilder sb = new StringBuilder();
							sb.append(pri_key);
							sb.append(pri_tag);
							sb.append(sec_key);
							sb.append(sec_tag);
							sb.append(_sectaghash.get(sec_tag) / total);
							writer.println(sb.toString());
						}
					}
				}
			}
			writer.close();
		} catch (IOException e) {
			System.err
					.println("Error in method calculatePossibilityForAllCombinations()");
		}

	}
}

CGSegmenter

public class CGSegmenter {

	private String TrainingMaterialPath;
	private String TaggedTrainingMaterialPath;
	private String FinalTagFilePathForRelation;
	private String FinalTagFilePathForSingle;
	private String DefaultFileFormat = "UTF-8";

	private THash _thash;
	private HashMap<String, Double> _charHash;

	private HashMap<String, HashMap<String, Double>> _tagHashForRelation;
	private HashMap<String, Double> _tagHashForSingle;

	private Double StrangeCombinationDefaultPossibility = 0.00035;
	private Double StrangeSingleDefaultPossibility = 0.0005;

	private List<String> resultlist;
	
	public void setDefaultStrangeCombinationPossibility(Double possibility){
		this.StrangeCombinationDefaultPossibility = possibility;
	}
	
	public Double getStrangeCombinationDefaultPossibility() {
		return StrangeCombinationDefaultPossibility;
	}
	
	public void setStrangeSingleDefaultPossibility(Double strangeSingleDefaultPossibility) {
		StrangeSingleDefaultPossibility = strangeSingleDefaultPossibility;
	}
	
	public Double getStrangeSingleDefaultPossibility() {
		return StrangeSingleDefaultPossibility;
	}

	public CGSegmenter(boolean NeedInitialiseFinalTagFile) {
		this.FinalTagFilePathForRelation = "tf/tag-re.txt";
		this.FinalTagFilePathForSingle = "tf/tag-si.txt";
		this.resultlist = new LinkedList<String>();
		if (NeedInitialiseFinalTagFile) {
			this.TrainingMaterialPath = "tm/msr-in.txt";
			this.TaggedTrainingMaterialPath = "tm/msr-out.txt";
			this.processTrainingMaterial();
			this.statisticTaggedTrainingMaterial();
		}
		this.initialiseTagHashMap();
	}

	// 预处理中文语料库
	protected void processTrainingMaterial() {
		File f = new File(this.TrainingMaterialPath);
		if (!f.exists()) {
			System.err.println("未找到中文语料库文件: "
					+ this.TrainingMaterialPath);
		} else {
			try {
				if (!f.exists()) {
					f.createNewFile();
				}
				FileInputStream fis = new FileInputStream(f);
				InputStreamReader re = new InputStreamReader(fis,
						this.DefaultFileFormat);
				BufferedReader reader = new BufferedReader(re);
				String temp;
				PrintWriter writer = new PrintWriter(
						this.TaggedTrainingMaterialPath, this.DefaultFileFormat);
				System.out.println(new Date().toString() + " 开始预处理中文语料库。");
				while ((temp = reader.readLine()) != null) {
					char[] chararr = temp.toCharArray();
					StringBuilder sb = new StringBuilder();
					int i = 0;
					if (chararr[i] == 65279)
						i = 1;
					int j = i;
					while (j <= chararr.length - 1) {
						while (j < chararr.length - 1 && chararr[j] == ' ')
							j++;
						i = j;
						while (j < chararr.length - 1 && chararr[j] != 32)
							j++;
						if (j - i == 1) {
							sb.append(chararr[i] + "S");
						} else if (j - i == 2) {
							sb.append(chararr[i] + "B");
							sb.append(chararr[j - 1] + "E");
						} else if (j - i > 2) {
							sb.append(chararr[i++] + "B");
							while (i != j - 1) {
								sb.append(chararr[i++] + "M");
							}
							sb.append(chararr[i] + "E");
						}
						if (j >= chararr.length - 1)
							break;
					}
					writer.println(sb.toString());
				}
				System.out.println(new Date().toString() + " 完成中文语料库预处理。");
				writer.close();
				fis.close();
				re.close();
				reader.close();
			} catch (IOException e) {
				System.err
						.println("Error in method processTrainingMaterial() : "
								+ e.getMessage());
			}
		}
	}

	// 统计学习
	protected void statisticTaggedTrainingMaterial() {
		this.learningSingleTag();     //状态独立概率P(x)
		this.learningRelationTag();   //状态转移概率P(X | Y)
	}

	// 学习状态独立概率
	protected void learningSingleTag() {
		File f = new File(this.TaggedTrainingMaterialPath);
		if (!f.exists()) {
			System.err.println("未找到训练语料文件"
					+ this.TaggedTrainingMaterialPath);
		} else {
			try {
				this._charHash = new HashMap<>();

				Double total = 0.0;
				FileInputStream fis = new FileInputStream(f);
				InputStreamReader re = new InputStreamReader(fis,
						this.DefaultFileFormat);
				BufferedReader reader = new BufferedReader(re);
				String temp = null;
				System.out.println(new Date().toString() + " 开始学习独立概率。");
				while ((temp = reader.readLine()) != null) {
					char[] chararr = temp.toCharArray();
					if (chararr.length == 0)
						continue;
					int i = 0;
					if ((int) chararr[i] == 65279)
						i++;
					while (i < chararr.length) {
						StringBuilder charsb = new StringBuilder();
						charsb.append(chararr[i]);
						charsb.append(chararr[i + 1]);
						if (this._charHash.containsKey(charsb.toString())) {
							Double _t = this._charHash.get(charsb.toString());
							_t = _t + 1.0;
							this._charHash.put(charsb.toString(), _t);
						} else
							this._charHash.put(charsb.toString(), 1.0);
						total += 1.0;
						i += 2;
					}
				}
				
				File _f = new File(this.FinalTagFilePathForSingle);
				if (!_f.exists())
					_f.createNewFile();
				PrintWriter writer_char = new PrintWriter(
						this.FinalTagFilePathForSingle, this.DefaultFileFormat);

				for (String key : this._charHash.keySet()) {
					writer_char.print(key);
					writer_char.println(this._charHash.get(key) / total);
				}
				System.out
						.println(new Date().toString() + " 完成独立概率学习。");
				writer_char.close();
				fis.close();
				re.close();
				reader.close();
			} catch (IOException e) {
				System.err.println("Error in method learningSingleTag()");
			}
		}
	}

	// 学习状态转移概率
	protected void learningRelationTag() {
		File f = new File(this.TaggedTrainingMaterialPath);
		if (!f.exists()) {
			System.err.println("未找到训练语料库"
					+ this.TaggedTrainingMaterialPath);
		} else {
			try {				
				this._thash = new THash();
				FileInputStream fis = new FileInputStream(f);
				InputStreamReader re = new InputStreamReader(fis,
						this.DefaultFileFormat);
				BufferedReader reader = new BufferedReader(re);
				String temp = null;
				System.out.println(new Date().toString()
						+ " 开始学习状态转移概率。");
				while ((temp = reader.readLine()) != null) {
					char[] chararr = temp.toCharArray();
					int i = 0;
					while (i < chararr.length) {
						if (i == 0)
							this._thash.PutValue('~', '~', chararr[i],
									chararr[i + 1]);
						else
							this._thash.PutValue(chararr[i - 2],
									chararr[i - 1], chararr[i], chararr[i + 1]);
						i += 2;
					}
				}
				fis.close();
				re.close();
				reader.close();
				this._thash.calculatePossibilityForAllCombinations(
						this.FinalTagFilePathForRelation,
						this.DefaultFileFormat);
				System.out.println(new Date().toString()
						+ " 完成状态转移概率学习。");
				System.out.println(new Date().toString() + " 训练语料库学习完毕。");
			} catch (IOException e) {
				System.err.println("Error in method learningRelationTag()");
			}
		}
	}

	// 初始化标注器表
	protected void initialiseTagHashMap() {
		File f = new File(this.FinalTagFilePathForRelation);
		if (!f.exists()) {
			System.out.println("未找到标注器初始化文件"
					+ this.FinalTagFilePathForRelation);
		} else {
			try {
				this._tagHashForRelation = new HashMap<>();
				FileInputStream fis = new FileInputStream(f);
				InputStreamReader re = new InputStreamReader(fis,
						this.DefaultFileFormat);
				BufferedReader reader = new BufferedReader(re);
				String temp = null;
				while ((temp = reader.readLine()) != null) {
					char[] chararr = temp.toCharArray();
					StringBuilder pri_key_sb = new StringBuilder();
					StringBuilder sec_key_sb = new StringBuilder();
					pri_key_sb.append(chararr[0]);
					pri_key_sb.append(chararr[1]);
					sec_key_sb.append(chararr[2]);
					sec_key_sb.append(chararr[3]);
					int j = 6;
					char[] pos_chararr = new char[7];
					for (int n = 0; n < 7; n++, j++) {
						pos_chararr[n] = chararr[j];
					}
					Double pos = 0.1 * this.convertStringtoDouble(pos_chararr,
							0);
					HashMap<String, Double> _hash;
					if (this._tagHashForRelation.containsKey(pri_key_sb
							.toString())) {
						_hash = this._tagHashForRelation.get(pri_key_sb
								.toString());
						_hash.put(sec_key_sb.toString(), pos);
					} else {
						_hash = new HashMap<>();
						_hash.put(sec_key_sb.toString(), pos);
						this._tagHashForRelation.put(pri_key_sb.toString(),
								_hash);
					}
				}
				fis.close();
				re.close();
				reader.close();
			} catch (IOException e) {
				System.out.println("Error in method initialise -> relation");
			}
		}

		f = new File(this.FinalTagFilePathForSingle);
		if (!f.exists()) {
			System.out.println("为找到标注器初始化文件"
					+ this.FinalTagFilePathForSingle);
		} else {
			try {
				this._tagHashForSingle = new HashMap<>();
				FileInputStream fis = new FileInputStream(f);
				InputStreamReader re = new InputStreamReader(fis,
						this.DefaultFileFormat);
				BufferedReader reader = new BufferedReader(re);
				String temp = null;
				while ((temp = reader.readLine()) != null) {
					char[] chararr = temp.toCharArray();
					StringBuilder sb = new StringBuilder();
					sb.append(chararr[0]);
					sb.append(chararr[1]);
					char[] pos_chararr = new char[5];
					for (int j = 0, i = 4; j < 5; j++, i++) {
						pos_chararr[j] = chararr[i];
					}
					Double pos = 0.1 * this.convertStringtoDouble(pos_chararr,
							0);
					pos = pos + chararr[2] - 48;
					if (chararr[chararr.length - 2] == '-') {
						int n = chararr[chararr.length - 1] - 48;
						for (int i = 0; i < n; i++) {
							pos *= 0.1;
						}
					}
					this._tagHashForSingle.put(sb.toString(), pos);
				}
				fis.close();
				re.close();
				reader.close();
			} catch (IOException e) {
				System.err.println("Error in method initialise -> single");
			}
		}
	}

	protected double convertStringtoDouble(char[] chararr, int position) {
		if (position == chararr.length - 1)
			return chararr[0] - 48;
		else
			return chararr[position] - 48 + 0.1
					* convertStringtoDouble(chararr, position + 1);
	}

	// 拆分句子,分离标点,符号,数字和英文字符等
	protected List<String> segmentSentence(char[] sentence) {
		int i = 0;
		int j = i;
		try {
			while (j < sentence.length) {

				if (sentence[j] == '。' || sentence[j] == ','
						|| sentence[j] == '?' || sentence[j] == ':'
						|| sentence[j] == '!' || sentence[j] == ' '
						|| sentence[j] == '(' || sentence[j] == ')') {
					if (i < j)
						this.segmentWords(sentence, i, j - 1);
					i = ++j;
					continue;
				}

				if ((sentence[j] >= 65 && sentence[j] <= 90)
						|| (sentence[j] >= 97 && sentence[j] <= 122)) {
					if (i != j)
						this.segmentWords(sentence, i, j - 1);
					i = j;
					while (j < sentence.length
							&& ((sentence[j] >= 65 && sentence[j] <= 90) || (sentence[j] >= 97 && sentence[j] <= 122))) {
						j++;
					}
					StringBuilder sb = new StringBuilder();
					while (i < j) {
						sb.append(sentence[i]);
						i++;
					}
					this.resultlist.add(sb.toString());
					i = j;
					continue;
				}

				if (sentence[j] < 127) {
					if (i < j) {
						segmentWords(sentence, i, j - 1);
						i = j;
					}
					while (j < sentence.length && sentence[j] < 127) {
						j++;
					}
					StringBuilder sb = new StringBuilder();
					while (i < j) {
						sb.append(sentence[i]);
						i++;
					}
					this.resultlist.add(sb.toString());
					i = j;
					continue;
				}

				j++;
			}
			if (i < j)
				this.segmentWords(sentence, i, j - 1);
			return resultlist;
		} catch (OutOfMemoryError | ArrayIndexOutOfBoundsException e) {
			System.out.println(sentence.length);
			System.out.println(sentence[i]);
			System.out.println(sentence[j]);
			return this.resultlist;
		}
	}

	// 分词操作
	protected void segmentWords(char[] sentence, int start, int end) {
		int length = end - start + 1;
		GNode[][] graph = new GNode[length][4];

		for (int i = 0; i < length; i++) {
			for (int j = 0; j < 4; j++) {
				graph[i][j] = new GNode();
			}
		}

		// 初始化状态矩阵
		for (int j = 0; j < 4; j++) {
			graph[0][j].CurTag = getTag(j);
			if (j == 0 || j == 3) {
				StringBuilder sb = new StringBuilder();
				sb.append(sentence[start]);
				sb.append(graph[0][j].CurTag);
				graph[0][j].MaxPos = this.getPossiblity("~~", sb.toString());
			} else
				graph[0][j].MaxPos = 0.0;
		}

		// 动态规划过程
		for (int i = 1; i < length; i++) {
			for (int j = 0; j < 4; j++) {
				graph[i][j].CurTag = this.getTag(j);
				StringBuilder sec_key_sb = new StringBuilder();
				sec_key_sb.append(sentence[i + start]);
				sec_key_sb.append(graph[i][j].CurTag);
				for (int n = 0; n < 4; n++) {
					if (!this.checkLogicalCombination(graph[i - 1][n].CurTag,
							graph[i][j].CurTag))
						continue;
					StringBuilder pri_key_sb = new StringBuilder();
					pri_key_sb.append(sentence[i + start - 1]);
					pri_key_sb.append(graph[i - 1][n].CurTag);
					Double _pos = this.getPossiblity(pri_key_sb.toString(),
							sec_key_sb.toString());
					if (this._tagHashForSingle.containsKey(pri_key_sb
							.toString()))
						_pos *= this._tagHashForSingle.get(pri_key_sb
								.toString());
					else
						_pos *= this.StrangeSingleDefaultPossibility;
					_pos *= graph[i - 1][n].MaxPos;
					if (_pos >= graph[i][j].MaxPos) {
						graph[i][j].MaxPos = _pos;
						graph[i][j].PreTag = graph[i - 1][n].CurTag;
					}
				}
			}
		}

		// 筛选最优解
		int m = 0;
		Double _maxpos = 0.0;
		for (int j = 0; j < 4; j++) {
			if (graph[length - 1][j].MaxPos >= _maxpos) {
				_maxpos = graph[length - 1][j].MaxPos;
				m = j;
			}
		}

		char[] chararr = new char[length * 2];
		for (int i = end - start, j = chararr.length - 1, n = end; i >= 0
				&& j > 0; i--, j -= 2, n--) {
			chararr[j] = graph[i][m].CurTag;
			chararr[j - 1] = sentence[n];
			m = this.getInt(graph[i][m].PreTag);
		}

		StringBuilder sb = new StringBuilder();
		for (int i = 0; i < chararr.length; i += 2) {
			sb.append(chararr[i]);
			if (chararr[i + 1] == 'E' || chararr[i + 1] == 'S') {
				this.resultlist.add(sb.toString());
				sb = new StringBuilder();
			} else if (i == chararr.length - 2)
				this.resultlist.add(sb.toString());
		}
	}

	protected char getTag(int value) {
		switch (value) {
		case 0:
			return 'B';
		case 1:
			return 'M';
		case 2:
			return 'E';
		case 3:
			return 'S';
		default:
			return 'S';
		}
	}

	protected int getInt(char tag) {
		switch (tag) {
		case 'B':
			return 0;
		case 'M':
			return 1;
		case 'E':
			return 2;
		case 'S':
			return 3;
		default:
			return 3;
		}
	}

	//筛选可能搭配
	protected boolean checkLogicalCombination(char i, char j) {
		if ((i == 'B' || i == 'M') && (j == 'M' || j == 'E'))
			return true;
		if ((i == 'E' || i == 'S') && (j == 'B' || j == 'S'))
			return true;
		else
			return false;
	}

	protected Double getPossiblity(String pri_key, String sec_key) {
		if (this._tagHashForRelation.containsKey(pri_key)) {
			HashMap<String, Double> _hash = this._tagHashForRelation
					.get(pri_key);
			if (_hash.containsKey(sec_key))
				return _hash.get(sec_key);
		}
		return this.StrangeCombinationDefaultPossibility;
	}
}







版权声明:本文为博主原创文章,未经博主允许不得转载。

相关文章推荐

Python下的中文分词实现

Python下的中文信息处理的实现(一) 一 安装和测试Python下的中文分词工具 参考http://hi.baidu.com/fooying/item/6ae7a0e26087e8d7eb34c9...

中文分词算法总结

什么是中文分词 众所周知,英文是以 词为单位的,词和词之间是靠空格隔开,而中文是以字为单位,句子中所有的字连起来才能描述一个意思。例如,英文句子I am a student,用中文则为:“我是一...

jieba中文分词源码分析(四)

一、未登录词问题在jieba中文分词的第一节曾提到未登录词问题 中文分词的难点 分词规范,词的定义还不明确 (《统计自然语言处理》宗成庆) 歧义切分问题,交集型切分问题,多义组合型...

自然语言处理之:c++中文分词(附源码)

githup地址:https://github.com/jbymy c++中文分词, 分词算法, 有向无环图, hmm,隐性马尔科夫模型

中文分词的python实现-基于FMM算法

正向最大匹配算法(FMM)正向最大匹配算法(FMM)是一种基于词典的分词方法,思想很简单就是从左向右扫描寻找词的最大匹配,比如词典中同时含有“钓鱼”和“钓鱼岛”,那“钓鱼岛属于中国”就会被分词成“钓鱼...

【中文分词】亲手开发一款中文分词器--前言

所谓分词,就是将一个完整的句子按正确的意思将每一个词语提取出来的操作。分词是自然语言处理的第一个关卡,是让计算机处理人类语言的基础。但是计算机是无法识别人类语言中词汇这一概念的,更何况如何去识别理解词...

词性标注的python实现-基于平均感知机算法

平均感知机算法(Averaged Perceptron)感知机算法是非常好的二分类算法,该算法求取一个分离超平面,超平面由w参数化并用来预测,对于一个样本x,感知机算法通过计算y = [w,x]预测样...
内容举报
返回顶部
收藏助手
不良信息举报
您举报文章:深度学习:神经网络中的前向传播和反向传播算法推导
举报原因:
原因补充:

(最多只允许输入30个字)