分词策略
查找算法
对于一个庞大的达几十万的词典库,查询时间是衡量分词效率的重要指标,所以高效的查找算法是必须的。
通常来说,对于查找算法,在有序列表中查找比在无序列表中查找更快,分区查找比全局遍历要快。
通过查看ArrayList、LinkedList、HashSet的contains方法的源代码,发现ArrayList和LinkedList采用全局遍历的方式且未利用有序列表的优势,HashSet使用了分区查找,如果hash分布均匀冲突少,则需要遍历的列表就很少甚至不需要。
经过实际测试对比,对一段文字分词使用ArrayList耗时2113ms,而使用HashSet平均耗时23ms左右,几乎相差100倍。所以我使用了HashSet进行查词。
后来查资料发现有TRIE索引树算法可以使查询速度更快。虽然HashSet已经有不错的性能了,但是如果词典越来越大,内存占用越来越多怎么办?如果有一个数据结构,有接近HashSet性能的同时,又能对词典的数据进行压缩以减少内存占用,那就完美了。
首先自己实现一个Trie的数据结构,替换HashSet,经过多次测试并改进TrieNode结构后,实现了基于有序数组的二分查找算法,结果耗时稳定在15ms左右,可以发现结果比HashSet要用时短,而且内存占用也减少了。
上代码吧
public class ChineseSegment {
// private static final List<String> DIC = new ArrayList<>();
private static final HashSet<String> DIC = new HashSet<>();
// private static final TrieIndex DIC = new TrieIndex();
private static final HashSet<String> POINT = new HashSet<>();
private static int MAX_LENGTH = 0;
static {
try {
System.out.println("开始初始化词典");
long start = System.currentTimeMillis();
int max = 1;
int count = 0;
List<String> lines1 = Files.readAllLines(
Paths.get("D:/文档/课程/搜索引擎/实验/词典/chinese_stopword.txt"),
Charset.forName("utf-8"));
List<String> lines2 = Files.readAllLines(
Paths.get("D:/文档/课程/搜索引擎/实验/词典/dic.txt"),
Charset.forName("utf-8"));
List<String> lines3 = Files.readAllLines(
Paths.get("D:/文档/课程/搜索引擎/实验/词典/Stopword.txt"),
Charset.forName("utf-8"));
List<String> lines4 = new ArrayList<String>();
lines4.addAll(lines1);
lines4.addAll(lines2);
List<String> lines5 = new ArrayList<String>();
lines5.addAll(lines3);
List<String> lines = new ArrayList<String>(new HashSet(lines4));// 去重
for (String line : lines) {
DIC.add(line);
count++;
if (line.length() > max) {
max = line.length();
}
}
List<String> _lines = new ArrayList<String>(new HashSet(lines5));
for (String line : _lines) {
POINT.add(line);
count++;
if (line.length() > max) {
max = line.length();
}
}
MAX_LENGTH = max;
System.out.println("完成初始化词典,词数目:" + count);
System.out.println("最大分词长度:" + MAX_LENGTH);
long end = System.currentTimeMillis();
System.out.println("初始化词典结束");
System.out.println("初始化词典耗时:" + (end - start) + "ms");
} catch (IOException ex) {
System.err.println("词典装载失败:" + ex.getMessage());
}
}
public static void main(String[] args) {
long start = System.currentTimeMillis();
System.out.println("开始分词......");
String text = "";
StringBuffer sb = new StringBuffer("");
try {
List<String> lines = Files.readAllLines(
Paths.get("D:/文档/课程/搜索引擎/实验/词典/text.txt"),
Charset.forName("gbk"));
for (String str : lines) {
sb.append(str);
}
text = sb.toString();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("分词内容:" + text);
// 分割标点
text = segPoint(text);
// 分词
List<String> seg = seg(text);
StringBuffer res_sb = new StringBuffer("");
for (String str : seg) {
res_sb.append(str);
}
long end = System.currentTimeMillis();
System.out.println("分词结束");
System.out.println("分词耗时:" + (end - start) + "ms");
System.out.println("将段落分句后再分词");
System.out.println(res_sb);
}
/**
* 分割标点符号
*
* @param text
* @return
*/
public static String segPoint(String text) {
/* 正则表达式:句子结束符 */
String regEx = "";
StringBuffer sb = new StringBuffer("");
for (String string : POINT) {
sb.append(string);
}
regEx = "[" + SegUtils.escapeExprSpecialWord(sb.toString()) + "]";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(text);
/* 按照句子结束符分割句子 */
String[] substrs = p.split(text);
/* 将句子结束符连接到相应的句子后 */
if (substrs.length > 0) {
int count = 0;
while (count < substrs.length) {
if (m.find()) {
String g = m.group();
substrs[count] += g + "\n";
// substrs[count] += g;
}
count++;
}
}
StringBuffer res_sb1 = new StringBuffer("");
for (int i = 0; i < substrs.length; i++) {
res_sb1.append(substrs[i]);
}
text = res_sb1.toString();
return text;
}
/**
* 分词过程 逆向匹配
*
* @param text
* @return
*/
public static List<String> seg(String text) {
List<String> result = new ArrayList<>();
// int len = MAX_LENGTH;
int len = 4;
boolean flagEnglish = false;
boolean flagNumber = false;
while (text.length() > 0) {
if (text.length() < len) {
len = text.length();
}
// 取指定的最大长度的文本去词典里面匹配
String tryWord = text.substring(text.length() - len, text.length());
while (!DIC.contains(tryWord) && !SegUtils.isNumber(tryWord)
&& !SegUtils.isEnglish(tryWord)) {
// 如果长度为一且在词典中未找到匹配,则按长度为一切分
if (tryWord.length() == 1) {
break;
}
// 如果匹配不到,则减去第一个字继续匹配
tryWord = tryWord.substring(1, tryWord.length());
}
if ("\n".equals(tryWord)) {
result.add(0, tryWord);
} else {
if (SegUtils.isEnglish(tryWord)) {
if (!flagEnglish) {
result.add(0, tryWord + "]");
flagEnglish = true;
} else {
result.add(0, tryWord);
}
} else if (SegUtils.isNumber(tryWord)) {
if (!flagNumber) {
result.add(0, tryWord + "]");
flagNumber = true;
} else {
result.add(0, tryWord);
}
} else {
if (flagEnglish) {
result.add(0, "[" + tryWord + "][");
flagEnglish = false;
} else if (flagNumber) {
result.add(0, "[" + tryWord + "][");
flagNumber = false;
} else {
result.add(0, "[" + tryWord + "]");
}
}
}
// 从待分词文本中去除已经分词的文本
text = text.substring(0, text.length() - tryWord.length());
}
return result;
}
}
/**
* Trie前缀索引树, 用于查找一个指定的字符串是否在词典中
*
* @author Administrator
*
*/
public class TrieIndex {
private final TrieNode ROOT_NODE = new TrieNode('/');
public boolean contains(String item) {
// 去掉首尾空白字符
item = item.trim();
int len = item.length();
if (len < 1) {
return false;
}
// 从根节点开始查找
TrieNode node = ROOT_NODE;
for (int i = 0; i < len; i++) {
char character = item.charAt(i);
TrieNode child = node.getChild(character);
if (child == null) {
// 未找到匹配节点
return false;
} else {
// 找到节点,继续往下找
node = child;
}
}
if (node.isTerminal()) {
return true;
}
return false;
}
public void addAll(List<String> items) {
for (String item : items) {
add(item);
}
}
public void add(String item) {
// 去掉首尾空白字符
item = item.trim();
int len = item.length();
if (len < 1) {
// 长度小于1则忽略
return;
}
// 从根节点开始添加
TrieNode node = ROOT_NODE;
for (int i = 0; i < len; i++) {
char character = item.charAt(i);
TrieNode child = node.getChildIfNotExistThenCreate(character);
// 改变顶级节点
node = child;
}
// 设置终结字符,表示从根节点遍历到此是一个合法的词
node.setTerminal(true);
}
private static class TrieNode implements Comparable {
private char character;
private boolean terminal;
private TrieNode[] children = new TrieNode[0];
public TrieNode(char character) {
this.character = character;
}
public boolean isTerminal() {
return terminal;
}
public void setTerminal(boolean terminal) {
this.terminal = terminal;
}
public char getCharacter() {
return character;
}
public void setCharacter(char character) {
this.character = character;
}
public Collection<TrieNode> getChildren() {
return Arrays.asList(children);
}
/**
* 利用二分搜索算法从有序数组中找到特定节点
*
* @param character待查找节点
* @returnNULL or 节点数据
*/
public TrieNode getChild(char character) {
// for (TrieNode child : children) {
// if (child.getCharacter() == character) {
// return child;
// }
// }
int index = Arrays.binarySearch(children, character);
if (index >= 0) {
return children[index];
}
return null;
}
public TrieNode getChildIfNotExistThenCreate(char character) {
TrieNode child = getChild(character);
if (child == null) {
child = new TrieNode(character);
addChild(child);
}
return child;
}
public void addChild(TrieNode child) {
// children = Arrays.copyOf(children, children.length + 1);
// this.children[children.length - 1] = child;
children = insert(children, child);
}
/**
* 将一个字符追加到有序数组
*
* @param array
* 有序数组
* @param element
* 字符
* @return 新的有序数字
*/
private TrieNode[] insert(TrieNode[] array, TrieNode element) {
int length = array.length;
if (length == 0) {
array = new TrieNode[1];
array[0] = element;
return array;
}
TrieNode[] newArray = new TrieNode[length + 1];
boolean insert = false;
for (int i = 0; i < length; i++) {
if (element.getCharacter() <= array[i].getCharacter()) {
// 新元素找到合适的插入位置
newArray[i] = element;
// 将array中剩下的元素依次加入newArray即可退出比较操作
System.arraycopy(array, i, newArray, i + 1, length - i);
insert = true;
break;
} else {
newArray[i] = array[i];
}
}
if (!insert) {
// 将新元素追加到尾部
newArray[length] = element;
}
return newArray;
}
@Override
public int compareTo(Object o) {
// TODO Auto-generated method stub
return this.getCharacter() - (char) o;
}
}
public void show() {
show(ROOT_NODE, "");
}
private void show(TrieNode node, String indent) {
if (node.isTerminal()) {
System.out.println(indent + node.getCharacter() + "(T)");
} else {
System.out.println(indent + node.getCharacter());
}
for (TrieNode item : node.getChildren()) {
show(item, indent + "\t");
}
}
}
public class SegUtils {
/**
* 转义正则特殊字符 ($()*+.[]?\^{},|)
*
* @param keyword
* @return
*/
public static String escapeExprSpecialWord(String keyword) {
if (!"".equals(keyword) || keyword != null) {
String[] fbsArr = { "\\", "$", "(", ")", "*", "+", ".", "[", "]",
"?", "^", "{", "}", "|" };
for (String key : fbsArr) {
if (keyword.contains(key)) {
keyword = keyword.replace(key, "\\" + key);
}
}
}
return keyword;
}
// GENERAL_PUNCTUATION 判断中文的"号
// CJK_SYMBOLS_AND_PUNCTUATION 判断中文的。号
// HALFWIDTH_AND_FULLWIDTH_FORMS 判断中文的,号
/**
* 是否是中文
*
* @param c
* @return
*/
public static boolean isChinese(char c) {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
|| ub == Character.UnicodeBlock.GENERAL_PUNCTUATION
|| ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
|| ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
return true;
}
return false;
}
/**
* 是否是英文
*
* @param c
* @return
*/
public static boolean isEnglish(String charaString) {
return charaString.matches("^[a-zA-Z]*");
}
public static boolean isChinese(String str) {
String regEx = "[\\u4e00-\\u9fa5]+";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(str);
if (m.find())
return true;
else
return false;
}
public static boolean isNumber(String str) {
String regEx = "^\\d+$";
return str.matches(regEx);
}
}
性能分析
程序运行结果如上图,可以看出初始化词典的时间占据大部分时间,而且在我使用前缀树Trie后构造词典时时间更加大。在前面的实验中,根据在词典中查找词语时的性能优化,从普通的ArrayList的查找一直到Trie索引树的二分查找,性能不断提升,我们可以知道分词程序的性能提升空间主要是在查找词典的算法中。