项目中需要将人名的拼音在一堆字符串中提取出来,字符串里既有英文也有非字母字符,之前的解决方案是写了一大堆的正则表达式,但是对于正常的人名还ok,但是遇到类似wangaie这种就傻了,我也懒得去研究辣么多的正则表达式了,就重新想了一个方案,如果字符串按拼音分词后还是正常的拼音那就提交咯。这里需要一个拼音字典,在网上找的如下:
a
ai
an
ang
ao
ba
bai
ban
bang
bao
bei
ben
beng
bi
bian
biao
bie
bin
bing
bo
bu
ca
cai
can
cang
cao
ce
ceng
cha
chai
chan
chang
chao
che
chen
cheng
chi
chong
chou
chu
chuai
chuan
chuang
chui
chun
chuo
ci
cong
cou
cu
cuan
cui
cun
cuo
da
dai
dan
dang
dao
de
deng
di
dian
diao
die
ding
diu
dong
dou
du
duan
dui
dun
duo
e
en
er
fa
fan
fang
fei
fen
feng
fo
fou
fu
ga
gai
gan
gang
gao
ge
gei
gen
geng
gong
gou
gu
gua
guai
guan
guang
gui
gun
guo
ha
hai
han
hang
hao
he
hei
hen
heng
hong
hou
hu
hua
huai
huan
huang
hui
hun
huo
ji
jia
jian
jiang
jiao
jie
jin
jing
jiong
jiu
ju
juan
jue
jun
ka
kai
kan
kang
kao
ke
ken
keng
kong
kou
ku
kua
kuai
kuan
kuang
kui
kun
kuo
la
lai
lan
lang
lao
le
lei
leng
li
lia
lian
liang
liao
lie
lin
ling
liu
long
lou
lu
lv
luan
lue
lun
luo
ma
mai
man
mang
mao
me
mei
men
meng
mi
mian
miao
mie
min
ming
miu
mo
mou
mu
na
nai
nan
nang
nao
ne
nei
nen
neng
ni
nian
niang
niao
nie
nin
ning
niu
nong
nu
nv
nuan
nue
nuo
o
ou
pa
pai
pan
pang
pao
pei
pen
peng
pi
pian
piao
pie
pin
ping
po
pu
qi
qia
qian
qiang
qiao
qie
qin
qing
qiong
qiu
qu
quan
que
qun
ran
rang
rao
re
ren
reng
ri
rong
rou
ru
ruan
rui
run
ruo
sa
sai
san
sang
sao
se
sen
seng
sha
shai
shan
shang
shao
she
shen
sheng
shi
shou
shu
shua
shuai
shuan
shuang
shui
shun
shuo
si
song
sou
su
suan
sui
sun
suo
ta
tai
tan
tang
tao
te
teng
ti
tian
tiao
tie
ting
tong
tou
tu
tuan
tui
tun
tuo
wa
wai
wan
wang
wei
wen
weng
wo
wu
xi
xia
xian
xiang
xiao
xie
xin
xing
xiong
xiu
xu
xuan
xue
xun
ya
yan
yang
yao
ye
yi
yin
ying
yo
yong
you
yu
yuan
yue
yun
za
zai
zan
zang
zao
ze
zei
zen
zeng
zha
zhai
zhan
zhang
zhao
zhe
zhen
zheng
zhi
zhong
zhou
zhu
zhua
zhuai
zhuan
zhuang
zhui
zhun
zhuo
zi
zong
zou
zu
zuan
zui
zun
zuo
存成一个*.txt文件,比如pinyin_dataset.txt,然后程序里读入就ok
分词的话可以采用前向最大匹配或者后项最大匹配,我采用了前向最大匹配,具体算法思想网上有好多,主要是需要构建一个词典,设置一个最大匹配字符数,拼音最长也就6位,每次划分6个字符去字典里比较,找到后即一个划分,否则减少划分的长度继续查找。代码中的PYSplit(string py_str)即分词函数
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Text.RegularExpressions;
namespace OCRConfig
{
public class CPYCheck
{
private const string BLANK = " ";
private const string NON_CHAR_PATTERN = "[^A-Z|a-z]";
private const string DEFAULT_DICT_NAME = "pinyin_dataset.txt";
private Dictionary<char, List<string>> data_set;
private List<string> word_list; // 存储拼音分词后的结果
/// <summary>
///
/// </summary>
/// <param name="dict_path">拼音字典文件路径</param>
public CPYCheck(string dict_path = null)
{
data_set = new Dictionary<char, List<string>>();
word_list = new List<string>();
if (File.Exists(dict_path))
{
CreatePYDataSet(dict_path);
}
else
{
throw new Exception(string.Format("the file {0} not find!", dict_path));
}
}
private void CreatePYDataSet(string dict_path)
{
//读取字典文件,创建拼音规则字典
StreamReader sr = new StreamReader(dict_path, Encoding.Default);
String line;
while ((line = sr.ReadLine()) != null)
{
if (!data_set.ContainsKey(line[0]))
{
data_set[line[0]] = new List<string> { line.Trim() };
}
else
{
data_set[line[0]].Add(line.Trim());
}
}
}
public string NameCheck(string name_str)
{
string pname = null;
bool res = true;
Regex rgx = new Regex(NON_CHAR_PATTERN);
//用空格替换所有非字母字符
name_str = rgx.Replace(name_str, BLANK);
//去除首尾空格
name_str = name_str.Trim();
pname = name_str;
name_str = name_str.ToLower();
//认为姓名最少由两个字母组成
if (name_str.Length < 2)
return null;
string[] name_list = name_str.Split(' ');
foreach (string name in name_list)
{
List<string> wordSplit = PYSplit(name);
if (wordSplit != null)
{
foreach (string word in wordSplit)
res = res && WordTest(word);
this.word_list.Clear();
if (res == false)
{
return null;
}
}
else
{
return null;
}
}
return pname;
}
// 拼音分词,前向最大匹配法算法
// 返回分词后的拼音数组
private List<string> PYSplit(string py_str)
{
const int MAX_WORD_LEN = 6; // 单词的最大长度
bool flag = false;
string s2 = py_str.Length > MAX_WORD_LEN ? py_str.Substring(0, MAX_WORD_LEN) : py_str;
try
{
List<string> word_list = this.data_set[s2[0]];
for (int index = s2.Length; index > -1; index--)
{
if (word_list.Contains(s2.Substring(0, index)))
{
this.word_list.Add(s2.Substring(0, index));
flag = true;
if (index < py_str.Length)
{
this.PYSplit(py_str.Substring(index, py_str.Length - index));
}
return this.word_list;
}
}
if (flag == false)
this.word_list.Add(py_str);
return null;
}
catch(Exception ex)
{
return null;
}
}
private bool WordTest(string word)
{
bool res = true;
word = word.ToLower();
try
{
res = data_set[word[0]].Contains(word);
}
catch(Exception ex)
{
res = false;
}
return res;
}
}
}