一、词性标注
jieba词性标注结合规则和统计的方法,词典匹配和HMM共同作用
二、命名实体识别
HMM将分词作为字标记来解决,其中有两条独立性假设 1、输出观察值之间相互独立 2、状态转移过程中,当前状态只与前一状态有关
CRF也是一种用来标记和切分序列化数据的统计模型。
两者不同的是:条件随机场是在给定观察的标记序列下,计算整个标记序列的联合概率,而HMM是在给定状态下,定义下一个状态的分布。HMM处理时,每个状态依赖于上一个状态,线性链CRF依赖于当前状态的周围节点状态。
2.1 日起识别
本质上是基于正则表达式的方式
# 进行日期识别
import re
from datetime import datetime,timedelta
from dateutil.parser import parse
import jieba.posseg as psg
UTIL_CN_NUM = { '零': 0, '一': 1, '二': 2, '两': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
'0': 0, '1': 1,'2': 2,'3': 3, '4': 4,'5': 5,'6': 6, '7': 7,'8': 8, '9': 9}
UTIL_CN_UNIT = {'十': 10, '百': 100, '千': 1000, '万': 10000}
def cn2dig(src):
if src=='':
return None
m=re.match("\d+",src)
if m:
return int(m.group(0))
rsl=0
unit=1
for item in src[::-1]:
if item in UTIL_CN_UNIT.keys():
unit = UTIL_CN_UNIT[item]
elif item in UTIL_CN_NUM.keys():
num = UTIL_CN_NUM[item]
rsl += num * unit
else:
return None
if rsl<unit:
rsl+=unit
return rsl
def year2dig(year):
res=''
for item in year:
if item in UTIL_CN_NUM.keys():
res = res + str(UTIL_CN_NUM[item])
else:
res = res + item
m = re.match("\d+", res)
if m:
if len(m.group(0))==2:
return int(datetime.datetime.today().year / 100) * 100 + int(m.group(0))
else:
return int(m.group(0))
else:
return None
# 对拼接字符串进行进一步处理
def check_time_valid(word):
m=re.match("\d+$",word)
if m:
if len(word)<=6:
return None
wordl=re.sub('[号|日]\d+$','日',word)
if wordl!=word:
return check_time_valid(wordl)
else:
return wordl
# 通过正则表达式将日期串进行切割 group(i)是指的利用正则表达式匹配的条目
def parse_datetime(msg):
if msg is None or len(msg)==0:
return None
try:
dt=parse(msg,fuzzy=True)
return dt.strftime('%Y-%m-%d %H:%M:%S')
except Exception as e:
m = re.match(
r"([0-9零一二两三四五六七八九十]+年)?([0-9一二两三四五六七八九十]+月)?([0-9一二两三四五六七八九十]+[号日])?([上中下午晚早]+)?([0-9零一二两三四五六七八九十百]+[点:\\.时])?([0-9零一二三四五六七八九十百]+分?)?([0-9零一二三四五六七八九十百]+秒)?",msg)
if m.group(0) is not None:
res={
"year":m.group(1),
"month":m.group(2),
"day":m.group(3),
"hour": m.group(5) if m.group(5) is not None else '00',
"minute": m.group(6) if m.group(6) is not None else '00',
"second": m.group(7) if m.group(7) is not None else '00',
}
params={}
for name in res:
if res[name] is not None and len(res[name]) != 0:
tmp=None
if name=='year':
tmp = year2dig(res[name][:-1])
else:
tmp = cn2dig(res[name][:-1])
if tmp is not None:
params[name] = int(tmp)
target_date = datetime.today().replace(**params)
is_pm=m.group(4)
if is_pm is not None:
if is_pm == u'下午' or is_pm == u'晚上' or is_pm == '中午':
hour = target_date.time().hour
if hour < 12:
target_date = target_date.replace(hour=hour + 12)
return target_date.strftime('%Y-%m-%d %H:%M:%S')
else:
return None
# 将带有时间信息的词进行切分,提取表示时间的词
def time_extract(text):
time_res=[]
word=''
keyDate={'今天':0,'明天':1,'后天':2}
for k,v in psg.cut(text):
if k in keyDate:
if word!='':
time_res.append(word)
word=(datetime.today()+timedelta(days=keyDate.get(k,0))).strftime('%Y年%m月%d日')
elif word!='':
if v in ['m','t']:
word=word+k
else:
time_res.append(word)
word=''
elif v in ['m','t']:
word=k
if word!='':
time_res.append(word)
result=list(filter(lambda x:x is not None,[check_time_valid(w) for w in time_res]))
final_res=[parse_datetime(w) for w in result]
return [x for x in final_res if x is not None]
text1 = '我要住到明天下午三点'
print(text1, time_extract(text1), sep=':')
2.2 地名识别
基于条件随机场进行地名识别
1、确定标签体系
B M E O S
2、语料数据处理
一行一个token,然后每一个有一个标签,如下所示:
我 O
去 O
北 B
京 M
数据处理的代码:
#coding=utf8
# 用于进行每行的标注转换
def tag_line(words,mark):
chars=[]
tags=[]
# 用于合并组合词
temp_word=''
for word in words:
word=word.strip('\t ')
if temp_word=='':
bracket_pos=word.find('[')
w,h=word.split('/')
if bracket_pos==-1:
if len(w)==0:
continue
chars.extend(w)
if h=='ns':
tags+=['S'] if len(w)==1 else ['B']+['M']*(len(w)-2)+['E']
else:
tags+=['O']*len(w)
else:
w=w[bracket_pos+1:]
temp_word+=w
else:
bracket_pos=word.find(']')
w,b=word.split('/')
if bracket_pos==-1:
temp_word+=w
else:
w=temp_word+w
h=word[bracket_pos+1:]
temp_word=''
if len(w)==0:
continue
chars.extend(w)
if h=='ns':
tags+=['S'] if len(w)==1 else ['B']+['M']*(len(w)-2)+['E']
else:
tags+=['O']*len(w)
assert temp_word==''
return (chars,tags)
# 用于加载数据,保存转换结果
def corpusHandler(corpusPath):
import os
root=os.path.dirname(corpusPath)
with open(corpusPath,encoding='utf-8') as corpus_f, open(os.path.join(root,'train.txt'),'w') as train_f, open(os.path.join(root,'test.txt'),'w') as test_f:
pos=0
for line in corpus_f:
line=line.strip('\r\n\t')
if line=='':
continue
isTest=True if pos%5==0 else False
words=line.split()[1:]
if len(words)==0:continue
line_chars,line_tags=tag_line(words,pos)
saveObj=test_f if isTest else train_f
for k,v in enumerate(line_chars):
saveObj.write(v+'\t'+line_tags[k]+'\n')
saveObj.write('\n')
pos+=1
3、特征模板设计
CRF的特征函数对应CEF++的特征模板。格式为%x [row,col],用于确定输入数据的一个token,raw确定当前token的相对行数,col用于确定列数。
#Unigram
U00:%x[-1,0]
U01:%x[0,0]
U02:%x[1,0]
U03:%x[2,0]
U04:%x[-2,0]
U05:%x[1,0]/%x[2,0]
U06:%x[0,0]/%x[-1,0]/%x[-2,0]
U07:%x[0,0]/%x[1,0]/%x[2,0]
U08:%x[-1,0]/%x[0,0]
U09:%x[0,0]/%x[1,0]
U10:%x[-1,0]/%x[1,0]
#Bigram
B
4、模型的训练和测试
crf_learn crf_test
计算测试集的效果
def f1(path):
with open(path) as f:
all_tag=0
loc_tag=0
pred_loc_tag=0
correct_tag=0
correct_log_tag=0
states=['B','M','E','S']
for line in f:
line=line.strip()
if line=='':
continue
_,r,p=line.split()
all_tag+=1
if r==p:
correct_tag+=1
if r in states:
correct_log_tag+=1
if r in states:
loc_tag+=1
if p in states:
pred_loc_tag+=1
loc_P=1.0*correct_log_tag/pred_loc_tag
loc_R=1.0*correct_log_tag/loc_tag
print('loc_P:{0},loc_R:{1},loc_F1:{2}'.format(loc_P,loc_R,(2*loc_P*loc_R)/(loc_P+loc_R)))
5、模型使用
def load_model(path):
import os, CRFPP
if os.path.exists(path):
return CRFPP.Tagger('-m {0} -v 3 -n2'.format(path))
return None
def locationNER(text):
tagger = load_model('./model')
for c in text:
tagger.add(c)
result = []
# parse and change internal stated as 'parsed'
tagger.parse()
word = ''
for i in range(0, tagger.size()):
for j in range(0, tagger.xsize()):
ch = tagger.x(i, j)
tag = tagger.y2(i)
if tag == 'B':
word = ch
elif tag == 'M':
word += ch
elif tag == 'E':
word += ch
result.append(word)
elif tag == 'S':
word = ch
result.append(word)
return result