代码如下:
def split_for_scentence(U,V):
for i in range(len(V)):
if(U[i]=="的"):
V[i]="uj"
V2STR="&".join(V)+"&"
re_rule=["a&uj&n","b&uj&n&","f&uj&n&","i&uj&n&","j&uj&n&","l&uj&n&","m&uj&n&","n&uj&n&","r&uj&n&","s&uj&n,","t&uj&n&","v&uj&n&","z&uj&n&",
"a&uj&nr&","b&uj&nr&","f&uj&nr&","i&uj&nr&","j&uj&nr&","l&uj&nr&","m&uj&nr&","n&uj&nr&","r&uj&nr&","s&uj&nr&","t&uj&nr&","v&uj&nr&","z&uj&nr&",
"a&uj&nt&","b&uj&nt&","f&uj&nt&","i&uj&nt&","j&uj&nt&","l&uj&nt&","m&uj&nt&","n&uj&nt&","r&uj&nt&","s&uj&nt&","t&uj&nt&","v&uj&nt&","z&uj&nt&",
"a&uj&nz&","b&uj&nz&","f&uj&nz&","i&uj&nz&","j&uj&nz&","l&uj&nz&","m&uj&nz&","n&uj&nz&","r&uj&nz&","s&uj&nz&","t&uj&nz&","v&uj&nz&","z&uj&nz&"
"a&uj&ns&","b&uj&ns&","f&uj&ns&","i&uj&ns&","j&uj&ns&","l&uj&ns&","m&uj&ns&","n&uj&ns&","r&uj&ns&","s&uj&ns&","t&uj&ns&","v&uj&ns&","z&uj&ns&",
"a&uj&np&","b&uj&np&","f&uj&np&","i&uj&np&","j&uj&np&","l&uj&np&","m&uj&np&","n&uj&np&","r&uj&np&","s&uj&np&","t&uj&np&","v&uj&np&","z&uj&np&",
"np&uj&nr&","np&uj&nt&","np&uj&nz&","np&uj&ns&","np&uj&np&",
"r&np&","t&np&",
"n&n&","n&np&","np&n&","np&np&","nt&ns&",
"n&n&c&n&","np&np&c&np&","n&c&n&","np&c&np&",
"n&r&","n&n&",
"r&m&","r&n&","r&r&","r&np&",
"n&uj&v&","a&uj&v&","r&uj&v&","np&uj&","np&r&",
"a&uj&","b&uj&","f&uj&","n&uj&","r&uj&","s&uj&","t&uj&","v&uj&","z&uj&",
"m&np&","m&n&","q&n&",]
for i in range(30):
for item in re_rule:
#匹配到的符合规范的开始索引
oct=0
#匹配到的符合规范的结束索引
oce=0
# 特殊情况 们
if("们" in U and V[U.index("们")-1]=="n"):
oct=U.index("们")-1
oce=U.index("们")
if(oce<len(U)-1):
if(oct==0):
new_U=["".join(U[oct:oce+1])]+U[oce+1:len(U)]
new_V=['np']+V[oce+1:len(V)]
else:
new_U=U[0:oct]+["".join(U[oct:oce+1])]+U[oce+1:len(U)]
new_V=V[0:oct]+['np']+V[oce+1:len(V)]
else:
if(oct==0):
new_U=["".join(U[oct:oce+1])]
new_V=['np']
else:
new_U=U[0:oct]+["".join(U[oct:oce+1])]
new_V=V[0:oct]+['np']
U=new_U
V=new_V
V2STR="&".join(V)+"&"
break
# 特殊情况 所+v
if("所" in U and V[U.index("所")+1]=="v"):
oct=U.index("所")
oce=U.index("所")+1
if(oce<len(U)-1):
if(oct==0):
new_U=["".join(U[oct:oce+1])]+U[oce+1:len(U)]
new_V=['np']+V[oce+1:len(V)]
else:
new_U=U[0:oct]+["".join(U[oct:oce+1])]+U[oce+1:len(U)]
new_V=V[0:oct]+['np']+V[oce+1:len(V)]
else:
if(oct==0):
new_U=["".join(U[oct:oce+1])]
new_V=['np']
else:
new_U=U[0:oct]+["".join(U[oct:oce+1])]
new_V=V[0:oct]+['np']
U=new_U
V=new_V
V2STR="&".join(V)+"&"
break
if(re.search(item ,V2STR,0)!=None):
ct=re.search(item ,V2STR,0).start()
ce=re.search(item ,V2STR,0).end()
oct=V2STR[0:ct].count("&")
oce=V2STR[0:ce].count("&")-1
#更新
#判断数组越界
if(oce<len(U)-1):
if(oct==0):
new_U=["".join(U[oct:oce+1])]+U[oce+1:len(U)]
new_V=['np']+V[oce+1:len(V)]
else:
new_U=U[0:oct]+["".join(U[oct:oce+1])]+U[oce+1:len(U)]
new_V=V[0:oct]+['np']+V[oce+1:len(V)]
else:
if(oct==0):
new_U=["".join(U[oct:oce+1])]
new_V=['np']
else:
new_U=U[0:oct]+["".join(U[oct:oce+1])]
new_V=V[0:oct]+['np']
#更新迭代
U=new_U
V=new_V
V2STR="&".join(V)+"&"
break
return U,V
测试代码1:基于jieba
import jieba.posseg as psg
import re
txt="这件事情让我觉得好糗啊"
generator_txt=psg.cut(txt)
#单个词的列表
U1=[]
#单个词的词性列表
V1=[]
for u,v in generator_txt:
U1.append(u)
V1.append(v)
U,V=split_for_scentence(U1,V1)
print(U)
print(V)
结果1如下:
Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\Dell\AppData\Local\Temp\jieba.cache
Loading model cost 0.698 seconds.
Prefix dict has been built successfully.
['这件事情', '让', '我', '觉得', '好', '糗', '啊']
['np', 'v', 'r', 'v', 'a', 'g', 'y']
测试代码2如下:
U2=['我','爱','美丽',"的",'中国']
V2=['r','v','ns','u','n']
V2STR2="&".join(V2)+"&"
U,V=split_for_scentence(U2,V2)
print(U)
print(V)
结果2如下;
['我', '爱', '美丽的中国']
['r', 'v', 'np']