基于分词(jieba)的名词性短语识别

代码如下:

def split_for_scentence(U,V):
    for i in range(len(V)):
        if(U[i]=="的"):
            V[i]="uj"
    V2STR="&".join(V)+"&"


    re_rule=["a&uj&n","b&uj&n&","f&uj&n&","i&uj&n&","j&uj&n&","l&uj&n&","m&uj&n&","n&uj&n&","r&uj&n&","s&uj&n,","t&uj&n&","v&uj&n&","z&uj&n&",
            "a&uj&nr&","b&uj&nr&","f&uj&nr&","i&uj&nr&","j&uj&nr&","l&uj&nr&","m&uj&nr&","n&uj&nr&","r&uj&nr&","s&uj&nr&","t&uj&nr&","v&uj&nr&","z&uj&nr&",
            "a&uj&nt&","b&uj&nt&","f&uj&nt&","i&uj&nt&","j&uj&nt&","l&uj&nt&","m&uj&nt&","n&uj&nt&","r&uj&nt&","s&uj&nt&","t&uj&nt&","v&uj&nt&","z&uj&nt&",
            "a&uj&nz&","b&uj&nz&","f&uj&nz&","i&uj&nz&","j&uj&nz&","l&uj&nz&","m&uj&nz&","n&uj&nz&","r&uj&nz&","s&uj&nz&","t&uj&nz&","v&uj&nz&","z&uj&nz&"
            "a&uj&ns&","b&uj&ns&","f&uj&ns&","i&uj&ns&","j&uj&ns&","l&uj&ns&","m&uj&ns&","n&uj&ns&","r&uj&ns&","s&uj&ns&","t&uj&ns&","v&uj&ns&","z&uj&ns&",
            "a&uj&np&","b&uj&np&","f&uj&np&","i&uj&np&","j&uj&np&","l&uj&np&","m&uj&np&","n&uj&np&","r&uj&np&","s&uj&np&","t&uj&np&","v&uj&np&","z&uj&np&",
            "np&uj&nr&","np&uj&nt&","np&uj&nz&","np&uj&ns&","np&uj&np&",
            "r&np&","t&np&",
            "n&n&","n&np&","np&n&","np&np&","nt&ns&",
            "n&n&c&n&","np&np&c&np&","n&c&n&","np&c&np&",
            "n&r&","n&n&",
            "r&m&","r&n&","r&r&","r&np&",
            "n&uj&v&","a&uj&v&","r&uj&v&","np&uj&","np&r&",
            "a&uj&","b&uj&","f&uj&","n&uj&","r&uj&","s&uj&","t&uj&","v&uj&","z&uj&",
            "m&np&","m&n&","q&n&",]
    for i in range(30):
        for item in re_rule:
            #匹配到的符合规范的开始索引
            oct=0
            #匹配到的符合规范的结束索引
            oce=0

            # 特殊情况 们
            if("们" in U and V[U.index("们")-1]=="n"):
        
                oct=U.index("们")-1
                oce=U.index("们")
                if(oce<len(U)-1):
                    if(oct==0):
                        new_U=["".join(U[oct:oce+1])]+U[oce+1:len(U)]
                        new_V=['np']+V[oce+1:len(V)]
                    else:
                        new_U=U[0:oct]+["".join(U[oct:oce+1])]+U[oce+1:len(U)]
                        new_V=V[0:oct]+['np']+V[oce+1:len(V)]
            
                else:
                    if(oct==0):
                        new_U=["".join(U[oct:oce+1])]
                        new_V=['np']
                    else:
                        new_U=U[0:oct]+["".join(U[oct:oce+1])]
                        new_V=V[0:oct]+['np']
                
                U=new_U
                V=new_V
                V2STR="&".join(V)+"&"
                break

            # 特殊情况 所+v
            if("所" in U and V[U.index("所")+1]=="v"):
        
                oct=U.index("所")
                oce=U.index("所")+1
                if(oce<len(U)-1):
                    if(oct==0):
                        new_U=["".join(U[oct:oce+1])]+U[oce+1:len(U)]
                        new_V=['np']+V[oce+1:len(V)]
                    else:
                        new_U=U[0:oct]+["".join(U[oct:oce+1])]+U[oce+1:len(U)]
                        new_V=V[0:oct]+['np']+V[oce+1:len(V)]
            
                else:
                    if(oct==0):
                        new_U=["".join(U[oct:oce+1])]
                        new_V=['np']
                    else:
                        new_U=U[0:oct]+["".join(U[oct:oce+1])]
                        new_V=V[0:oct]+['np']
                
                U=new_U
                V=new_V
                V2STR="&".join(V)+"&"
                break
            
            if(re.search(item ,V2STR,0)!=None):
                
                ct=re.search(item ,V2STR,0).start()
                ce=re.search(item ,V2STR,0).end()
                oct=V2STR[0:ct].count("&")
                oce=V2STR[0:ce].count("&")-1
                #更新
                #判断数组越界
                if(oce<len(U)-1):
                    if(oct==0):
                        new_U=["".join(U[oct:oce+1])]+U[oce+1:len(U)]
                        new_V=['np']+V[oce+1:len(V)]
                    else:
                        new_U=U[0:oct]+["".join(U[oct:oce+1])]+U[oce+1:len(U)]
                        new_V=V[0:oct]+['np']+V[oce+1:len(V)]
            
                else:
                    if(oct==0):
                        new_U=["".join(U[oct:oce+1])]
                        new_V=['np']
                    else:
                        new_U=U[0:oct]+["".join(U[oct:oce+1])]
                        new_V=V[0:oct]+['np']
                #更新迭代
                U=new_U
                V=new_V
                V2STR="&".join(V)+"&"

                break

    return U,V

测试代码1:基于jieba

import jieba.posseg as psg
import re
txt="这件事情让我觉得好糗啊"
generator_txt=psg.cut(txt)
#单个词的列表
U1=[]
#单个词的词性列表
V1=[]
for u,v in generator_txt:
    U1.append(u)
    V1.append(v)
U,V=split_for_scentence(U1,V1)
print(U)
print(V)

结果1如下:

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\Dell\AppData\Local\Temp\jieba.cache
Loading model cost 0.698 seconds.
Prefix dict has been built successfully.
['这件事情', '让', '我', '觉得', '好', '糗', '啊']
['np', 'v', 'r', 'v', 'a', 'g', 'y']

测试代码2如下:

U2=['我','爱','美丽',"的",'中国']
V2=['r','v','ns','u','n']
V2STR2="&".join(V2)+"&"

U,V=split_for_scentence(U2,V2)
print(U)
print(V)

结果2如下;

['我', '爱', '美丽的中国']
['r', 'v', 'np']

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值