环境配置
1.jdk安装配置环境变量
2.jpype安装 pip3 install jpype1
3.hanlp相关资源下载,百度云https://pan.baidu.com/s/1sw4fDjiLO0PhvYxJ2YMOOw 提取码4lm4
代码
# -*- coding: utf-8 -*-
from jpype import *
#路径
startJVM(getDefaultJVMPath(), "-Djava.class.path=D:\hanlp\hanlp-1.7.8.jar;D:\hanlp", "-Xms1g", "-Xmx1g")
#繁体转简体
def TraditionalChinese2SimplifiedChinese(sentence_str):
HanLP = JClass('com.hankcs.hanlp.HanLP')
return HanLP.convertToSimplifiedChinese(sentence_str)
#切词&命名实体识别与词性标注(可以粗略识别)
def NLP_tokenizer(sentence_str):
NLPTokenizer = JClass('com.hankcs.hanlp.tokenizer.NLPTokenizer')
return NLPTokenizer.segment(sentence_str)
#地名识别,标注为ns
def Place_Recognize(sentence_str):
HanLP = JClass('com.hankcs.hanlp.HanLP')
segment = HanLP.newSegment().enablePlaceRecognize(True)
return HanLP.segment(sentence_str)
#人名识别,标注为nr
def PersonName_Recognize(sentence_str):
HanLP = JClass('com.hankcs.hanlp.HanLP')
segment = HanLP.newSegment().enableNameRecognize(True)
return HanLP.segment(sentence_str)
#机构名识别,标注为nt
def Organization_Recognize(sentence_str):
HanLP = JClass('com.hankcs.hanlp.HanLP')
segment = HanLP.newSegment().enableOrganizationRecognize(True)
return HanLP.segment(sentence_str)
#标注结果转化成列表
def total_result(function_result_input):
x = str(function_result_input)
y = x[1:len(x)-1]
y = y.split(',')
return y
#时间实体
def time_result(total_result):
z = []
for i in range(len(total_result)):
if total_result[i][-2:] == '/t':
z.append(total_result[i])
return z
#Type_Recognition 可以选 ‘place’,‘person’,‘organization’三种实体,
#返回单一实体类别的列表
def single_result(Type_Recognition,total_result):
if Type_Recognition == 'place':
Type = '/ns'
elif Type_Recognition == 'person':
Type = '/nr'
elif Type_Recognition == 'organization':
Type = '/nt'
else:
print ('请输入正确的参数:(place,person或organization)')
z = []
for i in range(len(total_result)):
if total_result[i][-3:] == Type:
z.append(total_result[i])
return z
#把单一实体结果汇总成一个字典
def dict_result(sentence_str):
sentence = TraditionalChinese2SimplifiedChinese(sentence_str)
total_dict = {}
a = total_result(Place_Recognize(sentence))
b = single_result('place',a)
c = total_result(PersonName_Recognize(sentence))
d = single_result('person',c)
e = total_result(Organization_Recognize(sentence))
f = single_result('organization',e)
g = total_result(NLP_tokenizer(sentence))
h = time_result(g)
total_list = [i for i in [b,d,f,h]]
total_dict.update(place = total_list[0],person = total_list[1],organization = total_list[2],time = total_list[3])
shutdownJVM()#关闭JVM虚拟机
return total_dict
#测试
test_sentence="2018年武胜县新学乡政府大楼门前锣鼓喧天,6月份蓝翔给宁夏固原市彭阳县红河镇捐赠了挖掘机,中国科学院计算技术研究所的宗成庆教授负责教授自然语言处理课程"
print (dict_result(test_sentence))
配置及修改
1.在d盘创建hanlp目录将下载好的文件解压放到里面
2.修改hanlp.properties文件root=D:/hanlp/不然会报如下错误,注意是/不是\
jpype._jclass.ExceptionInInitializerError: java.lang.ExceptionInInitializerError
上面是使用 jpype来实现的,后来发现可以直接安装调用pyhanlp实现简单实现代码如下:
# -*- coding:utf-8 -*-
from pyhanlp import *
content = "现如今,机器学习和深度学习带动人工智能飞速的发展,并在图片处理、语音识别领域取得巨大成功。"
print(HanLP.segment(content))
content = "马伊琍与文章宣布离婚,华为是背后的赢家。"
print('原句:' + content)
print(HanLP.segment(content))
# 添加自定义词典
# insert会覆盖字典中已经存在的词,add会跳过已经存在的词,
# add("文章","nr 300") ,nr为词性,300为词频; add("交易平台","nz 1024 n 1") 表示可以一词多性 ,交易平台词性即可为nz 词频为1024,也可为n 词频为1
CustomDictionary.add("文章", "nr 300")
CustomDictionary.insert("工程机械", "nz 1024")
CustomDictionary.add("交易平台", "nz 1024 n 1")
print(HanLP.segment(content))
segment = HanLP.newSegment().enableNameRecognize(True)
print(HanLP.segment(content))