SpringBoot下使用HanLP包
HanLP是自然语言处理包,这里不多做赘述,由于可能会出现需要把HanLP提供的资源文件(词库)放在SpringBoot的Resources下,所以特地在此记录自己尝试成功的经历
关于引入依赖和词库文件的解压不多做说明,首先看一下提供的配置文件
文件名hanlp.properties
#2\u5143\u8bed\u6cd5\u8bcd\u5178\u8def\u5f84
BiGramDictionaryPath=data/dictionary/CoreNatureDictionary.ngram.txt
#\u505c\u7528\u8bcd\u8bcd\u5178\u8def\u5f84
CoreStopWordDictionaryPath=data/dictionary/stopwords.txt
#\u540c\u4e49\u8bcd\u8bcd\u5178\u8def\u5f84
CoreSynonymDictionaryDictionaryPath=data/dictionary/synonym/CoreSynonym.txt
#\u4eba\u540d\u8bcd\u5178\u8def\u5f84
PersonDictionaryPath=data/dictionary/person/nr.txt
#\u4eba\u540d\u8bcd\u5178\u8f6c\u79fb\u77e9\u9635\u8def\u5f84
PersonDictionaryTrPath=data/dictionary/person/nr.tr.txt
#\u7e41\u7b80\u8bcd\u5178\u6839\u76ee\u5f55
tcDictionaryRoot=data/dictionary/tc
#\u81ea\u5b9a\u4e49\u8bcd\u5178\u8def\u5f84\uff0c\u7528;\u9694\u5f00\u591a\u4e2a\u81ea\u5b9a\u4e49\u8bcd\u5178\uff0c\u7a7a\u683c\u5f00\u5934\u8868\u793a\u5728\u540c\u4e00\u4e2a\u76ee\u5f55\uff0c\u4f7f\u7528\u201c\u6587\u4ef6\u540d \u8bcd\u6027\u201d\u5f62\u5f0f\u5219\u8868\u793a\u8fd9\u4e2a\u8bcd\u5178\u7684\u8bcd\u6027\u9ed8\u8ba4\u662f\u8be5\u8bcd\u6027\u3002\u4f18\u5148\u7ea7\u9012\u51cf\u3002
#\u53e6\u5916data/dictionary/custom/CustomDictionary.txt\u662f\u4e2a\u9ad8\u8d28\u91cf\u7684\u8bcd\u5e93\uff0c\u8bf7\u4e0d\u8981\u5220\u9664\u3002\u6240\u6709\u8bcd\u5178\u7edf\u4e00\u4f7f\u7528UTF-8\u7f16\u7801\u3002
CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; \u73b0\u4ee3\u6c49\u8bed\u8865\u5145\u8bcd\u5e93.txt; \u5168\u56fd\u5730\u540d\u5927\u5168.txt ns; \u4eba\u540d\u8bcd\u5178.txt; \u673a\u6784\u540d\u8bcd\u5178.txt; user_dic.txt; \u4e0a\u6d77\u5730\u540d.txt ns;data/dictionary/person/nrf.txt nrf;
#CRF\u5206\u8bcd\u6a21\u578b\u8def\u5f84
CRFSegmentModelPath=data/model/segment/CRFSegmentModel.txt
#HMM\u5206\u8bcd\u6a21\u578b
HMMSegmentModelPath=data/model/segment/HMMSegmentModel.bin
#\u5206\u8bcd\u7ed3\u679c\u662f\u5426\u5c55\u793a\u8bcd\u6027
ShowTermNature=true
#IO\u9002\u914d\u5668\uff0c\u5b9e\u73b0com.hankcs.hanlp.corpus.io.IIOAdapter\u63a5\u53e3\u4ee5\u5728\u4e0d\u540c\u7684\u5e73\u53f0\uff08Hadoop\u3001Redis\u7b49\uff09\u4e0a\u8fd0\u884cHanLP
#\u9ed8\u8ba4\u7684IO\u9002\u914d\u5668\u5982\u4e0b\uff0c\u8be5\u9002\u914d\u5668\u662f\u57fa\u4e8e\u666e\u901a\u6587\u4ef6\u7cfb\u7edf\u7684\u3002
#IOAdapter=com.hankcs.hanlp.corpus.io.FileIOAdapter
#\u611f\u77e5\u673a\u8bcd\u6cd5\u5206\u6790\u5668
PerceptronCWSModelPath=data/model/perceptron/pku199801/cws.bin
PerceptronPOSModelPath=data/model/perceptron/pku199801/pos.bin
PerceptronNERModelPath=data/model/perceptron/pku199801/ner.bin
#CRF\u8bcd\u6cd5\u5206\u6790\u5668
CRFCWSModelPath=data/model/crf/pku199801/cws.bin
CRFPOSModelPath=data/model/crf/pku199801/pos.bin
CRFNERModelPath=data/model/crf/pku199801/ner.bin
#\u66f4\u591a\u914d\u7f6e\u9879\u8bf7\u53c2\u8003 https://github.com/hankcs/HanLP/blob/master/src/main/java/com/hankcs/hanlp/HanLP.java#L59 \u81ea\u884c\u6dfb\u52a0
然后有一个地方需要注意下
在HanLP类下
public static void init(String rootPath) {
Properties p = new Properties();
String prePath;
int i;
int lastSplash;
try {
ClassLoader loader = Thread.currentThread().getContextClassLoader();
if (loader == null) {
loader = HanLP.Config.class.getClassLoader();
}
p.load(new InputStreamReader((InputStream)(Predefine.HANLP_PROPERTIES_PATH == null ? loader.getResourceAsStream("hanlp.properties") : new FileInputStream(Predefine.HANLP_PROPERTIES_PATH)), "UTF-8"));
String root = p.getProperty("root", "");
if (StringUtil.isEmpty(root)) {
root = StringUtil.getString(rootPath);
}
System.out.println(root);
root = root.replaceAll("\\\\", "/");
if (root.length() > 0 && !root.endsWith("/")) {
root = root + "/";
}
CoreDictionaryPath = root + p.getProperty("CoreDictionaryPath", CoreDictionaryPath);
CoreDictionaryTransformMatrixDictionaryPath = root + p.getProperty("CoreDictionaryTransformMatrixDictionaryPath", CoreDictionaryTransformMatrixDictionaryPath);
BiGramDictionaryPath = root + p.getProperty("BiGramDictionaryPath", BiGramDictionaryPath);
CoreStopWordDictionaryPath = root + p.getProperty("CoreStopWordDictionaryPath", CoreStopWordDictionaryPath);
CoreSynonymDictionaryDictionaryPath = root + p.getProperty("CoreSynonymDictionaryDictionaryPath", CoreSynonymDictionaryDictionaryPath);
PersonDictionaryPath = root + p.getProperty("PersonDictionaryPath", PersonDictionaryPath);
PersonDictionaryTrPath = root + p.getProperty("PersonDictionaryTrPath", PersonDictionaryTrPath);
String[] pathArray = p.getProperty("CustomDictionaryPath", "data/dictionary/custom/CustomDictionary.txt").split(";");
prePath = root;
for(i = 0; i < pathArray.length; ++i) {
if (pathArray[i].startsWith(" ")) {
pathArray[i] = prePath + pathArray[i].trim();
} else {
pathArray[i] = root + pathArray[i];
lastSplash = pathArray[i].lastIndexOf(47);
if (lastSplash != -1) {
prePath = pathArray[i].substring(0, lastSplash + 1);
}
}
}
CustomDictionaryPath = pathArray;
tcDictionaryRoot = root + p.getProperty("tcDictionaryRoot", tcDictionaryRoot);
if (!tcDictionaryRoot.endsWith("/")) {
tcDictionaryRoot = tcDictionaryRoot + '/';
}
PinyinDictionaryPath = root + p.getProperty("PinyinDictionaryPath", PinyinDictionaryPath);
TranslatedPersonDictionaryPath = root + p.getProperty("TranslatedPersonDictionaryPath", TranslatedPersonDictionaryPath);
JapanesePersonDictionaryPath = root + p.getProperty("JapanesePersonDictionaryPath", JapanesePersonDictionaryPath);
PlaceDictionaryPath = root + p.getProperty("PlaceDictionaryPath", PlaceDictionaryPath);
PlaceDictionaryTrPath = root + p.getProperty("PlaceDictionaryTrPath", PlaceDictionaryTrPath);
OrganizationDictionaryPath = root + p.getProperty("OrganizationDictionaryPath", OrganizationDictionaryPath);
OrganizationDictionaryTrPath = root + p.getProperty("OrganizationDictionaryTrPath", OrganizationDictionaryTrPath);
CharTypePath = root + p.getProperty("CharTypePath", CharTypePath);
CharTablePath = root + p.getProperty("CharTablePath", CharTablePath);
PartOfSpeechTagDictionary = root + p.getProperty("PartOfSpeechTagDictionary", PartOfSpeechTagDictionary);
WordNatureModelPath = root + p.getProperty("WordNatureModelPath", WordNatureModelPath);
MaxEntModelPath = root + p.getProperty("MaxEntModelPath", MaxEntModelPath);
NNParserModelPath = root + p.getProperty("NNParserModelPath", NNParserModelPath);
CRFSegmentModelPath = root + p.getProperty("CRFSegmentModelPath", CRFSegmentModelPath);
CRFDependencyModelPath = root + p.getProperty("CRFDependencyModelPath", CRFDependencyModelPath);
HMMSegmentModelPath = root + p.getProperty("HMMSegmentModelPath", HMMSegmentModelPath);
CRFCWSModelPath = root + p.getProperty("CRFCWSModelPath", CRFCWSModelPath);
CRFPOSModelPath = root + p.getProperty("CRFPOSModelPath", CRFPOSModelPath);
CRFNERModelPath = root + p.getProperty("CRFNERModelPath", CRFNERModelPath);
PerceptronCWSModelPath = root + p.getProperty("PerceptronCWSModelPath", PerceptronCWSModelPath);
PerceptronPOSModelPath = root + p.getProperty("PerceptronPOSModelPath", PerceptronPOSModelPath);
PerceptronNERModelPath = root + p.getProperty("PerceptronNERModelPath", PerceptronNERModelPath);
ShowTermNature = "true".equals(p.getProperty("ShowTermNature", "true"));
Normalization = "true".equals(p.getProperty("Normalization", "false"));
String ioAdapterClassName = p.getProperty("IOAdapter");
if (ioAdapterClassName != null) {
try {
Class<?> clazz = Class.forName(ioAdapterClassName);
Constructor<?> ctor = clazz.getConstructor();
Object instance = ctor.newInstance();
if (instance != null) {
IOAdapter = (IIOAdapter)instance;
}
} catch (ClassNotFoundException var10) {
Predefine.logger.warning(String.format("找不到IO适配器类: %s ,请检查第三方插件jar包", ioAdapterClassName));
} catch (NoSuchMethodException var11) {
Predefine.logger.warning(String.format("工厂类[%s]没有默认构造方法,不符合要求", ioAdapterClassName));
} catch (SecurityException var12) {
Predefine.logger.warning(String.format("工厂类[%s]默认构造方法无法访问,不符合要求", ioAdapterClassName));
} catch (Exception var13) {
Predefine.logger.warning(String.format("工厂类[%s]构造失败:%s\n", ioAdapterClassName, TextUtility.exceptionToString(var13)));
}
}
} catch (Exception var14) {
StringBuilder sbInfo = new StringBuilder("========Tips========\n请将hanlp.properties放在下列目录:\n");
String classPath = (String)System.getProperties().get("java.class.path");
if (classPath != null) {
String[] var8;
lastSplash = (var8 = classPath.split(File.pathSeparator)).length;
for(i = 0; i < lastSplash; ++i) {
prePath = var8[i];
if ((new File(prePath)).isDirectory()) {
sbInfo.append(prePath).append('\n');
}
}
}
sbInfo.append("Web项目则请放到下列目录:\nWebapp/WEB-INF/lib\nWebapp/WEB-INF/classes\nAppserver/lib\nJRE/lib\n");
sbInfo.append("并且编辑root=PARENT/path/to/your/data\n");
sbInfo.append("现在HanLP将尝试从").append(System.getProperties().get("user.dir")).append("读取data……");
Predefine.logger.severe("没有找到hanlp.properties,可能会导致找不到data\n" + sbInfo);
}
}
这个函数需要输入一个root,也就是词库文件夹的绝对路径,如果设置的路径为空则会从properties中读取配置的路径,也就是我们需要的resources下的路径
HanLP下的Config类用于载入配置路径
public static final class Config {
public static boolean DEBUG = false;
public static String CoreDictionaryPath = "data/dictionary/CoreNatureDictionary.txt";
public static String CoreDictionaryTransformMatrixDictionaryPath = "data/dictionary/CoreNatureDictionary.tr.txt";
public static String[] CustomDictionaryPath = new String[]{"data/dictionary/custom/CustomDictionary.txt"};
public static String BiGramDictionaryPath = "data/dictionary/CoreNatureDictionary.ngram.txt";
public static String CoreStopWordDictionaryPath = "data/dictionary/stopwords.txt";
public static String CoreSynonymDictionaryDictionaryPath = "data/dictionary/synonym/CoreSynonym.txt";
public static String PersonDictionaryPath = "data/dictionary/person/nr.txt";
public static String PersonDictionaryTrPath = "data/dictionary/person/nr.tr.txt";
public static String PlaceDictionaryPath = "data/dictionary/place/ns.txt";
public static String PlaceDictionaryTrPath = "data/dictionary/place/ns.tr.txt";
public static String OrganizationDictionaryPath = "data/dictionary/organization/nt.txt";
public static String OrganizationDictionaryTrPath = "data/dictionary/organization/nt.tr.txt";
public static String tcDictionaryRoot = "data/dictionary/tc/";
public static String PinyinDictionaryPath = "data/dictionary/pinyin/pinyin.txt";
public static String TranslatedPersonDictionaryPath = "data/dictionary/person/nrf.txt";
public static String JapanesePersonDictionaryPath = "data/dictionary/person/nrj.txt";
public static String CharTypePath = "data/dictionary/other/CharType.bin";
public static String CharTablePath = "data/dictionary/other/CharTable.txt";
public static String PartOfSpeechTagDictionary = "data/dictionary/other/TagPKU98.csv";
public static String WordNatureModelPath = "data/model/dependency/WordNature.txt";
public static String MaxEntModelPath = "data/model/dependency/MaxEntModel.txt";
public static String NNParserModelPath = "data/model/dependency/NNParserModel.txt";
public static String CRFSegmentModelPath = "data/model/segment/CRFSegmentModel.txt";
public static String HMMSegmentModelPath = "data/model/segment/HMMSegmentModel.bin";
/** @deprecated */
public static String CRFDependencyModelPath = "data/model/dependency/CRFDependencyModelMini.txt";
public static String CRFCWSModelPath = "data/model/crf/pku199801/cws.bin";
public static String CRFPOSModelPath = "data/model/crf/pku199801/pos.bin";
public static String CRFNERModelPath = "data/model/crf/pku199801/ner.bin";
public static String PerceptronCWSModelPath = "data/model/perceptron/pku199801/cws.bin";
public static String PerceptronPOSModelPath = "data/model/perceptron/pku199801/pos.bin";
public static String PerceptronNERModelPath = "data/model/perceptron/pku199801/ner.bin";
public static boolean ShowTermNature = true;
public static boolean Normalization = false;
public static IIOAdapter IOAdapter;
public Config() {
}
在此了解基础上我们需要继承一个接口重写两个方法
/**
* @Description: HanLP读取静态资源
* @author: Amethyst
* @date: 2019-11-28 11:12
* @update_by:
* @tags:
*/
public class HanLPResourcesAdapter implements IIOAdapter {
@Override
public InputStream open(String path) throws IOException {
ClassPathResource resource =new ClassPathResource(path);
IntputStream is=new FileIntputStream(resource.getFile());
return is;
}
@Override
public OutputStream create(String path) throws IOException {
ClassPathResource resource=new ClassPathResource(path);
OutputStream os=new FileOutputStream(resource.getFile());
return os;
}
}
在重写方法的基础上我们还需要对properties进行两方面的改动
#添加根路径设置为空
root=
#添加适配器为我们自己适配器的路径
IOAdapter=x.x.HanLPResourcesAdapter
然后我们需要在调用前对HanLP进行一个初始化,调用一下init函数,设置根路径为空
//在载入包时需要调用此函数进行词库的读取初始化
HanLP.Config.init("");