该项目并非开源项目:fail!E:\data\NLPIR\Data\NLPIR.user Not valid license or your license expired! Please feel free to contact pipy_zhang@msn.com!
官网下载压缩包,http://ictclas.nlpir.org/downloads
java版运行需要jna-4.0.0.jar,Data文件夹下的数据,NLPIR.dll动态链接库文件(linux是NLPIR.so,另外有32位和64位之分)
NLPIRLibrary类详细
package com.chipmunk.analyzer;
import java.util.Map;
import com.sun.jna.Library;
import com.sun.jna.Native;
public interface NLPIRLibrary extends Library {
// 定义并初始化接口的静态变量 这一个语句是来加载 dll 的, 注意 dll 文件的路径
// 可以是绝对路径也可以是相对路径,只需要填写 dll 的文件名,不能加后缀
NLPIRLibrary Instance = (NLPIRLibrary) Native.loadLibrary(
NLPIRPath.PATH_LIB, NLPIRLibrary.class);
/**
* 初始化函数声明
* @param sDataPath:Initial Directory Path, where file Configure.xml and Data directory
stored. the default value is 0, it indicates the initial directory is current
working directory path
* @param encoding:encoding of input string, default is GBK_CODE (GBK encoding), and it
can be set with UTF8_CODE (UTF8 encoding) and BIG5_CODE (BIG5 encoding).
* @param sLicenceCode:license code, special use for some commercial users. Other users
ignore the argument
* @return 1-success,0-fail
*/
public int NLPIR_Init(String sDataPath, int encoding, String sLicenceCode);
/**
* 执行分词函数声明
* The NLPIR_ParagraphProcess function works properly only if NLPIR_Init succeeds.
* @param sParagraph: The source paragraph
* @param bPOStagged: Judge whether need POS tagging, 0 for no tag; 1 for tagging; default:1.
* @return
*/
public String NLPIR_ParagraphProcess(String sSrc, int bPOSTagged);
/**
* 提取关键词函数声明
* @param sLine, the input text.
* @param nMaxKeyLimit, the maximum number of key words.
* @param bWeightOut: whether the keyword weight output or not
* @return the keywords list if excute succeed. otherwise return NULL.
*/
public String NLPIR_GetKeyWords(String sLine, int nMaxKeyLimit,
boolean bWeightOut);
/**
* Extract keyword from a text file.
* @param sTextFile, the input text filename.
* @param nMaxKeyLimit, the maximum number of key words.
* @param bWeightOut: whether the keyword weight output or not
* @return Return the keywords list if excute succeed. otherwise return NULL.
*/
public String NLPIR_GetFileKeyWords(String sLine, int nMaxKeyLimit,
boolean bWeightOut);
/**
* 添加用户词典声明Add a word to the user dictionary.
* @param sWord
* @return Return 1 if add succeed. Otherwise return 0.
*/
public int NLPIR_AddUserWord(String sWord);// add by qp 2008.11.10
/**
* 删除用户词典声明
* @param sWord
* @return Return -1, the word not exist in the user dictionary; else, the handle of the word deleted
*/
public int NLPIR_DelUsrWord(String sWord);// add by qp 2008.11.10
//错误信息
public String NLPIR_GetLastErrorMsg();
/**
* 退出函数声明
* Exit the program and free all resources and destroy all working buffer used in NLPIR.
*/
public void NLPIR_Exit();
/**
* 文件分词声明 The NLPIR_FileProcess function works properly only if NLPIR_Init succeeds.
* @param sSourceFilename: The source file name to be analysized;
* @param sResultFilename: The result file name to store the results.
* @param bPOStagged: Judge whether need POS tagging, 0 for no tag; 1 for tagging; default:1.
* Return the processing speed if processing succeed. Otherwise return false.
*/
public boolean NLPIR_FileProcess(String utf8File, String utf8FileResult, int i);
//--------------------------------plus----------------------------------------//
/**
* 引入用户词典 Import user-defined dictionary from a text file.
* @param sFilename: Text filename for user dictionary
* @param bOverwrite: true(default), overwrite the existing dictionary ,false, add to the existing dictionary
* @return
*/
public int NLPIR_ImportUserDict(String sFilename,boolean bOverwrite);
/**
*
* @param sParagraph: The source paragraph
* @param pResultCount: pointer to result vector size
* @param bUserDict:whether use UserDict
* @return
* the pointer of result vector, it is managed by system, user cannot alloc and free it
struct result_t{
int start; //start position,词语在输入句子中的开始位置
int length; //length,词语的长度
char sPOS[POS_SIZE];//word type,词性ID值,可以快速的获取词性表
int iPOS;//词性
int word_ID; //如果是未登录词,设成或者-1
int word_type; //区分用户词典;1,是用户词典中的词;,非用户词典中的词
int weight;// word weight
};
*/
// public result_t NLPIR_ParagraphProcessA(String sParagraph,int pResultCount,boolean bUserDict);
public Map<String, Object> NLPIR_ParagraphProcessA(String sParagraph,int pResultCount,boolean bUserDict);
/**
* The output format is customized in NLPIR configure.
* @param sParagraph
* @return
*/
public int NLPIR_GetParagraphProcessAWordCount(String sParagraph);
/**
*
* @param nCount: the paragraph word count.
* @param result: Pointer to structure to store results.
*/
// public void NLPIR_ParagraphProcessAW(int nCount,result_t result);
public void NLPIR_ParagraphProcessAW(int nCount,Map<String, Object> result);
/**
* Save the user dictionary to disk.
* @return Return 1 if save succeed. Otherwise return 0.
*/
public int NLPIR_SaveTheUsrDic();
/**
* 关键词黑名单(永远不作为关键词输出)
* Import blacklist keyword dictionary from a text file.
* @param sFilename: Text filename for user dictionary
* @return The number of lexical entry imported successfully
*/
public int NLPIR_ImportKeyBlackList(String sFilename);
/**
* Extract new words from paragraph.
* @param sLine, the input text.
* @param nMaxKeyLimit, the maximum number of key words.
* @param bWeightOut: whether the keyword weight output or not
* @return
*/
public String NLPIR_GetNewWords(String sLine,int nMaxKeyLimit,boolean bWeightOut);
/**
* Extract new words from a text file.
* @param sTextFile, the input text filename.
* @param nMaxKeyLimit, the maximum number of key words.
* @param bWeightOut: whether the keyword weight output or not
* @return Return the keywords list if excute succeed. otherwise return NULL.
*/
public String NLPIR_GetFileNewWords(String sTextFile,int nMaxKeyLimit,boolean bWeightOut);
/**
* Extract a finger print from the paragraph .
* @param sLine
* @return 0, failed; else, the finger print of the content
*/
public long NLPIR_FingerPrint(String sLine);
/**
* select which pos map will use
* @param nPOSmap
* @return
*/
public int NLPIR_SetPOSmap(int nPOSmap);
/**
*
* @return true:success, false:fail
*/
public boolean NLPIR_NWI_Start();
/**
* 需要在运行NLPIR_NWI_Start()之后,才有效
* @param sFilename
* @return
*/
public int NLPIR_NWI_AddFile(String sFilename);
/**
* 往新词识别系统中添加一段待识别新词的内存
* 需要在运行NLPIR_NWI_Start()之后,才有效
* @param sText
* @return
*/
public boolean NLPIR_NWI_AddMem(String sText);
/**
* 新词识别添加内容结束
* 需要在运行NLPIR_NWI_Start()之后,才有效
* @return
*/
public boolean NLPIR_NWI_Complete();//新词
/**
* 获取新词识别的结果
* 需要在运行NLPIR_NWI_Complete()之后,才有效
* @param bWeightOut 是否需要输出每个新词的权重参数
* @return
*/
public String NLPIR_NWI_GetResult(boolean bWeightOut);//输出新词识别结果
/**
* 将新词识别结果导入到用户词典中
* 需要在运行NLPIR_NWI_Complete()之后,才有效
* 如果需要将新词结果永久保存,建议在执行NLPIR_SaveTheUsrDic
* @return
*/
public int NLPIR_NWI_Result2UserDict();//新词识别结果转为用户词典,返回新词结果数目
/**
* //NLPIR
NLPIR_NWI_Start();//启动新词发现功能
NLPIR_NWI_AddFile(sInputFile); //添加新词训练的文件,可反复添加
NLPIR_NWI_Complete();//添加文件或者训练内容结束
const char *pNewWordlist=NLPIR_NWI_GetResult();//输出新词识别结果
printf("识别出的新词为:%s\n",pNewWordlist);
strcpy(sResultFile,sInputFile);
strcat(sResultFile,"_result1.txt");
NLPIR_FileProcess(sInputFile,sResultFile);
NLPIR_NWI_Result2UserDict();//新词识别结果导入到用户词典
strcpy(sResultFile,sInputFile);
strcat(sResultFile,"_result2.txt");
NLPIR_FileProcess(sInputFile,sResultFile);
NLPIR_Exit();
*/
/**
* 功能:当前的切分结果过大时,如“中华人民共和国”
需要执行该函数,将切分结果细分为“中华人民共和国”
细分粒度最大为三个汉字
返回:返回细粒度分词,如果不能细分,则返回为空字符串""
* @param sLine
* @return
*/
public String NLPIR_FinerSegment(String sLine);//最大细粒化分词
/**
* 功能:获取各类英文单词的原型,考虑了过去分词、单复数等情况
返回:返回的词原型形式
driven->drive drives->drive drove-->drive
* @param sWord
* @return
*/
public String NLPIR_GetEngWordOrign(String sWord);//获取各类英文单词的原型,考虑了过去分词、单复数等情况
/**
* 功能:获取输入文本的词,词性,频统计结果,按照词频大小排序
返回:返回的是词频统计结果形式如下:
张华平/nr/10#博士/n/9#分词/n/8
*
* @param sText
* @return
*/
public String NLPIR_WordFreqStat(String sText);//获取输入文本的词,词性,频统计结果,按照词频大小排序
/**
功能:获取输入文本的词,词性,频统计结果,按照词频大小排序
参数:sFilename 文本文件的全路径
返回: 返回的是词频统计结果形式如下:
张华平/nr/10#博士/n/9#分词/n/8
* @param sFilename
* @return
*
*/
public String NLPIR_FileWordFreqStat(String sFilename);//获取输入文本的词,词性,频统计结果,按照词频大小排序
}
Demo
public static void main(String[] args) {
String argu = NLPIRPath.PATH_DATA;
int charset_type = 1;
int init_flag = NLPIRLibrary.Instance.NLPIR_Init(argu, charset_type,"0");
if (init_flag==1) {
System.out.println("init success!");
}else if (init_flag==0) {
String message = NLPIRLibrary.Instance.NLPIR_GetLastErrorMsg();
System.out.println("init fail!"+message);
}
String word = NLPIRLibrary.Instance.NLPIR_GetEngWordOrign("wanted");
System.out.println(word);
NLPIRLibrary.Instance.NLPIR_AddUserWord("强降雨 n");
String word2 = NLPIRLibrary.Instance.NLPIR_WordFreqStat("南方多地出现强降雨的同时,高温天气也在南方蔓延。据中国天气网21日消息,在副热带高压控制下,19日开始,华南一带出现高温天气,影响范围逐步扩大。预计,未来10天,高温继续蔓延,江南中南部、华南将出现日最高气温为35-38℃的持续高温晴热天气。");
System.out.println(word2);
String keywords =NLPIRLibrary.Instance.NLPIR_GetFileKeyWords("E:/temp/abc.txt", 10, true);
System.out.println(keywords);
// NLPIRLibrary.Instance.NLPIR_NWI_Start();//新词识别开始
// NLPIRLibrary.Instance.NLPIR_NWI_AddFile("E:/temp/def.txt");//批量增加输入文件,可以不断循环调用NLPIR_NWI_AddFile或者NLPIR_NWI_AddMem
//
// NLPIRLibrary.Instance.NLPIR_NWI_Complete();//新词识别导入文件结束
//
// String t= NLPIRLibrary.Instance.NLPIR_NWI_GetResult(false);//获取本次批量导入文本文件中识别的新词结果
// System.out.println("新词识别结果 " + t);//打印输出新词识别结果
// System.out.println("============");
try {
String aaa = NLPIRLibrary.Instance.NLPIR_ParagraphProcess(FileUtil.read(new File("E:/temp/def.txt"), "UTF-8"), 0);
System.out.println(aaa.replaceAll(" ", ","));
} catch (Exception e) {
e.printStackTrace();
}
String abc =NLPIRLibrary.Instance.NLPIR_GetFileNewWords("E:/temp/def.txt", 10, false);
for (String a : abc.split("#")) {
NLPIRLibrary.Instance.NLPIR_AddUserWord(a+" n");
System.out.println(a);
}
System.out.println("abc:"+abc);
NLPIRLibrary.Instance.NLPIR_AddUserWord("暴雨黄色预警 n");
try {
String ddd = NLPIRLibrary.Instance.NLPIR_ParagraphProcess(FileUtil.read(new File("E:/temp/def.txt"), "UTF-8"), 0);
System.out.println(ddd.replaceAll(" ", ","));
} catch (Exception e) {
e.printStackTrace();
}
NLPIRLibrary.Instance.NLPIR_Exit();
}