新词提取
demo_extract_word.py
from pyhanlp import *
from tests.test_utility import ensure_data
import os
HLM_PATH = ensure_data("红楼梦.txt", "http://file.hankcs.com/corpus/红楼梦.zip")
XYJ_PATH = ensure_data("西游记.txt", "http://file.hankcs.com/corpus/西游记.zip")
SHZ_PATH = ensure_data("水浒传.txt", "http://file.hankcs.com/corpus/水浒传.zip")
SAN_PATH = ensure_data("三国演义.txt", "http://file.hankcs.com/corpus/三国演义.zip")
WEIBO_PATH = ensure_data("weibo-classification", "http://file.hankcs.com/corpus/weibo-classification.zip")
def test_weibo():
for floder in os.listdir(WEIBO_PATH):
print(floder)
big_text = ""
for file in os.listdir(os.path.join(WEIBO_PATH,floder)):
with open(os.path.join(WEIBO_PATH,floder,file),encoding='utf-8') as src:
big_text += "".join(src.readlines())
word_info_list = HanLP.extractWords(big_text,100)
print(word_info_list)
def exact(corpus):
print("%s 热词"% corpus)
word_info_list = HanLP.extractWords(IOUtil.newBufferedReader(corpus),100)
print(word_info_list)
print("%s 新词" % corpus)
word_info_list = HanLP.extractWords(IOUtil.newBufferedReader(corpus), 100, True)
print(word_info_list)
if __name__ == '__main__':
test_weibo()
exact(HLM_PATH)
exact(XYJ_PATH)
exact(SHZ_PATH)
exact(SAN_PATH)
# 更多参数
# word_info_list = HanLP.extractWords(IOUtil.newBufferedReader(HLM_PATH),100,True,4,0.0,.5,100)
# print(word_info_list)
test_utility
import zipfile
import os
from pyhanlp.static import download, remove_file, HANLP_DATA_PATH
def test_data_path():
"""
获取测试数据路径,位于$root/data/test,根目录由配置文件指定。
:return:
"""
data_path = os.path.join(HANLP_DATA_PATH, 'test')
if not os.path.isdir(data_path):
os.mkdir(data_path)
return data_path
def ensure_data(data_name, data_url):
root_path = test_data_path()
dest_path = os.path.join(root_path, data_name)
if os.path.exists(dest_path):
return dest_path
if data_url.endswith('.zip'):
dest_path += '.zip'
download(data_url, dest_path)
if data_url.endswith('.zip'):
with zipfile.ZipFile(dest_path, "r") as archive:
archive.extractall(root_path)
remove_file(dest_path)
dest_path = dest_path[:-len('.zip')]
return dest_path
关键词提取
-
词频统计法
demo_term_freq.py
from pyhanlp import * from tests.test_utility import ensure_data TermFrequency = JClass('com.hankcs.hanlp.corpus.occurrence.TermFrequency') TermFrequencyCounter = JClass('com.hankcs.hanlp.mining.word.TermFrequencyCounter') HLM_PATH = ensure_data("红楼梦.txt", "http://file.hankcs.com/corpus/红楼梦.zip") if __name__ == '__main__': counter = TermFrequencyCounter() big_text = "" with open(HLM_PATH,encoding='utf-8') as src: big_text += "".join(src.readlines()) counter.add(big_text) # counter.add("加油加油中国队!") # 第一个文档 # counter.add("中国观众高呼加油中国") # 第二个文档 for termFrequency in counter: # 遍历每个词与词频 print("%s=%d" %(termFrequency.getTerm(),termFrequency.getFrequency())) print(counter.top(2)) # 取 top N # 根据词频提取关键词 print(TermFrequencyCounter.getKeywordList("女排夺冠,观众欢呼女排女排女排!",3))
-
TF-IDF法
from pyhanlp import * TfIdfCounter = JClass('com.hankcs.hanlp.mining.word.TfIdfCounter') if __name__ == '__main__': counter = TfIdfCounter() counter.add("《女排夺冠》","女排北京奥运会夺冠") # 输入多篇文档 counter.add("《羽毛器男单》","北京奥运会的羽毛球男单决赛") counter.add("《女排》","中国队女排夺北京奥运会金牌重返巅峰,观众欢呼女排女排女排!") counter.compute() # 输入完毕 for id in counter.documents(): print(id+":",counter.getKeywordsOf(id,3).toString()) # 根据每篇文章的TF-IDF提取关键词 # 根据语料库已有的IDF信息为语料库之外的新文档提取关键词 print(counter.getKeywords("奥运会反兴奋剂",2))
-
TextRank法
from pyhanlp import * TextRankKeyword = JClass("com.hankcs.hanlp.summary.TextRankKeyword") def demo_keyword(content): keyword_list = HanLP.extractKeyword(content,5) print(keyword_list) if __name__ == '__main__': content = ( "程序员(英文Programmer)是从事程序开发、维护的专业人员。" "一般将程序员分为程序设计人员和程序编码人员," "但两者的界限并不非常清楚,特别是在中国。" "软件从业人员分为初级程序员、高级程序员、系统" "分析员和项目经理四大类。") demo_keyword(content)
短语提取
from pyhanlp import *
def demo_phrase_extrator(text):
phrase_list = HanLP.extractPhrase(text,5)
print(phrase_list)
if __name__ == '__main__':
import doctest
doctest.testmod(verbose=True)
text = '算法工程师 算法(Algorithm)是一系列解决问题的清晰指令,也就是说,能够对一定规范的输入,' \
'在有限时间内获得所要求的输出。如果一个算法有缺陷,或不适合于某个问题,执行这个算法将不会解决这个问题。' \
'不同的算法可能用不同的时间、空间或效率来完成同样的任务。一个算法的优劣可以用空间复杂度与时间复杂度来衡量。' \
'算法工程师就是利用算法处理事物的人。'
demo_phrase_extrator(text)
关键句提取
from pyhanlp import *
TextRankSentence = JClass("com.hankcs.hanlp.summary.TextRankSentence")
def demo_summary(document):
sentence_list = HanLP.extractSummary(document,3)
print(sentence_list)
if __name__ == '__main__':
import doctest
doctest.testmod(verbose=True,optionflags=doctest.NORMALIZE_WHITESPACE)
document = '水利部水资源司司长陈明忠9月29日在国务院新闻办举行的新闻发布会上透露, ' \
'根据刚刚完成了水资源管理制度的考核,有部分省接近了红线的指标,有部分省超过红线的指标。' \
'对一些超过红线的地方,陈明忠表示,对一些取用水项目进行区域的限批,严格地进行水资源论证和取水许可的批准。'
demo_summary(document)