jieba分词
导入的包
import jieba
import jieba.posseg
import jieba.analyse
两种分词模式
jieba.cut()
jieba.cut_for_search()
自定义词典
jieba.load_userdict('user.txt')
调整词典
add_word(word,freq=None,tag=None)
del_word(word)
jieba.suggest_freq(segment,tune=True)
关键词的提取(基于tf-idf)
jieba.analyse.extract_tags(sentence,topK=20,witnWeight=True,allowPOS=())
jieba.analyse.set_idf_path(file_name)
jieba.analyse.set_stop_words(file_name)
for x,w in jieba.analyse.extract_tags(s,topK=20,witnWeight=True,allowPOS=())
print('%s%s'%(x,w))
textrank进行关键词的提取
for x,w in jieba.analyse.textrank(s,witnWeight=True)
print('%s%s'%(x,w))
词性标注
words=jieba.posseg.cut('我爱哈尔滨')
for word, flag in words:
print('%s%s'%(word,flag))
tokenize返回词语位置
result=jieba.tokenize(u'我爱哈尔滨')
for tk in result:
print('word%s\t\t start:%d \t\t end:%d'%(tk[0],tk[1],tk[2]) )
result=jieba.tokenize(u'我爱哈尔滨',mode='search')#类似于全模式
for tk in result:
print('word%s\t\t start:%d \t\t end:%d'%(tk[0],tk[1],tk[2]) )
结巴分词小示例
import jieba
def segword(path,output):
#读取文件
file=open(path).read()
#进行分析操作
seg_list=jieba.cut(str(file),cut_all=False)
seg_list2='/'.join(seg_list)
#结果导出
fout=open(output,'w')
fout.write(seg_list2.encode('utf-8'))
fout.close()
path=''
output=''
segword(path,output)