一个小小的结巴分词使用的实例(涉及了大部分关于结巴的操作)
import jieba
import jieba.analyse as ana#关键词
import jieba.posseg as pos#词性
#打开或创建文件
#class jieba_cut_word():
# test=''
# open_test=open('test.txt','r')
# write_result=open('result.txt','ab')
# def create_txt(tes,res):
# open(tes,'r')
# open(res,'ab')
# def cut_words():
# with open()
#if _name_ == ' _main_ ':
# cut_word=jieba_cut_word()
def get_content(path):
with open(path,'r', errors='ignore') as f:
content=' '
for l in f:
#print(l)
l=l.strip()
content+=l
#print(content)
return content
def cut_word(content,result):
with open(result,'a',encoding='gbk' ,errors='ignore') as g:
jieba1=jieba.cut(content)
g.write('\n')
g.write('/'.join(jieba1))
g.write('\n')
g.close()
def get_main_word(content,result):
with open(result,'a', encoding='gbk', errors='ignore') as g:
jieba2=ana.extract_tags(content, topK=20)
g.write('\n')
print(' '.join(jieba2))
g.write(' '.join(jieba2))
#for l in jieba2:
# g.write(l)
#去除停用词
def stop_word(content,stop_words):#去除停用词,得到一个字符串
#stopword={}.fromkeys(['等','怎样','如何','不论','包括','的']) 创建字典
#with open(result,'a',encoding='gbk' ,errors='ignore') as g:
jieba1=jieba.cut(content)
content2=' '
for seg in jieba1:
#seg=seg.decode('utf-8')
if seg not in stop_words:
content2+=seg
return content2
def jiazai_stopword():
stop_words=[line.strip() for line in open('stop_word.txt','r', errors='ignore').readlines()]
return stop_words
#if _name_ == '_main_':
test='test.txt'
result='result.txt'
content=get_content(test)
stop_words=jiazai_stopword()#加载停用词
content2=stop_word(content,stop_words)#去除停用词
jieba.load_userdict('dict1.txt')#加载自定义词典
cut_word(content2,result)
get_main_word(content2,result)
#之后做添加自定义词,去除停用词,提取关键词
存到手里好久了,今天把它发出来,,哈哈哈