#encoding=utf-8import jieba
import re
dict_path = r'./data/dict.txt'
jieba.load_userdict(dict_path)
dict_fp =open(dict_path,'r',encoding='utf-8')
d ={}[d.update({line:len(line.split(' ')[0])})for line in dict_fp]print(d.items())#dict_items([('台中\n', 3), ('台中正确', 4)])
f =sorted(d.items(), key=lambda x:x[1], reverse=True)#key是一个函数,key=len按照长度排序
dict_fp.close()
new_dict =open('./data/dict1.txt','w',encoding='utf-8')[new_dict.write(item[0]+'\n')for item in f]
new_dict.close()[ jieba.suggest_freq(line.strip(),tune=True)for line inopen('./data/dict1.txt','r',encoding='utf-8')]if __name__ =='__main__':
string ='台中正确应该不会被切开'
words = jieba.cut(string, HMM=False)
result =' '.join(words)print(result)
stanford NLP case
from stanfordcorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP(r'D:\NLP_sourceCode\stanfordcorenlp',lang='zh')# step.1 启动 server# Run a server using Chinese properties# java -Xmx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -serverProperties StanfordCoreNLP-chinese.properties -port 9000 -timeout 15000# nlp = StanfordCoreNLP('http://localhost', port=9000)
sentence ='清华大学位于北京。'print(nlp.word_tokenize(sentence))print(nlp.pos_tag(sentence))print(nlp.ner(sentence))print(nlp.parse(sentence))print(nlp.dependency_parse(sentence))
nlp.close()