立即学习:https://edu.csdn.net/course/play/9460/199585?utm_source=blogtoedu
# utils.py
import GrobalParament
# 去掉回车换行
def delete_r_n(line):
return line.replace("\r","").replace("\n","").strip()
# 读取停用词
def get_stop_words(stop_words_dir):
stop_word = []
with open(stop_words_dir, "r", encoding = GrobalParament.encoding) as f_reader:
for line in f_reader:
line = delete_r_n(line)
stop_words.append(line)
stop_words = set(stop_words)
retrun stop_words
# 结巴精准分词
def jieba_cut(content, stop_words):
word_list = []
if content != "" and content is not None:
seg_list = jieba.cut(content)
for word in seg_list:
if word not in stop_words:
word_list.append(word)
return word_list
# 结巴搜索引擎分词
def jieba_cut_for_search(content, stop_words)
word_list = []
if content != "" and content is not None:
seg_list = jieba.cut_for_search(content)
for word in seg_list:
if word not in stop_words:
word_list.append(word)
return word_list
if __name__ == "__main__":
stop_words = get_stop_words(GrobalParament.stop_word_dir)
content = "我毕业于北京理工大学,现就职于中国科学院计算技术研究所。"
word_list = jieba_cut(content,stop_words)
print(word_list)
word_list = jieba_cut_for_search(content, stop_words)
print(word_list)