这是我在做本科内容时,从python小白,一直到能够做出计算结果有关代码的记录,代码均能跑起来,有些内容可以看我前面文章,这个章节主要是我的一些代码记录。
结巴分词
# -*- coding: utf-8 -*-
import jieba
import jieba.analyse
import jieba.posseg as pseg
import re
# 加载词典
jieba.load_userdict('dict.txt')
# 创建停用词list
def stopwordslist(filepath):
stopwords = [line.strip() for line in open('stop_words.txt', 'r', encoding='utf-8').readlines()]
return stopwords
# 对句子进行分词
def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip())
stopwords = stopwordslist('stopwords.txt') # 这里引用函数,加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return outstr
inputs = open('./2019_wenben/0030.txt', 'r', encoding='utf-8')
# ./2019_wenben/0030.txt是读取文件的路径,有关文件路径的写法可参考其他文章
outputs = open('jieba_out.txt', 'a+', encoding='utf-8')
# jieba_out.txt是写入文件的名称,如没有该文件,则自动创建文件
for line in inputs:
line_seg = seg_sentence(line) # 这里的返回值是字符串
# outputs.write(line_seg + '\n')
outputs.write(line_seg + ' ') # 写入txt文件中的词,采用空格分开
outputs.close()
inputs.close()