首先导入jieba
jieba可以如是安装 ---->(点击进入github地址)安装说明处写明了安装方式,有全自动和半自动以及手动方式,本人是半自动安装的
成功导入jieba后的包名import内容
import os
import jieba
import jieba.analyse
切词&提词步骤解读
保存文件
该步骤是为了保存切词后的文件和提取关键词后的文件,其中的save_path是具体到文件名的文件路径。
def save_file(save_path, content):
with open(save_path, "a",encoding= 'utf-8',errors='ignore') as fp:
fp.write(content)
读取文件
该步骤是为了使程序能够获取到待处理文件里的内容,其中的file_path是具体到文件名的文件路径。
def read_file(file_path):
with open(file_path, "r",encoding= 'utf-8',errors='ignore') as fp:
content = fp.readlines()
return str(content)
内容提词
该步骤是利用 jieba.analyse.extract_tags
方法起到从内容中提取关键词的作用,然后以themes列表串化返回。
def extract_theme(content):
themes = []
tags = jieba.analyse.extract_tags(content, topK=3, withWeight=True, allowPOS=\
['n','ns','v','vn'],withFlag=True)
for i in tags:
themes.append(i[0].word)
return str(themes)
内容切词
这里重点介绍第二个for
循环内容,file_content就是读到文件后的数据内容,然后用replace
处理数据内容,剔除列表串化后读到的换行\n
符号,然后开始处理文件内真正的原始内容。且content_seg1&content_seg2的区分是为了规避其只能用一次的做法,防止在theme_tag非空的时候test_words包内没有分词文件。
可见第二个if
便是利用 jieba.cut
切词后的内容提取关键词的。
def cast_words(origin_path, save_path, theme_tag):
file_lists = os.listdir(origin_path)
for dir_1 in file_lists:
file_path = origin_path + dir_1 + "/"
seg_path = save_path + dir_1 + "/"
if not os.path.exists(seg_path):
os.makedirs(seg_path)
detail_paths = os.listdir(file_path)
for detail_path in detail_paths:
full_path = file_path + detail_path
file_content = read_file(full_path)
file_content = file_content.strip()
file_content = file_content.replace("\'", "")
file_content = file_content.replace("\\n", "")
content_seg1 = jieba.cut(file_content)
content_seg2 = jieba.cut(file_content)
if theme_tag is not None:
theme = extract_theme(" ".join(content_seg1 ))
save_file(theme_tag + detail_path, theme)
save_file(seg_path + detail_path, " ".join(content_seg2))
主方法调用
这里的train_segments处理是为了得到处理后的切词文件然后保存在train_words文件内,然后test_segments处理是为了得到处理后的关键词并放在theme_tag文件内。
if __name__ == "__main__":
train_words_path = 'C:/Users/cnwan/Documents/WangLin/cutwords/train_segments/'
train_save_path = 'C:/Users/cnwan/Documents/WangLin/cutwords/train_words/'
cast_words(train_words_path,train_save_path,theme_tag=None)
train_words_path = 'C:/Users/cnwan/Documents/WangLin/cutwords/test_segments/'
train_save_path = 'C:/Users/cnwan/Documents/WangLin/cutwords/test_words/'
theme_tag_path = 'C:/Users/cnwan/Documents/WangLin/cutwords/theme_tag/'
cast_words(train_words_path, train_save_path, theme_tag=theme_tag_path)
记录学习一瞬间
切词&提词完整代码
全部完整代码以及代码内的注解+print内容推理也可看明白这个切词+关键词提取的代码
import os
import jieba
import jieba.analyse # 导入提取关䭞词的库
# 对训练集 测试集文本都进行切词处理,对测试集数据打上主题标签
#存至文件؍
def save_file(save_path, content):
with open(save_path, "a",encoding= 'utf-8',errors='ignore') as fp:
fp.write(content)
# 读取文件
def read_file(file_path):
with open(file_path, "r",encoding= 'utf-8',errors='ignore') as fp:
content = fp.readlines()
# print(content)
return str(content)
# 抽取测试集的主题关键词
def extract_theme(content):
themes = []
tags = jieba.analyse.extract_tags(content, topK=3, withWeight=True, allowPOS=['n','ns','v','vn'],withFlag=True)
for i in tags:
themes.append(i[0].word)
return str(themes)
def cast_words(origin_path, save_path, theme_tag):
'''
train_words_path: 原始文本路径
train_save_path: 切词后文本路径
:return:
'''
file_lists = os.listdir(origin_path) #原文档所在路径
print('\n'+'file_lists:')
print(file_lists)
print('\n'+'origin_path:')
print(origin_path)
for dir_1 in file_lists: #找到文件夹
file_path = origin_path + dir_1 + "/" #原始文件路径
print('\n' + 'dir_1:')
print(dir_1)
print('\n' + 'file_path:')
print(file_path)
seg_path = save_path + dir_1 + "/" #切词后文件路径
print('\n' + 'save_path:')
print(save_path)
print('\n' + 'seg_path:')
print(seg_path)
if not os.path.exists(seg_path):
os.makedirs(seg_path)
detail_paths = os.listdir(file_path)
print('\n' + 'detail_paths:')
print(detail_paths)
for detail_path in detail_paths: #找到文件夹下具体文件路径
full_path = file_path + detail_path #原始文件下每个文档路径
print('\n' + 'detail_path:')
print(detail_path)
print('\n' + 'full_path:')
print(full_path)
file_content = read_file(full_path)
print('\n' + 'file_content:')
print(file_content)
file_content = file_content.strip() # replace("\r\n", " ")
# 删除换行
print('\n' + 'file_content.strip():')
print(file_content)
file_content = file_content.replace("\'", "")
print('\n' + 'file_content.replace("\'", ""):')
print(file_content)
file_content = file_content.replace("\\n", "")
print('\n' + 'file_content.replace("\\n", ""):')
print(file_content)
content_seg1 = jieba.cut(file_content) # 为文件内容分词
content_seg2 = jieba.cut(file_content) # 为文件内容分词
# for tip in content_seg:
# print('这是关键词:' + tip)
if theme_tag is not None:
print("文件路径:{} ".format(theme_tag + detail_path))
theme = extract_theme(" ".join(content_seg1 )) #theme为该文章主题关键词
# for tip in theme:
# print('这是关键词:' + tip)
print("文章主题关键词:{} ".format(theme))
save_file(theme_tag + detail_path, theme) # 将训练集文章的主题关键词存到标签存储路径
save_file(seg_path + detail_path, " ".join(content_seg2 )) # 将处理后的文件保存到分词后语料目录
if __name__ == "__main__":
# 对训练集进行分词
train_words_path = 'C:/Users/cnwan/Documents/WangLin/cutwords/train_segments/' #./
train_save_path = 'C:/Users/cnwan/Documents/WangLin/cutwords/train_words/' #./
cast_words(train_words_path,train_save_path,theme_tag=None)
# 对测试集进行分词 抽取文章主题标签
train_words_path = 'C:/Users/cnwan/Documents/WangLin/cutwords/test_segments/' #
train_save_path = 'C:/Users/cnwan/Documents/WangLin/cutwords/test_words/' #
theme_tag_path = 'C:/Users/cnwan/Documents/WangLin/cutwords/theme_tag/' #存放测试集文章主题标签路径 theme_tag/
cast_words(train_words_path, train_save_path, theme_tag=theme_tag_path)