Python读取预处理文本内容,读取停用词字典,读取非拆分词文件。
Python基于jieba库进行分词,将分词导出指定文件,读取分词文件,统计关键词词频,基于关键词词频文件,抽取三个字符以上关键词,放入非拆分词文件(保留非拆分原始内容不动)
import pandas as pd
import jieba
import re
from collections import Counter
# 定义一个函数用于清洗分词结果
def clean_words(words):
# 移除单个汉字
words = [word for word in words if len(word) >= 2]
# 移除常规标点符号
words = [re.sub(r'[^\w\s]', '', word) for word in words]
return words
# 读取Excel文件
excel_path = r"D:\公库\关键词拓扑\文本\搜索结果_参考网.xlsx"
df = pd.read_excel(excel_path)
# 假设标题列为第一列,摘要列为第二列
title_col = df.columns[0]
summary_col = df.columns[2]
# 初始化空列表以存储所有分词结果
all_words = []
# 读取停用词和非拆分词
stopwords = set()
non_split_words = set()
with open(r"D:\公库\关键词拓扑\中文停用词.txt", 'r', encoding='utf-8') as f:
for line in f:
stopwords.add(line.strip())
with open(r"D:\公库\关键词拓扑\非拆分词.txt", 'r', encoding='utf-8') as f:
for line in f:
non_split_words.add(line.strip())
# 对每一行的摘要进行分词处理
for summary in df[summary_col]:
# 使用jieba进行分词
words = jieba.cut(summary)
# 清洗分词结果
cleaned_words = clean_words(words)
# 去除停用词
cleaned_words = [word for word in cleaned_words if word not in stopwords]
# 添加到总列表中
all_words.extend(cleaned_words)
# 将分词结果写入文本文件
output_path = r"D:\公库\关键词拓扑\分词结果.txt"
with open(output_path, 'w', encoding='utf-8') as f:
for word in all_words:
f.write(f"{word}\n")
# 读取分词结果并统计词频
with open(output_path, 'r', encoding='utf-8') as f:
words = f.read().splitlines()
word_counts = Counter(words)
# 将统计结果写入文本文件
stats_output_path = r"D:\公库\关键词拓扑\关键词频结果.txt"
with open(stats_output_path, 'w', encoding='utf-8') as f:
for word, count in word_counts.most_common():
f.write(f"{word}: {count}\n")
# 抽取三个字符以上的关键词
three_char_words = [word for word in word_counts if len(word) >= 3]
# 写入非拆分词文件
with open(r"D:\公库\关键词拓扑\非拆分词.txt", 'a', encoding='utf-8') as f:
for word in three_char_words:
f.write(f"{word}\n")
然后,如果是考虑目标性可以将代码写的更加简洁。
考虑方向不同,代码逻辑不尽相像。