首先安装必要的包:
pip install xlrd jieba
代码如下:
import xlrd
import jieba
from collections import Counter
def read_xls_and_analyze(file_path):
# 打开xls文件
workbook = xlrd.open_workbook(file_path)
sheet = workbook.sheet_by_index(0) # 假设我们处理第一个工作表
# 初始化一个列表来存储所有的中文分词
all_words = []
# 遍历所有行和列
for row_idx in range(sheet.nrows):
for col_idx in range(sheet.ncols):
cell_value = sheet.cell_value(row_idx, col_idx)
# 检查单元格值是否为字符串
if isinstance(cell_value, str):
# 提取中文文本
chinese_text = ''.join(char for char in cell_value if '\u4e00' <= char <= '\u9fff')
if chinese_text: # 如果存在中文文本
# 使用jieba进行分词
words = jieba.lcut(chinese_text)
all_words.extend(words) # 将分词结果添加到列表中
# 使用Counter进行词频统计
word_counts = Counter(all_words)
# 打印词频统计结果
for word, count in word_counts.most_common():
print(f"{word}: {count}")
# 使用示例
file_path = r'C:\Users\admin\Desktop\gen500+sep500-eval.xls' # 使用原始字符串
read_xls_and_analyze(file_path)
结果: