语义分析之词云词频 | Python WordCloud

jupyter notebook(python 3.12.7)  + WordCloud

1.安装必要的包:

pip install wordcloud matplotlib jieba

检查是否安装成功:

import jieba
print("安装成功!" if jieba.__version__ else "失败")

2.用示例文本运行代码:

# -*- coding: utf-8 -*-
import os
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
from matplotlib import font_manager

# ===== 1. 中文环境初始化 =====
def init_chinese_font():
    """检测并配置中文字体"""
    try:
        # Windows/Mac/Linux通用检测逻辑
        font_path = font_manager.findfont(font_manager.FontProperties(
            family=['sans-serif'], 
            weight='normal',
            style='normal'
        ))
        plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'WenQuanYi Micro Hei']
        plt.rcParams['axes.unicode_minus'] = False
        return font_path
    except:
        raise RuntimeError("中文字体配置失败,请安装SimHei字体")

# ===== 2. 示例文本处理 =====
text = """
自然语言处理是人工智能领域的重要方向,深度学习技术极大地推动了该领域的发展。
中文分词、词性标注、命名实体识别都是中文信息处理的基础任务。近年来,
基于Transformer的预训练模型如BERT、GPT等在多个NLP任务上取得了突破性进展。
研究者们正在探索更高效的模型架构和训练方法,同时关注模型的可解释性和能耗问题。
"""

# ===== 3. 智能分词处理 =====
def chinese_text_segment(text):
    """中文分词与清洗"""
    # 加载停用词表
    stopwords = set(line.strip() for line in open('stopwords.txt', encoding='utf-8') 
                   if line.strip()) if os.path.exists('stopwords.txt') else {
        "的", "了", "和", "是", "在", "等", "如", "基于", "个", "都", "等"
    }
    
    # 精确模式分词
    words = jieba.cut(text)
    return [word for word in words 
           if len(word) > 1 
           and word not in stopwords
           and not word.isspace()]

# ===== 4. 生成词云 =====
try:
    # 初始化字体环境
    font_path = init_chinese_font()
    print(f"当前使用字体:{font_path}")

    # 执行分词
    words = chinese_text_segment(text)
    print("分词结果示例:", words[:10])

    # 生成词云对象
    wc = WordCloud(
        font_path=font_path,
        width=1200,
        height=800,
        background_color='white',
        max_words=150,
        collocations=False,
        scale=3  # 提升输出分辨率
    ).generate(" ".join(words))

    # 可视化展示
    plt.figure(figsize=(15, 10))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.title("自然语言处理主题词云", pad=20, fontsize=18)
    plt.savefig('nlp_wordcloud.png', dpi=300, bbox_inches='tight')
    plt.show()

    # 高频词统计
    word_freq = Counter(words).most_common(15)
    print("\n高频词TOP15:")
    for idx, (word, count) in enumerate(word_freq, 1):
        print(f"{idx:2d}. {word:<6} {count:>3}次")

except Exception as e:
    print(f"执行出错:{str(e)}")
    print("故障排除:")
    print("1. 确保已安装中文字体:https://zhuanlan.zhihu.com/p/499533144")
    print("2. 检查文件权限")
    print("3. 更新依赖库:pip install --upgrade wordcloud jieba matplotlib")

输出结果:

3.把示例文本替换成导入桌面指定txt文档,运行新代码:
(提前在桌面创建了一个名为input.txt的文档)

# -*- coding: utf-8 -*-
import os
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
from matplotlib import font_manager

# ===== 1. 中文环境初始化 =====
def init_chinese_font():
    """检测并配置中文字体"""
    try:
        # Windows/Mac/Linux通用检测逻辑
        font_path = font_manager.findfont(font_manager.FontProperties(
            family=['sans-serif'], 
            weight='normal',
            style='normal'
        ))
        plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'WenQuanYi Micro Hei']
        plt.rcParams['axes.unicode_minus'] = False
        return font_path
    except:
        raise RuntimeError("中文字体配置失败,请安装SimHei字体")

# ===== 2. 文件读取函数 =====
def read_desktop_file(filename="input.txt"):
    """自动读取桌面文档"""
    # 获取桌面路径(跨平台兼容)
    desktop_path = os.path.join(os.path.expanduser('~'), 'Desktop')
    
    # 支持常见文本格式
    file_path = os.path.join(desktop_path, filename)
    
    # 自动尝试不同编码读取
    encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as f:
                return f.read()
        except UnicodeDecodeError:
            continue
    raise UnicodeError(f"无法解码文件 {filename},请确保使用UTF-8/GBK编码")

# ===== 3. 智能分词处理 =====
def chinese_text_segment(text):
    """中文分词与清洗"""
    # 加载停用词表
    stopwords = set(line.strip() for line in open('stopwords.txt', encoding='utf-8') 
                   if line.strip()) if os.path.exists('stopwords.txt') else {
        "的", "了", "和", "是", "在", "等", "如", "基于", "个", "都", "等"
    }
    
    # 精确模式分词
    words = jieba.cut(text)
    return [word for word in words 
           if len(word) > 1 
           and word not in stopwords
           and not word.isspace()]

# ===== 4. 生成词云 =====
try:
    # 初始化字体环境
    font_path = init_chinese_font()
    print(f"当前使用字体:{font_path}")

    # 读取桌面文档(自动检测常见文件名)
    doc_names = ['input.txt', 'text.txt', '文档.txt', 'data.txt']
    for name in doc_names:
        if os.path.exists(os.path.join(os.path.expanduser('~'), 'Desktop', name)):
            text = read_desktop_file(name)
            print(f"成功读取文档:{name}")
            break
    else:
        raise FileNotFoundError("未找到输入文件,请在桌面创建input.txt/text.txt/文档.txt")

    # 执行分词
    words = chinese_text_segment(text)
    print("分词结果示例:", words[:10])

    # 生成词云对象
    wc = WordCloud(
        font_path=font_path,
        width=1200,
        height=800,
        background_color='white',
        max_words=150,
        collocations=False,
        scale=3  # 提升输出分辨率
    ).generate(" ".join(words))

    # 可视化展示
    plt.figure(figsize=(15, 10))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.title("文档分析词云", pad=20, fontsize=18)
    output_path = os.path.join(os.path.expanduser('~'), 'Desktop', 'wordcloud.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f"词云已保存至桌面:{output_path}")
    plt.show()

    # 高频词统计
    word_freq = Counter(words).most_common(15)
    print("\n高频词TOP15:")
    for idx, (word, count) in enumerate(word_freq, 1):
        print(f"{idx:2d}. {word:<6} {count:>3}次")

except Exception as e:
    print(f"执行出错:{str(e)}")
    print("故障排除:")
    print("1. 请在桌面创建包含中文文本的input.txt/text.txt文件")
    print("2. 确保文件编码为UTF-8或GBK")
    print("3. 中文字体安装指南:https://zhuanlan.zhihu.com/p/499533144")
    print("4. 更新依赖库:pip install --upgrade wordcloud jieba matplotlib")

输出结果:

4.把指定桌面txt文档替换成指定文件夹内的txt多个文档,运行新代码:

# -*- coding: utf-8 -*-
import os
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
from matplotlib import font_manager

# ===== 1. 中文环境初始化 =====
def init_chinese_font():
    """检测并配置中文字体"""
    try:
        font_path = font_manager.findfont(
            font_manager.FontProperties(family=['sans-serif'])
        )
        plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei']
        plt.rcParams['axes.unicode_minus'] = False
        return font_path
    except:
        raise RuntimeError("中文字体配置失败,请安装SimHei字体")

# ===== 2. 多文件读取函数 =====
def read_txt_files(folder_path):
    """批量读取指定文件夹内所有txt文件"""
    # 验证路径有效性
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"路径不存在: {folder_path}")
    if not os.path.isdir(folder_path):
        raise NotADirectoryError(f"不是有效文件夹: {folder_path}")

    # 获取所有txt文件
    txt_files = [f for f in os.listdir(folder_path) 
                if f.lower().endswith('.txt')]
    
    if not txt_files:
        raise FileNotFoundError(f"文件夹内未找到txt文件: {folder_path}")

    # 合并文本内容
    all_text = []
    for filename in txt_files:
        file_path = os.path.join(folder_path, filename)
        print(f"正在读取: {filename}")
        
        # 自动检测编码
        encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030']
        for encoding in encodings:
            try:
                with open(file_path, 'r', encoding=encoding) as f:
                    all_text.append(f.read())
                    break
            except UnicodeDecodeError:
                continue
        else:
            print(f"警告: {filename} 解码失败,已跳过")
    
    return '\n'.join(all_text)

# ===== 3. 增强版分词处理 =====
def chinese_text_segment(text):
    """带停用词过滤的分词"""
    # 加载停用词表
    stopwords = set()
    if os.path.exists('stopwords.txt'):
        with open('stopwords.txt', 'r', encoding='utf-8') as f:
            stopwords = {line.strip() for line in f if line.strip()}
    else:
        stopwords = {"的", "了", "和", "是", "在", "等", "如", "个", "都"}

    # 精确模式分词 + 词性过滤
    import jieba.posseg as pseg
    allow_pos = {'n', 'v', 'a', 'eng'}  # 保留名词、动词、形容词、英文
    
    words = []
    for word, flag in pseg.cut(text):
        if (len(word) > 1 and 
            flag[0] in allow_pos and
            word not in stopwords and
            not word.isspace()):
            words.append(word)
    return words

# ===== 4. 主程序 =====
if __name__ == "__main__":
    try:
        # ==== 用户配置 ====
        TARGET_FOLDER = r"D:\数据分析\文本资料"  # 修改为你的文件夹路径
        
        # 初始化字体
        font_path = init_chinese_font()
        print(f"字体配置成功: {font_path}")

        # 读取并合并文本
        combined_text = read_txt_files(TARGET_FOLDER)
        print(f"共读取到{len(combined_text)}个字符")

        # 执行分词
        words = chinese_text_segment(combined_text)
        print(f"有效词汇总数: {len(words)}")
        print("示例词汇:", words[:15])

        # 生成词云
        wc = WordCloud(
            font_path=font_path,
            width=1600,
            height=1200,
            background_color='white',
            max_words=300,
            collocations=False,
            scale=3
        ).generate(" ".join(words))

        # 保存结果
        output_path = os.path.join(TARGET_FOLDER, "合并分析_词云.png")
        plt.figure(figsize=(18, 12))
        plt.imshow(wc, interpolation="bilinear")
        plt.axis("off")
        plt.savefig(output_path, dpi=300, bbox_inches="tight")
        print(f"\n词云已保存至: {output_path}")

        # 高频词统计
        counter = Counter(words).most_common(30)
        print("\n高频词TOP30:")
        for idx, (word, count) in enumerate(counter, 1):
            print(f"{idx:2d}. {word:<8} {count:>4}次")

    except Exception as e:
        print(f"\n错误发生: {str(e)}")
        print("排查建议:")
        print("1. 检查文件夹路径是否正确(建议使用原始字符串格式)")
        print("2. 确保文件夹内有.txt文件")
        print("3. 尝试将文件编码统一转为UTF-8")
        print("4. 安装必要字体: https://zhuanlan.zhihu.com/p/499533144")

注意把代码中的文件夹替换成自己需要的:

运行结果:

一般可以先一键运行词云,看看自动分词的效果,然后再根据业务去调整词典修正运行结果。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值