python中wordcloud如何自定义形状和中文词云呢

最新推荐文章于 2024-06-23 22:01:32 发布

原创最新推荐文章于 2024-06-23 22:01:32 发布 · 436 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#python #numpy #开发语言

python 专栏收录该内容

5 篇文章

订阅专栏

该示例展示了如何结合Jieba分词库和自定义鹦鹉图像形状，生成带有中文的词云图。通过处理图像边缘，增强颜色对比，并使用自定义字体，创建出具有鹦鹉轮廓的词云，并排除停用词，提高词云的可读性。

自定义鹦鹉形状词云

中文乱码示例, 原生是不支持中文的

"""
Image-colored wordcloud with boundary map
=========================================
A slightly more elaborate version of an image-colored wordcloud
that also takes edges in the image into account.
Recreating an image similar to the parrot example.
"""

import os
from PIL import Image

import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_gradient_magnitude

from wordcloud import WordCloud, ImageColorGenerator

# get data directory (using getcwd() is needed to support running example in generated IPython notebook)
d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()

# load wikipedia text on rainbow
text = open(os.path.join(d, 'blackboard.md'), encoding="utf-8").read()

# load image. This has been modified in gimp to be brighter and have more saturation.
parrot_color = np.array(Image.open(os.path.join(d, "parrot-by-jose-mari-gimenez2.jpg")))
# subsample by factor of 3. Very lossy but for a wordcloud we don't really care.
parrot_color = parrot_color[::3, ::3]

# create mask  white is "masked out"
parrot_mask = parrot_color.copy()
parrot_mask[parrot_mask.sum(axis=2) == 0] = 255

# some finesse: we enforce boundaries between colors so they get less washed out.
# For that we do some edge detection in the image
edges = np.mean([gaussian_gradient_magnitude(parrot_color[:, :, i] / 255., 2) for i in range(3)], axis=0)
parrot_mask[edges > .08] = 255

# create wordcloud. A bit sluggish, you can subsample more strongly for quicker rendering
# relative_scaling=0 means the frequencies in the data are reflected less
# acurately but it makes a better picture
wc = WordCloud(max_words=2000, mask=parrot_mask, max_font_size=40, random_state=42, relative_scaling=0)

# generate word cloud
wc.generate(text)
plt.imshow(wc)

# create coloring from image
image_colors = ImageColorGenerator(parrot_color)
wc.recolor(color_func=image_colors)
plt.figure(figsize=(10, 10))
plt.imshow(wc, interpolation="bilinear")
wc.to_file("parrot_new.png")

乱码
乱码显示

自定义形状和中文词云的例子

加入jieba分词之后, 结合上面的自定义形状,生成一个小鹦鹉词云

"""
自定义形状和中文词云的例子
==========================

这个例子展示了如何从文本和自定义形状生成一个中文词云，使用Jieba进行中文分词和处理。

这个例子的自定义形状是一个鹦鹉的矢量图形。
"""

import jieba
import os
from PIL import Image

import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_gradient_magnitude

from wordcloud import WordCloud, ImageColorGenerator

# 获取数据目录
d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()

def jieba_processing_txt(text):
    # 加载自定义词典
    jieba.load_userdict('userdict.txt')

    mywordlist = []
    seg_list = jieba.cut(text, cut_all=False)
    liststr = "/ ".join(seg_list)

    with open(os.path.join(d, 'stopwords_cn_en.txt'), encoding='utf-8') as f_stop:
        f_stop_text = f_stop.read()
        f_stop_seg_list = f_stop_text.splitlines()

    for myword in liststr.split('/'):
        if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1:
            mywordlist.append(myword)
    return ' '.join(mywordlist)

# 读取文本数据
text = open(os.path.join(d, 'blackboard.md'), encoding="utf-8").read()
text = str(text)

# 读取自定义形状
parrot_color = np.array(Image.open(os.path.join(d, "parrot-by-jose-mari-gimenez2.jpg")))
# 对形状进行下采样
parrot_color = parrot_color[::3, ::3]

# 创建掩模，用于指定需要生成词云的区域
parrot_mask = parrot_color.copy()
# 将掩模中所有白色区域都视为“掩蔽区”，不参与词云的生成
parrot_mask[parrot_mask.sum(axis=2) == 0] = 255

# 对掩模进行微调，加强掩蔽区和非掩蔽区之间的边界，避免颜色混在一起
edges = np.mean([gaussian_gradient_magnitude(parrot_color[:, :, i] / 255., 2) for i in range(3)], axis=0)
parrot_mask[edges > .08] = 255

# 创建WordCloud对象
wc = WordCloud(font_path=os.path.join(d, 'SmileySans-Oblique.otf'),
               max_words=2000, 
               mask=parrot_mask, 
               max_font_size=40, 
               random_state=42, 
               relative_scaling=0)

# 对文本进行分词和处理，并生成词云
wc.generate(jieba_processing_txt(text))
plt.imshow(wc)

# 从图片中提取颜色，并将颜色应用到词云中
image_colors = ImageColorGenerator(parrot_color)
wc.recolor(color_func=image_colors)

# 显示词云
plt.figure(figsize=(10, 10))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

# 输出词云图像到文件
wc.to_file("wordcloud.png")