Python分词练习

import matplotlib.pyplot as plt
import jieba
import jieba.analyse
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image, ImageSequence
import numpy as np


#返回两个参数,cut后的值以及排序后的关键词
def cutWord(cutWordsFile,userWordsFile = None,stopWordsFile = None):
    cutWordsList = []
    stopWordsList = []
    userWordsList = []
    tmpcutWordList=[]
    if(userWordsFile) is not None:
        with open(userWordsFile,'r',encoding='utf-8') as f:
            lines = f.readlines() #添加自定义jieba分词
            for line in lines:
                userWordsList.append(line.strip('\n'))
            for word in userWordsList:
                jieba.add_word(word.strip())
        f.close()
    text =  open(cutWordsFile,'r',encoding='utf-8').read() #分词
    segList = jieba.cut(text,cut_all=False)
    for list in segList:
        cutWordsList.append(list)
    if(stopWordsFile) is not None:  #添加停止词
        with open(stopWordsFile,'r',encoding='utf-8') as f:
            lines = f.readlines()
            for line in lines:
                stopWordsList.append(line.strip('\n'))
        f.close()
        for word in cutWordsList:
            if  not(word in stopWordsList) and (len(word)) > 1:
                tmpcutWordList.append(word)
        cutWordsList.clear()
        cutWordsList = tmpcutWordList
    sentence = "".join(cutWordsList)
    tags = jieba.analyse.extract_tags(sentence,topK = -1,withWeight=False)
    word_List = []
    for t in tags:
        word_List.append(t)
    tmpList = []
    for word in word_List:
        tmpList.append([word,cutWordsList.count(word)])
        tmpList.sort(key=lambda x:x[1],reverse=True)
    return [cutWordsList,tmpList]  #返回两个参数,cut后的值以及排序后的关键词

def plotWordCloud(wordList,topK = 100):
    keyWords = dict()
    back_coloring_path = "bsz.jpg"  # 设置背景图片路径
    #text_path = 'txt/lz.txt'  # 设置要分析的文本路径
    font_path = 'C:\Windows\Fonts\simkai.ttf'  # 为matplotlib设置中文字体路径没
    #stopwords_path = 'stopwords\stopwords1893.txt'  # 停用词词表
    #imgname1 = "WordCloudDefautColors.png"  # 保存的图片名字1(只按照背景图片形状)
    #imgname2 = "WordCloudColorsByImg.png"  # 保存的图片名字2(颜色按照背景图片颜色布局生成)
    image = Image.open(back_coloring_path)
    graph = np.array(image)
    #配置wordCloud参数
    wc = WordCloud(font_path=font_path,  # 设置字体
                   background_color="white",  # 背景颜色
                   max_words=topK,  # 词云显示的最大词数
                   mask=graph,  # 设置背景图片
                   max_font_size=100,  # 字体最大值
                  # random_state=42,
                  # width=1000, height=860, margin=2,  # 设置图片默认的大小,但是如果使用背景图片的话,那么保存的图片大小将会按照其大小保存,margin为词语边缘距离
                   )
    for i in range(topK):
        keyWords[wordList[i][0]] = wordList[i][1]
    wc.generate_from_frequencies(keyWords)
    image_color = ImageColorGenerator(graph)
    # 显示图片
    plt.imshow(wc)
    plt.imshow(wc.recolor(color_func=image_color))
    plt.axis("off")  # 关闭图像坐标系
    plt.show()

def main():
    cut = cutWord('bsz.txt',userWordsFile='userWordsFile.txt',stopWordsFile='stopWordsFile.txt')
    #dict = dict([['发展', 18], ['苏州银行', 14], ['苏州', 10], ['金融', 10], ['挂牌', 7], ['编辑', 7], ['地方', 7], ['商业银行', 6], ['亿元', 6], ['首付款', 5], ['信用卡', 5], ['开业', 5], ['理念', 5], ['战略', 5], ['经营', 5], ['服务', 5], ['经济', 5], ['股份制', 4], ['决议', 4], ['支付', 4], ['加快', 4], ['现代', 4], ['资本', 4], ['重要', 4], ['成为', 4], ['住房贷款', 3]])
    print(cut[1])
    plotWordCloud(cut[1])
   








if __name__ == '__main__':
    main()


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值