python信息熵的计算

最新推荐文章于 2023-03-10 00:20:02 发布

潔～

最新推荐文章于 2023-03-10 00:20:02 发布

阅读量4.1k

点赞数 4

分类专栏：文本挖掘文章标签： python 信息熵

本文链接：https://blog.csdn.net/wyj95anan/article/details/107745341

版权

文本挖掘专栏收录该内容

4 篇文章 1 订阅

订阅专栏

信息熵：信息熵越大，信息越多，与以前的消息相比，措辞越独特。

参考：
信息熵求解部分：https://www.jianshu.com/p/468e2af86d59

导入包

import numpy as np
import pandas as pd
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

计算信息熵的方法

def calc_ent(x):
    """
        calculate shanno ent of x
    """

    x_value_list = set([x[i] for i in range(x.shape[0])])
    ent = 0.0
    for x_value in x_value_list:
        p = float(x[x == x_value].shape[0]) / x.shape[0]
        logp = np.log2(p)
        ent -= p * logp
    return ent
    print(ent)

文本预处理

urls=pd.read_csv(r"E:\IdeasTest.csv",encoding="utf-8")
m=len(urls)
print(m)
texts=[]

def removePunctuation(text):
    text = re.sub(r'[{}]+'.format(punctuation),'',text)
    return text.strip().lower()

#create English stop words list
en_stop = stopwords.words('english')
#英文词干提取
p_stemmer = PorterStemmer()

for j in range(0,m):
    print(urls['Ideas'][j])
    csvfile=urls['Ideas'][j]
    punctuation = '.!,;:?"\'"&/()+*=~@#$%^_{}[]|`°...、——【】‘’“”？《》，。·！……：；-<>'
    lower = csvfile.lower()
    #str.maketrans创建转化表
    remove = str.maketrans('','',string.punctuation) 
    without_punctuation = lower.translate(remove)
    without_punctuation1=removePunctuation(without_punctuation)
    for c in string.digits: #去数字
        without_punctuation1 = without_punctuation1.replace(c, '')
    tokens=word_tokenize(without_punctuation1)

    length1=len(tokens)
    x1=0
    while x1 < length1:
        if tokens[x1] in ['...','°','—','_','？','。','，','》','《','”','“','；','：','、','【','】','=','-','——','……','！','·','~','‘','’','/','^','-','+','<','>','{','}','*','//',',','.',':',';','?','(',')','[',']','&','!','$','%','*','@','|','`','#']:
            del tokens[x1]
            x1 -= 1
            length1 -= 1
        x1 += 1
    print(tokens)
        
    stopped_tokens = [w for w in tokens if not w in en_stop]
    print(stopped_tokens)
    # stem token词干化
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    print(stemmed_tokens)
    
    # add tokens to list
    texts.append(stemmed_tokens)
print(texts)

文档信息熵计算

content=[]
for x in range(0,m):
    #将生成的text由list转化为dtype类型
    textnp = np.array(texts[x])
    entropy=calc_ent(textnp) 
    content.append([entropy])
    dd=pd.DataFrame(content,columns=['entropy'])
    dd.to_csv(r"E:\entropy.csv")