用python实现小说的平均句长,词性占比,关键词,标点符号,词形统计
需求如下
代码:
词性占比
import jieba
from wordcloud import WordCloud
import re
from PIL import Image
import matplotlib. pyplot as plt
def read_file_gbk ( filename) :
with open ( filename, 'r' , encoding= 'GBK' ) as f:
s = f. read( )
s = re. sub( '/C' , '' , s)
s = re. sub( '\r|\n|\s' , '' , s)
return s
import jieba
import numpy as np
def open_dict ( Dict = 'hahah' , path= r'' ) :
path = path + '%s.txt' % Dict
dictionary = open ( path, 'r' , encoding= 'utf-8' )
dict = [ ]
for word in dictionary:
word = word. strip( ' ,\n' )
dict . append( word)
return dict
def judgeodd ( num) :
if ( num % 2 ) == 0 :
return 'even'
else :
return 'odd'
deny_word = open_dict( Dict = '否定词' , path= r'' )
posdict = open_dict( Dict = 'positive' , path= r'' )
negdict = open_dict( Dict = 'negative' , path= r'' )
degree_word = open_dict( Dict = '程度级别词语' , path= r'' )
mostdict = degree_word[ degree_word. index( 'extreme' ) + 1 : degree_word. index( 'very' ) ]
verydict = degree_word[ degree_word. index( 'very' ) + 1 : degree_word. index( 'more' ) ]
moredict = degree_word[ degree_word. index( 'more' ) + 1 : degree_word. index( 'ish' ) ]
ishdict = degree_word[ degree_word. index( 'ish' ) + 1 : degree_word. index( 'last' ) ]
def sentiment_score_list ( dataset) :
seg_sentence = dataset. split( '。|!|?' )
count1 = [ ]
count2 = [ ]
for sen in seg_sentence:
segtmp = jieba. lcut( sen, cut_all= False , HMM= False )
i = 0
a = 0
poscount = 0
poscount2 = 0
poscount3 = 0
negcount = 0
negcount2 = 0
negcount3 = 0
for word in segtmp:
poscount = 0
neg_count = 0
poscount2 = 0
neg_count2 = 0
poscount3 = 0
neg_count3 = 0
if word in posdict:
poscount += 1
c = 0
for w in segtmp[ a: i] :
if w in mostdict:
poscount *= 4.0
elif w in verydict:
poscount *= 3.0
elif w in moredict:
poscount *= 2.0
elif w in ishdict:
poscount *= 0.5
elif w in deny_word:
c += 1
if judgeodd( c) == 'odd' :
poscount *= - 1.0
poscount2 += poscount
poscount = 0
poscount3 = poscount + poscount2 + poscount3
poscount2 =<