用python实现小说的平均句长,词性占比,关键词,标点符号,词形统计
需求如下

代码:
词性占比
import jieba
from wordcloud import WordCloud
import re
from PIL import Image
import matplotlib.pyplot as plt
def read_file_gbk(filename):
with open(filename,'r',encoding='GBK') as f:
s = f.read()
s = re.sub('/C', '', s)
s = re.sub('\r|\n|\s','',s)
return s
import jieba
import numpy as np
def open_dict(Dict = 'hahah', path=r''):
path = path + '%s.txt' % Dict
dictionary = open(path, 'r', encoding='utf-8')
dict = []
for word in dictionary:
word = word.strip(' ,\n')
dict.append(word)
return dict
def judgeodd(num):
if (num % 2) == 0:
return 'even'
else:
return 'odd'
deny_word = open_dict(Dict = '否定词', path= r'')
posdict = open_dict(Dict = 'positive', path= r'')
negdict = open_dict(Dict = 'negative', path= r'')
degree_word = open_dict(Dict = '程度级别词语', path= r'')
mostdict = degree_word[degree_word.index('extreme')+1 : degree_word.index('very')]
verydict = degree_word[degree_word.index('very')+1 : degree_word.index('more')]
moredict = degree_word[degree_word.index('more')+1 : degree_word.index('ish')]
ishdict = degree_word[degree_word.index('ish')+1 : degree_word.index('last')]
def sentiment_score_list(dataset):
seg_sentence = dataset.split('。|!|?')
count1 = []
count2 = []
for sen in seg_sentence:
segtmp = jieba.lcut(sen, cut_all=False,HMM=False)
i = 0
a = 0
poscount = 0
poscount2 = 0
poscount3 = 0
negcount = 0
negcount2 = 0
negcount3 = 0
for word in segtmp:
poscount = 0
neg_count = 0
poscount2 = 0
neg_count2 = 0
poscount3 = 0
neg_count3 = 0
if word in posdict:
poscount += 1
c = 0
for w in segtmp[a:i]:
if w in mostdict:
poscount *= 4.0
elif w in verydict:
poscount *= 3.0
elif w in moredict:
poscount *= 2.0
elif w in ishdict:
poscount *= 0.5
elif w in deny_word:
c += 1
if judgeodd(c) == 'odd':
poscount *= -1.0
poscount2 += poscount
poscount = 0
poscount3 = poscount + poscount2 + poscount3
poscount2 =