先用划分段落的办法把每一个段落存进一个列的表里面,再遍历每个段落,求出句子个数
具体代码如下:
# -*-coding:utf-8 -*- import nltk # nltk.download() import sys import numpy import re import os from scipy.stats import pearsonr reload(sys) sys.setdefaultencoding('utf8') n = 0 while n < 5: n += 1 url = '/home/zheng/firstproject2/essay/' + str(n) file_name = os.listdir(url) print str(n) + '分下文章' number = 0 for single_file in file_name: number += 1 with open(url + '/' + single_file, 'r') as essay: content = essay.read() essay_list=re.split('[\n]+[\s]{2,}', content) num=0 new_list = [] for single_paragraph in essay_list: num += 1 sentences_num=re.findall('(.*?[A-Za-z]+.*?[\.\!\?])',single_paragraph) new_list.append(len(sentences_num)) N = len(new_list) sum1 = 0.0 sum2 = 0.0 for i in range(N): sum1 += new_list[i] sum2 += new_list[i] ** 2 mean = sum1 / N var = sum2 / N - mean ** 2 var = sum2 / N - mean ** 2 biaozhuncha=var **0.5 print '第' + str(number) + '篇文章的段落句子平均数:' +str(float(sum(new_list))/len(new_list)) print '第' + str(number) + '篇文章的段落句子数的方差为:' + str(var) print '第' + str(number) + '篇文章的段落句子数的标准差为:' +str(biaozhuncha)