以《我的微信连三界》这本小说为例,进行字符统计
- 对中文、英文、标点符号、数字等分别统计, 统计结果暂时储存在字典 countchr 中
- 用jieba库的分词功能将文本中所有可能的词(和字符)分离出来,统计每个词(和字符)出现频率,降序排列并保存
- “词频”保存在 “jieba lcut.txt” 中, "字符频率"保存在 “jieba lcut1.txt” 中
- 将高频字词和countchr一起保存在 “countchar.txt” 中
运行结果如下![在这里插入图片描述](https://img-blog.csdnimg.cn/20191102154022284.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzM4NzAwNTky,size_16,color_FFFFFF,t_70)
![在这里插入图片描述](https://img-blog.csdnimg.cn/20191102163315632.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzM4NzAwNTky,size_16,color_FFFFFF,t_70)
这样看着不是很舒服, 用excel处理了下
因为小说里有很多微信聊天的情景, 平均每个段落才21字, 比较短
主角名字占了20万字的篇幅, 有点小惊讶, "的"字出现频率最高, 这个在预料之中
在网上随机找了十几本小说试了下, 有如下规律
- 标点符号约占五分之一
- 主角名出现次数一般是最多的
- “的”、“了”、“是”、“他”这四个字使用频率都很高
- 平均每章2000~3000字, 按掌阅一章1毛多一点计算, 平摊下来5分/千字. 假设作者每分钟可以码60字, 每天稳定更两章, 每天需要码字2小时.
完整代码如下
import os
import time
import jieba
import asyncio
# hs中不储存标点符号,标点符号只统计总数,不分别统计?
pcd = {
3: '·、【】!¥—~……();‘’:“”《》,。?、',
4: ''' `~!@#$%^&*()_+-={}|:%<>?[]\;',./×''',
} # uncuaion
hs0 = {
1: 0, # 中文
2: 0, # english letter
3: 0, # 中文标点符号
4: 0, # english punctuation marks
5: 0, # 数字
6: 0, # 行数
7: 0, # 中文字数占总字符数的比例?
} # 效仿word的字符统计,因中文文章中空格较少,故不统计空格?
path = input('please input the path of your file: ')
print(os.path.isfile(path))
if not os.path.isfile(path):
path = r'C:\Users\QQ\Desktop\ls\py\我的微信连三界 狼烟新书\我的微信连三界 狼烟新书.txt' # 设置默认词?
rootpath = r'C:\Users\QQ\Desktop\ls\py\我的微信连三界 狼烟新书'
print(rootpath)
else:
rootpath = os.path.dirname(path)
print(rootpath)
def wdwxlsj():
#path = rootpath + r'\我的微信连三界 狼烟新书.txt'
sl = ['林海', '凡间', '地仙', '地府', '天仙', '散仙', '金仙', '天劫', '馨月', '林儿',
'脸皮', '不好意思', '齐天大圣', '微信', '手机', '太上老君', ] # 指定某些�?可以考虑增加脏话检索模�?
hss1 = {} # s1中的词在文件path里出现次数?
hsw = {} # 每个字符在文件里出现的次数
path2 = os.path.split(path)[0] + '\\{}.txt'.format('jieba lcut')
path3 = rootpath + r'\{}.txt'.format('jieba lcut1')
print(path2)
with open(path, 'r') as f, open(path2, 'w') as fs, open(path3, 'w') as fw:
string = f.read()
for i in sl:
hss1[i] = string.count(i)
f.seek(0)
# for i in f.readlines():
lines = f.readlines()
print(len(lines))
hsc = {} # 每个词在文件里出现的次数
# async def wordsf(i,hsc):
for i in lines:
for j in i:
if 19968 <= ord(j) <= 40869:
hs0[1] += 1
elif 65 <= ord(j) < 90 or 97 <= ord(j) <= 122:
hs0[2] += 1
elif 48 <= ord(j) <= 57:
hs0[5] += 1
else:
pass
for j in jieba.lcut(i):
if j in pcd[3]:
hs0[3] += 1
elif j in pcd[4]:
hs0[4] += 1
elif j == '\n':
hs0[6] += 1
'''此处代码作废
elif ord('a')<=ord(j)<=ord('z') or ord('A')<=ord(j)<=ord('Z'):
hs0[2]+=1#漏掉了连续字母?
elif ord('0')<=ord(j)<=ord('9'):
hs0[5]+=1#漏掉了大9的数字?
'''
elif len(j) > 1:
# print(j,type(j),hs0[3],hs0[4])
if j in hsc:
hsc[j] += 1
else:
hsc[j] = 1
else:
if j in hsw:
hsw[j] += 1
else:
hsw[j] = 1
'''尝试用异步运行,运行时间并未缩短,不知道问题出在哪
async def do(i,hsc):
await wordsf(i,hsc)
tasks=[asyncio.ensure_future(do(i,hsc)) for i in lines]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
'''
hs0[7] = hs0[1] / len(string)
hsc = sort(hsc) # 按values降序排列
hsw = sort(hsw)
fs.write(str(hsc))
fw.write(str(hsw))
def countchar():
with open(rootpath + r'\countchar.txt', 'w') as f:
countchr = {
'中文': hs0[1],
'english letter': hs0[2],
'中文标点符号': hs0[3],
'english punctuation marks': hs0[4],
'数字': hs0[5],
'行数': hs0[6],
'中文字数占总字符数的比例': hs0[7],
'总字符数': len(string),
'平均每个段落字数': hs0[1] // hs0[6]
}
for i in countchr:
print(type(countchr[i]), countchr[i])
f.write('{}: {}\n'.format(i, unit(countchr[i])))
for path in [r'\jieba lcut.txt', r'\jieba lcut1.txt']:
filename, content = hfwords(rootpath + path, 100)
print(filename, content)
f.write('\n' + filename + '\n' + content)
countchar()
print('以下词在小说《{}》中出现次数:'.format(os.path.split(path)[1][:-4]))
for i in hss1:
print('{:<5}: {}次'.format(i, hss1[i]))
def sort(hs):
l = [*zip(hs.values(), hs.keys())]
l = sorted(l, reverse=True)
hs2 = {}
for value, key in l:
hs2[key] = value
return hs2
def hfwords(file, num, spacing=5):
# High frequency words
with open(file, 'r') as f:
string = f.read()
wf = eval(string) # words frequency
for i in wf:
length = len(i)
break
if length > 1:
filename = '《{}》中出现频率最高的{}个词'.format(os.path.basename(os.path.dirname(file)), num)
else:
filename = '《{}》中出现频率最高的{}个字'.format(os.path.basename(os.path.dirname(file)), num)
# print(filename)
with open(os.path.dirname(file) + '\\{}.txt'.format(filename), 'w') as f:
c = 0 # 控制循环次数
content = '' # 将需要写入文件的内容暂时储存在content
for i in wf:
content += '{}:{}次'.format(i, unit(wf[i]))
# print(type(content), content)
c += 1
if c % spacing == 0:
content += '\n'
else:
content += ' '
if c >= num:
f.write(content)
return filename, content
# Jump out of the loop
def unit(num):
# 添加适宜的单位
if num < 1:
return '{:.2%}'.format(num)
elif num < 500:
return num
elif num < 5000:
return '{:.2f}千'.format(num / 1000)
else:
return '{:.2f}万'.format(num / 10000)
def main():
start = time.perf_counter()
try:
wdwxlsj()
except UnboundLocalError as e:
print(e)
end = time.perf_counter()
d = end - start
print('runtime : {} minutes {} seconds'.format(d // 60, d % 60))
main()
可以改进的地方:
- 增加句型统计模块, 检索句子结果、段落结构相似度以及句型重复率
- 计算平均每章节的字数, 提取章节名和章节编号等, 构建章节名格式管理模块, 对每个章节名进行格式的标准化
- 统计小说主要人物名在各章节的分布情况, 找出人物出场频率较高的章节, 由此对章节进行分篇或分集处理
- 将统计结果图表化