目录
词频统计及个性化输出
1. 所需库的安装
由于正常操作安装太慢,所以使用豆瓣的镜像库进行安装
pip3 install jieba -i https://pypi.douban.com/simple
pip3 install wordcloud -i https://pypi.douban.com/simple
pip3 install imageio -i https://pypi.douban.com/simple
2. jieba库小Demo
import jieba
jieba.lcut("中国是一个伟大的国家")
['中国', '是', '一个', '伟大', '的', '国家']
3. CalHamlet 字符统计
def getText():
txt = open("hamlet.txt","r").read()
txt = txt.lower()
for ch in '!"#$%^&*()+_-=,./:;<>?[\\]{|}`~':
txt = txt.replace(ch," ")
return txt
hamletTxt = getText()
words = hamletTxt.split()
counts = {}
for word in words:
counts[word] = counts.get(word,0)+1
items = list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(10):
word,count = items[i]
print ("{0:<10}{1:>5}".format(word,count))
the 1138
and 965
to 754
of 669
you 550
i 542
a 542
my 514
hamlet 462
in 436
4. CalHamlet 字符统计 过滤
excludes = {"the","and","of","you","a","i","my","in","to","that","is"
,"it","not","his","this","but","with","for","your","me","be"
,"as","he","what","him","so","have","will","do","no","we","are"
,"all","on","our","by","or","shall","if","o","good","come","thou"
,"they","now","more","let","from","her","how","at","thy"}
def getText():
txt = open("./资源/hamlet.txt","r").read()
txt = txt.lower()
for ch in '!"#$%^&*()+_-=,./:;<>?[\\]{|}`~':
txt = txt.replace(ch," ")
return txt
hamletTxt = getText()
words = hamletTxt.split()
counts = {}
for word in words:
counts[word] = counts.get(word,0)+1
for word in excludes:
del(counts[word])
items = list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(10):
word,count = items[i]
print ("{0:<10}{1:>5}".format(word,count))
hamlet 462
lord 309
king 194
horatio 157
claudius 120
queen 117
polonius 116
laertes 103
gertrude 95
ophelia 86
5.三国演义 人物出场统计
import jieba
txt = open("./资源/三国演义.txt",'r',encoding='utf-8').read()
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word)==1:
continue
else:
counts[word] = counts.get(word,0)+1
items = list(counts.items())
items.sort(key = lambda y:y[1],reverse=True)
for i in range(15):
word , count = items[i]
print ("{0:<10}{1:>5}".format(word,count))
曹操 934
孔明 831
将军 759
却说 647
玄德 570
关公 509
丞相 488
二人 463
不可 435
荆州 420
孔明曰 384
玄德曰 383
不能 383
如此 376
张飞 348
6.三国演义 人物出场统计 过滤版
import jieba
excludes = {"将军","却说","荆州","二人","不可","不能","如此","商议"}
txt = open("./资源/三国演义.txt","r",encoding ='utf-8').read()
words = jieba.lcut(txt)
counts ={}
for word in words:
if len(word)==1:
continue
elif word =="诸葛亮"or word=="孔明曰":
rword == "孔明"
elif word =="关公"or word=="云长":
rword = "关羽"
elif word =="玄德"or word=="玄德曰":
rword = "刘备"
elif word =="孟德"or word=="丞相":
rword = "曹操"
else:
rword = word
counts[rword] = counts.get(rword,0)+1
for word in excludes:
del(counts[word])
items = list(counts.items())
items.sort(key=lambda x:x[1],reverse = True)
for i in range(5):
word,count = items[i]
print("{0:<10}{1:>5}".format(word,count))
曹操 1435
刘备 1228
孔明 839
关羽 779
张飞 348
7.杨辉三角
def NumList_to_StrList(data):
new_data = []
for i in range(len(data)):
new_data.append(str(data[i]))
string = ' '.join(new_data)
return string
def YangHui(n):
width = n * 6
print('1'.center(width))
line = [1,1]
print('1 1'.center(width))
for i in range(2,n):
r = []
for j in range(0,len(line) - 1):
r.append(line[j] + line[j + 1])
line = [1] + r + [1]
print(NumList_to_StrList(line).center(width))
YangHui(6)
1
1 1
1 2 1
1 3 3 1
1 4 6 4 1
1 5 10 10 5 1
8.简单的词云小程序
import jieba
import wordcloud
import imageio
f = open("./资源/三国演义.txt","r",encoding ='utf-8').read()
ls = jieba.lcut(f)
txt = " ".join(ls)
m = imageio.imread("./资源/duye.jpg")
font = r'c:/Windows/Fonts/simfang.ttf'
w = wordcloud.WordCloud(background_color="white",font_path =font ,width=1000,height=1000,mask=m).generate(txt)
w.to_file("./资源/test.png")