第六周 组合数据类型 6.1~6.6
6.1 集合类型及操作
>>> A = {"python", 123, ("python",123)}
>>> A
{'python', 123, ('python', 123)}
>>> B = set("pypy123")
>>> B
{'3', '1', 'p', 'y', '2'}
>>> C = {"python",123,123,"python"}
>>> C
{'python', 123}
6.3实例9:基本统计值计算
# CalStatisticsV1.py
def getNum(): # 获取用户不定长度的输入
nums = []
iNumStr = input("请输入数字(回车退出): ")
while iNumStr != "":
nums.append(eval(iNumStr))
iNumStr = input("请输入数字(回车退出): ")
return nums
def mean(numbers): # 计算平均值
s = 0.0
for num in numbers:
s = s + num
return s / len(numbers)
def dev(numbers, mean): # 计算方差
sdev = 0.0
for num in numbers:
sdev = sdev + (num - mean) ** 2
return pow(sdev / (len(numbers) - 1), 0.5)
def median(numbers): # 计算中位数
sorted(numbers)
size = len(numbers)
if size % 2 == 0:
med = (numbers[size // 2 - 1] + numbers[size // 2]) / 2
else:
med = numbers[size // 2]
return med
n = getNum() # 主体函数
m = mean(n)
print("平均值:{},方差:{:.2},中位数:{}.".format(m, dev(n, m), median(n)))
6.5模块5:jieba库的使用
import jieba
s = "中国是一个伟大的国家"
print(jieba.lcut(s))
print(jieba.lcut(s, cut_all=True))
print(jieba.lcut_for_search(s))
print(jieba.add_word(w))
['中国', '是', '一个', '伟大', '的', '国家']
['中国', '国是', '一个', '伟大', '的', '国家']
['中国', '是', '一个', '伟大', '的', '国家']
None
6.6实例10:文本词频统计
Hamlet词频统计(含Hamlet原文文本)
https://python123.io/resources/pye/hamlet.txt
# CalHamletV1.py
def getText():
txt = open("hamlet.txt", "r").read()
txt = txt.lower()
for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~':
txt = txt.replace(ch, " ") # 将文本中特殊字符替换为空格
return txt
hamletTxt = getText()
words = hamletTxt.split()
counts = {}
for word in words:
counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
for i in range(10):
word, count = items[i]
print("{0:<10}{1:>5}".format(word, count))
the 1138
and 965
to 754
of 669
you 550
i 542
a 542
my 514
hamlet 462
in 436
《三国演义》人物出场统计(上)(含《三国演义》原文文本)
https://python123.io/resources/pye/threekingdoms.txt
# CalThreeKingdomsV1.py
import jieba
txt = open("threekingdoms.txt", "r", encoding='utf-8').read()
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word) == 1:
continue
else:
counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
for i in range(15):
word, count = items[i]
print("{0:<10}{1:>5}".format(word, count))
曹操 953
孔明 836
将军 772
却说 656
玄德 585
关公 510
丞相 491
二人 469
不可 440
荆州 425
玄德曰 390
孔明曰 390
不能 384
如此 378
张飞 358
《三国演义》人物出场统计(下)(含《三国演义》原文文本)
# CalThreeKingdomsV2.py
import jieba
excludes = {"将军", "却说", "荆州", "二人", "不可", "不能", "如此"}
txt = open("threekingdoms.txt", "r", encoding='utf-8').read()
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word) == 1:
continue
elif word == "诸葛亮" or word == "孔明曰":
rword = "孔明"
elif word == "关公" or word == "云长":
rword = "关羽"
elif word == "玄德" or word == "玄德曰":
rword = "刘备"
elif word == "孟德" or word == "丞相":
rword = "曹操"
else:
rword = word
counts[rword] = counts.get(rword, 0) + 1
for word in excludes:
del counts[word]
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
for i in range(10):
word, count = items[i]
print("{0:<10}{1:>5}".format(word, count))
曹操 1451
孔明 1383
刘备 1252
关羽 784
张飞 358
商议 344
如何 338
主公 331
军士 317
吕布 300