# 英文文本字母跳转概率统计.py
# 文本示例:hemlet.txt
import csv
def getText():
txt = open("hamlet.txt","r").read()
txt = txt.lower()
for ch in '!"#$%()&*+-,:;<>={}[]\/_.~`“”\'‘’?':
txt = txt.replace(ch," ")
return txt
hamletTxt = getText()
words = hamletTxt.split()
counts = {}
# for word in words:
# counts[word] = counts.get(word,0)+1
# items = list(counts.items())
# items.sort(key=lambda x:x[1],reverse= True)
counts_ch={}
PR_counts={}
for word in words:
ls_word=list(word)
# print(ls_word)
for i in range(len(ls_word)-1):
Answer_ch="{}->{}".format(ls_word[i],ls_word[i+1])
counts_ch[Answer_ch]=counts_ch.get(Answer_ch,0)+1
amount=0
for i in counts_ch.values():
amount=amount+i
for key in counts_ch.keys():
PR_counts[key]=counts_ch[key]/amount
# print(amount)
# for i in range (10):
# word,count = items[i]
# print("{0:<10}{1:>5}".format(word,count))
# print(counts_ch)
items_ch=list(PR_counts.items())
items_ch.sort(key=lambda x: x[1], reverse=True)
Rank=0
for i in range(len(PR_counts)):
Rank=Rank+1
Answer_ch,PR_count=items_ch[i]
print("{0:<3}:{1:<10}{2:>5}".format(Rank,Answer_ch, PR_count))
# print(PR_counts)
matrix_ch=[]
for row in range(0,26):
matrix_ch.append([])
for column in range(0,26):
try:
PR=PR_counts["{}->{}".format(chr(row+97),chr(column+97))]
except KeyError:
PR=0
matrix_ch[row].append(PR)
# print(matrix_ch)
Answer_matrix=[]
for row in range(27):
Answer_matrix.append([])
if row==0:
Answer_matrix[0].append(" ")
for i in range(26):
Answer_matrix[0].append(chr(i+97))
else:
Answer_matrix[row].append(chr(row+96))
for column in range(26):
Answer_matrix[row].append(matrix_ch[row-1][column])
writer = csv.writer(open("字母跳转概率输出矩阵.csv",'w',newline=''))
for item in Answer_matrix:
writer.writerow(item)