Python字典使用--词频统计的GUI实现

最新推荐文章于 2024-08-30 17:05:27 发布

i_chaoren

最新推荐文章于 2024-08-30 17:05:27 发布

阅读量8.2k

点赞数 2

分类专栏： Python

本文链接：https://blog.csdn.net/i_chaoren/article/details/55683323

版权

Python 专栏收录该内容

11 篇文章 9 订阅

订阅专栏

字典是针对非序列集合而提供的一种数据类型，字典中的数据是无序排列的。

字典的操作

为字典增加一项

dict[key] = value

students = {"Z004":"John","T002":"Peter"}

students
Out[23]: {'T002': 'Peter', 'Z004': 'John'}

students["S007"] = "Susan"

students
Out[25]: {'S007': 'Susan', 'T002': 'Peter', 'Z004': 'John'}

访问字典中的值

dict[key] 返回key对应的值value

dict.get(key,default)--返回字典中key对应的值，若未找到key，则返回default值，default值可不写

删除字典中的一项

del dict[key]

字典的遍历

遍历字典的键key

for key in dict.keys():print(key)

遍历字典的值value

for value in dict.values():print(value)

遍历字典的项

for item in dict.items():print(item)

是否一个键在字典中

注：值不能判断

in 或者 not in

删除字典项目

dict.clear()--删除字典中的所有项目

dict.pop(key)--删除并返回字典中key对应的值

直接赋值、浅拷贝、深拷贝

直接赋值：其实就是对象的引用（别名）。

浅拷贝(copy)：拷贝父对象，不会拷贝对象的内部的子对象。

深拷贝(deepcopy)： copy 模块的 deepcopy 方法，完全拷贝了父对象及其子对象。

字典浅拷贝实例：

>>>a = {1: [1,2,3]}
>>> b = a.copy()
>>> a, b
({1: [1, 2, 3]}, {1: [1, 2, 3]})
>>> a[1].append(4)
>>> a, b
({1: [1, 2, 3, 4]}, {1: [1, 2, 3, 4]})

深度拷贝需要引入 copy 模块：

>>>import copy
>>> c = copy.deepcopy(a)
>>> a, c
({1: [1, 2, 3, 4]}, {1: [1, 2, 3, 4]})
>>> a[1].append(5)
>>> a, c
({1: [1, 2, 3, 4, 5]}, {1: [1, 2, 3, 4]})

http://www.runoob.com/w3cnote/python-understanding-dict-copy-shallow-or-deep.html

示例：词频统计

第一步：输入文章

第二步：建立用于词频计算的空字典

第三步：对文本的每一行计算词频，如果文章长度一般，则不需用一次读一行，一次便可读完。

第四步：从字典中获取数据对到列表中

第五步：对列表中的数据对交换位置，并从大到小进行排序

第六步：输出结果

下图所示为程序输出结果及输出的统计结果

汉字的词频统计、排除特定词集合的程序后续更新...

普通版本

def getText():  
    txt=open('hamlet.txt','r').read()  
    txt=txt.lower()  
    for ch in "~@#$%^&*()_-+=<>?/,.:;{}[]|\'""":  
        txt=txt.replace(ch,' ')  
    return txt  
  
hamletTxt=getText()  
words=hamletTxt.split()  
counts={}  
sumcount = 0
for word in words:  
    counts[word]=counts.get(word,0)+1
    sumcount = sumcount + 1
items=list(counts.items())  
items.sort(key=lambda x:x[1],reverse=True)  
for i in range(10):  
    word,count=items[i]  
    print('{0:<10}{1:>5}'.format(word,count))  
  
 #将统计结果写入文本文件中  
outfile = open('词频统计结果.txt', "w")  
lines = []    
lines.append('单词种类：'+str(len(items))+'\n')  
lines.append('单词总数：'+str(sumcount)+'\n')  
lines.append('词频排序如下:\n')  
lines.append('word\tcounts\n')  
  
s= ''  
for i in range(len(items)):  
    s = '\t'.join([str(items[i][0]), str(items[i][1])])  
    s += '\n'    
    lines.append(s)  
print('\n统计完成！\n')  
outfile.writelines(lines)  
outfile.close()

排除特定词库

#排除词库
excludes = ['the','and','to','of','i','a','in','it','that','is',
            'you','my','with','not','his','this','but','for',
            'me','s','he','be','as','so','him','your']
def getText():    
    txt=open('hamlet.txt','r').read()    
    txt=txt.lower()    
    for ch in "~@#$%^&*()_-+=<>?/,.:;{}[]|\'""":    
        txt=txt.replace(ch,' ')       
    return txt    
    
hamletTxt=getText()    
words=hamletTxt.split()    
counts={}    
sumcount = 0  
for word in words:    
    counts[word]=counts.get(word,0)+1  
    sumcount = sumcount + 1 

counts_ex = counts.copy()    
for key in counts.keys():
    if key in excludes:
        counts_ex.pop(key)
items=list(counts_ex.items())    
items.sort(key=lambda x:x[1],reverse=True)    
for i in range(10):    
    word,count=items[i]    
    print('{0:<10}{1:>5}'.format(word,count))    
    
 #将统计结果写入文本文件中    
outfile = open('词频统计结果.txt', "w")    
lines = []      
lines.append('单词种类：'+str(len(items))+'\n')    
lines.append('单词总数：'+str(sumcount)+'\n')    
lines.append('词频排序如下:\n')    
lines.append('word\tcounts\n')    
    
s= ''    
for i in range(len(items)):    
    s = '\t'.join([str(items[i][0]), str(items[i][1])])    
    s += '\n'      
    lines.append(s)    
print('\n统计完成！\n')    
outfile.writelines(lines)    
outfile.close()

GUI版本

import tkinter as tk
from tkinter import ttk
from tkinter import scrolledtext
from tkinter import filedialog
from tkinter import messagebox as mBox

#获取原文内容
def getText(DIR):
    txt=open(DIR,'r').read()
    return txt
    txt.close()

#打开文件
def __opendir():
    srcText.delete('1.0', tk.END) # 先删除所有
        
    # 打开文件夹对话框
    fname = filedialog.askopenfilename(filetypes=( ("Text file", "*.txt*"),("HTML files", "*.html;*.htm")))
    entryvar.set(fname) # 设置变量entryvar，等同于设置部件Entry
        
    if not fname:
        mBox.showwarning('警告', message='未选择文件夹！')  # 弹出消息提示框

    #显示需要统计的文本
    Txt=getText(fname)
    srcText.insert(tk.END, Txt)
            
    srcText.update()
    
#手动输入文件名时回车键触发      
def srcEnter(event=None):
    
    fname=DirEntry.get()
    if not fname:
        mBox.showwarning('警告', message='请选择文件！')  # 弹出消息提示框
        
    Txt=getText(fname)
    srcText.insert(tk.END, Txt)
            
    srcText.update()

#词频统计
def wordFrequence():
    fname=DirEntry.get()
    if not fname:
        mBox.showwarning('警告', message='请选择文件！')  # 弹出消息提示框

    txt=getText(fname)
    
    #对原文进行小写，标点符号转换处理
    txt=txt.lower()
    for ch in '!"#$%&*()+,.-;:<=>?@[]\^_{}|`':
        txt=txt.replace(ch,' ')

    #词频统计
    words=txt.split()
    counts={} #用空字典存储统计结果
    for word in words:
        counts[word]=counts.get(word,0)+1

    #词频排序
    items=list(counts.items())
    items.sort(key=lambda x:x[1],reverse=True)

    #输出排序结果
    num=0
    for i in range(len(counts)):
        word,count=items[i]
        num=i*count+num


    dstText.insert(tk.END, '单词种类：')
    dstText.insert(tk.END, str(len(items)))
    dstText.insert(tk.END, '\n')
    dstText.insert(tk.END, '单词总数：')
    dstText.insert(tk.END, str(num))
    dstText.insert(tk.END, '\n')
    dstText.insert(tk.END, '词频排序如下:\n')
    dstText.insert(tk.END, '#word:\t\t#counts:\n')

    for i in range(len(counts)):
        word,count=items[i]
        dstText.insert(tk.END, word)
        dstText.insert(tk.END, '\t\t')
        dstText.insert(tk.END, count)
        dstText.insert(tk.END, '\n')

def savefile():
    # 打开文件夹对话框
    dirname = filedialog.askdirectory() 
    outvar.set(dirname) # 设置变量entryvar，等同于设置部件Entry
        
    if not dirname:
        mBox.showwarning('警告', message='请选择保存位置！')  # 弹出消息提示框

    fname=dirname+'\词频统计结果.txt'
    outfile = open(fname, "w")
    outfile.writelines(dstText.get(1.0,tk.END))
    outfile.close()
    mBox.showinfo('词频统计', '统计结果保存成功！')

def dstEnter(event=None):
    dirname=outvar.get()
    if not dirname:
        mBox.showwarning('警告', message='请选择保存位置！')  # 弹出消息提示框
    fname=dirname+'\词频统计结果.txt'
    outfile = open(fname, "w")
    outfile.writelines(dstText.get(1.0,tk.END))
    outfile.close()
    mBox.showinfo('词频统计', '统计结果保存成功！')
    
# Create instance
win = tk.Tk()   

# Add a title       
win.title("词频统计GUI")

# Disable resizing the GUI
win.resizable(0,0)


#---------------窗口控件介绍------------------#

#打开文件对话框
SelDirButton = ttk.Button(win, command=__opendir, text='选择文件目录：')
SelDirButton.grid(row=0, column=0,sticky=tk.W,pady=3,padx=3)

#文件的目录显示    
entryvar = tk.StringVar() 
DirEntry=ttk.Entry(win, width=30,textvariable=entryvar)
DirEntry.grid(row=1, column=0,sticky=tk.W,pady=3,padx=3)
DirEntry.bind('<Return>', func=srcEnter)

#文件内容的显示
srcText = scrolledtext.ScrolledText(win,width=30,height=30)#内容输出框
srcText.grid(row=2, column=0,columnspan=1,sticky=tk.W,pady=3,padx=3)

#词频统计按钮
CalcuButton = ttk.Button(win, command=wordFrequence, text='词频统计')
CalcuButton.grid(row=0, column=1,sticky=tk.W,pady=3,padx=3)

#统计结果显示
dstText = scrolledtext.ScrolledText(win,width=30,height=30)#内容输出框
dstText.grid(row=2, column=1,columnspan=2,sticky=tk.W,pady=3,padx=3)

#保存文件按钮
SavefileButton = ttk.Button(win, command=savefile, text='统计结果保存到：')
SavefileButton.grid(row=0, column=2,sticky=tk.W,pady=3,padx=3)

#保存文件目录
outvar = tk.StringVar() 
saveEntry=ttk.Entry(win, width=30,textvariable=outvar)
saveEntry.grid(row=1, column=1,columnspan=2,sticky=tk.W,pady=3,padx=3)
saveEntry.bind('<Return>', func=dstEnter)

     
#======================
# Start GUI
#======================
win.mainloop()