作业四：词频统计-基本功能

最新推荐文章于 2022-06-09 16:22:40 发布

weixin_30642267

最新推荐文章于 2022-06-09 16:22:40 发布

阅读量192

点赞数

文章标签： python 开发工具 git

原文链接：http://www.cnblogs.com/xffych/p/9824844.html

版权

一、基本信息

　　1、本次作业的地址：https://edu.cnblogs.com/campus/ntu/Embedded_Application/homework/2088

　 2、项目Git地址：https://gitee.com/ntucs/PairProg/tree/SE045_060

3.开发环境：Pycharm2017、Python3.7

4.结对成员:1613072045徐泽辉、1613072060杨春华

二：项目分析

2.1 程序运行模块（方法、函数）介绍

　　①任务一：读取文件，统计有效行数

def process_file(dst):  # 读文件到缓冲区，统计文本行数
    try:     # 打开
        file = open(dst, 'r')  # dst为文本的目录路径
    except IOError as e:
        print(e)
        return None
    try:
        lines = len(file.readlines())#统计文本行数
        # 关闭文件,重新打开
        file.close()
        file = open(dst, "r")
        bvffer = file.read()
    except:
        print("Read File Error!")
        return None
    file.close()
    return bvffer, lines

　②任务一：使用正则表达式统计词频，存放如字典，统计单词总数　

def process_buffer(bvffer):  # 处理缓冲区，返回存放每个单词频率的字典word_freq，单词总数
    if bvffer:
        word_freq = {}
        # 将文本内容都小写
        bvffer = bvffer.lower()
        # 用空格消除文本中标点符号
        Char = {",.;!？"}
        for ch in Char:
            bvffer = bvffer.replace(ch, ' ')
        words = bvffer.split(' ')
        # 正则匹配至少以4个英文字母开头，跟上字母数字符号，单词以分隔符分割，不区分大小写
        regex_word = "^[a-z]{4}(\w)*"
      
        for word in words:
        
            if re.match(regex_word, word):
                # 数据字典已经存在该单词，数量+1
                if word in word_freq.keys():
                    word_freq[word] = word_freq[word] + 1
                # 不存在，把单词存入字典，数量置为1
                else:
                    word_freq[word] = 1
    return word_freq, len(words)

　③任务一：按照单词的频数排序，返回前十的单词组

def output_result(word_freq):  # 按照单词的频数排序，返回前十的单词组
    if word_freq:
        sorted_word_freq = sorted(word_freq.items(), key=lambda v: v[1], reverse=True)
        for item in sorted_word_freq[:10]:  # 输出 Top 10 的单词
            print('<' + str(item[0]) + '>:' + str(item[1]))
    return sorted_word_freq[:10]

④任务一：保存结果到文件（result.txt)

def save_result(lines, words, items):  # 保存结果到文件（result.txt)
    try:
        result = open("C:\\Users\\YCH19981203\\result.txt", "w")  # 以写模式打开，并清空文件内容
    except Exception as e:
        result = open("C:\\Users\\YCH19981203\\result.txt", "x")  # 文件不存在，创建文件并打开
    # 写入文件result.txt
    result.write("lines:" + lines + "\n")
    result.write("words:" + words + "\n")
    for item in items:
        item = '<' + str(item[0]) + '>:' + str(item[1]) + '\n'
        result.write(item)
    print('写入result.txt已完成')
    result.close()

⑤任务一：主函数

def main():
    # 命令行传递参数
    dst='C:\\Users\\YCH19981203\\Gone_with_the_wind.txt'
    bvffer, lines = process_file(dst)
    word_freq, words = process_buffer(bvffer)
    items = output_result(word_freq)
    # 把lines、words类型强制转化为str
    lines = str(lines)
    words = str(words)
    save_result(lines, words, items)
    Phrase_freq2 = process_Phrase2(bvffer)  # 生成词组字典
    Phrase_freq3 = process_Phrase3(bvffer)  # 生成词组字典
    output_result(Phrase_freq2)
    output_result(Phrase_freq3)

⑥任务二：停词表，这里没有用nltk工具库，建立了一个停词表文件，新的 process_buffer(bvffer)如下

def process_buffer(bvffer):  # 处理缓冲区，返回存放每个单词频率的字典word_freq，单词总数
    if bvffer:
        word_freq = {}
        # 将文本内容都小写
        bvffer = bvffer.lower()
        # 用空格消除文本中标点符号
        Char = {",.;!？"}
        for ch in Char:
            bvffer = bvffer.replace(ch, ' ')
        words = bvffer.split(' ')
        # 正则匹配至少以4个英文字母开头，跟上字母数字符号，单词以分隔符分割，不区分大小写
        regex_word = "^[a-z]{4}(\w)*"
        stopList = open("C:\\Users\\YCH19981203\\stopwords.txt", "r")
        StopList=stopList.read()
        for word in words:
         if word not in StopList:
            if re.match(regex_word, word):
                # 数据字典已经存在该单词，数量+1
                if word in word_freq.keys():
                    word_freq[word] = word_freq[word] + 1
                # 不存在，把单词存入字典，数量为1
                else:
                    word_freq[word] = 1
    return word_freq, len(words)

⑦任务二：统计两个单词词组

def process_Phrase2(bvffer): #统计两个单词词组
    Phrase = []
    Phrase_freq = {}
    words = bvffer.strip().split()#单词分割
    for y in range(len(words) - 1):
        if words[y][-1] in '’“‘!;,.?”' or words[y + 1][0] in '’“‘!;,.?”':  # 判断两个单词之间是否有其他符号
            continue
        elif words[y][0] in '’“‘!;,.?”':  # 判断第一个单词前是否有符号
            words[y] = words[y][1:]
        elif words[y + 1][-1] in '’“‘!;,.?”':  # 判断第二个单词后是否有符号
            words[y + 1] = words[y + 1][:len(words[y + 1]) - 1]
        Phrase.append(words[y] + ' ' + words[y + 1])  # 录入列表Phrase
    for ph in Phrase:
        Phrase_freq[ph] = Phrase_freq.get(ph, 0) + 1  # 生成词组字典
    return Phrase_freq

⑧任务二：函数分析模块

if __name__ == "__main__":
    import cProfile
    import pstats
    cProfile.run("main()", filename="result.out")
    p = pstats.Stats('result.out')  # 创建Stats对象
    p.sort_stats('calls').print_stats(10)  # 按照调用次数排序，打印前10函数的信息
    p.strip_dirs().sort_stats("cumulative", "name").print_stats(10)  # 按照运行时间和函数名排序，只打印前10行函数的信息
    p.print_callees("process_buffer")  # 查看process_buffer()函数中调用了哪些函数