使用python的信息检索作业（2）

最新推荐文章于 2022-10-19 11:57:05 发布

陆离少爷

最新推荐文章于 2022-10-19 11:57:05 发布

阅读量539

点赞数 2

分类专栏： python学习笔记文章标签： python

本文链接：https://blog.csdn.net/qq_45817281/article/details/105110331

版权

python学习笔记专栏收录该内容

2 篇文章 1 订阅

订阅专栏

最大匹配分词

任务：
使用最大匹配算法、字典文件（corpus.dict），对语料（corpus.sentence）进行分词，将分词的结果输出到文件corpus.out中；对比corpus.answer和corpus.out，给出算法的P/R/F指标

确定文件路径

path = print(os.getcwd())  # 获得文件当前路径

拿取语料库内容

with open(path + '\\data\\corpus.sentence.txt', 'r', encoding='utf-8-sig') as sentence:  # 打开语料库
    lst = []
    while True:
        line = sentence.readline()
        if line == '':
            break
        else:
            lst.append(line[:-1])  # 将语料库中内容存进列表并关闭文件

拿取字典内容

with open(path + '\\data\\corpus.dict.txt', 'r', encoding='utf-8-sig') as d:  # 打开字典文件
    dic = []
    while True:
        line = d.readline()
        if line == '':
            break
        else:
            dic.append(line[:-1])  # 将字典内容存入列表并关闭文件

分词核心部分

for item in lst:  # 循环语料库中每一句话
    start = 0  # 设置启示位置
    count = 10  # 设置最大长度
    ans = []
    while count >= 1:  # 设置内循环
        if count == start:
            break
        res = item[start:count]  # 截取当前长度字符串
        if res in dic:  # 判断是否存在在字典中
            ans.append(res)
            start = count
            if 10 + start < len(item):  # 防止下标访问越界
                count = 10 + start  # 重设长度
            else:
                count = len(item)
        else:
            if count == start + 1:  # 单个字符时处理
                ans.append(item[start:count])
                start = count
                if 10 + start < len(item):
                    count = 10 + start
                else:
                    count = len(item)
            else:  # 长度减一
                count -= 1

完整代码：

import os

current_path = os.path.abspath(__file__)
path = os.path.abspath(os.path.dirname(current_path) + os.path.sep + ".")  # 获取父文件夹路径

with open(path + '\\data\\corpus.sentence.txt', 'r', encoding='utf-8-sig') as sentence:  # 打开语料库
    lst = []
    while True:
        line = sentence.readline()
        if line == '':
            break
        else:
            lst.append(line[:-1])  # 将语料库中内容存进列表并关闭文件

with open(path + '\\data\\corpus.dict.txt', 'r', encoding='utf-8-sig') as d:  # 打开字典文件
    dic = []
    while True:
        line = d.readline()
        if line == '':
            break
        else:
            dic.append(line[:-1])  # 将字典内容存入列表并关闭文件

num = int(dic[0].split('\t')[1])  # 获取最大长度

with open(path + '\\data\\corpus.out.txt', 'w', encoding='utf-8-sig') as out:  # 创建.out文件
    out_lst = []

    with open(path + '\\data\\corpus.answer.txt', 'r', encoding='utf-8-sig') as answer:  # 打开answer文件
        ans_lst = []
        while True:  # 读入数据
            line = answer.readline()
            if line == '':
                break
            else:
                ans_lst.append(line[:-1].split())  # 以空格隔开

    suppose_all_num = 0  # 初始化应切分出的词的总数
    for i in ans_lst:  # 统计应切分出的词的总数
        for j in i:
            suppose_all_num += 1

    for item in lst:  # 循环语料库中每一句话
        start = 0  # 设置启示位置
        count = 10  # 设置最大长度
        ans = []
        while count >= 1:  # 设置内循环
            if count == start:
                break
            res = item[start:count]  # 截取当前长度字符串
            if res in dic:  # 判断是否存在在字典中
                ans.append(res)
                start = count
                if 10 + start < len(item):  # 防止下标访问越界
                    count = 10 + start  # 重设长度
                else:
                    count = len(item)
            else:
                if count == start + 1:  # 单个字符时处理
                    ans.append(item[start:count])
                    start = count
                    if 10 + start < len(item):
                        count = 10 + start
                    else:
                        count = len(item)
                else:  # 长度减一
                    count -= 1

        out_lst.append(ans)  # 将分词结果全部存入列表以备统计总数
        result = ' '.join(ans)
        out.write(result + '\n')  # 将分词结果写入.out文件并换行

    out_all_num = 0  # 初始化切分出的词的总数
    for i in out_lst:  # 统计切分出的词的总数
        for j in i:
            out_all_num += 1

wrong_ans = []  # 创建列表用于存放出错分词
right_num = 0  # 初始化正确切分出的词的数目
for i in range(0, len(out_lst)):
    for j in out_lst[i]:
        if j in ans_lst[i]:
            right_num += 1
        else:
            wrong_ans.append(j)  # 将错误分词放入列表

wrong_dic = {}  # 统计错误次数
for i in wrong_ans:
    wrong_dic[i] = wrong_dic.get(i, 0) + 1


precision = right_num / out_all_num  # 计算正确率
recall = right_num / suppose_all_num  # 计算召回率
f = precision * recall * 2 / (precision + recall)  # 计算F值

with open(path + '\\data\\corpus.评价结果.txt', 'w', encoding='utf-8-sig') as judge:  # 将PRF值输出到文件中
    judge.write('正确率Precision = ' + str(right_num) + '/' + str(out_all_num) + ' = ' + '{0:.2f}'.format(
        precision * 100) + '%' + '\n')
    judge.write('召回率Recall = ' + str(right_num) + '/' + str(suppose_all_num) + ' = ' + '{0:.2f}'.format(
        recall * 100) + '%' + '\n')
    judge.write('F值 = ' + '{0:.2f}'.format(f * 100) + '%' + '\n')
    judge.write('\n')
    judge.write("错误分词及其出现次数为：\n")  # 输出错误列表
    for i in sorted(wrong_dic.items(), key=lambda ob: -ob[1]):
        judge.write(i[0] + '\t' + str(i[1]) + '\n')