使用python的信息检索作业(2)

最大匹配分词

任务:
使用最大匹配算法、字典文件(corpus.dict),对语料(corpus.sentence)进行分词,将分词的结果输出到文件corpus.out中;对比corpus.answer和corpus.out,给出算法的P/R/F指标

  • 确定文件路径
path = print(os.getcwd())  # 获得文件当前路径
  • 拿取语料库内容
with open(path + '\\data\\corpus.sentence.txt', 'r', encoding='utf-8-sig') as sentence:  # 打开语料库
    lst = []
    while True:
        line = sentence.readline()
        if line == '':
            break
        else:
            lst.append(line[:-1])  # 将语料库中内容存进列表并关闭文件
  • 拿取字典内容
with open(path + '\\data\\corpus.dict.txt', 'r', encoding='utf-8-sig') as d:  # 打开字典文件
    dic = []
    while True:
        line = d.readline()
        if line == '':
            break
        else:
            dic.append(line[:-1])  # 将字典内容存入列表并关闭文件
  • 分词核心部分
for item in lst:  # 循环语料库中每一句话
    start = 0  # 设置启示位置
    count = 10  # 设置最大长度
    ans = []
    while count >= 1:  # 设置内循环
        if count == start:
            break
        res = item[start:count]  # 截取当前长度字符串
        if res in dic:  # 判断是否存在在字典中
            ans.append(res)
            start = count
            if 10 + start < len(item):  # 防止下标访问越界
                count = 10 + start  # 重设长度
            else:
                count = len(item)
        else:
            if count == start + 1:  # 单个字符时处理
                ans.append(item[start:count])
                start = count
                if 10 + start < len(item):
                    count = 10 + start
                else:
                    count = len(item)
            else:  # 长度减一
                count -= 1

完整代码:

import os

current_path = os.path.abspath(__file__)
path = os.path.abspath(os.path.dirname(current_path) + os.path.sep + ".")  # 获取父文件夹路径

with open(path + '\\data\\corpus.sentence.txt', 'r', encoding='utf-8-sig') as sentence:  # 打开语料库
    lst = []
    while True:
        line = sentence.readline()
        if line == '':
            break
        else:
            lst.append(line[:-1])  # 将语料库中内容存进列表并关闭文件

with open(path + '\\data\\corpus.dict.txt', 'r', encoding='utf-8-sig') as d:  # 打开字典文件
    dic = []
    while True:
        line = d.readline()
        if line == '':
            break
        else:
            dic.append(line[:-1])  # 将字典内容存入列表并关闭文件

num = int(dic[0].split('\t')[1])  # 获取最大长度

with open(path + '\\data\\corpus.out.txt', 'w', encoding='utf-8-sig') as out:  # 创建.out文件
    out_lst = []

    with open(path + '\\data\\corpus.answer.txt', 'r', encoding='utf-8-sig') as answer:  # 打开answer文件
        ans_lst = []
        while True:  # 读入数据
            line = answer.readline()
            if line == '':
                break
            else:
                ans_lst.append(line[:-1].split())  # 以空格隔开

    suppose_all_num = 0  # 初始化应切分出的词的总数
    for i in ans_lst:  # 统计应切分出的词的总数
        for j in i:
            suppose_all_num += 1

    for item in lst:  # 循环语料库中每一句话
        start = 0  # 设置启示位置
        count = 10  # 设置最大长度
        ans = []
        while count >= 1:  # 设置内循环
            if count == start:
                break
            res = item[start:count]  # 截取当前长度字符串
            if res in dic:  # 判断是否存在在字典中
                ans.append(res)
                start = count
                if 10 + start < len(item):  # 防止下标访问越界
                    count = 10 + start  # 重设长度
                else:
                    count = len(item)
            else:
                if count == start + 1:  # 单个字符时处理
                    ans.append(item[start:count])
                    start = count
                    if 10 + start < len(item):
                        count = 10 + start
                    else:
                        count = len(item)
                else:  # 长度减一
                    count -= 1

        out_lst.append(ans)  # 将分词结果全部存入列表以备统计总数
        result = ' '.join(ans)
        out.write(result + '\n')  # 将分词结果写入.out文件并换行

    out_all_num = 0  # 初始化切分出的词的总数
    for i in out_lst:  # 统计切分出的词的总数
        for j in i:
            out_all_num += 1

wrong_ans = []  # 创建列表用于存放出错分词
right_num = 0  # 初始化正确切分出的词的数目
for i in range(0, len(out_lst)):
    for j in out_lst[i]:
        if j in ans_lst[i]:
            right_num += 1
        else:
            wrong_ans.append(j)  # 将错误分词放入列表

wrong_dic = {}  # 统计错误次数
for i in wrong_ans:
    wrong_dic[i] = wrong_dic.get(i, 0) + 1


precision = right_num / out_all_num  # 计算正确率
recall = right_num / suppose_all_num  # 计算召回率
f = precision * recall * 2 / (precision + recall)  # 计算F值

with open(path + '\\data\\corpus.评价结果.txt', 'w', encoding='utf-8-sig') as judge:  # 将PRF值输出到文件中
    judge.write('正确率Precision = ' + str(right_num) + '/' + str(out_all_num) + ' = ' + '{0:.2f}'.format(
        precision * 100) + '%' + '\n')
    judge.write('召回率Recall = ' + str(right_num) + '/' + str(suppose_all_num) + ' = ' + '{0:.2f}'.format(
        recall * 100) + '%' + '\n')
    judge.write('F值 = ' + '{0:.2f}'.format(f * 100) + '%' + '\n')
    judge.write('\n')
    judge.write("错误分词及其出现次数为:\n")  # 输出错误列表
    for i in sorted(wrong_dic.items(), key=lambda ob: -ob[1]):
        judge.write(i[0] + '\t' + str(i[1]) + '\n')

此方法效率低下,根据已知词典分词准确率较高

发布了5 篇原创文章 · 获赞 2 · 访问量 126
App 阅读领勋章
微信扫码 下载APP
阅读全文

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 1024 设计师: 上身试试

分享到微信朋友圈

×

扫一扫,手机浏览