中文自动摘要技术

最新推荐文章于 2024-10-05 08:57:27 发布
Ytong22
最新推荐文章于 2024-10-05 08:57:27 发布
阅读量145
点赞数
文章标签： python
本文链接：https://blog.csdn.net/m0_61991112/article/details/128854054
版权
def bf_match(s, p):
    line = []
    slen = len(s)
    plen = len(p)
    if slen >= plen:
        for index in range(slen - plen + 1):        # index是查询式在原文本的起始位置
            i = index
            j = 0
            while i < slen and j < plen and s[i] == p[j]:
                i = i + 1
                j = j + 1
            if j == plen:
                inner = []                          # 存放查询查询式在原文本的出现位置和查询式长度
                inner.append(index)
                inner.append(plen)
                line.append(inner)

        return line, len(line)
def get_shingle(s, k):
    """按k个字符长度切分字符串"""
    global dic

    window = []     # 存放候选窗口
    values = dic.values()
    for value in values:
        for v in range(len(value)):
            location = value[v][0]      # 获取关键词所处位置
            if location + k < len(s):      # 切片操作
                window.append(s[location: location+k])
            else:                       # 末尾
                window.append(s[location:])
    return window  # 返回候选窗口
def change_color(s, keys):
    """为查询词在窗口中添加颜色"""
    for k in keys:
        k_color = "\033[92m" + k + "\033[0m"
        s = str(s).replace(k, k_color)
    return s


def modify(str):
    """对选择好的窗口进行微调"""
    location, length = bf_match(s1, str)         # 找到候选窗口在原文本的起始位置
    location = location[0][0]               # 获取该窗口(str)在原文的位置
    if location == 0:       # 处在第一位不用进行调整
        return str
    elif location + k == len(s1)-1:     # 末尾不进行调整
             return str
    else:
        for i in range(1, k+1):  # 最多滑动40个字符，组成意思更加完整的句子
            index = location - i
            if index < 0:
                str = s1[0: k]
                break
            else:
                start = s1[index: index+k]      # 根据k对原文进行切片
                if start.startswith('，') or start.startswith('。') or start.startswith('！'):    # 遇到标点符号认为是完整句子
                    if len(s1[index+1: index+k+1]) > 25:    # 最小窗口长度25个字
                        str = s1[index+1: index+k+1]
                    else:
                        continue
                    break
    return str
def get_top():
    """获取权值最大的窗口"""
    weight = {}     # 存放每个窗口投票分数的字典
    for s in sh:  # 遍历每个窗口/字符串shingle
        weight[s] = 0  # 初始化其权重为0
        for t in tt1:  # 遍历关键词
            if t in s:
                count = s.count(t)      # 统计该关键词在该窗口中出现的次数

                weight[s] += dic[t][0][1] * count       # 以出现次数*字符长度的总和作为该窗口的投票分数，并存入投票分数字典中
    sorted_weight = sorted(weight.items(),              # 将投票分数从大到小逆序排列，取分数最高者作为最终窗口
                           key=lambda d: d[1], reverse=True)
    max = sorted_weight[0][1]   # 取最大分值

    final_list = []     # 若有多个同分窗口，选择最合适的作为最终窗口
    for i in sorted_weight:
        if max == i[1]:
            final_list.append(i[0])     # 取排名靠前的同分窗口
        else:
            break

    if len(final_list) == 1:        # 如果只有一个窗口则直接输出窗口
        return sorted_weight[0][0]
    else:
        dic_score = {}                # 得分字典
        for j in final_list:    # 遍历每个同分窗口
            score = 0           # 初始得分都为0
            for t in tt1:       # 每有一个不同的关键词出现，score+1
                if t in j:
                    score += 1
            dic_score[j] = score      # 记录该窗口的最终得分
        dic_score = sorted(dic_score.items(), key=lambda d: d[1], reverse=True)     # 选择得分最高的作为最终窗口

    return dic_score[0][0]      # 返回最终窗口
def change_k():
    """根据需要修改切片值"""
    global k

    while 1:
        k = int(input('\n请输入合适的切片长度:如30或25： '))
        if k <= 0:
            print('切片长度不能为负')
        elif k > len_file:
            print('切片长度不能超过文本长度，当前文本长度：', len_file, '，输入值：', k)
        elif k < get_keylen():
            print('切片长度不能小于搜索词长度，当前切片长度：', k, '查询词:', keywords, '，搜索词长度：', get_keylen())
        else:
            break
    return k
def get_keylen():
    """返回关键词长度"""
    return len(keywords)
def check():
    """检测输入是否合法"""
    global keywords
    global k

    flag = 0
    while 1:
        if 'qt' in keywords or 'cg' in keywords:       # 判断查询词中是否含有修改切片长度'cg'或者停止运行'qt'的命令
            if 'qt' in keywords:                # 退出系统
                flag = 1
                break
            if 'cg' in keywords:                # 修改切片长度
                print('修改切片长度', end=' ')
                change_k()
                keywords = input('请重新输入查询关键词:')  # 重新为输入赋值（原先输入是'cg'）
                keywords = keywords.replace(' ', '').replace('“', '').replace("'", '').\
                    replace('”', '').replace("'", '').replace("‘", '').replace("’", '')
                keywords = list(set(keywords.split('，')))  # 以逗号划分查询词，形成查询词列表，set查询词去重
        else:
            break

    return flag
if __name__ == '__main__':
    s1 = open("文件资源.txt", "r", encoding='utf-8').read().replace(' ', '')     # 读取原文本
    len_file = len(s1)      # 文本长度
    print('文本文档：',s1, '\n文本长度：', len_file)
    k = 0                   # k=0时表示为切片赋初始长度，可以一直使用该值，无需每次都重新输入，直到希望修改切片长度
    counter = 0             # 记录查询的总次数
    while 1:
        print('当前查询次数:', counter)
        counter += 1
        print('当前切片长度:', k)
        keywords = input('输入查询关键词，输入qt退出):')
        keywords = keywords.replace(' ', '').replace('“', '').replace("'", '').replace('”', '').\
            replace("'", '').replace("‘", '').replace("’", '')
        keywords = list(set(keywords.split('，')))  # 以逗号划分查询词，形成查询词列表，set查询词去重
        if k == 0 and 'cg' in keywords:
            print('切片未生成，无法进行修改操作')
            continue
        if check() == 1:
            print('\n再见')
            break
        if k == 0:      # 切片赋初始值
            k = change_k()

        tt1 = keywords
        counts = 0                      # 计算关键词是否在文本中出现过的次数，若为0则认为查询词不在该文档中
        dic = {}                        # 关键词字典，存放关键词及其在文档中所处位置
        for i in tt1:                   # 针对查询词列表中的每一个查询词，对其使用bf算法计算其在原文本出现过的位置
            dic[i], count = bf_match(s1, i)
            counts += count             # 计算所有关键词出现过的次数，若为0则提示所有关键词都不在该文档中，否则返回查询操作结果
        if counts == 0:
            print('该查询在文章中不存在\n\n')
        else:
            sh = get_shingle(s1, k)      #
            for i, s in enumerate(sh):          # 遍历窗口，输出每个窗口内容
                s = change_color(s, tt1)
                print('第%d个窗口：' % (i + 1), s)

            top_sh = get_top()        # 最终窗口
            top_sh = modify(top_sh)  # 窗口微调
            top_sh = change_color(top_sh, tt1)  # 高亮窗口中的关键词
            print('自动摘要：\n', top_sh, '\n')