基于聚类的文本摘要实现


基于聚类的文本摘要实现

实现步骤:
在这里插入图片描述

一、文件目录

在这里插入图片描述

二、聚类摘要(main.py)

import re
import torch
from transformers import BertTokenizer#中文分词器
from transformers import AlbertModel#Albert预训练模型获得embedding
from nltk.cluster import KMeansClusterer#k均值聚类
from scipy.spatial import distance_matrix#距离计算模块distance
import nltk
import pandas as pd
content = """
内容
"""
title = '摘要'


# ********** 分句,清理标点符号 ********** #
# 分句,清理标点符号
def split_document(para):
    line_split = re.split(r'[|。|!|;|?|]|\n|,', para.strip())
    _seg_sents = [re.sub(r'[^\w\s]','',sent) for sent in line_split]
    _seg_sents = [sent for sent in _seg_sents  if sent != '']
    return _seg_sents

# sentences=['新冠肺炎疫情暴发以来', '频繁出现的无症状感染者病例', '再次引起恐慌', '近日'...]
sentences = split_document(content)


# ********** 计算句子向量 ********** #
#Mean Pooling:考虑attention mask以获得正确的平均值
def mean_pooling(model_output, attention_mask):
    # :所有句子的embedding:token_embeddings=[bs,sentence-len+2,hidden_dim]=[2, 15, 312]
    token_embeddings = model_output[0]
    # 扩展attention mask维度:[bs,sentence-len+2]--->[bs,sentence-len+2,hidden_dim]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    a = torch.sum(token_embeddings * input_mask_expanded)
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# 下载模型
tokenizer = BertTokenizer.from_pretrained('clue/albert_chinese_tiny')
model = AlbertModel.from_pretrained('clue/albert_chinese_tiny')

# 得到句子的embedding
def _get_sentence_embeddings(sentences):
    # Tokenize sentences
    # sentences=['新冠肺炎疫情暴发以来', '频繁出现的无症状感染者病例']-->encoded_input : 输出3个tensor
    #加开始和结束后的'input_ids': tensor([[101,3173,1094,5511,4142,4554,2658,3274,1355,809,3341,102,0,0,0],[101,7574,5246,1139,4385,4638,3187,4568,4307,2697,3381,5442,4567,891,102]])
    #'token_type_ids': tensor([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]])
    #'attention_mask': tensor([[1,1,1,1,1,1,1,1,1,1,1,1,0,0,0],[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]])
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    # Compute token embeddings
    # with torch.no_grad的作用:在该模块下,所有计算得出的tensor的requires_grad都自动设置为False。
    # model_output:输出2个tensor
    # last_hidden_status:[bs,sentence-len+2,hidden_dim]=[2, 15, 312]
    # pooler_output:[bs,hidden_dim]=[2, 312]
    with torch.no_grad():
        model_output = model(**encoded_input)
    # 考虑attention mask以获得正确的平均值 Mean Pooling=[bs,hidden_dim]
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    return sentence_embeddings

# 得到句子的embedding=[bs,hidden_dim]
sentence_embeddings = _get_sentence_embeddings(sentences)


# ********** 直接用句子向量来聚类 ********** #
NUM_CLUSTERS=10 # 分为多少组
iterations=25 #迭代次数
X = sentence_embeddings.numpy()
# k均值聚类
kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,repeats=iterations,avoid_empty_clusters=True)
# assigned_clusters =[6, 6, 4, 8, 6, 8, 6, 2...]
assigned_clusters = kclusterer.cluster(X, assign_clusters=True) # 计算所有句子的分组


# ********** 计算每个句子到质心的距离 ********** #
# data:
#       sentence                embedding            cluster        centroid             distance_from_centroid
#0     新冠肺炎疫情暴发以来        [-0.2, 0.3,....]        2         [-0.17, 0.20,...]          3.476364
#1      频繁出现的无症状感染者病例  [-0.2, 0.1,....]        9         [-0.19, -0.16,...]          3.096487
data = pd.DataFrame(sentences)
data.columns=['sentence']
data['embedding'] = sentence_embeddings.numpy().tolist()# .tolist()数组转化为列表
# 句子分为10簇
data['cluster']=pd.Series(assigned_clusters, index=data.index)
# 每个质心的向量:计算求平均
data['centroid']=data['cluster'].apply(lambda x: kclusterer.means()[x])
# 计算sentence的embedding和质心的距离
def distance_from_centroid(row):
    return distance_matrix([row['embedding']], [row['centroid'].tolist()])[0][0]
data['distance_from_centroid'] = data.apply(distance_from_centroid, axis=1)


# ********** 得到摘要 ********** #
# 1. 按照cluster 进行分组
# 2. 组内排序
# 3. 按照文章顺序顺序取原来的句子
summary=data.sort_values('distance_from_centroid',ascending = True).groupby('cluster').head(1).sort_index()['sentence'].tolist()
print(summary)

实验结果

摘要10句

  • 0
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值