基于聚类的文本摘要实现
实现步骤:
一、文件目录
二、聚类摘要(main.py)
import re
import torch
from transformers import BertTokenizer#中文分词器
from transformers import AlbertModel#Albert预训练模型获得embedding
from nltk.cluster import KMeansClusterer#k均值聚类
from scipy.spatial import distance_matrix#距离计算模块distance
import nltk
import pandas as pd
content = """
内容
"""
title = '摘要'
# ********** 分句,清理标点符号 ********** #
# 分句,清理标点符号
def split_document(para):
line_split = re.split(r'[|。|!|;|?|]|\n|,', para.strip())
_seg_sents = [re.sub(r'[^\w\s]','',sent) for sent in line_split]
_seg_sents = [sent for sent in _seg_sents if sent != '']
return _seg_sents
# sentences=['新冠肺炎疫情暴发以来', '频繁出现的无症状感染者病例', '再次引起恐慌', '近日'...]
sentences = split_document(content)
# ********** 计算句子向量 ********** #
#Mean Pooling:考虑attention mask以获得正确的平均值
def mean_pooling(model_output, attention_mask):
# :所有句子的embedding:token_embeddings=[bs,sentence-len+2,hidden_dim]=[2, 15, 312]
token_embeddings = model_output[0]
# 扩展attention mask维度:[bs,sentence-len+2]--->[bs,sentence-len+2,hidden_dim]
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
a = torch.sum(token_embeddings * input_mask_expanded)
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
# 下载模型
tokenizer = BertTokenizer.from_pretrained('clue/albert_chinese_tiny')
model = AlbertModel.from_pretrained('clue/albert_chinese_tiny')
# 得到句子的embedding
def _get_sentence_embeddings(sentences):
# Tokenize sentences
# sentences=['新冠肺炎疫情暴发以来', '频繁出现的无症状感染者病例']-->encoded_input : 输出3个tensor
#加开始和结束后的'input_ids': tensor([[101,3173,1094,5511,4142,4554,2658,3274,1355,809,3341,102,0,0,0],[101,7574,5246,1139,4385,4638,3187,4568,4307,2697,3381,5442,4567,891,102]])
#'token_type_ids': tensor([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]])
#'attention_mask': tensor([[1,1,1,1,1,1,1,1,1,1,1,1,0,0,0],[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]])
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
# Compute token embeddings
# with torch.no_grad的作用:在该模块下,所有计算得出的tensor的requires_grad都自动设置为False。
# model_output:输出2个tensor
# last_hidden_status:[bs,sentence-len+2,hidden_dim]=[2, 15, 312]
# pooler_output:[bs,hidden_dim]=[2, 312]
with torch.no_grad():
model_output = model(**encoded_input)
# 考虑attention mask以获得正确的平均值 Mean Pooling=[bs,hidden_dim]
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
return sentence_embeddings
# 得到句子的embedding=[bs,hidden_dim]
sentence_embeddings = _get_sentence_embeddings(sentences)
# ********** 直接用句子向量来聚类 ********** #
NUM_CLUSTERS=10 # 分为多少组
iterations=25 #迭代次数
X = sentence_embeddings.numpy()
# k均值聚类
kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,repeats=iterations,avoid_empty_clusters=True)
# assigned_clusters =[6, 6, 4, 8, 6, 8, 6, 2...]
assigned_clusters = kclusterer.cluster(X, assign_clusters=True) # 计算所有句子的分组
# ********** 计算每个句子到质心的距离 ********** #
# data:
# sentence embedding cluster centroid distance_from_centroid
#0 新冠肺炎疫情暴发以来 [-0.2, 0.3,....] 2 [-0.17, 0.20,...] 3.476364
#1 频繁出现的无症状感染者病例 [-0.2, 0.1,....] 9 [-0.19, -0.16,...] 3.096487
data = pd.DataFrame(sentences)
data.columns=['sentence']
data['embedding'] = sentence_embeddings.numpy().tolist()# .tolist()数组转化为列表
# 句子分为10簇
data['cluster']=pd.Series(assigned_clusters, index=data.index)
# 每个质心的向量:计算求平均
data['centroid']=data['cluster'].apply(lambda x: kclusterer.means()[x])
# 计算sentence的embedding和质心的距离
def distance_from_centroid(row):
return distance_matrix([row['embedding']], [row['centroid'].tolist()])[0][0]
data['distance_from_centroid'] = data.apply(distance_from_centroid, axis=1)
# ********** 得到摘要 ********** #
# 1. 按照cluster 进行分组
# 2. 组内排序
# 3. 按照文章顺序顺序取原来的句子
summary=data.sort_values('distance_from_centroid',ascending = True).groupby('cluster').head(1).sort_index()['sentence'].tolist()
print(summary)
实验结果
摘要10句