import pandas as pd
from sklearn.cluster import KMeans
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
def embedding(sentences):
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2').to(device)
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to(device)
with torch.no_grad():
model_output = model(**encoded_input)
sentence_embeddings = model_output[0][:, 0]
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1).to(device)
return sentence_embeddings
def cluster_and_summarize(file_path, cluster_num):
# 读取 Excel 文件
df = pd.read_excel(file_path, header=None, names=['description', 'type'])
questions = df['description'].tolist()
types = df['type'].tolist()
# 获取问题描述的嵌入向量
X = embedding(questions)
# 使用 KMeans 聚类
kmeans = KMeans(n_clusters=cluster_num)
kmeans.fit(X.cpu())
labels = kmeans.labels_
# 输出每个问题所属的类别
cluster_result = {}
for i in range(cluster_num):
cluster_result[i] = []
for i in range(len(questions)):
cluster_result[labels[i]].append((questions[i], types[i]))
# 生成 HTML 格式的输出
html_output = "<html><body>"
for i in range(cluster_num):
cluster = cluster_result[i]
question_summary = [question for question, _ in cluster]
type_summary = [typ for _, typ in cluster]
# 聚类总结
summary = f"Cluster {i + 1}: {len(cluster)} questions"
summary += f"<br>Summary: {', '.join(set(type_summary))}"
html_output += f"<h2>{summary}</h2>"
html_output += "<ul>"
# 每个聚类中的问题详细信息
for question, typ in cluster:
html_output += f"<li>{typ}: {question}</li>"
html_output += "</ul>"
html_output += "</body></html>"
# 保存为 HTML 文件
with open("cluster_result.html", "w") as f:
f.write(html_output)
print("Clustering completed and results saved to cluster_result.html")
# 调用函数进行聚类和总结
cluster_and_summarize('your_excel_file.xlsx', 5)