【无标题】12

import pandas as pd
from sklearn.cluster import KMeans
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

def embedding(sentences):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2').to(device)
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = model_output[0][:, 0]
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1).to(device)
    return sentence_embeddings

def cluster_and_summarize(file_path, cluster_num):
    # 读取 Excel 文件
    df = pd.read_excel(file_path, header=None, names=['description', 'type'])
    questions = df['description'].tolist()
    types = df['type'].tolist()

    # 获取问题描述的嵌入向量
    X = embedding(questions)

    # 使用 KMeans 聚类
    kmeans = KMeans(n_clusters=cluster_num)
    kmeans.fit(X.cpu())
    labels = kmeans.labels_

    # 输出每个问题所属的类别
    cluster_result = {}
    for i in range(cluster_num):
        cluster_result[i] = []

    for i in range(len(questions)):
        cluster_result[labels[i]].append((questions[i], types[i]))

    # 生成 HTML 格式的输出
    html_output = "<html><body>"
    for i in range(cluster_num):
        cluster = cluster_result[i]
        question_summary = [question for question, _ in cluster]
        type_summary = [typ for _, typ in cluster]
        
        # 聚类总结
        summary = f"Cluster {i + 1}: {len(cluster)} questions"
        summary += f"<br>Summary: {', '.join(set(type_summary))}"
        html_output += f"<h2>{summary}</h2>"
        html_output += "<ul>"
        
        # 每个聚类中的问题详细信息
        for question, typ in cluster:
            html_output += f"<li>{typ}: {question}</li>"
        html_output += "</ul>"
    html_output += "</body></html>"
    
    # 保存为 HTML 文件
    with open("cluster_result.html", "w") as f:
        f.write(html_output)

    print("Clustering completed and results saved to cluster_result.html")

# 调用函数进行聚类和总结
cluster_and_summarize('your_excel_file.xlsx', 5)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值