# 计算轮廓系数,评估聚类结果 dbscan的方法,到计算轮廓系数的地方都报错,不知道为啥
raise ValueError(
ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)
# k-means分类
def kmeans_sample():
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer #向量化
from sklearn.cluster import KMeans #kmeans聚类方法
from sklearn.metrics import silhouette_score #
# 示例数据
data = {'新闻发布时间':['2024-01-01','2024-01-02','2024-01-03'],
'网站':['site1','site2','site3'],
'主体':['主体A','主体B','主体C'],
'内容摘要': ['摘要1', '摘要2', '摘要3'],
'甲方名': ['甲方1', '甲方2', '甲方3'],
'乙方名': ['乙方1', '乙方2', '乙方3'],
'协议内容':['协议1','协议2','协议3']
}
df = pd.DataFrame(data)
# 使用TF-IDF向量化内容摘要
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['内容摘要'])
# 选择聚类的数量
n_clusters = 2 #可根据需要调整
kmeans = KMeans(n_clusters= n_clusters,random_state=42)
kmeans.fit(tfidf_matrix)
# 将聚类结果添加到原始DataFrame中
df['cluster'] = kmeans.labels_
# 计算轮廓系数,评估聚类结果
# score = silhouette_score(tfidf_matrix,kmeans.labels_)
# print(f'轮廓系数:{score}')
# 将聚类结果保存到新的excel文件中
output_filename = 'clustered_data.xlsx'
with pd.ExcelWriter(output_filename,engine='openpyxl')as writer:
df.to_excel(writer,index=False)
print(f'聚类结果已保存到 {output_filename}')
# dbscan分类
def dbscan_sample():
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer # 向量化
from sklearn.cluster import DBSCAN # DBSCAN聚类方法
from sklearn.metrics import silhouette_score #
# 示例数据
data = {'新闻发布时间': ['2024-01-01', '2024-01-02', '2024-01-03'],
'网站': ['site1', 'site2', 'site3'],
'主体': ['主体A', '主体B', '主体C'],
'内容摘要': ['摘要1', '摘要2', '摘要3'],
'甲方名': ['甲方1', '甲方2', '甲方3'],
'乙方名': ['乙方1', '乙方2', '乙方3'],
'协议内容': ['协议1', '协议2', '协议3']
}
df = pd.DataFrame(data)
# 使用TF-IDF向量化内容摘要
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['内容摘要'])
# 选择聚类的数量
# n_clusters = 2
dbscan = DBSCAN(eps= 0.5,min_samples=2)# 可根据数据调整
clusters = dbscan.fit_predict(tfidf_matrix)
# 将聚类结果添加到原始DataFrame中
df['cluster'] = clusters
# 计算轮廓系数,评估聚类结果
# score = silhouette_score(tfidf_matrix, clusters)
# print(f'轮廓系数:{score}')
# 将聚类结果保存到新的excel文件中
output_filename = 'clustered_data_dbscan.xlsx'
with pd.ExcelWriter(output_filename, engine='openpyxl')as writer:
df.to_excel(writer, index=False)
print(f'聚类结果已保存到 {output_filename}')