20200221_2_国家非文化遗产聚类分析

这个需求主要是k-means聚类算法,再加一个优化k-means++算法,这些直接调库就可以

from sklearn import metrics
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline 
import matplotlib.pyplot as plt
import pandas as pd
import jieba
#进行文件的读取
test=pd.read_excel('data/国家非文化遗产1.xlsx')
#强制转化为str类型
test["内容"]=test["内容"].astype('str')
cuttxt = lambda x: " ".join(jieba.lcut(x)) # 这里不做任何清理工作,以保留情感词
#调用apply函数进行修改
test["clean"] = test["内容"].apply(cuttxt) 
#调用停用词
stpwrdpath ="data/停用词.txt"
with open(stpwrdpath, 'rb') as fp:
    stopword = fp.read().decode('utf-8')  # 提用词提取
#将停用词表转换为list  
stpwrdlst = stopword.splitlines()
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import CountVectorizer 
vectorizer=CountVectorizer(stop_words=stpwrdlst)#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频  
transformer=TfidfTransformer()#该类会统计每个词语的tf-idf权值
tfidf=transformer.fit_transform(vectorizer.fit_transform(test["clean"]))#第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
weight=tfidf.toarray()#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重 
word=vectorizer.get_feature_names()#获取词袋模型中的所有词
from sklearn.cluster import KMeans
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters = i, random_state = 42)
    kmeans.fit(weight)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('K-means算法')
plt.xlabel('Number of clusters')
plt.show()
D:\sofewore\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:211: RuntimeWarning: Glyph 31639 missing from current font.
  font.set_text(s, 0.0, flags=flags)
D:\sofewore\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:211: RuntimeWarning: Glyph 27861 missing from current font.
  font.set_text(s, 0.0, flags=flags)
D:\sofewore\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:180: RuntimeWarning: Glyph 31639 missing from current font.
  font.set_text(s, 0, flags=flags)
D:\sofewore\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:180: RuntimeWarning: Glyph 27861 missing from current font.
  font.set_text(s, 0, flags=flags)

在这里插入图片描述

kmeans = KMeans(n_clusters = 2, random_state = 42)
y=kmeans.fit_predict(weight)
data={
    '类别':y
}
test1=pd.DataFrame(data)
# test1.head()
类别
01
11
21
31
41
test1["类别"].value_counts()
housetype=test1["类别"].value_counts()
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
#设置画布
asd,sdf = plt.subplots(1,1,dpi=100)
#获取排前10条类型
housetype.head(10).plot(kind='bar',x='housetype',y='size',title='类别数量分布',ax=sdf)
plt.legend(['数量'])
plt.show()

在这里插入图片描述

from sklearn.cluster import KMeans
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(weight)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

在这里插入图片描述

kmeans = KMeans(n_clusters = 2, init = 'k-means++', random_state = 42)
y=kmeans.fit_predict(weight)
data={
    '类别':y
}
test1=pd.DataFrame(data)
# test1.head()
类别
01
11
21
31
41
test1["类别"].value_counts()
1    2692
0     458
Name: 类别, dtype: int64
housetype=test1["类别"].value_counts()
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
#设置画布
asd,sdf = plt.subplots(1,1,dpi=100)
#获取排前10条类型
housetype.head(10).plot(kind='bar',x='housetype',y='size',title='类别数量分布',ax=sdf)
plt.legend(['数量'])
plt.show()

在这里插入图片描述

# # Fitting K-Means to the dataset
# # K-means本身不能解决 Random Initialization Trap,但是K-means++使用wcss算法用n_init参数能解决
# kmeans = KMeans(n_clusters = 2, init = 'k-means++', random_state = 42)
# y_kmeans = kmeans.fit_predict(weight)
# Visualising the clusters
plt.scatter(weight[y == 0, 0], weight[y == 0, 1], s = 100, c = 'red', label = '0')
plt.scatter(weight[y == 1, 0], weight[y == 1, 1], s = 100, c = 'blue', label = '1')
# plt.scatter(weight[y_kmeans == 2, 0], weight[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Sensitive')
# plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Superior')
# plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Target')
# plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值