ch06 词云
#6.2.2
import jieba
from collections import Counter
content = open(r"F:\code\数据及相关的资料\pachong.txt",encoding='utf-8').read()
con_words = [x for x in jieba.cut(content) if len(x) >= 2]
print(Counter(con_words).most_common(10))
txt = '欧阳建国是创新办主任依然是欢聚时代云公司云计算方面的专家'
jieba.load_userdict(r'F:\code\数据及相关的资料\user_dict.txt')
print(','.join(jieba.cut(txt)))
欧阳建国,是,创新办,主任,依然,是,欢聚时代,云,公司,云计算,方面,的,专家
#6.3 文本词云图
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
text = open(r"F:\code\数据及相关的资料\pachong.txt",encoding='utf-8')
mylist = list(text) #将文本字符串转换为列表
word_list = [ " ".join(jieba.cut(sentence))for sentence in mylist]
new_text = ' '.join(word_list)
wordcloud = WordCloud(font_path = 'simhei.ttf',
background_color="black").generate(new_text)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
#6.4 根据轮廓词云图的制作
#根据文本生成词云
import jieba
from PIL import Image
from wordcloud import WordCloud, STOPWORDS
from imageio import imread
import matplotlib.pyplot as plt
content = open(r"F:\code\数据及相关的资料\pachong.txt",encoding='utf-8')
mylist = list(content) #将文本字符串转换为列表
word_list = [ " ".join(jieba.cut(sentence))for sentence in mylist]
new_text = ' '.join(word_list)
pac_mask = imread("F:\\code\\数据及相关的资料\\apchong.png")
wc = WordCloud(font_path = 'simhei.ttf',
background_color="white",max_words=2000,mask=pac_mask).generate(new_text)
plt.imshow(wc)
plt.axis("off")
plt.show()
#根据词频生成词云
import jieba
from PIL import Image
from wordcloud import WordCloud, STOPWORDS
from imageio import imread
from collections import Counter
import matplotlib.pyplot as plt
content = open(r"F:\code\数据及相关的资料\pachong.txt",encoding='utf-8')
mylist = list(content) #将文本字符串转换为列表
word_list = [ " ".join(jieba.cut(sentence))for sentence in mylist]
new_text = ' '.join(word_list)
con_words = [x for x in jieba.cut(new_text) if len(x) >= 2]
frequencies = Counter(con_words).most_common()
frequencies = dict(frequencies)
pac_mask = imread("F:\\code\\数据及相关的资料\\apchong.png")
wc = WordCloud(font_path = 'simhei.ttf',
background_color="white",max_words=2000,mask=pac_mask).generate(new_text)
plt.imshow(wc)
plt.axis("off")
plt.show()
"""------实验内容1.从seaborn库中获取鸢尾花数据集(iris)------"""
import seaborn as sns #导入seaborn库
iris_data = sns.load_dataset('iris') #从seaborn库中获取鸢尾花数据集(iris)
print (iris_data) #打印出鸢尾花数据集
"""------实验内容2.计算每个特征的均值、中位数、中列数、方差、以及五数概括------"""
"""---方法1:通过np计算相关数值---"""
import numpy as np #导入numpy库
for str_feature in iris_data.columns[:4]: #遍历鸢尾花数据集的前4列,即4个特征
print(str_feature) #打印特征
feature_values = iris_data[str_feature].values #4个特征的值
feature_min = np.min(feature_values) #每个特征的最小值
feature_max = np.max(feature_values) #每个特征的最大值
feature_mean = np.mean(feature_values) #每个特征的均值
feature_median = np.median(feature_values) #每个特征的中位数
feature_midrange = (feature_min + feature_max)/2 #每个特征的中列数
feature_var = np.var(feature_values) #每个特征的方差
feature_Q1 = np.percentile(feature_values,25) #25%的分位数
feature_Q3 = np.percentile(feature_values,75) #75%的分位数
feature_five_numbers = [feature_min, feature_Q1, feature_median, feature_Q3, feature_max] #五数概括(最小值,25%的分位数,中位数,75%的分位数,最大值)
print('特征 %s:均值=%.2f, 中位数=%.2f, 中列数=%.2f,方差=%.2f,'%(str_feature,feature_mean,feature_median,feature_midrange,feature_var))
print('五数概括= %s\n'%str(feature_five_numbers))
"""---方法2:通过dataframe的describe功能---"""
# data_describe = iris_data.describe()
# for str_feature in iris_data.columns[:4]:
# print(str_feature)
# feature_describe = iris_data.describe()[str_feature]
# print('特征%s:均值=%.2f, 中位数=%.2f, 中列数=%.2f,方差=%.2f,'%(str_feature,feature_describe['mean'],feature_describe['50%'],(feature_describe['min'] + feature_describe['max'])/2,feature_describe['std']**2))
# print('五数概括= %s\n'%str(feature_five_numbers))
# print (help(np.percentile))
"""------实验3.画出每个特征的箱线图------"""
"""---方法1:使用boxplot()---"""
import matplotlib.pyplot as plt #导入matplotlib库
data = iris_data.values #鸢尾花数据集的值赋值给data
for indx in range(0,4):
# print (indx)
plt.subplot(141+indx) #
plt.boxplot(x=data[:,indx]) #x:指定要绘制箱线图的数据
plt.ylabel('value') #纵标签
plt.xlabel(iris_data.columns.values[indx]) #横标签
plt.show()
"""---方法2:使用boxplot()的by功能---"""
# import matplotlib.pyplot as plt
#
# iris_data.boxplot(column = 'sepal_length', by = 'species')
# plt.xlabel("Species")
# plt.ylabel("Sepal Length")
# plt.title("")
# plt.show()
"""---方法3:建立特征数值矩阵---"""
# list_speal_length = [list(iris_data[iris_data['species'] == 'setosa']['sepal_length'].values)
# ,list(iris_data[iris_data['species'] == 'versicolor']['sepal_length'].values)
# ,list(iris_data[iris_data['species'] == 'virginica']['sepal_length'].values)]
# plt.boxplot(list_speal_length)
# plt.xticks([1,2,3], ["setosa",'versicolor','virginica'])
# plt.xlabel("Species")
# plt.ylabel("Sepal Length")
# plt.show()
"""------实验内容4. 画出特征sepal_length和petal_length散点图------"""
#‘setosa’,’versicolor’,’virginica’三个类别分别用红,绿,蓝三种颜色标记
class1_feature1 = iris_data[iris_data['species'] == 'setosa']['sepal_length'].values
class1_feature3 = iris_data[iris_data['species'] == 'setosa']['petal_length'].values
class2_feature1 = iris_data[iris_data['species'] == 'versicolor']['sepal_length'].values
class2_feature3 = iris_data[iris_data['species'] == 'versicolor']['petal_length'].values
class3_feature1 = iris_data[iris_data['species'] == 'virginica']['sepal_length'].values
class3_feature3 = iris_data[iris_data['species'] == 'virginica']['petal_length'].values
plt.scatter(class1_feature1, class1_feature3, c='r')
plt.scatter(class2_feature1, class2_feature3, c='g')
plt.scatter(class3_feature1, class3_feature3, c='b')
plt.show()
"""------实验内容5. 计算第1类setosa和第3类virginica特征平均值的欧式距离和曼哈顿距离------"""
data_setosa = iris_data[iris_data['species'] == 'setosa']
val_mean_class1 = np.round(np.mean(data_setosa),2) #round():保留小数点后两位
# print (val_mean_class1)
data_virginica = iris_data[iris_data['species'] == 'virginica']
val_mean_class3 = np.round(np.mean(data_virginica),2)
dis_eu = np.linalg.norm(val_mean_class1 - val_mean_class3, ord=2)
dis_man = np.linalg.norm(val_mean_class1 - val_mean_class3, ord=1)
print ("欧式距离:",dis_eu)
print ("曼哈顿距离:",dis_man)