kmeans聚类
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
#data= pd.read_excel(r"C:\Users\11982\Desktop\聚类.xlsx", usecols=[1, 2]) #
data=[
[0.662926761,100],
[0.79773726,100],
[0.802320207,60],
[0.872275896,60],
[0.761221676,80],
[0.773353782,100],
[0.797992117,100],
[0.782069623,100],
[0.788350727,100],
[0.790734294,80],
[0.612311172,60],
[0.709983776,80],
[0.777329562,100],
[0.265481755,60],
[0.782580006,100],
[0.782693916,100],
[0.767480709,100],
[0.773139689,100],
[0.768696679,100],
[0.77090429,80],
[0.775311097,80],
[0.770738196,100],
[0.769252381,80],
[0.774494932,100],
[0.771968875,60],
[0.769280484,100],
[0.767510088,100],
[0.774274057,80],
[0.773116532,80],
[0.772376583,100],
[0.772672911,80],
[0.768124975,80],
[0.771352894,80],
[0.780186026,80],
[0.768723999,80],
[0.77183411,80],
[0.772863576,60],
[0.772572453,60],
[0.772215203,60],
[0.521591749,100],
[0.65076882,80],
[0.769602484,60],
[0.769643967,60],
[0.769731438,60],
[0.77280591,100],
[0.769911033,60],
[0.771485438,60],
[0.780361136,80],
[0.769699451,60],
[0.771315925,100],
[0.769971951,60],
[0.768289309,60],
[0.76911563,80],
[0.770884,80],
[0.770624857,100],
[0.7650869,80],
[0.770341227,80],
[0.769707428,80],
[0.770007663,80],
[0.770018548,100],
[0.770648347,80],
[0.690994033,80],
[0.770593756,80],
[0.769299853,60],
[0.525127568,60],
[0.76923322,80],
[0.7684301,80],
[0.764146217,60],
[0.769542,60],
[0.51956417,80],
[0.769261447,60],
[0.770123585,80],
[0.76952227,60],
[0.265245046,60],
[0.771006522,80],
[0.768790991,60],
[0.769759738,100],
[0.515996397,80],
[0.769945241,100],
[0.770920878,80],
[0.516312207,60],
[0.769825525,100],
[0.768877553,100],
[0.769848137,60],
[0.769861442,100],
[0.730780191,60],
[0.517739666,80],
[0.770035063,60],
[0.517454483,80],
[0.896496999,60],
[0.518935233,80],
[0.769835963,80],
[0.770364039,60],
[0.517313111,60],
[0.76991748,80],
[0.5183545,60]
]
estimator=KMeans(n_clusters=5)
res=estimator.fit_predict(data)
lable_pred=estimator.labels_
centroids=estimator.cluster_centers_
inertia=estimator.inertia_
#print res
print(lable_pred)
print(centroids)
print(inertia)
colors1 = '#00CED1' #点的颜色
colors2 = '#DC143C'
colors3 = '#8E6B23'#郝色
colors4 = '#4D4DFF'#霓虹蓝
colors5 = '#800000'#栗色
colors6 = '#FFA500'#橙色
colors7 = '#008B8B'#深青色
colors8 = '#8F8FBD'#浅钢蓝色
for i in range(len(data)):
if int(lable_pred[i])==0:
plt.scatter(data[i][0],data[i][1],color='red')
if int(lable_pred[i])==1:
plt.scatter(data[i][0],data[i][1],color='black')
if int(lable_pred[i])==2:
plt.scatter(data[i][0],data[i][1],color='blue')
if int(lable_pred[i]) == 3:
plt.scatter(data[i][0], data[i][1], color='green')
if int(lable_pred[i]) == 4:
plt.scatter(data[i][0], data[i][1], color='yellow')
# if int(lable_pred[i]) == 5:
# plt.scatter(data[i][0], data[i][1], color=colors1)
# if int(lable_pred[i]) == 6:
# plt.scatter(data[i][0], data[i][1], color=colors2)
# if int(lable_pred[i]) == 7:
# plt.scatter(data[i][0], data[i][1], color=colors3)
# if int(lable_pred[i]) == 8:
# plt.scatter(data[i][0], data[i][1], color=colors4)
# if int(lable_pred[i]) == 9:
# plt.scatter(data[i][0], data[i][1], color=colors5)
# if int(lable_pred[i]) == 10:
# plt.scatter(data[i][0], data[i][1], color=colors6)
# if int(lable_pred[i]) == 11:
# plt.scatter(data[i][0], data[i][1], color=colors7)
# if int(lable_pred[i]) == 12:
# plt.scatter(data[i][0], data[i][1], color=colors8)
# if int(lable_pred[i]) == 13:
# plt.scatter(data[i][0], data[i][1], color='pink')
# if int(lable_pred[i]) == 14:
# plt.scatter(data[i][0], data[i][1], color='grey')
# if int(lable_pred[i]) == 15:
# plt.scatter(data[i][0], data[i][1], color='purple')
# else:
# plt.scatter(data[i][0],[data[i][1]],color='cyan')
# plt.xlim((0, 1))
# plt.ylim((50, 110))
my_x_ticks = np.arange(0, 1, 0.1)
my_y_ticks = np.arange(50, 110, 5)
plt.xticks(my_x_ticks)
plt.yticks(my_y_ticks)
plt.show()
多项式拟合
import numpy as np
import matplotlib.pyplot as plt
# 定义x、y散点坐标
x = [0.04,
0.0425,
0.0465,
0.0505,
0.0545,
0.0585,
0.0625,
0.0665,
0.0705,
0.0745,
0.0785,
0.0825,
0.0865,
0.0905,
0.0945,
0.0985,
0.1025,
0.1065,
0.1105,
0.1145,
0.1185,
0.1225,
0.1265,
0.1305,
0.1345,
0.1385,
0.1425,
0.1465,
0.15,
]
x = np.array(x)
#print('x is :\n', x)
num = [0,
0.066799583,
0.13505206,
0.20658008,
0.276812293,
0.302883401,
0.370215852,
0.406296668,
0.458295295,
0.508718692,
0.544408837,
0.548493958,
0.588765696,
0.625764576,
0.635605146,
0.673527424,
0.696925431,
0.705315993,
0.742936326,
0.776400729,
0.762022595,
0.791503697,
0.814998933,
0.822297861,
0.835301602,
0.845747745,
0.842070844,
0.868159536,
0.885864919,
]
y = np.array(num)
#print('y is :\n', y)
# 用3次多项式拟合
f1 = np.polyfit(x, y, 3)
print('f1 is :\n', f1)
p1 = np.poly1d(f1)
print('p1 is :\n', p1)
# 也可使用yvals=np.polyval(f1, x)
yvals = p1(x) # 拟合y值
print('yvals is :\n', yvals)
# 绘图
plot1 = plt.plot(x, y, 's', label='original values')
plot2 = plt.plot(x, yvals, 'r', label='polyfit values')
plt.xlabel('x')
plt.ylabel('y')
plt.legend(loc=4) # 指定legend的位置右下角
plt.title('B')
plt.show()
图表
折线图(单)
import pyecharts.options as opts
from pyecharts.charts import Line
x_data = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
y_data = [820, 932, 901, 934, 1290, 1330, 1320]
(
Line()
.set_global_opts(
tooltip_opts=opts.TooltipOpts(is_show=False),
xaxis_opts=opts.AxisOpts(type_="category"),
yaxis_opts=opts.AxisOpts(
type_="value",
axistick_opts=opts.AxisTickOpts(is_show=True),
splitline_opts=opts.SplitLineOpts(is_show=True),
),
)
.add_xaxis(xaxis_data=x_data)
.add_yaxis(
series_name="注释",
y_axis=y_data,
symbol="emptyCircle",
is_symbol_show=True,
label_opts=opts.LabelOpts(is_show=False),
)
.render("basic_line_chart.html")
)
折线图(多)
import pyecharts.options as opts
from pyecharts.charts import Line
x_data = ["周一", "周二", "周三", "周四", "周五", "周六", "周日"]
y_data = [820, 932, 901, 934, 1290, 1330, 1320]
(
Line()
.add_xaxis(xaxis_data=x_data)
.add_yaxis(
series_name="邮件营销",
stack="总量",
y_axis=[120, 132, 101, 134, 90, 230, 210],
label_opts=opts.LabelOpts(is_show=False),
)
.add_yaxis(
series_name="联盟广告",
stack="总量",
y_axis=[220, 182, 191, 234, 290, 330, 310],
label_opts=opts.LabelOpts(is_show=False),
)
.add_yaxis(
series_name="视频广告",
stack="总量",
y_axis=[150, 232, 201, 154, 190, 330, 410],
label_opts=opts.LabelOpts(is_show=False),
)
.add_yaxis(
series_name="直接访问",
stack="总量",
y_axis=[320, 332, 301, 334, 390, 330, 320],
label_opts=opts.LabelOpts(is_show=False),
)
.add_yaxis(
series_name="搜索引擎",
stack="总量",
y_axis=[820, 932, 901, 934, 1290, 1330, 1320],
label_opts=opts.LabelOpts(is_show=False),
)
.set_global_opts(
title_opts=opts.TitleOpts(title="折线图堆叠"),
tooltip_opts=opts.TooltipOpts(trigger="axis"),
yaxis_opts=opts.AxisOpts(
type_="value",
axistick_opts=opts.AxisTickOpts(is_show=True),
splitline_opts=opts.SplitLineOpts(is_show=True),
),
xaxis_opts=opts.AxisOpts(type_="category", boundary_gap=False),
)
.render("stacked_line_chart.html")
)
散点图
from pyecharts import options as opts
from pyecharts.charts import EffectScatter
data1 = ['郑州','发生','大','实时']
data2=[3148.8,4750.08,5896.84,4828.69]
c = (
EffectScatter()
.add_xaxis(data1)
.add_yaxis("", data2)
.set_global_opts(title_opts=opts.TitleOpts(title="Scatter"))
.render("effectscatter_base.html")
)
import pyecharts.options as opts
from pyecharts.charts import Scatter
data = [
[10.0, 8.04],
[8.0, 6.95],
[13.0, 7.58],
[9.0, 8.81],
[11.0, 8.33],
[14.0, 9.96],
[6.0, 7.24],
[4.0, 4.26],
[12.0, 10.84],
[7.0, 4.82],
[5.0, 5.68],
]
#按x大小排序
data.sort(key=lambda x: x[0])
x_data = [d[0] for d in data]
y_data = [d[1] for d in data]
(
#长宽
Scatter(init_opts=opts.InitOpts(width="800px", height="500px"))
.add_xaxis(xaxis_data=x_data)
.add_yaxis(
series_name="注释",
y_axis=y_data,
symbol_size=20,
label_opts=opts.LabelOpts(is_show=False),
)
.set_series_opts()
.set_global_opts(
xaxis_opts=opts.AxisOpts(
type_="value", splitline_opts=opts.SplitLineOpts(is_show=True)
),
yaxis_opts=opts.AxisOpts(
type_="value",
axistick_opts=opts.AxisTickOpts(is_show=True),
splitline_opts=opts.SplitLineOpts(is_show=True),
),
tooltip_opts=opts.TooltipOpts(is_show=True),
)
.render("basic_scatter_chart.html")
)
箱线图
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
font = {'family': 'MicroSoft YaHei',
'weight': 'bold',
'size': '12',
}
matplotlib.rc("font", **font)
data = {
'宿舍区': [7236.02, 4662.25, 8786.23, 8783.97, 9194.18,8970.89, 8768.35, 9495.18, 8842.41, 16186.7,16582.08,16207.68],
'食堂区': [1303.37, 789.72, 1348.3, 1568.75, 1834.25, 1816.25, 2165.11, 2358.45, 2782.31, 5098.3,4994.48,4955.08],
'教学楼区': [209.34,91.79,356.41,487.11,547.18,535.02,1119.9,979.58,870.42,1216.12,938.34,1032.72,],
}
df = pd.DataFrame(data)
df.plot.box(title="各月水量箱线图",)
plt.grid(linestyle="--", alpha=0.3)
plt.show()
柱状图
from pyecharts import options as opts
from pyecharts.charts import Bar
x=['第一季度','第二季度','第三季度','第四季度']
data1=[2454.86, 4330.32, 4315.04, 4132.87]
data2=[2348.24, 3344.08, 3454.78, 1056.26]
data3=[2475.19, 2000, 3212,3221.06 ]
c = (
Bar()
.add_xaxis(x)
.add_yaxis("XXX第一宿舍", data1)
.add_yaxis("XXX第二宿舍",data2)
.add_yaxis("XXX第三宿舍",data3)
.set_global_opts(
#title_opts=opts.TitleOpts(title=""),
yaxis_opts=opts.AxisOpts(name="用水量"),
xaxis_opts=opts.AxisOpts(name="季度"),
)
.render("bar_xyaxis_name.html")
)
填充图形指定区域
import numpy as np
import matplotlib.pyplot as plt
# 生成模拟数据
x = np.arange(0.0, 4.0*np.pi, 0.01)
y = np.sin(x)
# 绘制正弦曲线
plt.plot(x, y)
# 绘制基准水平直线
plt.plot((x.min(),x.max()), (0,0))
# 设置坐标轴标签
plt.xlabel('x')
plt.ylabel('y')
# 填充指定区域
plt.fill_between(x, y, where=(2.3<x) & (x<4.3) | (x>10), facecolor='purple')
# 可以填充多次
plt.fill_between(x, y, where=(7<x) & (x<8), facecolor='green')
plt.show()
import matplotlib.pyplot as plt
import numpy as np
x = np.linspace(0.05, 5, 100)
y = np.sin(x)**2+np.cos(x)
plt.plot(x, y, linestyle="-.", linewidth=1, color="magenta", label="example")
plt.legend()
plt.grid(linestyle="-", color="cyan")
#绘制网格线
plt.show()
当不知道随机变量的概率模型服从哪个分布时,可用均匀分布
- 误差、命中率、身高体重等服从正态分布
应用举例
某食品加工厂主要生产即食产品,一般当天生产的产品必
须当天售出,否则就会出现不能保质、或变质、造成一定的经济损失,如果市场需求量大而生产量不足,则也会影响工厂的销售收入,该产品的单位成本为1.5元,单位产品售价为4元。工厂为了避免产品滞销存货过多而造成的经济损失,提出了如何制定合理的生产与库存数量的方案问题,能够使得工厂能有尽可能多的收益,经初步考虑拟从以下两种生产与库存方案中选出一个较好的方案
方案(1):按前一天的销售量作为当天的生产库存量。
方案(2):按前两天的平均销售量作为当天的生产库存量。
解题思路
利用蒙特卡罗方法随机模拟市场对该产品需求量,统计计算出按照两种不同方案T天后工厂的经济值,比较不同方案经济效益的大小,选出一个较好的方案
import numpy as np
import random
def revenue(day, S1, S21, S22):
'''
S1: 方案(1)昨天的销售量
S21: S21表示方案(2)昨天的销售量
S22: S22表示方案(2)前天的销售量
'''
all1= 0 # 方案(1)总利润
all2= 0 # 方案(2)总利润
k = 1
while k < day:
produce1 = S1 # 方案(1)当天的生产量
produce2 = (S21 + S22) / 2 # 方案(2)当天的生产量
need= np.random.normal(1500, 30^2) # 每天需求量
if need > produce1:
buy1 = produce1 # 方案(1)当天销售量
else:
buy1 = need
if need > produce2:
buy2 = produce2 # 方案(2)当天销售量
else:
buy2 = need
today1 = 4.2 *buy1 - 2* produce1 # 方案(1)当天利润
today2 = 4.2* buy2 - 2 * produce2 # 方案(2)当天利润
all1 = all1 + today1
all2 = all2 + today2
k += 1
S1 = buy1
#位置不能换
S22 = S21
S21 = buy2
return all1, all2
for i in range(10):
x=random.randint(200,300)
y=random.randint(200,300)
z=random.randint(y,300)
print(revenue(10, x, y, z))
由此观之
气泡图
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm #字体管理器
def DrawBubble(read_name):#气泡图
sns.set(style = "whitegrid")#设置样式
fp = pd.read_csv(read_name)#数据来源
x = fp.alla#X轴数据
y = fp.account#Y轴数据
z = fp.number#用来调整各个点的大小s
cm = plt.cm.get_cmap('RdYlBu')
fig,ax = plt.subplots(figsize = (12,10))
#注意s离散化的方法,因为需要通过点的大小来直观感受其所表示的数值大小
#我所使用的是当前点的数值减去集合中的最小值后+0.1再*1000
#参数是X轴数据、Y轴数据、各个点的大小、各个点的颜色
bubble = ax.scatter(x, y, s=(z - np.min(z) + 0.1) * 1000, c=z, cmap=cm, linewidth=0.5, alpha=0.5)
ax.grid()
fig.colorbar(bubble)
# my_font = fm.FontProperties(fname="A:\pythonProject\汉仪瘦金书简.ttf")
ax.set_xlabel('x', fontsize = 15)#X轴标签
ax.set_ylabel('y', fontsize = 15)#Y轴标签
plt.show()
if __name__=='__main__':
DrawBubble("A:\pythonProject\可视化\聚类中心值.csv")#气泡图