机器学习 上网时长分析
import numpy as np
import sklearn.cluster as skc #聚类模块
from sklearn import metrics #计算距离模块
import matplotlib.pyplot as plt
mac2id=dict()
onlinetimes=[] #存入的是起止时间元组()
f=open('TestData.txt',encoding='utf-8')
#print(f)
for line in f:
mac=line.split(',')[2] #读取mac地址
onlinetime=int(line.split(',')[6]) #上网时长
starttime=int(line.split(',')[4].split(' ')[1].split(':')[0]) #开始上网时间 取22 2014-07-20 22:44:18.540000000
#如果mac地址不在列表中,则该地址对应的字典VALUE等于对应的时长长度,在线时长列表加入开始上网时间和上网总时间
#否则,在线时长列表,字典用来计算每个时长的数量
if mac not in mac2id:
mac2id[mac]=len(onlinetimes) #?
onlinetimes.append((starttime,onlinetime))
else:
onlinetimes[mac2id[mac]]=[(starttime,onlinetime)]
real_X=np.array(onlinetimes).reshape((-1,2))
X=real_X[:,0:1]
db=skc.DBSCAN(eps=0.01,min_samples=20).fit(X) #用DBSCAN训练,eps=0.01,簇
labels = db.labels_ #每个数据的簇标签
print('Labels:')
print(labels)
raito=len(labels[labels[:] == -1]) / len(labels) #计算标签为负一,即噪声的比例
print('Noise raito:',format(raito, '.2%'))
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
print("Silhouette Coefficient: %0.3f"% metrics.silhouette_score(X, labels))
for i in range(n_clusters_):
print('Cluster ',i,':')
print(list(X[labels == i].flatten())) #变成一维
plt.hist(X,24)