5.轮廓系数
- 好的聚类:内密外疏,同一个聚类内部的样本要足够密集,不同聚类之间样本要足够疏远。
电视机
皮夹克
电冰箱
羽绒服
好!
A : 电视机,电冰箱
B : 皮夹克,羽绒服
差!
A : 电视机,羽绒服
B : 电冰箱,皮夹克
针对样本空间中的一个特定样本,计算它与所在聚类其它样本的平均距离a,以及该样本与距离最近的另一个聚类中所有样本的平均距离b,该样本的轮廓系数为(b-a)/max(a, b),将整个样本空间中所有样本的轮廓系数取算数平均值,作为聚类划分的性能指标s。
-1 <----- 0 -----> 1
最差 聚类重叠 最好
sm.silhouette_score(输入集, 输出集,
sample_size=样本数, metric=距离算法)->平均轮廓系数
距离算法:euclidean,欧几里得距离
代码:score.py# -*- coding: utf-8 -*- from __future__ import unicode_literals import numpy as np import sklearn.cluster as sc import sklearn.metrics as sm import matplotlib.pyplot as mp x = [] with open('../../data/multiple3.txt', 'r') as f: for line in f.readlines(): data = [float(substr) for substr in line.split(',')] x.append(data) x = np.array(x) # K均值聚类器 model = sc.KMeans(n_clusters=4) model.fit(x) centers = model.cluster_centers_ l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005 b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005 grid_x = np.meshgrid(np.arange(l, r, h), np.arange(b, t, v)) flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()] flat_y = model.predict(flat_x) grid_y = flat_y.reshape(grid_x[0].shape) pred_y = model.predict(x) # 打印平均轮廓系数 print(sm.silhouette_score( x, pred_y, sample_size=len(x), metric='euclidean')) mp.figure('K-Means Cluster', facecolor='lightgray') mp.title('K-Means Cluster', fontsize=20) mp.xlabel('x', fontsize=14) mp.ylabel('y', fontsize=14) mp.tick_params(labelsize=10) mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray') mp.scatter(x[:, 0], x[:, 1], c=pred_y, cmap='brg', s=80) mp.scatter(centers[:, 0], centers[:, 1], marker='+', c='gold', s=1000, linewidth=1) mp.show()
6.DBSCAN(带噪声的基于密度的聚类)算法
- 朋友的朋友也是朋友
从样本空间中任意选择一个样本,以事先给定的半径做圆,凡被该圆圈中的样本都视为与该样本处于相同的聚类,以这些被圈中的样本为圆心继续做圆,重复以上过程,不断扩大被圈中样本的规模,直到再也没有新的样本加入为止,至此即得到一个聚类。于剩余样本中,重复以上过程,直到耗尽样本空间中的所有样本为止。
- 事先给定的半径会影响最后的聚类效果,可以借助轮廓系数选择较优的方案。
- 根据聚类的形成过程,把样本细分为以下三类:
外周样本:被其它样本聚集到某个聚类中,但无法再引入新样本的样本。
孤立样本:聚类中的样本数低于所设定的下限,则不称其为聚类,反之称其为孤立样本。
核心样本:除了外周样本和孤立样本以外的样本。
代码:dbscan.py# -*- coding: utf-8 -*- from __future__ import unicode_literals import numpy as np import sklearn.cluster as sc import sklearn.metrics as sm import matplotlib.pyplot as mp x = [] with open('../../data/perf.txt', 'r') as f: for line in f.readlines(): data = [float(substr) for substr in line.split(',')] x.append(data) x = np.array(x) epsilons, scores, models = \ np.linspace(0.3, 1.2, 10), [], [] for epsilon in epsilons: # DBSCAN聚类器 model = sc.DBSCAN(eps=epsilon, min_samples=5) model.fit(x) score = sm.silhouette_score( x, model.labels_, sample_size=len(x), metric='euclidean') scores.append(score) models.append(model) scores = np.array(scores) best_index = scores.argmax() best_epsilon = epsilons[best_index] print(best_epsilon) best_score = scores[best_index] print(best_score) best_model = models[best_index] pred_y = best_model.fit_predict(x) core_mask = np.zeros(len(x), dtype=bool) core_mask[best_model.core_sample_indices_] = True offset_mask = best_model.labels_ == -1 periphery_mask = ~(core_mask | offset_mask) mp.figure('DBSCAN Cluster', facecolor='lightgray') mp.title('DBSCAN Cluster', fontsize=20) mp.xlabel('x', fontsize=14) mp.ylabel('y', fontsize=14) mp.tick_params(labelsize=10) labels = set(pred_y) cs = mp.get_cmap('brg', len(labels))( range(len(labels))) mp.scatter(x[core_mask][:, 0], x[core_mask][:, 1], c=cs[pred_y[core_mask]], s=80, label='Core') mp.scatter(x[periphery_mask][:, 0], x[periphery_mask][:, 1], edgecolor=cs[pred_y[periphery_mask]], facecolor='none', s=80, label='Periphery') mp.scatter(x[offset_mask][:, 0], x[offset_mask][:, 1], c=cs[pred_y[offset_mask]], marker='x', s=80, label='Offset') mp.legend() mp.show()
十四、推荐引擎
1.欧氏距离分数
1
欧氏距离分数 = -------------
1+欧氏距离
欧氏距离->0 ->oo
欧氏距离分数->1 ->0
0 <-> 1
不相似 相似
a b c ...
a x x x ...
b x x x ...
c x x x ...
...
代码:es.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import json
import numpy as np
with open('../../data/ratings.json', 'r') as f:
ratings = json.loads(f.read())
users, scmat = list(ratings.keys()), []
for user1 in users:
scrow = []
for user2 in users:
movies = set()
for movie in ratings[user1]:
if movie in ratings[user2]:
movies.add(movie)
if len(movies) == 0:
score = 0
else:
x, y = [], []
for movie in movies:
x.append(ratings[user1][movie])
y.append(ratings[user2][movie])
x = np.array(x)
y = np.array(y)
score = 1 / (1 + np.sqrt(
((x - y) ** 2).sum()))
scrow.append(score)
scmat.append(scrow)
users = np.array(users)
scmat = np.array(scmat)
for scrow in scmat:
print(' '.join('{:>5.2f}'.format(score)
for score in scrow))
2.皮氏距离分数
相关性矩阵
/ 1 相关性系数 \
\ 相关性系数 1 /
相关性系数 = 协方差/标准差之积
-1 <------- 0 -------> 1
反相关 不相关 正相关
3-5
0-2
代码:ps.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import json
import numpy as np
with open('../../data/ratings.json', 'r') as f:
ratings = json.loads(f.read())
users, scmat = list(ratings.keys()), []
for user1 in users:
scrow = []
for user2 in users:
movies = set()
for movie in ratings[user1]:
if movie in ratings[user2]:
movies.add(movie)
if len(movies) == 0:
score = 0
else:
x, y = [], []
for movie in movies:
x.append(ratings[user1][movie])
y.append(ratings[user2][movie])
x = np.array(x)
y = np.array(y)
score = np.corrcoef(x, y)[0, 1]
scrow.append(score)
scmat.append(scrow)
users = np.array(users)
scmat = np.array(scmat)
for scrow in scmat:
print(' '.join('{:>5.2f}'.format(score)
for score in scrow))
3.按照相似度从高到低排列每个用户的相似用户
代码:sim.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import json
import numpy as np
with open('../../data/ratings.json', 'r') as f:
ratings = json.loads(f.read())
users, scmat = list(ratings.keys()), []
for user1 in users:
scrow = []
for user2 in users:
movies = set()
for movie in ratings[user1]:
if movie in ratings[user2]:
movies.add(movie)
if len(movies) == 0:
score = 0
else:
x, y = [], []
for movie in movies:
x.append(ratings[user1][movie])
y.append(ratings[user2][movie])
x = np.array(x)
y = np.array(y)
score = np.corrcoef(x, y)[0, 1]
scrow.append(score)
scmat.append(scrow)
users = np.array(users)
scmat = np.array(scmat)
for i, user in enumerate(users):
sorted_indices = scmat[i].argsort()[::-1]
sorted_indices = sorted_indices[
sorted_indices != i]
similar_users = users[sorted_indices]
similar_scores = scmat[i, sorted_indices]
print(user, similar_users, similar_scores,
sep='\n')
4.生成推荐清单
推荐度
皮氏距离分数>0的用户
打分高低
相似度权重
代码:rcm.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import json
import numpy as np
with open('../../data/ratings.json', 'r') as f:
ratings = json.loads(f.read())
users, scmat = list(ratings.keys()), []
for user1 in users:
scrow = []
for user2 in users:
movies = set()
for movie in ratings[user1]:
if movie in ratings[user2]:
movies.add(movie)
if len(movies) == 0:
score = 0
else:
x, y = [], []
for movie in movies:
x.append(ratings[user1][movie])
y.append(ratings[user2][movie])
x = np.array(x)
y = np.array(y)
score = np.corrcoef(x, y)[0, 1]
scrow.append(score)
scmat.append(scrow)
users = np.array(users)
scmat = np.array(scmat)
for i, user in enumerate(users):
sorted_indices = scmat[i].argsort()[::-1]
sorted_indices = sorted_indices[
sorted_indices != i]
similar_users = users[sorted_indices]
similar_scores = scmat[i, sorted_indices]
positive_mask = similar_scores > 0
similar_users = similar_users[positive_mask]
similar_scores = similar_scores[positive_mask]
score_sums, weight_sums = {}, {}
for similar_user, similar_score in zip(
similar_users, similar_scores):
for movie, score in ratings[
similar_user].items():
if movie not in ratings[user].keys():
if movie not in score_sums.keys():
score_sums[movie] = 0
score_sums[movie] += score * similar_score
if movie not in weight_sums.keys():
weight_sums[movie] = 0
weight_sums[movie] += similar_score
movie_ranks = {}
for movie, score_sum in score_sums.items():
movie_ranks[movie] = \
score_sum / weight_sums[movie]
sorted_indices = np.array(list(
movie_ranks.values())).argsort()[::-1]
recomms = np.array(list(
movie_ranks.keys()))[sorted_indices]
print(user, recomms, sep='\n')
十五、自然语言
__________________________________________
| ^
v |
人->说话-->机 器 女 友->说话
| 文本--->语义->逻辑-->文本 |
|语音识别|自然语言| 业务|自然语言|语音合成
NLTK - 自然语言工具包
1.分词
import nltk.tokenize as tk
tk.sent_tokenize(文本)->句子列表
tk.word_tokenize(文本)->单词列表 \
分词器 = tk.WordPunctTokenizer() > 略有不同
分词器.tokenize(文本)->单词列表 /
代码:tkn.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import nltk.tokenize as tk
doc = "Are you curious about tokenization? " \
"Let's see how it works! " \
"We need to analyze a couple of sentences " \
"with punctuations to see it in action."
print(doc)
tokens = tk.sent_tokenize(doc)
for i, token in enumerate(tokens):
print("%2d" % (i + 1), token)
print('-' * 15)
tokens = tk.word_tokenize(doc)
for i, token in enumerate(tokens):
print("%2d" % (i + 1), token)
print('-' * 15)
tokenizer = tk.WordPunctTokenizer()
tokens = tokenizer.tokenize(doc)
for i, token in enumerate(tokens):
print("%2d" % (i + 1), token)
2.词干
import nltk.stem.porter as pt
import nltk.stem.lancaster as lc
import nltk.stem.snowball as sb
pt.PorterStemmer() -> 波特词干提取器,偏宽松
lc.LancasterStemmer() -> 朗卡斯特词干提取器,偏严格
sb.SnowballStemmer(语言) -> 思诺博词干提取器,偏中庸
XXX词干提取器.stem(单词)->词干
代码:stm.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import nltk.stem.porter as pt
import nltk.stem.lancaster as lc
import nltk.stem.snowball as sb
words = ['table', 'probably', 'wolves', 'playing',
'is', 'dog', 'the', 'beaches', 'grounded',
'dreamt', 'envision']
pt_stemmer = pt.PorterStemmer()
lc_stemmer = lc.LancasterStemmer()
sb_stemmer = sb.SnowballStemmer('english')
for word in words:
pt_stem = pt_stemmer.stem(word)
lc_stem = lc_stemmer.stem(word)
sb_stem = sb_stemmer.stem(word)
print('%8s %8s %8s %8s' % (
word, pt_stem, lc_stem, sb_stem))
3.原型
名词:复数->单数
动词:分词->原型
代码:lmm.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import nltk.stem as ns
words = ['table', 'probably', 'wolves', 'playing',
'is', 'dog', 'the', 'beaches', 'grounded',
'dreamt', 'envision']
lemmatizer = ns.WordNetLemmatizer()
for word in words:
n_lemma = lemmatizer.lemmatize(word, pos='n')
v_lemma = lemmatizer.lemmatize(word, pos='v')
print('%8s %8s %8s' % (word, n_lemma, v_lemma))
4.词袋
The brown dog is running. The black dog is in the black room. Running in the room is forbidden.
1 The brown dog is running
2 The black dog is in the black room
3 Running in the room is forbidden
the brown dog is running black in room forbidden
1 1 1 1 1 1 0 0 0 0
2 1 0 1 1 0 2 1 1 0
3 1 0 0 1 1 0 1 1 1
代码:bow.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import nltk.tokenize as tk
import sklearn.feature_extraction.text as ft
doc = 'The brown dog is running. ' \
'The black dog is in the black room. ' \
'Running in the room is forbidden.'
print(doc)
sentences = tk.sent_tokenize(doc)
print(sentences)
cv = ft.CountVectorizer()
bow = cv.fit_transform(sentences).toarray()
print(bow)
words = cv.get_feature_names()
print(words)
5.词频
apple
1 5/8
2 10/100
词袋矩阵的归一化。
代码:tf.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import nltk.tokenize as tk
import sklearn.feature_extraction.text as ft
import sklearn.preprocessing as sp
doc = 'The brown dog is running. ' \
'The black dog is in the black room. ' \
'Running in the room is forbidden.'
print(doc)
sentences = tk.sent_tokenize(doc)
print(sentences)
cv = ft.CountVectorizer()
bow = cv.fit_transform(sentences).toarray()
print(bow)
words = cv.get_feature_names()
print(words)
tf = sp.normalize(bow, norm='l1')
print(tf)
6.文档频率
含有某个单词的样本数/总样本数
7.逆文档频率
总样本数/含有某个单词的样本数
8.词频-逆文档频率(TF-IDF)
词频矩阵中的每一个元素乘以相应单词的逆文档频率,其值越大说明该词对样本语义的贡献越大,根据每个词的贡献力度,构建学习模型。
代码:tfidf.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import nltk.tokenize as tk
import sklearn.feature_extraction.text as ft
doc = 'The brown dog is running. ' \
'The black dog is in the black room. ' \
'Running in the room is forbidden.'
print(doc)
sentences = tk.sent_tokenize(doc)
print(sentences)
cv = ft.CountVectorizer()
bow = cv.fit_transform(sentences).toarray()
print(bow)
words = cv.get_feature_names()
print(words)
tt = ft.TfidfTransformer()
tfidf = tt.fit_transform(bow).toarray()
print(tfidf)
9.文本分类(主题识别)
代码:doc.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import sklearn.datasets as sd
import sklearn.feature_extraction.text as ft
import sklearn.naive_bayes as nb
train = sd.load_files(
'../../data/20news', encoding='latin1',
shuffle=True, random_state=7)
train_data = train.data
train_y = train.target
categories = train.target_names
cv = ft.CountVectorizer()
train_bow = cv.fit_transform(train_data)
tt = ft.TfidfTransformer()
train_x = tt.fit_transform(train_bow)
model = nb.MultinomialNB()
model.fit(train_x, train_y)
test_data = [
'The curveballs of right handed pitchers tend to curve to the left',
'Caesar cipher is an ancient form of encryption',
'This two-wheeler is really good on slippery roads']
test_bow = cv.transform(test_data)
test_x = tt.transform(test_bow)
pred_test_y = model.predict(test_x)
for sentence, index in zip(test_data, pred_test_y):
print(sentence, '->', categories[index])
1 2 3 4 5 6
2 3 0 0 1 4
0 4 1 1 2 2
10.性别识别
代码:gndr.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import random
import numpy as np
import nltk.corpus as nc
import nltk.classify as cf
male_names = nc.names.words('male.txt')
female_names = nc.names.words('female.txt')
models, acs = [], []
for n_letters in range(1, 6):
data = []
for male_name in male_names:
feature = {'feature': male_name[
-n_letters:].lower()}
data.append((feature, 'male'))
for female_name in female_names:
feature = {'feature': female_name[
-n_letters:].lower()}
data.append((feature, 'female'))
random.seed(7)
random.shuffle(data)
train_data = data[:int(len(data) / 2)]
test_data = data[int(len(data) / 2):]
model = cf.NaiveBayesClassifier.train(
train_data)
ac = cf.accuracy(model, test_data)
models.append(model)
acs.append(ac)
best_index = np.array(acs).argmax()
best_letters = best_index + 1
best_model = models[best_index]
best_ac = acs[best_index]
print(best_letters, best_ac)
names, genders = [
'Leonardo', 'Amy', 'Sam', 'Tom', 'Katherine',
'Taylor', 'Susanne'], []
for name in names:
feature = {'feature': name[
-best_letters:].lower()}
gender = best_model.classify(feature)
genders.append(gender)
for name, gender in zip(names, genders):
print(name, '->', gender)