1.什么是数据挖掘
数据挖掘关注数据中的模式发现,人们善于构建模型并进行预测,数据挖掘能够扩展这种能力,让我们能处理大量信息。
2.什么是协同过滤
它是推荐系统中一种经典的算法,该方法通过基于其他用户进行推荐,例如购物网站中当你买了一件商品后,会有提示:买了该商品的顾客同时也购买了如下商品…等等。
2.1 如何寻找相似用户?
采用的评价相似指标有如下几个:
- 曼哈顿距离(Manhattan Distance)
在二维情况下,每个用户的坐标表示为点 ( x , y ) (x,y) (x,y),他们之间的曼哈顿距离可以用如下公式计算:
∣ x 1 − x 2 ∣ + ∣ y 1 − y 2 ∣ |x_{1}-x_{2}| + |y_{1}- y_{2}| ∣x1−x2∣+∣y1−y2∣ - 欧式距离(勾股定理)(Euclidean Distance)
( x 1 − x 2 ) 2 + ( y 1 − y 2 ) 2 \sqrt{(x_{1}-x_{2})^{2}+(y_{1}-y_{2})^{2}} (x1−x2)2+(y1−y2)2 - 闵式距离(Minkowski Distance)
又叫做闵可夫斯基距离,是欧氏空间中的一种测度,被看做是欧氏距离的一种推广,欧氏距离是闵可夫斯基距离的一种特殊情况。
d ( x , y ) = ( ∑ k = 1 n ∣ x k − y k ∣ r ) 1 r d(x,y) = (\sum_{k=1}^n |x_{k}-y_{k}|^r ) ^\frac{1}{r} d(x,y)=(k=1∑n∣xk−yk∣r)r1
其中: - r=1时,上述公式就是曼哈顿距离
- r=2时,上述公式就是欧式距离
- r=3时,上述公式就是上确界距离,等同于切比雪夫距离
- r越大,某一维上的较大差异对最终值的影响也越大
# 代码实现
# parameter
# input:rating1,rating2 both is dictionary
# return:distance
from math import sqrt
def Minkowski(rating1, rating2, r):
distance = 0
for key in rating1:
if key in rating2:
temp = abs(rating1[key] - rating2[key])
distance += pow(pow(temp,r), 1.0/r)
return distance
- 皮尔逊相关系数
在数据挖掘领域中,存在"分数贬值"现象(比如不同用户的评级范围可能不同,比如各国之间的货币存在贬值现象),解决这个问题的一个方法是使用皮尔逊相关系数(Pearson Correlation Coefficient),其取值区间为[-1,1],1表示完全一致,-1表示完全不一致,利用皮尔逊相关系数可以寻找某个感兴趣用户的最相似用户。公式如下:
r = ∑ i = 1 n x i y i − ∑ i = 1 n x i ∑ i = 1 n y i n ∑ i = 1 n x i 2 − ( ∑ i = 1 n x i 2 ) n ∑ i = 1 n y i 2 − ( ∑ i = 1 n y i 2 ) n r=\frac{ \sum_{i=1}^{n} x_{i}y_{i} - \frac{\sum_{i=1}^{n} x_{i}\sum_{i=1}^{n} y_{i}}{n} } {\sqrt{\sum_{i=1}^{n} x_{i}^2 - \frac{ (\sum_{i=1}^{n} x_{i}^2)} {n}} \sqrt{\sum_{i=1}^{n} y_{i}^2 - \frac{ (\sum_{i=1}^{n} y_{i}^2)} {n}} } r=∑i=1nxi2−n(∑i=1nxi2)∑i=1nyi2−n(∑i=1nyi2)∑i=1nxiyi−n∑i=1nxi∑i=1nyi
def pearson(rating1, rating2):
sum_xy = 0
sum_x = 0
sum_y = 0
sum_x2 = 0
sum_y2 = 0
n = 0
for key in rating1:
if key in rating2:
n += 1
x = rating1[key]
y = rating2[key]
sum_xy += x * y
sum_x += x
sum_y += y
sum_x2 += pow(x, 2)
sum_y2 += pow(y, 2)
# now compute denominator
denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * sqrt(sum_y2 - pow(sum_y, 2) / n)
if denominator == 0:
return 0
else:
return (sum_xy - (sum_x * sum_y) / n) / denominator
- 余弦相似度
文本挖掘常常会用到,而且广泛应用于协同过滤中,其取值区间为[-1,1],1表示完全一致,-1表示完全不一致,公式如下:
c o s ( x , y ) = x ⋅ y ∣ ∣ x ∣ × ∣ ∣ y ∣ ∣ cos(x,y)=\frac{x \cdot y}{||x| \times ||y||} cos(x,y)=∣∣x∣×∣∣y∣∣x⋅y
2.1 相似度的选择
- 如果数据存在分数贬值,则采用皮尔逊相关系数
- 如果数据稀疏,采用余弦相似度
- 如果数据稠密,且几乎所有属性都没有零值,则考虑用曼哈顿距离或欧氏距离
3 基于协同过滤的一个简单推荐系统
import codecs
from math import sqrt
users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0,
"Norah Jones": 4.5, "Phoenix": 5.0,
"Slightly Stoopid": 1.5,
"The Strokes": 2.5, "Vampire Weekend": 2.0},
"Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5,
"Deadmau5": 4.0, "Phoenix": 2.0,
"Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
"Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0,
"Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5,
"Slightly Stoopid": 1.0},
"Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0,
"Deadmau5": 4.5, "Phoenix": 3.0,
"Slightly Stoopid": 4.5, "The Strokes": 4.0,
"Vampire Weekend": 2.0},
"Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0,
"Norah Jones": 4.0, "The Strokes": 4.0,
"Vampire Weekend": 1.0},
"Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0,
"Norah Jones": 5.0, "Phoenix": 5.0,
"Slightly Stoopid": 4.5, "The Strokes": 4.0,
"Vampire Weekend": 4.0},
"Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0,
"Norah Jones": 3.0, "Phoenix": 5.0,
"Slightly Stoopid": 4.0, "The Strokes": 5.0},
"Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0,
"Phoenix": 4.0, "Slightly Stoopid": 2.5,
"The Strokes": 3.0}
}
class recommender:
def __init__(self, data, k=1, metric='pearson', n=5):
""" initialize recommender
currently, if data is dictionary the recommender is initialized
to it.
For all other data types of data, no initialization occurs
k is the k value for k nearest neighbor
metric is which distance formula to use
n is the maximum number of recommendations to make"""
self.k = k
self.n = n
self.username2id = {}
self.userid2name = {}
self.productid2name = {}
# for some reason I want to save the name of the metric
self.metric = metric
if self.metric == 'pearson':
self.fn = self.pearson
#
# if data is dictionary set recommender data to it
#
if type(data).__name__ == 'dict':
self.data = data
def convertProductID2name(self, id):
"""Given product id number return product name"""
if id in self.productid2name:
return self.productid2name[id]
else:
return id
def userRatings(self, id, n):
"""Return n top ratings for user with id"""
print ("Ratings for " + self.userid2name[id])
ratings = self.data[id]
print(len(ratings))
ratings = list(ratings.items())
ratings = [(self.convertProductID2name(k), v)
for (k, v) in ratings]
# finally sort and return
ratings.sort(key=lambda artistTuple: artistTuple[1],
reverse = True)
ratings = ratings[:n]
for rating in ratings:
print("%s\t%i" % (rating[0], rating[1]))
def loadBookDB(self, path=''):
"""loads the BX book dataset. Path is where the BX files are
located"""
self.data = {}
i = 0
#
# First load book ratings into self.data
#
f = codecs.open(path + "BX-Book-Ratings.csv", 'r', 'utf8')
for line in f:
i += 1
#separate line into fields
fields = line.split(';')
user = fields[0].strip('"')
book = fields[1].strip('"')
rating = int(fields[2].strip().strip('"'))
if user in self.data:
currentRatings = self.data[user]
else:
currentRatings = {}
currentRatings[book] = rating
self.data[user] = currentRatings
f.close()
#
# Now load books into self.productid2name
# Books contains isbn, title, and author among other fields
#
f = codecs.open(path + "BX-Books.csv", 'r', 'utf8')
for line in f:
i += 1
#separate line into fields
fields = line.split(';')
isbn = fields[0].strip('"')
title = fields[1].strip('"')
author = fields[2].strip().strip('"')
title = title + ' by ' + author
self.productid2name[isbn] = title
f.close()
#
# Now load user info into both self.userid2name and
# self.username2id
#
f = codecs.open(path + "BX-Users.csv", 'r', 'utf8')
for line in f:
i += 1
#print(line)
#separate line into fields
fields = line.split(';')
userid = fields[0].strip('"')
location = fields[1].strip('"')
if len(fields) > 3:
age = fields[2].strip().strip('"')
else:
age = 'NULL'
if age != 'NULL':
value = location + ' (age: ' + age + ')'
else:
value = location
self.userid2name[userid] = value
self.username2id[location] = userid
f.close()
print(i)
def pearson(self, rating1, rating2):
sum_xy = 0
sum_x = 0
sum_y = 0
sum_x2 = 0
sum_y2 = 0
n = 0
for key in rating1:
if key in rating2:
n += 1
x = rating1[key]
y = rating2[key]
sum_xy += x * y
sum_x += x
sum_y += y
sum_x2 += pow(x, 2)
sum_y2 += pow(y, 2)
if n == 0:
return 0
# now compute denominator
denominator = (sqrt(sum_x2 - pow(sum_x, 2) / n)
* sqrt(sum_y2 - pow(sum_y, 2) / n))
if denominator == 0:
return 0
else:
return (sum_xy - (sum_x * sum_y) / n) / denominator
def computeNearestNeighbor(self, username):
"""creates a sorted list of users based on their distance to
username"""
distances = []
for instance in self.data:
if instance != username:
distance = self.fn(self.data[username],
self.data[instance])
distances.append((instance, distance))
# sort based on distance -- closest first
distances.sort(key=lambda artistTuple: artistTuple[1],
reverse=True)
return distances
def recommend(self, user):
"""Give list of recommendations"""
recommendations = {}
# first get list of users ordered by nearness
nearest = self.computeNearestNeighbor(user)
#
# now get the ratings for the user
#
userRatings = self.data[user]
#
# determine the total distance
totalDistance = 0.0
for i in range(self.k):
totalDistance += nearest[i][1]
# now iterate through the k nearest neighbors
# accumulating their ratings
for i in range(self.k):
# compute slice of pie
weight = nearest[i][1] / totalDistance
# get the name of the person
name = nearest[i][0]
# get the ratings for this person
neighborRatings = self.data[name]
# get the name of the person
# now find bands neighbor rated that user didn't
for artist in neighborRatings:
if not artist in userRatings:
if artist not in recommendations:
recommendations[artist] = (neighborRatings[artist]
* weight)
else:
recommendations[artist] = (recommendations[artist]
+ neighborRatings[artist]
* weight)
# now make list from dictionary
recommendations = list(recommendations.items())
recommendations = [(self.convertProductID2name(k), v)
for (k, v) in recommendations]
# finally sort and return
recommendations.sort(key=lambda artistTuple: artistTuple[1],
reverse = True)
# Return the first n items
return recommendations[:self.n]