首先创建读取路径,这个取决于你的pyspark是以什么方式启动的
global Path
if sc.master[0:5] == 'local':
Path='file:/home/swt/pythonwork/PythonProject/'
else:
Path="hdfs://ubuntu:9000/user/swt/"
我是本地启动
sc.master
'local[*]'
开始读取数据
rawUserData = sc.textFile(Path+"data/u.data")
rawUserData.count()
rawUserData.first()
rawRatings = rawUserData.map(lambda line:line.split("\t")[:3])
rawRatings.take(5)
ratingsRDD = rawRatings.map(lambda x:(x[0],x[1],x[2]))
ratingsRDD.take(5)
100000
'196\t242\t3\t881250949'
[['196', '242', '3'],
['186', '302', '3'],
['22', '377', '1'],
['244', '51', '2'],
['166', '346', '1']]
[('196', '242', '3'),
('186', '302', '3'),
('22', '377', '1'),
('244', '51', '2'),
('166', '346', '1')]
# 对数据进行处理
numRatings=ratingsRDD.count()
numRatings
numUsers = ratingsRDD.map(lambda x:x[0]).distinct().count()
numUsers
numMovles = ratingsRDD.map(lambda x:x[1]).distinct().count()
numMovles
100000
943
1682
# 将处理好的数据进行训练,ALS算法返回的是一个model
from pyspark.mllib.recommendation import ALS
model = ALS.train(ratingsRDD, 10, 10, 0.01)
print(model)
<pyspark.mllib.recommendation.MatrixFactorizationModel object at 0x7fb87179b908>
# 用户id为100的用户推荐以下电影和推荐指数
model.recommendProducts(100,5)
[Rating(user=100, product=1160, rating