分为两个部分:
1. 训练模型 RecommendTrain.py
2. 推荐 Recommend.py
将模型保存起来,可以更方便的调用
模型的保存和加载:
model.save(sc, Path + "ALSmodel")
model = MatrixFactorizationModel.load(sc, Path+"ALSmodel")
RecommendTrain.py
目的:训练模型,并把模型保存到hdfs
# -*- coding: UTF-8 -*-
from pyspark.mllib.recommendation import ALS # ALS
from pyspark import SparkConf, SparkContext # 为了创建sc
def SetLogger( sc ): # 去除spark默认显示的杂乱信息
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)
def SetPath(sc): # 配置路径(本地文件和hdfs文件)
global Path
if sc.master[0:5]=="local" :
Path="file:/root/PycharmProjects/dataset/ml-100k/"
else:
Path="hdfs://hadoop01:9000/test/ml-100k/"
def CreateSparkContext(): # 创建sc
# setAppname为设置app名,此名称会显示在spark、yarn的web页面
sparkConf = SparkConf().setAppName("RecommendTrain").set("spark.ui.showConsoleProgress", "false")
sc = SparkContext(conf = sparkConf)
print ("master="+sc.master)
SetLogger(sc)
SetPath(sc)
return (sc)
def PrepareData(sc): # 准备数据
#----------------------1.建立用户评价数据-------------
print("开始读取用户评分数据...")
rawUserData = sc.textFile(Path+"u.data")
ratingsRDD = rawUserData.map(lambda line: line.split("\t")[:3] ) # 分割、取前三列
#----------------------2.显示数据项数-------------
numRatings = ratingsRDD.count() # 数据量
numUsers = ratingsRDD.map(lambda x: x[0] ).distinct().count() # 用户数量
numMovies = ratingsRDD.map(lambda x: x[1]).distinct().count() # 电影数量
print("共计:ratings: " + str(numRatings) + " User:" + str(numUsers) + " Movie:" + str(numMovies))
return(ratingsRDD)
def SaveModel(sc): # 保存模型
try:
model.save(sc,Path+"ALSmodel")
print("已存储 Model 在ALSmodel")
except Exception :
print("Model已经存在,请先删除再存储.")
if __name__ == "__main__":
sc=CreateSparkContext()
print("==========数据准备阶段===========")
ratingsRDD = PrepareData(sc)
print("==========训练阶段===============")
print("开始ALS训练,参数rank=5,iterations=20, lambda=0.1");
model = ALS.train(ratingsRDD, 5, 20, 0.1)
print("========== 存储Model ============")
SaveModel(sc)
yarn运行:
菜单栏”Tools”–>”External Tools”,点击运行”spark submit”
在hdfs上看看模型有没有保存
hdfs web:http://192.168.80.139:9870
也可在终端:hadoop fs -ls /test/ml-100k/
可能遇到的错误:stackoverflow
ALS算法迭代20次,爆栈错误:
spark在迭代计算的过程中(迭代次数太多),也就是函数调用层级过多导致,所需的栈空间也急剧上升,线程的栈满了,最终爆栈了。。
解决方法:
1. 减少迭代次数
ALS收敛很快(15次以内),实测把20次改为10次程序可以正常运行
2. checkpoint()
sc.setCheckpointDir("hdfs://hadoop01:9000/checkpoint")
把路径加到checkpoint,迭代30次都不会爆栈。
Recommend.py
# -*- coding: UTF-8 -*-
import sys
from pyspark import SparkConf, SparkContext
from pyspark.mllib.recommendation import MatrixFactorizationModel
def CreateSparkContext(): # 创建sc
sparkConf = SparkConf().setAppName("Recommend").set("spark.ui.showConsoleProgress", "false")
sc = SparkContext(conf=sparkConf)
print("master=" + sc.master)
SetLogger(sc)
SetPath(sc)
return (sc)
def SetPath(sc): # 配置路径(本地文件和hdfs文件)
global Path
if sc.master[0:5] == "local":
Path = "file:/root/PycharmProjects/dataset/ml-100k/"
else:
Path = "hdfs://hadoop01:9000/test/ml-100k/"
def SetLogger(sc): # 减少spark提示信息
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)
def PrepareData(sc): # 准备数据
print("开始读取电影ID与名称字典...")
itemRDD = sc.textFile(Path + "u.item")
movieTitle = itemRDD.map(lambda line: line.split("|")) \
.map(lambda a: (float(a[0]), a[1])) \
.collectAsMap()
return (movieTitle)
# 针对用户推荐电影
def RecommendMovies(model, movieTitle, inputUserID):
RecommendMovie = model.recommendProducts(inputUserID, 10)
print("针对用户id" + str(inputUserID) + "推荐下列电影:")
for rmd in RecommendMovie:
print("针对用户id {0} 推荐电影{1} 推荐评分 {2}". \
format( rmd[0],movieTitle[rmd[1]],rmd[2]))
# 载入模型
def loadModel(sc):
try:
model = MatrixFactorizationModel.load(sc, Path+"ALSmodel")
print("载入ALSModel模型")
except Exception:
print("找不到ALSModel模型,请先训练")
return model
# 推荐
def Recommend(model):
RecommendMovies(model, movieTitle,5)
if __name__ == "__main__":
sc=CreateSparkContext()
print("==========数据准备===============")
(movieTitle) = PrepareData(sc)
print("==========载入模型===============")
model=loadModel(sc)
print("==========进行推荐===============")
Recommend(model)
yarn运行:
菜单栏”Tools”–>”External Tools”,点击运行”spark submit”
遇到了各种问题,提交了很多次