基于Spark的电影推荐系统
项目架构
组件版本及配置
名称 | 版本 |
---|---|
Hadoop | 2.8.5 |
Hive | 2.1.0 |
Spark | 1.6.3 |
Kafka | 2.10_0.8.21 |
MariaDB(Mysql) | 5.5.64 |
Scala | 2.10.6 |
Java | 1.8.0_25 |
Zookeeper | 3.4.12 |
启动服务
zookeeper 三个节点都执行
zkServer.sh start
hdfs
start-dfs.sh
yarn
start-yarn.sh
spark
sh /sbin/start-all.sh
hive
hive -service metastore
kafka
./bin/kafka-server-start.sh config/server.properties & 打印日志启动
./bin/kafka-server-start.sh -daemon config/server.properties & 不打印日志启动
消费消息
kafka-console-consumer.sh --zookeeper bigboss1:2181 --from-beginning --topic kafka1024
数据清洗
去除/平滑脏数据
spark-submit --class main.scala.com.hopu.wash.movies /opt/testfiles/sparkhive.jar
分表
//从ratings抽样60%成训练集ratx表
create table ratx as select * from ratings tablesample(60 percent);
//ratings表-ratx表60%=得到40%测试集ratc
create table ratc3 like ratings;
insert overwrite table ratc3 select * from ratings left join ratx on ratx.userid=ratings.userid and ratx.movieid=ratings.movieid and ratx.timestamp=ratings.timestamp where ratx.timestamp is null;
给新用户推荐热门评分高的电影
select movieid,avg(rating) avgrat,count(*) cnt from ratings group by movieid having cnt>=1000 order by avgrat desc limit 50;
训练模型
spark-submit --class main.scala.com.hopu.myals.train /opt/testfiles/sparkhive.jar
离线推荐
spark-submit --jars /opt/testfiles/mariadb-java-client-2.2.1.jar --driver-class-path /opt/testfiles/mariadb-java-client-2.2.1.jar --conf spark.executor.extraClassPath=/opt/testfiles/mariadb-java-client-2.2.1.jar --class main.scala.com.hopu.myjdbc.PredictToMysql
实时推荐
生产者
spark-submit --class main.scala.com.hopu.kfk.MyProducer --jars kafka-clients-0.8.2.1.jar /opt/testfiles/sparkhive.jar
实时推荐/消费者
spark-submit --class main.scala.com.hopu.kfk.Show --jars kafka-clients-0.8.2.1.jar,kafka_2.10-0.8.2.1.jar,spark-streaming-kafka_2.10-1.6.3.jar,metrics-core-2.2.0.jar,kafka_2.10-0.8.2.1.jar,zkclient-0.10.jar --executor-memory 1g /opt/testfiles/sparkhive.jar
给新用户推荐
hive->热门好评电影Top50->mysql
select movieid,avg(rating) avgrat,count(*) cnt from ratings group by movieid having cnt>=1000 order by avgrat desc limit 50;
判断用户是否是新用户,是新用户就推荐热门好评电影Top50其中5部
while (validusersIter.hasNext) {
val userid=validusersIter.next
val us:DataFrame=hc.sql(s"select count(*) from ratings where userid=${userid}")
if(us.first().getInt(0)==0){
//新用户,从top50中抽5部
val results=hc.sql("select * from top50 tablesample(10 percent)")
println("推荐如下电影 :")
results.show()
}else{
val recresult = model.recommendProducts(userid, 5)
println("推荐如下电影 :")
for(r<-recresult){
println("#####"+r.product+"#####")
var thedf = Seq((userid, r.product,r.rating)).toDF("userid","movieid","rating")
thedf.write.mode("Append").jdbc(url,table,properties)
}
}
}