大数据推荐系统算法(1)大数据框架介绍
大数据推荐系统算法(2) lambda架构
大数据推荐系统算法(3) 用户画像
大数据推荐系统(4)推荐算法
大数据推荐系统(5)Mahout
大数据推荐系统(6)Spark
大数据推荐系统(7)推荐系统与Lambda架构
大数据推荐系统(8)分布式数据收集和存储
大数据推荐系统(9)实战
架构:
一、公共部分(离线和在线都可使用)
1.参数设置 redis的参数 kafka的参数
package com.dylan.recom.common;
public final class Constants {
public static final String REDIS_SERVER = "master";
public static final String KAFKA_SERVER = "master";
public static final String KAFKA_ADDR = KAFKA_SERVER + ":9092";
public static final String KAFKA_TOPICS = "recom"; #kafka创建主题(实时部分)
}
2.对接口的实现(初始化物品和相似度)
package com.dylan.recom.common;
/**
* Created by dylan
*/
public class ItemSimilarity implements Comparable<ItemSimilarity> {
private long id; //itemID
private Double s; //similarity
public ItemSimilarity() {
this.id = -1;
this.s = 0d;
}
public ItemSimilarity(long itemId, Double similarity) {
this.id = itemId;
this.s = similarity;
}
public long getId() {
return id;
}
public void setId(long itemId) {
this.id = itemId;
}
public Double getS(){
return s;
}
public void setS(Double similarity) {
this.s = similarity;
}
public boolean equals(Object obj) {
if (!(obj instanceof ItemSimilarity))
return false;
if (obj == this)
return true;
// TODO: double number should not compare directly
return this.id == ((ItemSimilarity) obj).id && this.s == ((ItemSimilarity) obj).s;
}
public int hashCode(){
return (int)(id + s);
}
@Override
public int compareTo(ItemSimilarity obj) {
if(this.s > obj.s) {
return 1;
} else if(this.s < obj.s) {
return -1;
}
return 0;
}
@Override
public String toString() {
return "id:" + id + ",similarity:" + s;
}
}
3.连接redis
1初始化 redis , 2 获取实例,3 释放redis。
package com.dylan.recom.common;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
public final class RedisUtil {
//Redis服务器IP
private static String ADDR = Constants.REDIS_SERVER;
//Redis的端口号
private static int PORT = 6379;
//访问密码
private static String AUTH = "admin";
//可用连接实例的最大数目,默认值为8;
//如果赋值为-1,则表示不限制;如果pool已经分配了maxActive个jedis实例,则此时pool的状态为exhausted(耗尽)。
private static int MAX_ACTIVE = 1024;
//控制一个pool最多有多少个状态为idle(空闲的)的jedis实例,默认值也是8。
private static int MAX_IDLE = 200;
//等待可用连接的最大时间,单位毫秒,默认值为-1,表示永不超时。如果超过等待时间,则直接抛出JedisConnectionException;
private static int MAX_WAIT = 10000;
private static int TIMEOUT = 10000;
//在borrow一个jedis实例时,是否提前进行validate操作;如果为true,则得到的jedis实例均是可用的;
private static boolean TEST_ON_BORROW = true;
private static JedisPool jedisPool = null;
/**
* 初始化Redis连接池
*/
static {
try {
JedisPoolConfig config = new JedisPoolConfig();
config.setMaxIdle(MAX_IDLE);
config.setTestOnBorrow(TEST_ON_BORROW);
jedisPool = new JedisPool(config, ADDR, PORT, TIMEOUT);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 获取Jedis实例
* @return
*/
public synchronized static Jedis getJedis() {
try {
if (jedisPool != null) {
Jedis resource = jedisPool.getResource();
return resource;
} else {
return null;
}
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* 释放jedis资源
* @param jedis
*/
public static void returnResource(final Jedis jedis) {
if (jedis != null) {
jedisPool.returnResource(jedis);
}
}
}
二、离线部分
创建推荐模型,将产生的相关表写入 redis 中
1、在 Intellij IDE 中,在“MahoutRecommendation”工程,创建 “SimilarityTablesGenerator”类来实现
2、运行程序并验证确保数据写入成功
1.datamodel 本地的GroupLensDataModel 从本地读取文件,拷贝suorce目录来处理。
如果没指定直接读,如果指定了转化需要格式
package com.dylan.recom.offline;
import com.google.common.io.Files;
import com.google.common.io.InputSupplier;
import com.google.common.io.Resources;
import org.apache.commons.io.Charsets;
import org.apache.mahout.cf.taste.impl.model.file.FileDataModel;
import org.apache.mahout.common.iterator.FileLineIterable;
import java.io.*;
import java.net.URL;
import java.util.regex.Pattern;
public final class GroupLensDataModel extends FileDataModel {
private static final String COLON_DELIMTER = "::";
private static final Pattern COLON_DELIMITER_PATTERN = Pattern.compile(COLON_DELIMTER);
public GroupLensDataModel() throws IOException {
this(readResourceToTempFile("/../resources/ratings.dat"));
}
/**
* @param ratingsFile GroupLens ratings.dat file in its native format
* @throws IOException if an error occurs while reading or writing files
*/
public GroupLensDataModel(File ratingsFile) throws IOException {
super(convertGLFile(ratingsFile));
}
private static File convertGLFile(File originalFile) throws IOException {
// Now translate the file; remove commas, then convert "::" delimiter to comma
File resultFile = new File(new File(System.getProperty("java.io.tmpdir")), "ratings.txt");
if (resultFile.exists()) {
resultFile.delete();
}
try {
Writer writer = new OutputStreamWriter(new FileOutputStream(resultFile), Charsets.UTF_8);
for (String line : new FileLineIterable(originalFile, false)) {
int lastDelimiterStart = line.lastIndexOf(COLON_DELIMTER);
if (lastDelimiterStart < 0) {
throw new IOException("Unexpected input format on line: " + line);
}
String subLine = line.substring(0, lastDelimiterStart);
String convertedLine = COLON_DELIMITER_PATTERN.matcher(subLine).replaceAll(",");
writer.write(convertedLine);
writer.write('\n');
}
} catch (IOException ioe) {
resultFile.delete();
throw ioe;
}
return resultFile;
}
public static File readResourceToTempFile(String resourceName) throws IOException {
InputSupplier<? extends InputStream> inSupplier;
try {
URL resourceURL = Resources.getResource(GroupLensDataModel.class, resourceName);
inSupplier = Resources.newInputStreamSupplier(resourceURL);
} catch (IllegalArgumentException iae) {
File resourceFile = new File("src/main/java" + resourceName);
inSupplier = Files.newInputStreamSupplier(resourceFile);
}
File tempFile = File.createTempFile("taste", null);
tempFile.deleteOnExit();
Files.copy(inSupplier, tempFile);
return tempFile;
}
@Override
public String toString() {
return "GroupLensDataModel";
}
}
2.HDFSDataModel 把HDFS数据读出来保存到本地。
package com.zkpk.reco.offline;
import org.apache.commons.io.Charsets;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.cf.taste.impl.model.GenericDataModel;
import org.apache.mahout.cf.taste.impl.model.file.FileDataModel;
import java.io.*;
import java.util.regex.Pattern;
/**
* Created by Dylan
*/
public class HDFSDataModel extends FileDataModel {
private static final String COLON_DELIMTER = "::";
private static final Pattern COLON_DELIMITER_PATTERN = Pattern.compile(COLON_DELIMTER);
public HDFSDataModel(Configuration conf, String pathStr) throws IOException {
this(conf, new Path(pathStr));
}
public HDFSDataModel(Configuration conf, Path path) throws IOException {
super(storeHdfsFileToLocal(conf, path, COLON_DELIMTER));
}
private static File storeHdfsFileToLocal(Configuration conf, Path path, String delimiter) {
// Now translate the file; remove commas, then convert "::" delimiter to comma
File resultFile = new File(new File(System.getProperty("java.io.tmpdir")), "ratings.txt");
if (resultFile.exists()) {
resultFile.delete();
}
try{
Writer writer = new OutputStreamWriter(new FileOutputStream(resultFile), Charsets.UTF_8);
FileSystem fs = path.getFileSystem(conf);
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
String line = br.readLine();
while (line != null) {
int lastDelimiterStart = line.lastIndexOf(COLON_DELIMTER);
if (lastDelimiterStart < 0) {
throw new IOException("Unexpected input format on line: " + line);
}
String subLine = line.substring(0, lastDelimiterStart);
String convertedLine = COLON_DELIMITER_PATTERN.matcher(subLine).replaceAll(",");
writer.write(convertedLine);
writer.write('\n');
line = br.readLine();
}
} catch(IOException e){
e.printStackTrace();
}
return resultFile;
}
}
3.基于Item的相似度的计算。得到user-item(给用户推荐的电影)的表 和item-item的相似度表,并插入到Iredis中
package com.dylan.recom.offline;
import org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender;
import org.apache.mahout.cf.taste.impl.similarity.LogLikelihoodSimilarity;
import org.apache.mahout.cf.taste.impl.similarity.precompute.MultithreadedBatchItemSimilarities;
import org.apache.mahout.cf.taste.model.DataModel;
import org.apache.mahout.cf.taste.recommender.ItemBasedRecommender;
import org.apache.mahout.cf.taste.similarity.precompute.BatchItemSimilarities;
import java.io.File;
/**
* generate item-item similarity table & user-item table, and insert into redis
*
*/
public final class SimilarityTablesGenerator {
private SimilarityTablesGenerator() {}
public static void main(String[] args) throws Exception {
/*
if (args.length != 1) {
System.err.println("Need path to ratings.dat of the movielens1M dataset as argument!");
System.exit(-1);
}
*/
DataModel dataModel = new GroupLensDataModel();
UserItemSimilarityTableRedisWriter userItemSimilarityTableRedisWriter =
new UserItemSimilarityTableRedisWriter(dataModel);
userItemSimilarityTableRedisWriter.storeToRedis();
ItemBasedRecommender recommender = new GenericItemBasedRecommender(dataModel,
new LogLikelihoodSimilarity(dataModel));
BatchItemSimilarities batch = new MultithreadedBatchItemSimilarities(recommender, 5);
int numSimilarities = batch.computeItemSimilarities(Runtime.getRuntime().availableProcessors(), 1,
new ItemsSimilarityTableRedisWriter());
System.out.println("Computed " + numSimilarities + " similarities for " + dataModel.getNumItems() + " items "
+ "and saved them to redis");
userItemSimilarityTableRedisWriter.waitUtilDone();
}
}
4.将基于用户的推荐结果写入redis
package com.dylan.recom.offline;
import com.alibaba.fastjson.JSON;
import com.dylan.recom.common.RedisUtil;
import org.apache.mahout.cf.taste.common.TasteException;
import org.apache.mahout.cf.taste.impl.common.FastIDSet;
import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
import org.apache.mahout.cf.taste.model.DataModel;
import redis.clients.jedis.Jedis;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.Executors;
/**
* Created by Dylan
*/
public class UserItemSimilarityTableRedisWriter {
private DataModel dataModel = null;
private Jedis jedis = null;
private CountDownLatch latch = new CountDownLatch(1);
public UserItemSimilarityTableRedisWriter(DataModel dataModel) {
this.dataModel = dataModel;
this.jedis = RedisUtil.getJedis();
}
public void storeToRedis() {
Executors.newSingleThreadExecutor().submit(
new Runnable() {
@Override
public void run() {
process();
latch.countDown();
}
}
);
}
private void process() {
try {
LongPrimitiveIterator iterator = dataModel.getUserIDs();
while(iterator.hasNext()) {
long userID = iterator.nextLong();
FastIDSet iDSet = dataModel.getItemIDsFromUser(userID);
String key = "UI:" + userID;
String value = JSON.toJSONString(iDSet.toArray());
jedis.set(key, value);
System.out.println("Stored User:" + key);
}
} catch(TasteException te) {
te.printStackTrace();
}
}
public void waitUtilDone() throws InterruptedException {
latch.await();
}
}
5.基于item的推荐写入redis
package com.dylan.recom.offline;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import com.alibaba.fastjson.JSON;
import com.dylan.recom.common.ItemSimilarity;
import com.dylan.recom.common.RedisUtil;
import org.apache.mahout.cf.taste.similarity.precompute.SimilarItem;
import org.apache.mahout.cf.taste.similarity.precompute.SimilarItems;
import org.apache.mahout.cf.taste.similarity.precompute.SimilarItemsWriter;
import redis.clients.jedis.Jedis;
// generate item-item similarity table
public class ItemsSimilarityTableRedisWriter implements SimilarItemsWriter {
private long itemCounter = 0;
private Jedis jedis = null;
@Override
public void open() throws IOException {
jedis = RedisUtil.getJedis();
}
@Override
public void add(SimilarItems similarItems) throws IOException {
ItemSimilarity[] values = new ItemSimilarity[similarItems.numSimilarItems()];
int counter = 0;
for (SimilarItem item: similarItems.getSimilarItems()) {
values[counter] = new ItemSimilarity(item.getItemID(), item.getSimilarity());
counter++;
}
String key = "II:" + similarItems.getItemID();
String items = JSON.toJSONString(values);
jedis.set(key, items);
itemCounter++;
if(itemCounter % 100 == 0) {
System.out.println("Store " + key + " to redis, total:" + itemCounter);
}
}
@Override
public void close() throws IOException {
jedis.close();
}
}
三、服务层
1、启动 RESTful 服务 通过 RESTful 服务,给定的 userid,获取为其推荐的商品列表 2、验证结果 示例:获取给用户 123 的推荐列表 http://localhost:9999/ws/v1/recom/123
1.启动severs
package com.dylan.recom.webservice;
import org.eclipse.jetty.server.Server;
import org.eclipse.jetty.servlet.ServletContextHandler;
import org.eclipse.jetty.servlet.ServletHolder;
import org.glassfish.jersey.servlet.ServletContainer;
/**
* Created by dylan
*/
public class RecoServer {
private Server webServer = null;
public void start() {
ServletContextHandler context = new ServletContextHandler(ServletContextHandler.NO_SESSIONS);
context.setContextPath("/");
webServer = new Server(9999);
webServer.setHandler(context);
ServletHolder jerseyServlet = context.addServlet(ServletContainer.class, "/*");
jerseyServlet.setInitOrder(0);
// Tells the Jersey Servlet which REST service/class to load.
jerseyServlet.setInitParameter("jersey.config.server.provider.packages",
"com.dylan.recom.webservice");
try {
System.out.println("Web Server started ......");
webServer.start();
webServer.join();
} catch(Exception e) {
e.printStackTrace();
} finally {
webServer.destroy();
}
}
public void stop() throws Exception{
if(webServer != null) {
webServer.stop();
}
}
public static void main(String[] args) throws Exception {
RecoServer recoServer = new RecoServer();
recoServer.start();
}
}
2.用户访问目录和访问目录时候看到什么。
从redis获取数据,给一个Userid可得到他看到的items,并放入 RecommendedItems 。再从redis中item0item的表,找到user看过的items相似的items,取前10个,放入RecommendedItems。最后呈现出来。
package com.dylan.recom.webservice;
/**
* Created by dylan
*/
import com.alibaba.fastjson.JSON;
import com.dylan.recom.common.ItemSimilarity;
import com.dylan.recom.common.RedisUtil;
import redis.clients.jedis.Jedis;
import javax.ws.rs.*;
import javax.ws.rs.core.MediaType;
import java.util.*;
@Path("/ws/v1/recom") #推荐列表的路径
public class ItemBasedRecoResult {
Jedis jedis = null;
public ItemBasedRecoResult() {
jedis = RedisUtil.getJedis();
}
@GET
@Path("/{userid}")
@Produces(MediaType.APPLICATION_JSON)
public RecommendedItems getRecoItems(@PathParam("userid") String userid) {
RecommendedItems recommendedItems = new RecommendedItems();
// Stage 1: get user's items
String key = String.format("UI:%s", userid);
String value = jedis.get(key);
if(value == null || value.length() <= 0) {
return recommendedItems;
}
List<Long> userItems = JSON.parseArray(value, Long.class);
Set<Long> userItemsSet = new TreeSet<Long>(userItems);
// Stage 2: get similar items to the user's items
List<String> userItemStrs = new ArrayList<>();
for(Long item: userItems) {
userItemStrs.add("II:" + item);
}
List<String> similarItems = jedis.mget(userItemStrs.toArray(new String[userItemStrs.size()]));
Set<ItemSimilarity> similarItemsSet = new TreeSet<>();
for(String item: similarItems) {
List<ItemSimilarity> result = JSON.parseArray(item, ItemSimilarity.class);
similarItemsSet.addAll(result);
}
List<Long> recommendedItemIDs = new ArrayList<>();
for(ItemSimilarity item: similarItemsSet) {
if(!userItemsSet.contains(item.getId())) {
recommendedItemIDs.add((item.getId()));
}
if(recommendedItemIDs.size() >= 10)
break;
}
recommendedItems.setItems(recommendedItemIDs.toArray(new Long[0]));
return recommendedItems;
}
}
3.存放指顶Item
package com.dylan.recom.webservice;
/**
* Created by dylan
*/
import javax.xml.bind.annotation.XmlElement;
import javax.xml.bind.annotation.XmlRootElement;
@XmlRootElement
public class RecommendedItems {
private Long[] items = null;
public Long[] getItems() {
return items;
}
public void setItems(Long[] items) {
this.items = items;
}
}
四、在线
1、在 kafka 中创建名为“recom”的主题
2、向 Kafka 中写入数据 实现“KafkaProducer”类
3、产生实时推荐结果,并写入 redis 实现“RealtimeRecommender”类
4、验证 redis 中是否写入数据
1.在kafka中 创建recom的主题
2.存放用户点击类型 UId 和 Iid
package com.dylan.recom.realtime;
/**
* Created by dylan
*/
public class NewClickEvent {
private long userId;
private long itemId;
public NewClickEvent() {
this.userId = -1L;
this.itemId = -1L;
}
public NewClickEvent(long userId, long itemId) {
this.userId = userId;
this.itemId = itemId;
}
public long getUserId() {
return userId;
}
public void setUserId(long userId) {
this.userId = userId;
}
public long getItemId() {
return itemId;
}
public void setItemId(long itemId) {
this.itemId = itemId;
}
}
3.数据写入到kafka的主题中
package com.dylan.recom.realtime;
/**
* Created by dylan
*/
import com.alibaba.fastjson.JSON;
import com.dylan.recom.common.Constants;
import org.apache.log4j.Logger;
import java.util.Properties;
import kafka.javaapi.producer.Producer;
import kafka.producer.KeyedMessage;
import kafka.producer.ProducerConfig;
public class KafkaProducer implements Runnable {
private static final Logger LOGGER = Logger.getLogger(KafkaProducer.class);
private final String topic;
public KafkaProducer(String topic) {
this.topic = topic;
}
static NewClickEvent[] newClickEvents = new NewClickEvent[]{
new NewClickEvent(1000000L, 123L),
new NewClickEvent(1000001L, 111L),
new NewClickEvent(1000002L, 500L),
new NewClickEvent(1000003L, 278L),
new NewClickEvent(1000004L, 681L),
};
public void run() {
Properties props = new Properties();
props.put("metadata.broker.list", Constants.KAFKA_ADDR);
props.put("serializer.class", "kafka.serializer.StringEncoder");
props.put("producer.type", "async");
ProducerConfig conf = new ProducerConfig(props);
Producer<Integer, String> producer = null;
try {
System.out.println("Producing messages");
producer = new Producer<>(conf);
for (NewClickEvent event : newClickEvents) {
String eventAsStr = JSON.toJSONString(event);
producer.send(new KeyedMessage<Integer, String>(
this.topic, eventAsStr));
System.out.println("Sending messages:" + eventAsStr);
}
System.out.println("Done sending messages");
} catch (Exception ex) {
LOGGER.fatal("Error while producing messages", ex);
LOGGER.trace(null, ex);
System.err.println("Error while producing messages:" + ex);
} finally {
if (producer != null) producer.close();
}
}
public static void main(String[] args) throws Exception {
new Thread(new KafkaProducer(Constants.KAFKA_TOPICS)).start();
}
}
4.通过spark-streaming 实现实时推荐(基于redis的离线结果,在线发生访问行为,从redis找到与行为相关的item)
定义两个线程
读进来的数据进行拆分,变为集合(读取kafka的消息,数据和主题)
处理消息,从redis中读取数据,知道给User推荐什么item。(获取userid,itemid,找到用户现在的电影对应的相似的电影,再把相似的电影推送出来)。
最后写入redis。
package com.dylan.recom.realtime
import com.alibaba.fastjson.JSON
import com.dylan.recom.common.{Constants, RedisUtil}
import kafka.serializer.StringDecoder
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka._
import org.apache.spark.SparkConf
/**
* Created by dylan
*/
object RealtimeRecommender {
def main(args: Array[String]) {
val Array(brokers, topics) = Array(Constants.KAFKA_ADDR, Constants.KAFKA_TOPICS)
// Create context with 2 second batch interval
val sparkConf = new SparkConf().setMaster("local[2]").setAppName("RealtimeRecommender")
val ssc = new StreamingContext(sparkConf, Seconds(2))
// Create direct kafka stream with brokers and topics
val topicsSet = topics.split(",").toSet
val kafkaParams = Map[String, String](
"metadata.broker.list" -> brokers,
"auto.offset.reset" -> "smallest")
val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc, kafkaParams, topicsSet)
messages.map(_._2).map{ event =>
JSON.parseObject(event, classOf[NewClickEvent])
}.mapPartitions { iter =>
val jedis = RedisUtil.getJedis
iter.map { event =>
println("NewClickEvent" + event)
val userId = event.asInstanceOf[NewClickEvent].getUserId
val itemId = event.asInstanceOf[NewClickEvent].getItemId
val key = "II:" + itemId
val value = jedis.get(key)
jedis.set("RUI:" + userId, value)
println("Recommend to user:" + userId + ", items:" + value)
}
}.print()
// Start the computation
ssc.start()
ssc.awaitTermination()
}
}