spark kafka java api_java实现spark streaming与kafka集成进行流式计算

最新推荐文章于 2023-06-15 19:14:11 发布

斯托克弗

最新推荐文章于 2023-06-15 19:14:11 发布

阅读量379

点赞数

文章标签： spark kafka java api

本文链接：https://blog.csdn.net/weixin_28797919/article/details/114226376

版权

java实现spark streaming与kafka集成进行流式计算

2017/6/26补充：接手了搜索系统，这半年有了很多新的心得，懒改这篇粗鄙之文，大家看综合看这篇新博文来理解下面的粗鄙代码吧，http://blog.csdn.net/yujishi2/article/details/73849237。

背景：网上关于spark streaming的文章还是比较多的，可是大多数用scala实现，因我们的电商实时推荐项目以java为主，就踩了些坑，写了java版的实现，代码比较意识流，轻喷，欢迎讨论。

流程：spark streaming从kafka读用户实时点击数据，过滤数据后从redis读商品相似度矩阵，从db读user历史行为，实时计算兴趣度，并将结果写入redis一份，供api层读取展示，写入hdfs一份供离线计算准确率召回率。

补充：据了解，大型实时推荐系统里面，协同过滤一般用作生成候选集，计算兴趣读会被ctr等策略的 rerank代替，在calculateinterest中调用在线rerank服务排序。

12／13补充：召回不变，目前采用ctr预估加上规则排序，后续上ltr。

废话少说，上代码：

public class Main {

static final String ZK_QUORUM = "*.*.*.*:2181,*.*.*.*:2181,*.*.*.*:2181/kafka";

static final String GROUP = "test-consumer-group";

static final String TOPICSS = "user_trace";

static final String NUM_THREAD = "64";

public static void main(String[] args) {

SparkConf sparkConf = new SparkConf().setAppName("main.java.computingCenter");

// Create the context with 2 seconds batch size

//每两秒读取一次kafka

JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000));

int numThreads = Integer.parseInt(NUM_THREAD);

Map topicMap = new HashMap();

String[] topics = TOPICSS.split(",");

for (String topic: topics) {

topicMap.put(topic, numThreads);

}

JavaPairReceiverInputDStream messages =

KafkaUtils.createStream(jssc, ZK_QUORUM, GROUP, topicMap);

JavaDStream lines = messages.map(new Function, String>() {

public String call(Tuple2 tuple2) {

return tuple2._2();

}

});

JavaDStream words = lines.flatMap(new FlatMapFunction() {

public Iterable call(String lines) {

//kafka数据格式："{\"Topic\":\"user_trace\",\"PartitionKey\":\"0\",\"TimeStamp\":1471524044018,\"Data\":\"0=163670589171371918%3A196846178238302087\",\"LogId\":\"0\",\"ContentType\":\"application/x-www-form-urlencoded\"}";

List arr = new ArrayList();

for (String s : lines.split(" ")) {

Map j = JSON.parseObject(s);

String s1 = "";

String s2 = "";

try {

s1 = URLDecoder.decode(j.get("Data").toString(), "UTF-8");

s2 = s1.split("=")[1];

} catch (UnsupportedEncodingException e) {

e.printStackTrace();

}

arr.add(s2);

}

return arr;

}

});

JavaPairDStream goodsSimilarityLists = words.filter(new Function() {

@Override

public Boolean call(String s) throws Exception {

//过滤非法的数据

if (s.split(":").length == 2) {

return true;

}

return false;

}

}).mapPartitionsToPair(new PairFlatMapFunction, String, String>() {

//此处分partition对每个pair进行处理

@Override

public Iterable> call(Iterator s) throws Exception {

ArrayList> result = new ArrayList>();

while (s.hasNext()) {

String x = s.next();

String userId = x.split(":")[0];

String goodsId = x.split(":")[1];

System.out.println(x);

LinkedHashMap recommendMap = null;

try {

//此service从redis读数据,进行实时兴趣度计算,推荐结果写入redis,供api层使用

CalculateInterestService calculateInterestService = new CalculateInterestService();

try {

recommendMap = calculateInterestService.calculateInterest(userId, goodsId);

} catch (Exception e) {

e.printStackTrace();

}

String text = "";

int count = 0;

for (Map.Entry entry : recommendMap.entrySet()) {

text = text + entry.getKey();

if (count == recommendMap.size() - 1) {

break;

}

count = count + 1;

text = text + "{/c}";

}

text = System.currentTimeMillis() + ":" + text;

result.add(new Tuple2(userId, text));

} catch (Exception e) {

e.printStackTrace();

}

return result;

}

});

goodsSimilarityLists.foreachRDD(new Function, Void>() {

@Override

public Void call(JavaPairRDD rdd) throws Exception {

//打印rdd，调试方便

System.out.println(rdd.collect());

return null;

}

});

JavaPairDStream goodsSimilarityListsText = goodsSimilarityLists.mapToPair(new PairFunction, Text, Text>(){

@Override

public Tuple2 call(Tuple2 ori) throws Exception {

//此处要将tuple2转化为org.apache.hadoop.io.Text格式，使用saveAsHadoopFiles方法写入hdfs

return new Tuple2(new Text(ori._1), new Text(ori._2));

}

});

//写入hdfs

goodsSimilarityListsText.saveAsHadoopFiles("/user/hadoop/recommend_list/rl", "123", Text.class, Text.class, SequenceFileOutputFormat.class);

jssc.start();

jssc.awaitTermination();

}

public class CalculateInterestService {

private String dictKey = "greate_item_sim_2.0";

private String recommendTable = "great_recommend_table_2.0";

static final String HIGO_BASE_URL = "jdbc:mysql://*.*.*.*:3212/*";

static final String HIGO_BASE_USER = "*";

static final String HIGO_BASE_PASS = "*";

public LinkedHashMap calculateInterest(String userId, String traceGoodsId) {

LinkedHashMap sortedMap = new LinkedHashMap();

String[] simGoods = RedisHelper.getInstance().hget(dictKey, traceGoodsId).split(",");

//用户的历史记录,应该存action:goodsId:timestamp格式,要重构,bi写入单独的数据表中

HashMap userTrace = null;

try {

userTrace = getUserTrace(userId);

} catch (ClassNotFoundException e) {

e.printStackTrace();

return sortedMap;

}

HashMap recommendMap = new HashMap();

String[] simGoodsIds = new String[simGoods.length];

for (int i = 0; i < simGoods.length; i++) {

simGoodsIds[i] = simGoods[i].split(":")[0];

}

List pSimGoodsIds = RedisHelper.getInstance().hmget(dictKey, simGoodsIds);

HashMap predictSimGoodsIds = new HashMap();

for (int i = 0; i < simGoodsIds.length; i++) {

predictSimGoodsIds.put(Long.parseLong(simGoodsIds[i]), pSimGoodsIds.get(i));

}

for (String item : simGoods) {

//need optimised

Double totalSum = 0.0;

Double sum = 0.0;

Long originGoodsId = Long.parseLong(item.split(":")[0]);

for (String predictGoods : predictSimGoodsIds.get(originGoodsId).split(",")) {

Long goodsId = Long.parseLong(predictGoods.split(":")[0].toString());

Double sim = Double.valueOf(predictGoods.split(":")[1].toString());

totalSum = totalSum + sim;

Double score = 0.0;

if (!userTrace.containsKey(goodsId)) {

//TODO 用户评分矩阵过于稀疏,需要svd补充评分,暂时无评分score为默认0.1

userTrace.put(goodsId, "default");

}

String action = userTrace.get(goodsId);

if (action.equals("click")) {

score = 0.2;

} else if (action.equals("favorate")) {

} else if (action.equals("add_cart")) {

score = 0.6;

} else if (action.equals("order")) {

score = 0.8;

} else if (action.equals("default")) {

score = 0.1;

}

//相似度词典应存 goodsid:sim格式,要重构

sum = sum + score * sim;

}

Double predictResult = sum / totalSum;

recommendMap.put(originGoodsId, predictResult);

}

//sort recommend list

List> list = new ArrayList>(recommendMap.entrySet());

Collections.sort(list, new Comparator>() {

@Override

public int compare(Map.Entry o1, Map.Entry o2) {

return o2.getValue().compareTo(o1.getValue());

}

});

Map.Entry tmpEntry = null;

Iterator> iter = list.iterator();

while (iter.hasNext()) {

tmpEntry = iter.next();

sortedMap.put(tmpEntry.getKey(), tmpEntry.getValue());

}

writeRecommendListToRedis(userId, sortedMap);

return sortedMap;

}

private HashMap getUserTrace(String userId) throws ClassNotFoundException {

//SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);

Class.forName("com.mysql.jdbc.Driver");

PreparedStatement stmt = null;

Connection conn = null;

UserTrace userTrace = new UserTrace();

try {

conn = DriverManager.getConnection(HIGO_BASE_URL, HIGO_BASE_USER, HIGO_BASE_PASS);

String sql = "select * from t_pandora_goods_record where account_id=" + userId;

stmt = (PreparedStatement)conn.prepareStatement(sql);

ResultSet rs = stmt.executeQuery();

while(rs.next()) {

userTrace.setId(Long.parseLong(rs.getString(1)));

userTrace.setAccountId(Long.parseLong(rs.getString(2)));

userTrace.setGoodsIds(rs.getString(3));

userTrace.setMtime(rs.getString(4));

}

stmt.close();

conn.close();

} catch (Exception e) {

e.printStackTrace();

}

String[] goodsActionTimestamp = userTrace.getGoodsIds().split(",");

HashMap hm = new HashMap();

for (String ac : goodsActionTimestamp) {

Long goodsId = Long.parseLong(ac.split(":")[0]);

//String action = ac.split(":")[1];

//String timestamp = ac.split(":")[2];

//hack 下一步要bi把用户历史行为写入表中, action:goodsId:timestamp格式, timestamp后期将参与权重计算

String action = "click";

hm.put(goodsId, action);

}

return hm;

}

private void writeRecommendListToRedis(String userId, LinkedHashMap sortedMap) {

String recommendList = "";

int count = 0;

for (Map.Entry entry : sortedMap.entrySet()) {

recommendList = recommendList + entry.getKey();

if (count == sortedMap.size() - 1) {

break;

}

count = count + 1;

recommendList = recommendList + ",";

}

RedisHelper.getInstance().hset(recommendTable, userId, recommendList);

}

斯托克弗

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
spark kafka java api_java实现spark streaming与kafka集成进行流式计算

java实现spark streaming与kafka集成进行流式计算2017/6/26补充：接手了搜索系统，这半年有了很多新的心得，懒改这篇粗鄙之文，大家看综合看这篇新博文来理解下面的粗鄙代码吧，http://blog.csdn.net/yujishi2/article/details/73849237。背景：网上关于spark streaming的文章还是比较多的，可是大多数用scala实现，...
复制链接

扫一扫