1、项目简介
所谓的团购标签生成就是通过从消费者对商品的评价中统计提取关键词,并统计其消费者对该商品该指标的累积数量,并进行排序显示,类似于淘宝、美团等电商平台都有的大家印象之类的。
2、业务介绍
1、从复杂的json数据格式中提取出评论标签项
2、统计每个评论标签项的数量
3、对统计出的评论标签按降序排序
4、回显标签
3、功能实现
3.1 提取评论标签项
ReviewTags.java
package cn.ctgu.taggen;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
public class ReviewTags {
public static String extractTags(String jsonStr){
JSONObject object= JSON.parseObject(jsonStr);//将json格式转成对象
if(object==null||!object.containsKey("extInfoList")){
return "";
}
JSONArray array=object.getJSONArray("extInfoList");//提取出key对应的value,并转成数组,即提取出extInfoList中的两个value
if(array==null){
return "";
}
StringBuilder sb=new StringBuilder();
for(int i=0;i<array.size();i++){
JSONObject obj=array.getJSONObject(i);
if(obj!=null&&obj.containsKey("title")&&obj.getString("title").equals("contentTags")&&obj.containsKey("values")){
JSONArray arr=obj.getJSONArray("values");//提取出values key对应的value并转成一个数组
if(arr==null){
continue;
}
boolean begin=true;
for(int j=0;j<arr.size();j++){
if(begin){
begin=false;
}else{
sb.append(",");
}
sb.append(arr.getString(j));
}
}
}
return sb.toString();
}
/*
* 结果:
*
* 回头客,上菜快,环境优雅,性价比高,菜品不错
*
*
* 函数功能:从字符串中提取出相应的信息
* */
public static void main(String[] args) {
String s = "{\"reviewPics\":[{\"picId\":2405538806,\"url\":\"http://p0.where.net/shaitu/7c10019c62947d01ded80cc698c77c90217708.jpg\",\"status\":1},{\"picId\":2405442602,\"url\":\"http://p0.meituan.net/shaitu/d41ef06f5d16d5d3cbc871765ff93130270451.jpg\",\"status\":1}],\"extInfoList\":[{\"title\":\"contentTags\",\"values\":[\"回头客\",\"上菜快\",\"环境优雅\",\"性价比高\",\"菜品不错\"],\"desc\":\"\",\"defineType\":0},{\"title\":\"tagIds\",\"values\":[\"493\",\"232\",\"24\",\"300\",\"1\"],\"desc\":\"\",\"defineType\":0}],\"expenseList\":null,\"reviewIndexes\":[1,2],\"scoreList\":null}";
System.out.println(extractTags(s));
System.out.println(extractTags(""));
System.out.println(extractTags(null));
}
}
3.2 Java版实现标签统计排序处理
Tuple2Comparator.java
package cn.ctgu.taggen;
import scala.Tuple2;
import java.util.Comparator;
/**
* Created by Administrator on 2017/5/12.
*/
public class Tuple2Comparator implements Comparator<Tuple2<String,Integer>>{
public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {
return o2._2() - o1._2() ;
}
}
TagGeneratorJava.java
package cn.ctgu.taggen;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.TreeSet;
public class TagGeneratorJava {
public static void main(String[] args) {
SparkConf conf=new SparkConf();
conf.setAppName("Gennerator");
conf.setMaster("local[4]");
JavaSparkContext sc=new JavaSparkContext(conf);
JavaRDD<String>rdd1=sc.textFile("F:\\徐培成——spark\\线路三\\3-项目-团购网站标签生成\\团购网站标签生成\\temptags.txt");
//切割
JavaRDD<String[]>rdd2=rdd1.map(new Function<String, String[]>() {
public String[] call(String v1) throws Exception {
return v1.split("\t");
}
});
//过滤
JavaRDD<String[]>rdd3=rdd2.filter(new Function<String[], Boolean>() {
public Boolean call(String[] v2) throws Exception {
return v2.length==2;
}
});
//变换数组,12345->味道好,上菜快 JavaPairRDD是对元组的封装
JavaPairRDD<String,String>rdd4=rdd3.mapToPair(new PairFunction<String[], String, String>() {
public Tuple2<String, String> call(String[] v3) throws Exception {
return new Tuple2<String, String>(v3[0],ReviewTags.extractTags(v3[1]));
}
});
//过滤店家的有效评论
JavaPairRDD<String,String>rdd5=rdd4.filter(new Function<Tuple2<String, String>, Boolean>() {
public Boolean call(Tuple2<String, String> v5) throws Exception {
return v5._2().length()>0;
}
});
//将评论切割成数组
JavaPairRDD<String,String[]>rdd6=rdd5.mapToPair(new PairFunction<Tuple2<String, String>, String, String[]>() {
public Tuple2<String, String[]> call(Tuple2<String, String> v6) throws Exception {
return new Tuple2<String, String[]>(v6._1(),v6._2().split(","));
}
});
//压扁,12345->味道好 12345->上菜快
JavaPairRDD<String,String>rdd7=rdd6.flatMapValues(new Function<String[], Iterable<String>>() {
public Iterable<String> call(String[] v7) throws Exception {
List<String> list=new ArrayList<String>();
for (String s:v7){
list.add(s);
}
return list;
}
});
//标1成对
JavaPairRDD<Tuple2<String,String>,Integer>rdd8=rdd7.mapToPair(new PairFunction<Tuple2<String, String>, Tuple2<String, String>, Integer>() {
public Tuple2<Tuple2<String, String>, Integer> call(Tuple2<String, String> v8) throws Exception {
return new Tuple2<Tuple2<String, String>, Integer>(v8,1);
}
});
//聚合(12345->味道好)->30 (12345->上菜快)->80
JavaPairRDD<Tuple2<String,String>,Integer>rdd9=rdd8.reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer v1, Integer v2) throws Exception {
return v1+v2;
}
});
//(12345,(味道好->30),12345,(上菜快->80)
JavaPairRDD<String,Tuple2<String,Integer>>rdd10=rdd9.mapToPair(new PairFunction<Tuple2<Tuple2<String, String>, Integer>, String, Tuple2<String, Integer>>() {
public Tuple2<String, Tuple2<String, Integer>> call(Tuple2<Tuple2<String, String>, Integer> t) throws Exception {
return new Tuple2<String, Tuple2<String, Integer>>(t._1()._1(),new Tuple2<String, Integer>(t._1()._2(),t._2()));
}
});
//变换value成集合,以备聚合
JavaPairRDD<String,List<Tuple2<String,Integer>>>rdd11=rdd10.mapToPair(new PairFunction<Tuple2<String, Tuple2<String, Integer>>, String, List<Tuple2<String, Integer>>>() {
public Tuple2<String, List<Tuple2<String, Integer>>> call(Tuple2<String, Tuple2<String, Integer>> t) throws Exception {
List<Tuple2<String,Integer>>list=new ArrayList<Tuple2<String, Integer>>();
list.add(t._2());
return new Tuple2<String, List<Tuple2<String, Integer>>>(t._1(),list);
}
});
//聚合
JavaPairRDD<String,List<Tuple2<String,Integer>>>rdd12=rdd11.reduceByKey(new Function2<List<Tuple2<String, Integer>>, List<Tuple2<String, Integer>>, List<Tuple2<String, Integer>>>() {
public List<Tuple2<String, Integer>> call(List<Tuple2<String, Integer>> v1, List<Tuple2<String, Integer>> v2) throws Exception {
v1.addAll(v2);
return v1;
}
});
//聚合
//聚合12345->[
// (->),
// (->)
// ]
JavaPairRDD<String,String>rdd13=rdd12.mapToPair(new PairFunction<Tuple2<String, List<Tuple2<String, Integer>>>, String, String>() {
public Tuple2<String, String> call(Tuple2<String, List<Tuple2<String, Integer>>> t) throws Exception {
//TreeSet具有排序功能,通过实现对比器接口
TreeSet<Tuple2<String, Integer>> ts = new TreeSet<Tuple2<String, Integer>>(new Tuple2Comparator());
ts.addAll(t._2());
Iterator<Tuple2<String, Integer>> it = ts.iterator() ;
int index = 0 ;
String str = "" ;
//取前10
while(it.hasNext()){
if(index > 9){
break ;
}
Tuple2<String,Integer> t0 = it.next();
str = str + t0._1() + ":" + t0._2() + "," ;
index ++ ;
}
str = str.substring(0,str.length() - 1) ;//把最后一个","去除
return new Tuple2<String, String>(t._1(),str) ;
}
});
List<Tuple2<String,String>>data=rdd13.collect();
for (Tuple2<String,String>tt:data){
System.out.println(tt._1()+"==>"+tt._2());
}
}
}
3.3 Scala实现标签生成排序
import cn.ctgu.taggen.ReviewTags
import org.apache.spark.{SparkConf, SparkContext}
object TagGenerator{
def main(args: Array[String]): Unit = {
val conf=new SparkConf()
conf.setAppName("TagGenerator by ***")
conf.setMaster("local[4]")
val sc=new SparkContext(conf)
val poi_tags=sc.textFile("F:\\徐培成——spark\\线路三\\3-项目-团购网站标签生成\\团购网站标签生成\\temptags.txt")
val poi_taglist=poi_tags.map(e=>e.split("\t")).filter(e=>e.length==2)
//这是个函数映射,e(0)是key,后面的是value,最终结果为类似:(7789,"a,b,c")的元组,实际为:77287793 -> 音响效果好,干净卫生,服务热情
.map(e=>e(0)->ReviewTags.extractTags(e(1)))
//过滤评论串不为0
.filter(e=>e._2.length>0)
//映射成一个数组:77287793 -> [音响效果好,干净卫生,服务热情]
.map(e=>e._1->e._2.split(","))
//压扁操作,变为:77287793 -> 音响效果好 , 77287793->干净卫生,77287793->服务热情
.flatMapValues(e=>e)
//映射,元组到1的映射,(77287793,音响效果好)->1,(77287793,干净卫生)->1,(77287793,服务热情)->1
.map(e=>(e._1,e._2)->1)
//按key聚合,结果为:(77287793,音响效果好)->340
.reduceByKey(_+_)
//元组不能聚合,列表能聚合,所以这个位置是将其放在列表中,结果为:77287793->List(音响效果好,340)
.map(e=>e._1._1->List((e._1._2,e._2)))
//将元组聚合到一个列表中,77287793->List((音响效果好,340),(干净卫生,400),(..))
.reduceByKey(_ ::: _)
//将上面的结果按以下方式映射:将列表中的元素按第二个元素(也就是340、400...)进行倒序排序,并取出前10个
//sortBy是按升序排的,reverse下就变成了降序
.map(e=>e._1->e._2.sortBy(_._2).reverse.take(10)
//对列表中的每一个元组进行变换,转成字符串的形式
//77287793->List(音响效果好:540,干净卫生:400),(..),..)===>
//77287793->音响效果好:540,干净卫生:400
.map(a=>a._1+":"+a._2.toString).mkString(","))
poi_taglist.map(e=>e._1+"\t"+e._2).saveAsTextFile("file:///F:\\comp\\res.txt")
}
}
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cn.ctgu</groupId>
<artifactId>JsonLogProcessModel</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.24</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.1.0</version>
</dependency>
</dependencies>
</project>
4、技术及难点总结
1、对数据的清洗
2、复杂标签数据的提取中采用了fastjson技术
3、标签统计、排序过程中主要涉及了复杂的spark RDD一些算子和transformation操作
5、该过程包含以下算子的操作:RDD Map、Filter、mapToPair、flatMapValues、ReduceByKey等操作。
6、解决了数据倾斜问题。
难点:数据比较复杂,清洗过程相对麻烦;RDD排序操作是采用TreeSet集合,实现Comparator接口来达到排序的要求;由于某个热门商品的评论量过大,导致了数据倾斜问题,通过对key增加一个随机数来解决数据倾斜,让不同的key分配到不同的partitions上,然后对每个partitions上的数据做一次聚合,从而达到缓解数据倾斜问题。