Data Frame
———–>取同一编号即同一家的前三的最多评论
77287793
{
"reviewPics": null,
"extInfoList": [
{
"title": "contentTags",
"values": [
"高大上",
"环境优雅",
"价格实惠",
"交通便利"
],
"desc": "",
"defineType": 0
},
{
"title": "tagIds",
"values": [
"616",
"24",
"373",
"278"
],
"desc": "",
"defineType": 0
}
],
"expenseList": null,
"reviewIndexes": [
2
],
"scoreList": null
}
Scala
package com.spark.json
import com.alibaba.fastjson.JSON
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by wqh on 2017/9/12.
*/
object Test extends App {
val start = System.currentTimeMillis
val conf = new SparkConf()
conf.setAppName("TestsortBykey").setMaster("local[4]")
val sc = new SparkContext(conf)
val rdd1 = sc.textFile("/Users/wqh/Desktop/data/temptags.txt", 2)
def parseJon(line: String): (String, String) = {
val arr = line.split("\t")
if (arr != null && arr.length > 1) {
val jsonObj = JSON.parseObject(arr(1))
val j2 = jsonObj.getJSONArray("extInfoList")
if (j2 != null && j2.size() > 0) {
val j3 = j2.getJSONObject(0)
val j4 = j3.getJSONArray("values")
if (j4 != null && j4.size() > 0) {
val j5 = j4.toArray().mkString(",")
(arr(0), j5)
} else (arr(0), "")
} else (arr(0), "")
} else (arr(0), "")
}
val rdd2 = rdd1.map(parseJon(_))
val rdd3 = rdd2.filter(t => t._2 != null && !t._2.equals(""))
val rdd4 = rdd3.flatMapValues(_.split(","))
val rdd5 = rdd4.map(t => (t, 1))
val rdd6 = rdd5.reduceByKey(_ + _)
val rdd7 = rdd6.map(t => (t._1._1, (t._1._2, t._2) :: Nil))
val rdd8 = rdd7.reduceByKey(_ ++ _)
val rdd9 = rdd8.map(t => {
(t._1, t._2.sortBy(-_._2).take(3))
})
rdd9.collect().foreach(println)
println(System.currentTimeMillis - start)
}
Java
package com.it18zhang.spark;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
public class TempTagGenJava {
public static void main(String[] args) {
long start = System.currentTimeMillis();
SparkConf conf = new SparkConf();
conf.setAppName("tagGen");
conf.setMaster("local[4]");
JavaSparkContext sc = new JavaSparkContext(conf);
//加载文档
JavaRDD<String> rdd1 = sc.textFile("/Users/wqh/Desktop/data/temptags.txt");
//变换
JavaRDD<Tuple2<String, String>> rdd2 = rdd1.map(new Function<String, Tuple2<String, String>>() {
public Tuple2<String, String> call(String line) throws Exception {
return parseJson(line);
}
});
//过滤空串
JavaRDD<Tuple2<String, String>> rdd3 = rdd2.filter(new Function<Tuple2<String, String>, Boolean>() {
public Boolean call(Tuple2<String, String> t) throws Exception {
return t._2() != null && !t._2.equals("");
}
});
//压扁
JavaPairRDD<String, String> rdd4 = rdd3.flatMapToPair(
new PairFlatMapFunction<Tuple2<String, String>, String, String>() {
public Iterator<Tuple2<String, String>> call(Tuple2<String, String> t) throws Exception {
List<Tuple2<String, String>> list = new ArrayList<Tuple2<String, String>>();
String[] arr = t._2.split(",");
for (String comm : arr) {
list.add(new Tuple2<String, String>(t._1, comm));
}
return list.iterator();
}
});
//
JavaPairRDD<Tuple2<String, String>, Integer> rdd5 = rdd4.mapToPair(
new PairFunction<Tuple2<String, String>, Tuple2<String, String>, Integer>() {
public Tuple2<Tuple2<String, String>, Integer> call(Tuple2<String, String> t) throws Exception {
return new Tuple2<Tuple2<String, String>, Integer>(t, 1);
}
});
//聚合
JavaPairRDD<Tuple2<String, String>, Integer> rdd6 = rdd5.reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
//变换成新对
JavaPairRDD<String, List<Tuple2<String, Integer>>> rdd7 = rdd6.mapToPair(new PairFunction<Tuple2<Tuple2<String, String>, Integer>, String, List<Tuple2<String, Integer>>>() {
public Tuple2<String, List<Tuple2<String, Integer>>> call(Tuple2<Tuple2<String, String>, Integer> t) throws Exception {
List<Tuple2<String, Integer>> list = new ArrayList<Tuple2<String, Integer>>();
list.add(new Tuple2<String, Integer>(t._1._2, t._2));
return new Tuple2<String, List<Tuple2<String, Integer>>>(t._1._1, list);
}
});
JavaPairRDD<String, List<Tuple2<String, Integer>>> rdd8 = rdd7.reduceByKey(
new Function2<List<Tuple2<String, Integer>>, List<Tuple2<String, Integer>>, List<Tuple2<String, Integer>>>() {
public List<Tuple2<String, Integer>> call(List<Tuple2<String, Integer>> v1, List<Tuple2<String, Integer>> v2) throws Exception {
v1.addAll(v2);
return v1;
}
});
//排序
JavaPairRDD<String, List<Tuple2<String, Integer>>> rdd9 = rdd8.mapToPair(new PairFunction<Tuple2<String, List<Tuple2<String, Integer>>>, String, List<Tuple2<String, Integer>>>() {
public Tuple2<String, List<Tuple2<String, Integer>>> call(Tuple2<String, List<Tuple2<String, Integer>>> t) throws Exception {
String busiNum = t._1;
List<Tuple2<String, Integer>> list = t._2;
list.sort(new Comparator<Tuple2<String, Integer>>() {
public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {
return o2._2 - o1._2;
}
});
if (list != null && list.size() >= 3) {
list = list.subList(0, 3);
}
return new Tuple2<String, List<Tuple2<String, Integer>>>(busiNum, new ArrayList(list));
}
});
List list = rdd9.collect();
for (Object o : list) {
System.out.println(o);
}
System.out.println(System.currentTimeMillis() - start);
}
/**
* 解析一行json串,返回元组KV
*
* @param line
* @return
*/
public static Tuple2<String, String> parseJson(String line) {
String[] arr = line.split("\t");
String cont = "";
String busiNum = arr[0];
if (arr != null && arr.length > 1) {
String json = arr[1];
JSONObject jo = JSON.parseObject(json);
JSONArray jarr = jo.getJSONArray("extInfoList");
if (jarr != null && jarr.size() > 0) {
JSONObject jo2 = jarr.getJSONObject(0);
if (jo2 != null) {
JSONArray commArr = jo2.getJSONArray("values");
if (commArr != null && commArr.size() > 0) {
for (int i = 0; i < commArr.size(); i++) {
cont = cont + commArr.get(i) + ",";
}
}
}
}
}
if (cont != null && !cont.equals("")) {
return new Tuple2<String, String>(busiNum, cont.substring(0, cont.length() - 1));
}
return new Tuple2<String, String>(busiNum, "");
}
}
Result
(83644298,List((体验好,1), (性价比高,1), (服务热情,1)))
(82317795,List((味道差,1)))
(77705462,List((服务热情,3), (羊肉,2), (价格实惠,2)))
(85766086,List((价格实惠,2), (服务热情,2), (味道赞,2)))
(74145782,List((服务热情,18), (味道赞,14), (上菜快,13)))
(71039150,List((团建,1), (价格实惠,1), (朋友聚会,1)))
(70611801,List((干净卫生,4), (回头客,3), (味道赞,2)))
(88902676,List((2,2)))
(73963176,List((味道赞,15), (价格实惠,12), (分量足,11)))
(84270191,List((价格实惠,2), (服务热情,2), (性价比高,2)))
(89223651,List((环境优雅,8), (服务热情,8), (技师专业,7)))
(82016443,List((分量足,3), (味道赞,2), (主食赞,2)))
(77287793,List((干净卫生,29), (环境优雅,26), (音响效果好,26)))
(79197522,List((服务热情,2), (价格实惠,1), (放松舒服,1)))
(83084036,List((干净卫生,1), (价格实惠,1)))
(73879078,List((饮品赞,3), (回头客,2), (味道赞,2)))
(88284865,List((价格实惠,1), (价格高,1), (性价比低,1)))
(83073343,List((干净卫生,17), (味道赞,16), (环境优雅,15)))
(76114040,List((2,3), (5,1), (性价比高,1)))
(86913510,List((午餐,1), (分量适中,1)))
(88496862,List((回头客,5), (味道赞,4), (服务热情,4)))
(78477325,List((味道赞,8), (回头客,7), (干净卫生,5)))
(83981222,List((性价比高,4), (干净卫生,3), (价格实惠,3)))
(82705919,List((回头客,3), (干净卫生,3), (1,2)))
(87994574,List((无推销,12), (价格实惠,8), (服务热情,7)))
(77373671,List((菜品差,1), (服务热情,1), (干净卫生,1)))
(75144086,List((8239,60), (服务热情,38), (8241,31)))
(85648235,List((味道赞,17), (服务热情,15), (干净卫生,13)))
(73607905,List((菜品不错,16), (回头客,15), (干净卫生,15)))
(76893145,List((服务热情,10), (环境优雅,7), (高大上,5)))
(78824187,List((价格实惠,13), (回头客,11), (分量足,10)))
8658
17/09/12 16:10:31 INFO SparkContext: Invoking stop() from shutdown hook