Spark json TempTags Sample

Data Frame

———–>取同一编号即同一家的前三的最多评论

77287793
{
    "reviewPics": null, 
    "extInfoList": [
        {
            "title": "contentTags", 
            "values": [
                "高大上", 
                "环境优雅", 
                "价格实惠", 
                "交通便利"
            ], 
            "desc": "", 
            "defineType": 0
        }, 
        {
            "title": "tagIds", 
            "values": [
                "616", 
                "24", 
                "373", 
                "278"
            ], 
            "desc": "", 
            "defineType": 0
        }
    ], 
    "expenseList": null, 
    "reviewIndexes": [
        2
    ], 
    "scoreList": null
}

json data

Scala

package com.spark.json

import com.alibaba.fastjson.JSON
import org.apache.spark.{SparkConf, SparkContext}
/**
  * Created by wqh on 2017/9/12.
  */
object Test extends App {

    val start = System.currentTimeMillis
    val conf = new SparkConf()
    conf.setAppName("TestsortBykey").setMaster("local[4]")
    val sc = new SparkContext(conf)
    val rdd1 = sc.textFile("/Users/wqh/Desktop/data/temptags.txt", 2)

    def parseJon(line: String): (String, String) = {

        val arr = line.split("\t")
        if (arr != null && arr.length > 1) {

            val jsonObj = JSON.parseObject(arr(1))
            val j2 = jsonObj.getJSONArray("extInfoList")
            if (j2 != null && j2.size() > 0) {

                val j3 = j2.getJSONObject(0)
                val j4 = j3.getJSONArray("values")
                if (j4 != null && j4.size() > 0) {
                    val j5 = j4.toArray().mkString(",")
                    (arr(0), j5)
                } else (arr(0), "")
            } else (arr(0), "")
        } else (arr(0), "")
    }

    val rdd2 = rdd1.map(parseJon(_))
    val rdd3 = rdd2.filter(t => t._2 != null && !t._2.equals(""))
    val rdd4 = rdd3.flatMapValues(_.split(","))
    val rdd5 = rdd4.map(t => (t, 1))
    val rdd6 = rdd5.reduceByKey(_ + _)
    val rdd7 = rdd6.map(t => (t._1._1, (t._1._2, t._2) :: Nil))
    val rdd8 = rdd7.reduceByKey(_ ++ _)
    val rdd9 = rdd8.map(t => {
        (t._1, t._2.sortBy(-_._2).take(3))
    })

    rdd9.collect().foreach(println)
    println(System.currentTimeMillis - start)
}

Java

package com.it18zhang.spark;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;

public class TempTagGenJava {
    public static void main(String[] args) {
        long start = System.currentTimeMillis();
        SparkConf conf = new SparkConf();
        conf.setAppName("tagGen");
        conf.setMaster("local[4]");

        JavaSparkContext sc = new JavaSparkContext(conf);
        //加载文档
        JavaRDD<String> rdd1 = sc.textFile("/Users/wqh/Desktop/data/temptags.txt");
        //变换
        JavaRDD<Tuple2<String, String>> rdd2 = rdd1.map(new Function<String, Tuple2<String, String>>() {
            public Tuple2<String, String> call(String line) throws Exception {
                return parseJson(line);
            }
        });
        //过滤空串
        JavaRDD<Tuple2<String, String>> rdd3 = rdd2.filter(new Function<Tuple2<String, String>, Boolean>() {
            public Boolean call(Tuple2<String, String> t) throws Exception {
                return t._2() != null && !t._2.equals("");
            }
        });
        //压扁
        JavaPairRDD<String, String> rdd4 = rdd3.flatMapToPair(
                new PairFlatMapFunction<Tuple2<String, String>, String, String>() {
                    public Iterator<Tuple2<String, String>> call(Tuple2<String, String> t) throws Exception {
                        List<Tuple2<String, String>> list = new ArrayList<Tuple2<String, String>>();
                        String[] arr = t._2.split(",");
                        for (String comm : arr) {
                            list.add(new Tuple2<String, String>(t._1, comm));
                        }
                        return list.iterator();
                    }
                });

        //
        JavaPairRDD<Tuple2<String, String>, Integer> rdd5 = rdd4.mapToPair(
                new PairFunction<Tuple2<String, String>, Tuple2<String, String>, Integer>() {
                    public Tuple2<Tuple2<String, String>, Integer> call(Tuple2<String, String> t) throws Exception {
                        return new Tuple2<Tuple2<String, String>, Integer>(t, 1);
                    }
                });
        //聚合
        JavaPairRDD<Tuple2<String, String>, Integer> rdd6 = rdd5.reduceByKey(new Function2<Integer, Integer, Integer>() {
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });

        //变换成新对
        JavaPairRDD<String, List<Tuple2<String, Integer>>> rdd7 = rdd6.mapToPair(new PairFunction<Tuple2<Tuple2<String, String>, Integer>, String, List<Tuple2<String, Integer>>>() {
            public Tuple2<String, List<Tuple2<String, Integer>>> call(Tuple2<Tuple2<String, String>, Integer> t) throws Exception {
                List<Tuple2<String, Integer>> list = new ArrayList<Tuple2<String, Integer>>();
                list.add(new Tuple2<String, Integer>(t._1._2, t._2));
                return new Tuple2<String, List<Tuple2<String, Integer>>>(t._1._1, list);
            }
        });

        JavaPairRDD<String, List<Tuple2<String, Integer>>> rdd8 = rdd7.reduceByKey(
                new Function2<List<Tuple2<String, Integer>>, List<Tuple2<String, Integer>>, List<Tuple2<String, Integer>>>() {
                    public List<Tuple2<String, Integer>> call(List<Tuple2<String, Integer>> v1, List<Tuple2<String, Integer>> v2) throws Exception {
                        v1.addAll(v2);
                        return v1;
                    }
                });

        //排序
        JavaPairRDD<String, List<Tuple2<String, Integer>>> rdd9 = rdd8.mapToPair(new PairFunction<Tuple2<String, List<Tuple2<String, Integer>>>, String, List<Tuple2<String, Integer>>>() {
            public Tuple2<String, List<Tuple2<String, Integer>>> call(Tuple2<String, List<Tuple2<String, Integer>>> t) throws Exception {
                String busiNum = t._1;
                List<Tuple2<String, Integer>> list = t._2;
                list.sort(new Comparator<Tuple2<String, Integer>>() {
                    public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {
                        return o2._2 - o1._2;
                    }
                });
                if (list != null && list.size() >= 3) {
                    list = list.subList(0, 3);
                }
                return new Tuple2<String, List<Tuple2<String, Integer>>>(busiNum, new ArrayList(list));
            }
        });
        List list = rdd9.collect();
        for (Object o : list) {
            System.out.println(o);
        }
        System.out.println(System.currentTimeMillis() - start);

    }

    /**
     * 解析一行json串,返回元组KV
     *
     * @param line
     * @return
     */
    public static Tuple2<String, String> parseJson(String line) {
        String[] arr = line.split("\t");
        String cont = "";
        String busiNum = arr[0];
        if (arr != null && arr.length > 1) {
            String json = arr[1];
            JSONObject jo = JSON.parseObject(json);
            JSONArray jarr = jo.getJSONArray("extInfoList");
            if (jarr != null && jarr.size() > 0) {
                JSONObject jo2 = jarr.getJSONObject(0);
                if (jo2 != null) {
                    JSONArray commArr = jo2.getJSONArray("values");
                    if (commArr != null && commArr.size() > 0) {
                        for (int i = 0; i < commArr.size(); i++) {
                            cont = cont + commArr.get(i) + ",";
                        }
                    }
                }
            }
        }
        if (cont != null && !cont.equals("")) {
            return new Tuple2<String, String>(busiNum, cont.substring(0, cont.length() - 1));
        }
        return new Tuple2<String, String>(busiNum, "");
    }
}

Result

(83644298,List((体验好,1), (性价比高,1), (服务热情,1)))
(82317795,List((味道差,1)))
(77705462,List((服务热情,3), (羊肉,2), (价格实惠,2)))
(85766086,List((价格实惠,2), (服务热情,2), (味道赞,2)))
(74145782,List((服务热情,18), (味道赞,14), (上菜快,13)))
(71039150,List((团建,1), (价格实惠,1), (朋友聚会,1)))
(70611801,List((干净卫生,4), (回头客,3), (味道赞,2)))
(88902676,List((2,2)))
(73963176,List((味道赞,15), (价格实惠,12), (分量足,11)))
(84270191,List((价格实惠,2), (服务热情,2), (性价比高,2)))
(89223651,List((环境优雅,8), (服务热情,8), (技师专业,7)))
(82016443,List((分量足,3), (味道赞,2), (主食赞,2)))
(77287793,List((干净卫生,29), (环境优雅,26), (音响效果好,26)))
(79197522,List((服务热情,2), (价格实惠,1), (放松舒服,1)))
(83084036,List((干净卫生,1), (价格实惠,1)))
(73879078,List((饮品赞,3), (回头客,2), (味道赞,2)))
(88284865,List((价格实惠,1), (价格高,1), (性价比低,1)))
(83073343,List((干净卫生,17), (味道赞,16), (环境优雅,15)))
(76114040,List((2,3), (5,1), (性价比高,1)))
(86913510,List((午餐,1), (分量适中,1)))
(88496862,List((回头客,5), (味道赞,4), (服务热情,4)))
(78477325,List((味道赞,8), (回头客,7), (干净卫生,5)))
(83981222,List((性价比高,4), (干净卫生,3), (价格实惠,3)))
(82705919,List((回头客,3), (干净卫生,3), (1,2)))
(87994574,List((无推销,12), (价格实惠,8), (服务热情,7)))
(77373671,List((菜品差,1), (服务热情,1), (干净卫生,1)))
(75144086,List((8239,60), (服务热情,38), (8241,31)))
(85648235,List((味道赞,17), (服务热情,15), (干净卫生,13)))
(73607905,List((菜品不错,16), (回头客,15), (干净卫生,15)))
(76893145,List((服务热情,10), (环境优雅,7), (高大上,5)))
(78824187,List((价格实惠,13), (回头客,11), (分量足,10)))
8658
17/09/12 16:10:31 INFO SparkContext: Invoking stop() from shutdown hook
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值