[1.1.0]工具类之SparkUtils的编写

场景

将各业务子模块公共的Spark代码抽取到一个工具类中,主要包括:

  • 模拟生产环境hive仓库中相关业务表数据 - 本地开发测试用
    spark应用程序所要处理的业务数据量很大,往往存储在Hive仓库中;本项目涉及到的hive表有 用户访问行为表:user_visit_action 与 用户信息表 user_info。在本地进行测试写好的spark应用程序代码,需手动写代码模拟上述hive表中业务数据。
  • SQLContext等对象的生成
    通过修改配置文件相关参数,实现同一个Application在本地测试模式与实际生产环境集群模式的自动切换。

代码

  • SparkUtils.java类
package cool.pengych.sparker.util;

import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.hive.HiveContext;
import com.alibaba.fastjson.JSONObject;
import cool.pengych.sparker.conf.ConfigurationManager;
import cool.pengych.sparker.constant.Constants;

/**
 * spark应用程序各业务模块公共代码类
 * @author pengyucheng
 */
public class SparkUtils
{
    private static final boolean IS_LOCAL = ConfigurationManager.getBoolean(Constants.LOCAL_DEPLOY);
    /**
     * 集群模式:本地 or 集群
     */
    public static void setMaster(SparkConf conf)
    {
        if(IS_LOCAL)
        {
            conf.setMaster("local");
        }
    }

    /**
     * 生产环境就用HiveContext
     * @param sc
     * @return
     */
    public static SQLContext getSQLContext(SparkContext sc)
    {
        if(IS_LOCAL)
        {
            return new SQLContext(sc);
        }
        else
        {
            return new HiveContext(sc);
        }
    }

    /**
     * 模拟hive仓库中数据,本地开发环境测试用
     * @param sc
     * @param sqlContext
     */
    public static void mockData(JavaSparkContext sc,SQLContext sqlContext)
    {
        if(IS_LOCAL)
        {
            LocalDataGenerator.mockData(sc, sqlContext);
        }
    }

    /**
     * 获取指定日期范围内的用户行为数据
     * @param sqlContext
     * @param taskParam
     * @return  JavaRDD<Row> 
     */
    public static JavaRDD<Row> getActionRDDByRange(SQLContext sqlContext,JSONObject taskParam)
    {
        String startDate = taskParam.getString(Constants.PARAM_START_DATE);
        String endDate = taskParam.getString(Constants.PARAM_END_DATE);
        String sql = "select * from user_visit_action "
                + "where date >= "+startDate +" and date <= "+ endDate;
        return sqlContext.sql(sql).toJavaRDD();
    }
}
  • LocalDataGenerator.java类
package cool.pengych.sparker.util;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.UUID;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import scala.util.Random;

/**
 * 用户行为类型
 * 搜索、点击、下单及支付
 * @author pengyucheng
 */
 enum ActionType   
{  
    SEARCH,CLICK,ORDER,PAY;
}  

 /**
  * spark应用程序
  *  本地测试数据用数据生成器 
  * @author pengyucheng
  */
public class LocalDataGenerator
{
    /**
     * 模拟生产环境hive表数据
     * 本地测试用
     * @param jsc
     * @param sqlContext
     */
    public static void mockData(JavaSparkContext jsc,SQLContext sqlContext)
    {
        mockUserInfo(jsc,sqlContext);
        mockUseSessionInfo(jsc,sqlContext);
    }

    /**
     * 模拟产生hive中 user_info表的数据
     * @param sc
     * @param sqlContext
     */
    public static void mockUserInfo(JavaSparkContext jsc ,SQLContext sqlContext)
    {
                /*
                 * 1、生成 List<Row> 与 定义RDD中row的每列数据类型 :完成非结构化数据到结构化数据的转换
                 */
                List<Row> rows = new ArrayList<Row>();
                Row row = null;

                long userId = 0;
                String username = null;
                String name = null;
                int age = 0;
                String professional = null;
                String city = null;
                String[] sexStrs = new String[]{"man","woman"};
                String sex = null;
                Random random =  new Random(666);
                for (int i = 0; i < 100; i++)
                {
                    userId = random.nextInt(100);
                    username = "username"+userId;
                    name = "name"+userId;
                    age = random.nextInt(100);
                    professional = "professional"+userId;
                    city = "city"+userId;
                    sex = sexStrs[random.nextInt(2)];
                    row = RowFactory.create(userId,username,name,age,professional,city,sex);
                    rows.add(row);
                }
                JavaRDD<Row> rowRDD = jsc.parallelize(rows);
                StructType st = DataTypes.createStructType(Arrays.asList(
                        DataTypes.createStructField("user_id", DataTypes.LongType, false),
                        DataTypes.createStructField("username", DataTypes.StringType, true),
                        DataTypes.createStructField("name", DataTypes.StringType, true),
                        DataTypes.createStructField("age", DataTypes.IntegerType, true),
                        DataTypes.createStructField("professional", DataTypes.StringType, true),
                        DataTypes.createStructField("city", DataTypes.StringType, true)
                        ));

                /*
                 * 2、rowRDD 转化成 DataFrame
                 */
                DataFrame df = sqlContext.createDataFrame(rowRDD, st);
                df.printSchema();

                /*
                 * 3、将内存中的数据,注册临时表
                 */
                df.registerTempTable("user_info");
    }

    /**
     * 模拟hive中 user_session_info表数据
     * @param sc
     * @param sqlContext
     */
    public static void mockUseSessionInfo(JavaSparkContext jsc ,SQLContext sqlContext)
    {
        List<Row> rows = new ArrayList<Row>();
        Random random = new Random();
        String date = DateUtils.formatDate(new Date());
        String[] searchKeyWords = new String[]{"钢琴","吉他","hadoop","spark"};
        for (int i = 0; i < 100; i++)
        {
            long userId = random.nextInt(100);
            for (int j = 0; j < 10; j++) 
            {
                String sessionId = UUID.randomUUID().toString();
                for (int k = 0; k < 10; k++) 
                {
                    String searchKeyWord = null;
                    Long clickCategoryId = null;
                    Long clickProductId = null;
                    String orderCategoryIds = null; 
                    String orderProductIds = null;
                    String payCategoryIds = null;
                    String payProductIds = null;

                    long pageid = random.nextInt(10);  
                    String actionTime = date + " " + random.nextInt(24)+":"+random.nextInt(60)+":"+random.nextInt(60);
                    ActionType actionType = ActionType.values()[random.nextInt(ActionType.values().length)];
                    switch (actionType)
                    {
                    case SEARCH:
                       searchKeyWord = searchKeyWords[random.nextInt(4)];
                        break;
                    case CLICK:
                        clickCategoryId = Long.valueOf(String.valueOf(random.nextInt(100)));   
                        clickProductId = Long.valueOf(String.valueOf(random.nextInt(100)));  
                        break;
                    case ORDER:
                        orderCategoryIds = getRandomStringArrs();  
                        orderProductIds = getRandomStringArrs();
                        break;
                    case PAY:
                        payCategoryIds = getRandomStringArrs();  
                        payProductIds = getRandomStringArrs();
                        break;
                    default:
                        break;
                    }
                    Row row = RowFactory.create(date, userId, sessionId, 
                            pageid, actionTime, searchKeyWord,
                            clickCategoryId, clickProductId,
                            orderCategoryIds, orderProductIds,
                            payCategoryIds, payProductIds);
                    rows.add(row);
                }
            }
        }
        StructType type = DataTypes.createStructType(Arrays.asList(
                DataTypes.createStructField("date", DataTypes.StringType, false),
                DataTypes.createStructField("user_id", DataTypes.LongType, true),
                DataTypes.createStructField("session_id", DataTypes.StringType, true),
                DataTypes.createStructField("page_id", DataTypes.LongType, true),
                DataTypes.createStructField("action_time", DataTypes.StringType, true),
                DataTypes.createStructField("search_keyword", DataTypes.StringType, true),
                DataTypes.createStructField("click_category_id", DataTypes.LongType, true),
                DataTypes.createStructField("click_product_id", DataTypes.LongType, true),
                DataTypes.createStructField("order_category_ids", DataTypes.StringType, true),
                DataTypes.createStructField("order_product_ids", DataTypes.StringType, true),
                DataTypes.createStructField("pay_category_ids", DataTypes.StringType, true),
                DataTypes.createStructField("pay_product_ids", DataTypes.StringType, true)));

        DataFrame df = sqlContext.createDataFrame(jsc.parallelize(rows), type);
        df.registerTempTable("user_visit_action");

        /**************测试用**************/
        List<Row> rows2 = df.toJavaRDD().take(1);
        for (Row row : rows2) 
        {
            System.out.println(row);
        }
    }

    /**
     * 获取随机个数的id
     * 比如,获取 ids 个 click_category_id
     * @return 字符串ids
     */
    private static String getRandomStringArrs()
    {
        Random random = new Random();
        StringBuffer sb = new StringBuffer();
        int ids = random.nextInt(7);
        for (int i = 0; i < ids ; i++)
        {
            sb.append( String.valueOf(random.nextInt(100)));
            if(i<ids-1)
            {
                sb.append(",");
            }
        }
        return sb.toString();
    }
}
LocalDataGenerator类执行结果
16/06/28 00:05:04 INFO BlockManagerMaster: Registered BlockManager
root
 |-- user_id: long (nullable = false)
 |-- username: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- professional: string (nullable = true)
 |-- city: string (nullable = true)

16/06/28 00:05:08 INFO SparkContext: Starting job: take at LocalDataGenerator.java:193
16/06/28 00:05:09 INFO DAGScheduler: Job 0 finished: take at LocalDataGenerator.java:193, took 1.128053 s
[2016-06-28,60,e689c129-2bac-4fc0-82f4-2c17bee21066,0,2016-06-28 1:44:29,null,null,null,,65,66,40,12,43,74,null,null]
16/06/28 00:05:09 INFO SparkContext: Invoking stop() from shutdown hook

总结

00:07 Good , night Spark

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值