目录
本篇文章记录各区域热门商品统计-关联城市信息以及RDD转换为DataFrame后注册临时表。
代码
AreaTop3productSpark.java
/** *生成点击商品信息基础临时表 * @param sqlContext * @param cityid2clickActionRDD 商品点击信息 * @param cityid2cityInfoRDD 城市信息 */ private static void generateTempClickProductBasicTable( SQLContext sqlContext, JavaPairRDD<Long,Row> cityid2clickActionRDD, JavaPairRDD<Long,Row> cityid2cityInfoRDD ){ //执行join操作,进行点击行为数据和城市数据进行关联 JavaPairRDD<Long,Tuple2<Row,Row>> joinedRDD = cityid2clickActionRDD.join(cityid2cityInfoRDD); //将上面的JavaPairRDD转换为一个JavaRDD<Row> 才能将RDD转换为Dataset JavaRDD<Row> mappedRDD = joinedRDD.map( new Function<Tuple2<Long, Tuple2<Row, Row>>, Row>() { private static final long serialVersionUID = 1L; @Override public Row call(Tuple2<Long, Tuple2<Row, Row>> tuple) throws Exception { long cityid = tuple._1; Row clickAction = tuple._2._1; Row cityInfo = tuple._2._2; long productid= clickAction.getLong(1); String cityName = cityInfo.getString(1); String area = cityInfo.getString(2); return RowFactory.create(cityid,cityName,area,productid); } }); //基于JavaRDD<Row>的格式,就可以将其转化为Dataset List<StructField> structFields = new ArrayList<StructField>(); structFields.add(DataTypes.createStructField("city_id", DataTypes.LongType,true)); structFields.add(DataTypes.createStructField("city_name", DataTypes.StringType,true)); structFields.add(DataTypes.createStructField("area", DataTypes.StringType,true)); structFields.add(DataTypes.createStructField("product_id", DataTypes.LongType,true)); StructType schema = DataTypes.createStructType(structFields); Dataset ds = sqlContext.createDataFrame(mappedRDD,schema); //将Dataset中的数据注册为临时表 ds.registerTempTable("tmp_clk_prod_basic"); }