Java数据统计
spark版本2.1.2,包含Dateset使用,SparkStreaming数据统计
项目地址为https://github.com/baifanwudi/big-data-analysis
代码示例
SparkSql demo: 读取json文件写入hive
package com.adups.offline.hive.log;
import com.adups.base.AbstractSparkSql;
import com.adups.config.FlumePath;
import com.adups.util.DateUtil;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class OtaAppLog extends AbstractSparkSql {
private Logger logger = LoggerFactory.getLogger(OtaAppLog.class);
@Override
public void executeProgram(String pt, String path, SparkSession spark) throws IOException {
int partitionNum = 4;
String ptWithPre= DateUtil.pathPtWithPre(pt);
String appLogPath= FlumePath.APP_LOG_PATH+ptWithPre;
if(!existsPath(appLogPath)){
return;
}
Dataset<Row> otaAppLog= spark.read().schema(produceSchema()).json(appLogPath).distinct().repartition(partitionNum);
otaAppLog.createOrReplaceTempView("OtaAppLog");
beforePartition(spark);
String sql = "insert overwrite table ota_app_log partition(pt='"+pt+"') " +
"select mid,ip,version,deviceId,productId,continentEn,continentZh,countryEn,countryZh,provinceEn,provinceZh,cityEn,cityZh," +
"networktype,lac,cid,mcc,mnc,rxlev,num,goType,createTime,dataType from OtaAppLog";
logger.warn("executing sql is :" + sql);
spark.sql(sql);
}
public StructType produceSchema(){
List<StructField> inputFields=new ArrayList<>();
String splitSeq=",";
String stringType="mid,ip,version,continentEn,continentZh,countryEn,countryZh,provinceEn,provinceZh," +
"cityEn,cityZh,networktype,deviceId,lac,cid,mcc,mnc,rxlev,dataType";
String timeType="createTime";
String longType="productId";
String integerType="num,goType";
for(String stringTmp:stringType.split(splitSeq)){
inputFields.add(DataTypes.createStructField(stringTmp,DataTypes.StringType,true));
}
inputFields.add(DataTypes.createStructField(timeType,DataTypes.TimestampType,false));
for(String integerTmp:integerType.split(splitSeq)){
inputFields.add(DataTypes.createStructField(integerTmp,DataTypes.IntegerType,true));
}
for(String longTmp:longType.split(splitSeq)){
inputFields.add(DataTypes.createStructField(longTmp,DataTypes.LongType,false));
}
return DataTypes.createStructType(inputFields);
}
public static void main(String[] args) throws Exception {
String pt= DateUtil.producePtOrYesterday(args);
OtaAppLog otaAppLog =new OtaAppLog();
otaAppLog.runAll(pt);
}
}
package com.adups.base;
import com.adups.config.HiveConfig;
import org.apache.hadoop.fs.Path;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
/**
* @author allen
* Created by allen on 04/08/2017.
*/
public abstract class AbstractSparkSql extends AbstractFileSystem {
private Logger logger = LoggerFactory.getLogger(this.getClass());
/**
* spark运算
* @param pt 时间格式 pt=2017-10-11
* @param path hdfs路径
* @param spark
* @throws IOException
*/
public abstract void executeProgram(String pt,String path,SparkSession spark) throws IOException;
public boolean existsPath(String... pathList) throws IOException {
for (String path : pathList) {
if (!fileSystem.exists(new Path(path))) {
logger.error(" the path:" + path + " is not existed");
return false;
}else{
logger.warn("executing the path is : " + path);
}
}
return true;
}
public void runAll(String pt,String path,Boolean isHiveSupport) throws IOException {
if(path!=null && !existsPath(path)) {
logger.error("the src path is not existed:" + path);
return;
}
executeSpark(pt,path,isHiveSupport);
}
/**
* 没有路径判断,默认激活 hive
*/
public void runAll(String pt) throws IOException {
runAll(pt,null,true);
}
public void runAll(String pt,String path) throws IOException {
runAll(pt,path,true);
}
public void runAll(String pt,Boolean isHiveSupport) throws IOException {
runAll(pt,null,isHiveSupport);
}
private void executeSpark(String pt,String path,Boolean isHiveSupport) throws IOException {
SparkSession spark ;
String appName=this.getClass().getSimpleName();
if(isHiveSupport) {
spark = SparkSession.builder().appName(appName).enableHiveSupport().getOrCreate();
logger.info("spark enable hive, begin to execute the program");
useDataBase(spark);
}else{
spark = SparkSession.builder().appName(appName).getOrCreate();
logger.info("spark begin to execute the program");
}
executeProgram(pt,path,spark);
logger.info("spark has finished the program ");
}
private void useDataBase(SparkSession spark){
logger.info("before the sql : "+HiveConfig.SQL_DATABASE );
spark.sql(HiveConfig.SQL_DATABASE);
}
public void beforePartition(SparkSession spark){
spark.sql(HiveConfig.HIVE_PARTITION);
}
}
spark读json文件做数据统计写入mysql
package com.adups.online.flume;
import com.adups.base.AbstractSparkSql;
import com.adups.bean.out.DeviceArea;
import com.adups.config.OnlineOfflinePath;
import com.adups.common.ReadTable;
import com.adups.common.sql.flume.DeviceAreaOnlineSave;
import com.adups.util.CommonUtil;
import com.adups.util.DateUtil;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.functions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.collection.Seq;
import static org.apache.spark.sql.functions.*;
import java.io.IOException;
/**
* @author allen
* Created by allen on 03/08/2017.
*/
public class DeviceAreaOnline extends AbstractSparkSql {
private Logger logger = LoggerFactory.getLogger(this.getClass());
@Override
public void executeProgram(String pt, String path, SparkSession spark) throws IOException {
String prePath = DateUtil.pathPtWithPre(pt);
String nowPt = DateUtil.nowPtDay();
String beginTime = nowPt + " 00:00:00";
String endTime = nowPt + " 23:59:59";
String deviceTotal = OnlineOfflinePath.OFFLINE_DEVICE_NEW_TOTAL_PATH + prePath;
String deviceAreaTotal = OnlineOfflinePath.OFFLINE_DEVICE_AREA_NEW_TOTAL_PATH + prePath;
String originAreaPath = OnlineOfflinePath.ONLINE_DEVICE_AREA_NEW_TOTAL_PATH;
if (!existsPath(deviceAreaTotal, deviceAreaTotal)) {
return;
}
String where = "(select product_id as productId,device_id as deviceId,country_zh as country,province_zh as province from iot_register.device_info " +
"where create_time between '" + beginTime + "' and '" + endTime + "' ) as device_time_filter";
Dataset<Row> todayDevice = new ReadTable().loadTable(spark, where).coalesce(1);
Dataset<Row> yesterdayStats = spark.read().parquet(deviceTotal).select("productId", "totalNum");
Dataset<Row> totalIncrement = todayDevice.groupBy("productId").agg(functions.countDistinct("deviceId").as("newNum"));
Seq<String> seq = CommonUtil.columnNames("productId");
Seq<String> naFillZero = CommonUtil.columnNames("newNum,totalNum");
Dataset<Row> result = yesterdayStats.join(totalIncrement, seq, "outer").na().fill(0, naFillZero)
.select(col("productId"), col("newNum"), col("newNum").plus(col("totalNum")).as("totalNum"))
.withColumn("pt", lit(nowPt)).coalesce(1);
Dataset<Row> yesterdayAreaStatistics = spark.read().parquet(deviceAreaTotal).select("productId", "country", "province", "totalNum").toDF();
Dataset<Row> areaIncrement = todayDevice.groupBy("productId", "country", "province").agg(functions.countDistinct("deviceId").as("newNum"));
seq = CommonUtil.columnNames("productId,country,province");
Dataset<Row> areaResult = yesterdayAreaStatistics.join(areaIncrement, seq, "outer").na().fill(0, naFillZero)
.select(col("productId"), col("country"), col("province"), col("newNum"),
col("newNum").plus(col("totalNum")).as("totalNum")).withColumn("pt", lit(nowPt)).coalesce(1);
Dataset<DeviceArea> deltaArea;
if (existsPath(originAreaPath)) {
try {
Dataset<Row> originBase = spark.read().parquet(originAreaPath);
deltaArea = areaResult.except(originBase).coalesce(1).as(new DeviceArea().produceBeanEncoder());
} catch (Exception e) {
logger.error(e.getMessage());
deltaArea = areaResult.as(new DeviceArea().produceBeanEncoder());
}
} else {
deltaArea = areaResult.as(new DeviceArea().produceBeanEncoder());
}
try {
insertToMysql(deltaArea);
} catch (Exception e) {
logger.error(e.getMessage());
}
areaResult.write().mode("overwrite").format("parquet").save(originAreaPath);
result.write().mode("overwrite").format("parquet").save(OnlineOfflinePath.ONLINE_DEVICE_NEW_TOTAL_PATH);
}
public void insertToMysql(Dataset<DeviceArea> dataSet) {
dataSet.foreachPartition(data -> {
String sql = "insert into stats_device_area(product_id,country,province,new_num,total_num,pt)" +
"values (?,?,?,?,?,?) on duplicate key update new_num=?,total_num=?";
new DeviceAreaOnlineSave().putDataBatch(data, sql);
});
}
public static void main(String[] args) throws IOException {
String pt = DateUtil.producePtOrYesterday(args);
DeviceAreaOnline deviceAreaOnline = new DeviceAreaOnline();
deviceAreaOnline.runAll(pt, false);
}
}