应工作要求,将hdfs中的logs文件转换成parquet文件,日志文件是追加型、分天存储的。
粗略了解spark,故设计启动时跑一次,其余每天3点跑
源码:https://github.com/zjf92/spark-1.4.0-TimerTask
package com.spark.parquet;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Timer;
import java.util.TimerTask;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
public class ParquetJob {
private static JavaSparkContext sc;
priva