一、项目介绍
本项目是做离线分析,通过对日志分析进行同结果,最后以图表的方式展现出来。
步骤:1、脚本实现上传日志到hdfs
2、mr jar包实现数据清理保留有价值数据
3、将数据加载到hive外部表,统计每个ip的总上行流量单独放一个表
4、将最终数据用sqoop导出到mysql
5、在php页面展示柱状图
7、以上所有步骤在azkaban进行,spark则代替2、3、4步骤
二、shell 采集日志脚本上传HDFS
1、需求分析
点击流日志每天都产生在指定文件夹中,在业务应用服务器上,需要准实时上传至数据仓库(Hadoop HDFS)上。
2、实现思路
1、shell脚本实现,遍历日志文件夹内文件,如果满足日志文件名称格式,移动到待上传文件夹。
2、有一个记录待上传文件路径的文件,遍历文件内路径,上传后删除。
3、实现脚本
#!/bin/bash
#set java env
export JAVA_HOME=/root/apps/jdk1.8.0_131/
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=${JAVA_HOME}/bin:$PATH
#set hadoop env
export HADOOP_HOME=/root/apps/hadoop-2.7.2/
export PATH=${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:$PATH
#日志文件存放的目录
log_src_dir=/root/data/logs/
#待上传文件存放的目录
log_toupload_dir=/root/data/logs/toupload/
#打印环境变量信息
echo "envs: hadoop_home: $HADOOP_HOME"
day_01=`date -d'-1 day' +%Y-%m-%d`
syear=`date --date=$day_01 +%Y`
smonth=`date --date=$day_01 +%m`
sday=`date --date=$day_01 +%d`
#echo $day_01
#echo $syear
#echo $smonth
#echo $sday
#日志文件上传到hdfs的根路径
hdfs_root_dir=/data/clickLog/$syear/$smonth/$sday
hadoop fs -mkdir -p $hdfs_root_dir
#读取日志文件的目录,判断是否有需要上传的文件
echo "log_src_dir:"$log_src_dir
ls $log_src_dir | while read fileName
do
if [[ "$fileName" == access.log.* ]]; then
# if [ "access.log" = "$fileName" ];then
date=`date +%Y_%m_%d_%H_%M_%S`
#将文件移动到待上传目录并重命名
#打印信息
echo "moving $log_src_dir$fileName to $log_toupload_dir"xxxxx_click_log_$fileName"$date"
mv $log_src_dir$fileName $log_toupload_dir"xxxxx_click_log_$fileName"$date
#将待上传的文件path写入一个列表文件willDoing
echo $log_toupload_dir"xxxxx_click_log_$fileName"$date >> $log_toupload_dir"willDoing."$date
fi
done
#找到列表文件willDoing 包含 will -v 不包含
ls $log_toupload_dir | grep will |grep -v "_COPY_" | grep -v "_DONE_" | while read line
do
#打印信息
echo "toupload is in file:"$line
#将待上传文件列表willDoing改名为willDoing_COPY_
mv $log_toupload_dir$line $log_toupload_dir$line"_COPY_"
#读列表文件willDoing_COPY_的内容(一个一个的待上传文件名) ,此处的line 就是列表中的一个待上传文件的path
cat $log_toupload_dir$line"_COPY_" |while read line
do
#打印信息
echo "puting...$line to hdfs path.....$hdfs_root_dir"
hadoop fs -put $line $hdfs_root_dir
done
mv $log_toupload_dir$line"_COPY_" $log_toupload_dir$line"_DONE_"
done
三、MR进行数据清洗,保留想要的数据
pom依赖:
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>jdk.tools</groupId>
<artifactId>jdk.tools</artifactId>
<version>1.8</version>
<scope>system</scope>
<systemPath>D:/java/lib/tools.jar</systemPath>
// <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.33</version>
</dependency>
</dependencies>
mapper类进行过切分保留有价值数据
package com.xin;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.parquet.io.ValidatingRecordConsumer;
import java.io.IOException;
/**
* Created by xinBa.
* User: 辛聪明
* Date: 2020/3/26
*/
public class AccessLogMapper extends Mapper<LongWritable,Text,Text, NullWritable> {
Text text = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] strings = value.toString().split(" ");
//筛掉脏数据,如果长度小于11,则不符合规范,退出本次map方法
if(strings.length < 11){
return;
}
String ip = strings[0];
String date = AnalysisNginxTool.nginxDateStmpToDate(strings[3]);
String url = strings[6];
String upFlow = strings[9];
text.set(ip+","+date+","+url+","+upFlow);
context.write(text,NullWritable.get());
}
}
主控制器类 AccessLogDriver.java
package com.xin;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class AccessLogDriver {
public static void main(String[] args) throws Exception {
DateToNUM.initMap();
Configuration conf = new Configuration();
if(args.length != 2){
args = new String[2];
args[0] = "hdfs://hdp-1:9000/data/clickLog/20200326";
args[1] = "hdfs://hdp-1:9000/data/hive/test" ;
}
Job job = Job.getInstance(conf); // 设置一个用户定义的job名称
job.setJarByClass(AccessLogDriver.class);
job.setMapperClass(AccessLogMapper.class); // 为job设置Mapper类
// 为job设置Reducer类
job.setNumReduceTasks(0);
job.setMapOutputKeyClass(Text.class);// 为job的输出数据设置Key类
job.setMapOutputValueClass(NullWritable.class);// 为job输出设置value类
FileInputFormat.addInputPath(job, new Path(args[0])); // 为job设置输入路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));// 为job设置输出路径
System.exit(job.waitForCompletion(true) ? 0 : 1); // 运行job
}
}
工具类 AnalysisNginxTool.java
package com.xin;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class AnalysisNginxTool {
private static Logger logger = LoggerFactory.getLogger(AnalysisNginxTool.class);
//转换成全数字日期的字符串
public static String nginxDateStmpToDate(String date) {
String res = "";
try {
SimpleDateFormat df = new SimpleDateFormat("[dd/MM/yyyy:HH:mm:ss");
String datetmp = date.split(" ")[0].toUpperCase();
String mtmp = datetmp.split("/")[1];
//将map赋值1-12月
DateToNUM.initMap();
//把所有月份字母转换成数字
datetmp = datetmp.replaceAll(mtmp, (String) DateToNUM.map.get(mtmp));
System.out.println("日期是:"+datetmp);
//再将日期进行转换yyyy/MM/dd格式
Date d = df.parse(datetmp);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy/MM/dd");
res = sdf.format(d);
} catch (ParseException e) {
logger.error("error:" + date, e);
System.out.println("error:" + date+" msg"+e.getMessage());
}
return res;
}
// 转换成全数字日期的整形
public static long nginxDateStmpToDateTime(String date) {
long l = 0;
try {
SimpleDateFormat df = new SimpleDateFormat("[dd/MM/yyyy:HH:mm:ss");
String datetmp = date.split(" ")[0].toUpperCase();
String mtmp = datetmp.split("/")[1];
datetmp = datetmp.replaceAll(mtmp, (String) DateToNUM.map.get(mtmp));
Date d = df.parse(datetmp);
l = d.getTime();
} catch (ParseException e) {
logger.error("error:" + date, e);
System.out.println("error:" + date+" msg"+e.getMessage());
}
return l;
}
}
工具类 DateToNUM.java
package com.xin;
import java.util.HashMap;
public class DateToNUM
{
public static HashMap map = new HashMap();
public static void initMap()
{
map.put("JAN", "01");
map.put("FEB", "02");
map.put("MAR", "03");
map.put("APR", "04");
map.put("MAY", "05");
map.put("JUN", "06");
map.put("JUL", "07");
map.put("AUG", "08");
map.put("SEPT", "09");
map.put("OCT", "10");
map.put("NOV", "11");
map.put("DEC", "12");
}
}
注意事项:1、打包jar包后上传linux后,运行代码 :hadoop jar后跟运行主类
hadoop jar cliclog-1.0-SNAPSHOT.jar com.xin.AccessLogDriver /data/clickLog/20200326/xxxxx_click_log_access.log.1232020_03_27_13_46_10 /data/hive/test2
2、出现“cliclog-1.0-SNAPSHOT.jar中没有主清单属性”错误,没有指定运行主类,也可以在MANIFEST.MF中添加一行代码
Main-Class: com.xinAccessLogDriver // : 后有一个空格
3、运行linux上的jar包,位置也是linux上而不是hdfs上的,输出路径(文件清洗后保存位置)必须是不存在的文件夹
4、jar包处理的是hdfs上的文件,路径可以省略hdfs://hdp-1:9000,不是本地或linux上的文件
四、导入Hive数据仓库,再导出到Mysql
1、hive创建表
create external table mydb.access(ip string,day string,url string,upflow string) row format delimited fields terminated by ',';
2、加载数据至外部表access
load data inpath '/data/hive/test/' into table mydb.access;
注意:加载数据后,原数据将被移动到“/user/hive/warehouse/mydb.db/access”下
3、创建上行流量数据表,导入数据
create external table mydb.upflow (ip string,sum string) row format delimited fields terminated by ',';
insert into mydb.upflow select ip,sum(upflow) as sum from mydb.access group by ip order by sum desc;
4、sqoop导出数据到mysql
bin/sqoop export \
--connect jdbc:mysql://localhost:3306/sqoop\
--username root \
--password 123456 \
--table upflow \
--export-dir /user/hive/warehouse/mydb.db/upflow \
--input-fields-terminated-by ',' \
--m 1
五、PHP结合Echarts图表展示数据库数据
1、安装phpstudy 再WWW文件夹下引入Echarts的js资源,编写php文件
<!DOCTYPE html>
<?php
$dbms='mysql'; //数据库类型
$host='hdp-1'; //数据库主机名
$dbName='sqoop'; //使用的数据库
$user='root'; //数据库连接用户名
$pass='123456'; //对应的密码
$dsn="$dbms:host=$host;dbname=$dbName";
$dbh = new PDO($dsn, $user, $pass); //初始化一个PDO对象
/*你还可以进行一次搜索操作*/
foreach ($dbh->query('SELECT * from upflow') as $row) {
$x[]=$row['ip'];
$y[]=$row['sum'];
// print_r($row); //你可以用 echo($GLOBAL); 来看到这些值
}
//查询后转换成json数据
$json_x=json_encode($x);
$json_y=json_encode($y);
?>
<html>
<head>
<meta charset="utf-8">
<!-- 引入 ECharts 文件 -->
<script src="echarts.simple.min.js"></script>
</head>
<body>
<!-- 为ECharts准备一个具备大小(宽高)的Dom -->
<div id="main" style="width: 600px;height:400px;"></div>
<script type="text/javascript">
// 声明 x,y
var x=<?php echo $json_x?>;
var y=<?php echo $json_y?>;
// 基于准备好的dom,初始化echarts实例
var myChart = echarts.init(document.getElementById('main'));
// 指定图表的配置项和数据
var option = {
title : {
text: '测试数据',
subtext: '纯属虚构'
},
tooltip : {
trigger: 'axis'
},
legend: {
data:['蒸发量']
},
toolbox: {
show : true,
feature : {
mark : {show: true},
dataView : {show: true, readOnly: false},
magicType : {show: true, type: ['line', 'bar']},
restore : {show: true},
saveAsImage : {show: true}
}
},
calculable : true,
xAxis : [
{
type : 'category',
data : x
}
],
yAxis : [
{
type : 'value'
}
],
series : [
{
name:'蒸发量',
type:'bar',
data:y,
markPoint : {
data : [
{type : 'max', name: '最大值'},
{type : 'min', name: '最小值'}
]
},
markLine : {
data : [
{type : 'average', name: '平均值'}
]
}
},
]
};
// 使用刚指定的配置项和数据显示图表。
myChart.setOption(option);
</script>
</body>
</html>
在浏览器输入localhost,然后找到对应php文件打开
六、Azkaban流程调度
1、upload.job 执行脚本将日志上传hdfs
# upload.job
type=command
command=bash upload.sh
2、clean.job 运行mr jar包,进行数据清洗
# clean.job
type=command
dependencies=upload
command=bash clean.sh
clean.sh
#!/bin/bash
#set java env
export JAVA_HOME=/root/apps/jdk1.8.0_131/
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=${JAVA_HOME}/bin:$PATH
#set hadoop env
export HADOOP_HOME=/root/apps/hadoop-2.7.2/
export PATH=${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:$PATH
day_01=`date -d'-1 day' +%Y-%m-%d`
syear=`date --date=$day_01 +%Y`
smonth=`date --date=$day_01 +%m`
sday=`date --date=$day_01 +%d`
#echo $day_01
#echo $syear
#echo $smonth
#echo $sday
log_hdfs_dir=/data/clickLog/$syear/$smonth/$sday
#echo $log_hdfs_dir
click_log_clean=com.xin.AccessLogDriver
clean_dir=/data/cleaup/$syear/$smonth/$sday
echo "hadoop jar /home/centos/hivedemo/hiveaad.jar $click_log_clean $log_hdfs_dir $clean_dir"
hadoop fs -rm -r -f $clean_dir
hadoop jar /root/data/logs/cliclog-1.0-SNAPSHOT.jar $click_log_clean $log_hdfs_dir $clean_dir
3、hivesql.job 把清洗后的数据加载hive数据仓库
# hivesql.job
type=command
dependencies=clean
command=bash hivesql.sh
hivesql.sh
#!/bin/bash
export JAVA_HOME=/root/apps/jdk1.8.0_131/
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=${JAVA_HOME}/bin:$PATH
#set hadoop env
export HADOOP_HOME=/root/apps/hadoop-2.7.2/
export PATH=${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:$PATH
export HIVE_HOME=/root/apps/hive-2.1.1
export PATH=${HIVE_HOME}/bin:$PATH
day_01=`date -d'-1 day' +%Y-%m-%d`
syear=`date --date=$day_01 +%Y`
smonth=`date --date=$day_01 +%m`
sday=`date --date=$day_01 +%d`
#echo $day_01
#echo $syear
#echo $smonth
#echo $sday
clean_dir=/cleaup/$syear/$smonth/$sday
HQL_origin="load data inpath '$clean_dir' into table mydb.access"
#HQL_origin="create external table db2.access(ip string,day string,url string,upflow string) row format delimited fields terminated by ',' location '$clean_dir'"
#echo $HQL_origin
hive -e "$HQL_origin"
4、ip.job 操作hive数据仓库,形成新的upflow表
# ip.job
type=command
dependencies=hivesqljob
command=bash ip.sh
ip.sh
#!/bin/bash
export JAVA_HOME=/root/apps/jdk1.8.0_131/
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=${JAVA_HOME}/bin:$PATH
#set hadoop env
export HADOOP_HOME=/root/apps/hadoop-2.7.2/
export PATH=${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:$PATH
export HIVE_HOME=/root/apps/hive-2.1.1
export PATH=${HIVE_HOME}/bin:$PATH
HQL_origin="insert into mydb.upflow select ip,sum(upflow) as sum from mydb.access group by ip order by sum desc "
#echo $HQL_origin
hive -e "$HQL_origin"
5、mysql.job 用sqoop将hive数据导入mysql
# mysql.job
type=command
dependencies=ipjob
command=bash mysql.sh
mysql.sh
#!/bin/bash
export JAVA_HOME=/root/apps/jdk1.8.0_131/
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=${JAVA_HOME}/bin:$PATH
#set hadoop env
export HADOOP_HOME=/root/apps/hadoop-2.7.2/
export PATH=${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:$PATH
export HIVE_HOME=/root/apps/hive-2.1.1
export PATH=${HIVE_HOME}/bin:$PATH
export SQOOP_HOME=/root/apps/sqoop-1.4.7
export PATH=${SQOOP_HOME}/bin:$PATH
sqoop export --connect \
jdbc:mysql://localhost:3306/sqoop \
--username root --password 123456 --table upflow --export-dir \
/user/hive/warehouse/mydb.db/upflow --input-fields-terminated-by ',' \
--m 2
6、将其打包成一个 job.zip 提交到azkaban运行
注意:azkaban.jobExecutor.utils.process.ProcessFailureExceptionazkaban.jobExecutor.utils.process.ProcessFailureException解决方法将脚本文件的文件格式设置为unix格式
7、php页面显示echarts图表
七、Hadoop和Hive如何平滑的转移到Spark
Spark有着基于内存,速度快的优势,同时可以省去Hive处理以及导入数据的繁琐。因此项目可以简化成脚本或flume提交数据到hdfs,spark jar处理hdfs数据后直接保存在mysql
1、启动hdfs、spark、zookeeper (spark/sbin/start-all.sh启动)
2、ide中编写spark处理程序
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.3.1</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.33</version>
</dependency>
</dependencies>
<build>
<pluginManagement>
<plugins>
<!-- 编译scala的插件 -->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
</plugin>
<!-- 编译java的插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
</plugin>
</plugins>
</pluginManagement>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- 打jar插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.2</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
package com.xin.mysql
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by xinBa.
* User: 辛聪明
* Date: 2020/3/29
*/
object ClickLog {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("sparkJob2")
val sc = new SparkContext(conf)
// val text: RDD[String] = sc.textFile("hdfs:hdp-1:9000/data/clickLog/2020/03/27xxxxx_click_log_access.log.12020_03_28_21_58_44")
val text: RDD[String] = sc.textFile(args(0))
val ipRdd: RDD[(String, Float)] = text.map(x => {
val strings: Array[String] = x.split(" ")
val ip = strings(0);
// val date = AnalysisNginxTool.nginxDateStmpToDate(strings(3));
// val url = strings(6);
val upFlow = strings(9).toFloat;
(ip, upFlow)
}).reduceByKey((_ + _))
ipRdd.foreachPartition(insertData)
sc.stop()
}
def insertData(iterator: Iterator[(String, Float)]): Unit = {
Class.forName ("com.mysql.jdbc.Driver").newInstance()
val conn = java.sql.DriverManager.getConnection("jdbc:mysql://hdp-1:3306/sqoop", "root", "123456")
iterator.foreach(data => {
data
val ps = conn.prepareStatement("insert into upflow(ip,sum) values (?,?)")
ps.setString(1,data._1)
ps.setFloat(2,data._2)
ps.executeUpdate()
})
}
}
此外SparkSOL也可以实现
package com.xin.spark
import java.util.Properties
import com.xin.mrlog.AnalysisNginxTool
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
//创建样例类
case class clickLog2(ip: String,date:String,url: String,upflow:Int)
object clickLogJobToJDBC {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
// .setMaster("local[*]")
.setAppName("clickLogJobToJDBC")
val sc = new SparkContext(conf)
val spark: SparkSession = SparkSession.builder().getOrCreate()
// val lines: RDD[String] = sc.textFile(AnalysisNginxTool.yesterday())
val lines: RDD[String] = sc.textFile("hdfs://hdp-1/data/clickLog/2020/03/27")
//数据清洗
val clean: RDD[clickLog2] = lines.map(_.split(" "))
.map(x => clickLog2(
x(0).toString,
AnalysisNginxTool.nginxDateStmpToDate(x(3)),
x(6).toString,
x(9).toInt
))
//导入隐式依赖
import spark.implicits._
//创建视图即sql业务处理
val df: DataFrame = clean.toDF()
df.createTempView("access")
val resDf: DataFrame = spark.sql("select ip,sum(upflow) as sum from access group by ip order by sum")
//将业务逻辑进行入库处理
val prop = new Properties()
prop.put("user","root")
prop.put("password","123456")
prop.put("driver","com.mysql.jdbc.Driver")
resDf.write
.mode("overwrite")//覆盖
.jdbc("jdbc:mysql://hdp-1:3306/sqoop?useUnicode=true&characterEncoding=utf-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=UTC","upflow",prop)
// resDf.show()
println("运行成功")
sc.stop()
spark.stop()
}
}
在集群上执行,以spark-submit方式运行jar包
spark-submit --master spark://hdp-1:7077 --class com.xin.mysql.ClickLog /root/spark/sparkdemo-1.0-SNAPSHOT-shaded.jar hdfs://hdp-1:9000//data/clickLog/2020/03/27
结合Azkaban调度,upload.job upload.sh不变,spark.job如下,spark.sh即上面命令
#spark.job
type=command
dependencies=upload
command=bash spark.sh