本文主要介绍如何使用maven构建spark应用,同样可以用于其他cmdline的java应用。
项目结构
/
/conf/ 配置文件
/libs/ 依赖包
/bin/ 启动命令脚本
/logs/ log
/data/ 数据
/src/ 源代码
/pom.xml
/assembly.xml
项目pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>x.x.x</groupId>
<artifactId>x</artifactId>
<version>1.0.0</version>
<properties>
<maven.test.skip>false</maven.test.skip>
<scala.version>2.11.7</scala.version>
<scala.prefix>2.11</scala.prefix>
<spark.version>2.0.2</spark.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.prefix}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.prefix}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_${scala.prefix}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.prefix}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/java</sourceDirectory>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
<compilerArguments>
<extdirs>${project.basedir}/lib</extdirs>
</compilerArguments>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>2.4</version>
<configuration>
<archive>
<addMavenDescriptor>false</addMavenDescriptor>
<manifest>
<!-- 指定MANIFEST.MF的外部jar调用路径,这里设置ture表示入口jar和被调用的jar在同一路径,
但是不要指定classpathPrefix,否则无法启动,找不到类 -->
<addClasspath>true</addClasspath>
<!-- 这里控制main入口jar包互相调用时的相对路径 -->
<!--<classpathPrefix>libs/</classpathPrefix>-->
<classpathPrefix></classpathPrefix>
</manifest>
</archive>
<excludes>
<exclude>${project.basedir}/xml/*</exclude>
</excludes>
</configuration>
</plugin>
<!--有了这个插件才能新建scala类,才能混用-->
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<!--<goal>add-source</goal>-->
<goal>compile</goal>
</goals>
</execution>
<execution>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>1.8</version>
<executions>
<execution>
<id>add-source</id>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src/main/java</source>
<source>src/main/scala</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>2.2</version>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.4</version>
<configuration>
<appendAssemblyId>false</appendAssemblyId>
<descriptors>
<descriptor>${project.basedir}/assembly.xml</descriptor>
</descriptors>
<archive>
<addMavenDescriptor>false</addMavenDescriptor>
<manifest>
<!--指定MANIFEST.MF的外部jar调用路径,这里设置ture表示入口jar和被调用的jar在同一路径,-->
<!--但是不要指定classpathPrefix,否则无法启动,找不到类-->
<addClasspath>true</addClasspath>
<!--这里控制main入口jar包互相调用时的相对路径-->
<classpathPrefix></classpathPrefix>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
<!--<goal>assembly</goal>-->
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>aliyun</id>
<url>http://maven.aliyun.com/nexus/content/groups/public</url>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>aliyun</id>
<url>http://maven.aliyun.com/nexus/content/groups/public</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
</pluginRepository>
</pluginRepositories>
</project>
打包assembly.xml
<assembly>
<id>bin</id>
<formats>
<format>zip</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<dependencySets>
<dependencySet>
<outputDirectory>/${project.artifactId}/libs</outputDirectory>
<!--如果有第三方jar包,最好是导入本地maven库然后打包 -->
<useProjectArtifact>false</useProjectArtifact>
<!-- 不打成fatjar包 -->
<unpack>false</unpack>
</dependencySet>
</dependencySets>
<fileSets>
<!-- 把项目自己编译出来的jar文件,打包进zip文件的根目录 -->
<fileSet>
<directory>${project.build.directory}</directory>
<outputDirectory>/${project.artifactId}/libs</outputDirectory>
<includes>
<!--<include>${project.artifactId}</include>-->
<include>*.jar</include>
</includes>
</fileSet>
<!-- 把项目相关的说明文件,打包进zip文件的根目录 -->
<fileSet>
<directory>${project.basedir}</directory>
<outputDirectory>/${project.artifactId}/bin</outputDirectory>
<includes>
<include>start.sh</include>
<include>*.sh</include>
</includes>
</fileSet>
<!--把项目相关的说明文件,打包进zip文件的根目录-->
<fileSet>
<directory>${project.basedir}/conf</directory>
<outputDirectory>/${project.artifactId}/conf</outputDirectory>
</fileSet>
<fileSet>
<directory>${project.basedir}/libs</directory>
<outputDirectory>/${project.artifactId}/libs</outputDirectory>
</fileSet>
<fileSet>
<directory>${project.basedir}/logs</directory>
<outputDirectory>/${project.artifactId}/logs</outputDirectory>
</fileSet>
<fileSet>
<directory>${project.basedir}/data</directory>
<outputDirectory>/${project.artifactId}/data</outputDirectory>
</fileSet>
<fileSet>
<directory>${project.basedir}/bin</directory>
<outputDirectory>/${project.artifactId}/bin</outputDirectory>
</fileSet>
</fileSets>
</assembly>
启动脚本
DIR=$(cd `dirname $0`; cd ..; pwd)
cd $DIR
spark-submit \
--master yarn\
--deploy-mode client\#这样的项目结构只能用client模式提交
--name IncreaseDataBuild\
--executor-memory 16G \
--total-executor-cores 120 \
--class xxx.xx.xx.XX\
--conf "spark.hadoop.mapreduce.input.fileinputformat.split.minsize=107374182" \
--jars $(ls libs/*.jar |grep -v "xxx-xx"| tr ' ' ',')\
./libs/xxx-xx-1.0.0.jar