maven配置
<!-- 添加依赖组件,版本与cdh集群版本相同即可 -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-assemblies</artifactId>
<version>2.5.1</version>
</dependency>
<!-- junit是java的单元测试框架 -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.10</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.5.0-cdh5.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.5.0-cdh5.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-maven-plugins</artifactId>
<version>2.5.0-cdh5.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>0.98.6-cdh5.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.3.2</version>
</dependency>
代码开发
public class Regexp_Count extends UDF{
private final Text srcText = new Text();
public int evaluate(Text source_char,Text pattern ) {
int count = 0;
if(source_char==null||source_char.equals(srcText)||pattern==null) {
return count;
}
Pattern p = Pattern.compile(pattern.toString());
Matcher m = p.matcher(source_char.toString());
while (m.find()) {
count++;
}
return count;
}
}
maven打包相关配置
cdh依赖下载
<!-- cdh依赖 maven下载镜像 -->
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
</repositories>
maven setting文件中
<!--mirrorOf 中添加 ,!cloudera,才能下载CDH相关依赖-->
<mirror>
<id>nexus</id>
<mirrorOf>*,!cloudera</mirrorOf>
<name>aliyun MAVEN</name>
<url>http://maven.aliyun.com/nexus/content/groups/public</url>
</mirror>
maven bulid只打项目源码和部分依赖
<!--使用该插件后,只打项目源码,并将json-lib依赖打入项目lib目录下-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<version>${maven-dependency-plugin-version}</version>
<executions>
<execution>
<id>copy</id>
<phase>test</phase>
<goals>
<goal>copy</goal>
</goals>
<configuration>
<artifactItems>
<artifactItem>
<groupId>net.sf.json-lib</groupId>
<artifactId>json-lib</artifactId>
<version>2.4</version>
<type>jar</type>
<classifier>jdk15</classifier>
</artifactItem>
</artifactItems>
<outputDirectory>${project.build.directory}/classes/lib</outputDirectory>
<excludeTransitive>false</excludeTransitive>
<stripVersion>true</stripVersion>
</configuration>
</execution>
</executions>
</plugin>
maven bulid将项目依赖打入项目源码中
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<createDependencyReducedPom>true</createDependencyReducedPom>
<!-- 自动将所有不使用的类全部排除掉,将 uber-jar 最小化。 -->
<minimizeJar>true</minimizeJar>
<!-- 指定 -jar的后缀名。 -->
<!--<shadedArtifactAttached>true</shadedArtifactAttached>
<shadedClassifierName>shade</shadedClassifierName> -->
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>com.ntep.App</mainClass>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>META-INF/spring.handlers</resource>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>META-INF/spring.schemas</resource>
</transformer>
</transformers>
<!-- 将该工程依赖的部分 Jar 包 include/exclude 掉。 -->
<artifactSet>
<!--<excludes>
<exclude>junit:junit</exclude>
</excludes>-->
</artifactSet>
<!-- 将依赖的某个 Jar 包内部的类或者资源 include/exclude 掉。 -->
<filters>
<!-- <filter>
<artifact>junit:junit</artifact>
<includes>
<include>junit/framework/**</include>
<include>org/junit/**</include>
</includes>
<excludes>
<exclude>org/junit/experimental/**</exclude>
<exclude>org/junit/runners/**</exclude>
</excludes>
</filter>-->
<filter>
<artifact>log4j:log4j</artifact>
<includes>
<include>**</include>
</includes>
</filter>
<filter>
<artifact>commons-logging:*</artifact>
<includes>
<include>**</include>
</includes>
</filter>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
maven bulid将项目所有依赖单独copy到项目外的lib下
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>copy-dependencies</id>
<phase>prepare-package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<outputDirectory>${project.build.directory}/lib</outputDirectory>
<overWriteReleases>false</overWriteReleases>
<overWriteSnapshots>false</overWriteSnapshots>
<overWriteIfNewer>true</overWriteIfNewer>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<classpathPrefix>lib/</classpathPrefix>
<mainClass>theMainClass</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
将配置文件打入项目
<!-- 将资源文件打入项目中,适用于项目中创建文件夹不进行bulid path的配置文件,bulid path自动加载进jar包中 开始 -->
<resources>
<resource>
<directory>temp</directory>
<includes>
<include>**/**</include>
</includes>
</resource>
</resources>
hive自定义函数的依赖在集群中没有,加载方式
通过函数在hive中报错,定位到缺失的依赖包
临时函数解决办法
通过add jar 方式将缺失依赖加载进 hive
创建临时函数指定自己项目中的启动类