POM
<properties>
<scala.version>2.11</scala.version>
<spark.version>2.2.0</spark.version>
<scope>compile</scope>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.version}</artifactId>
<version>${spark.version}</version>
<exclusions>
<exclusion>
<artifactId>janino</artifactId>
<groupId>org.codehaus.janino</groupId>
</exclusion>
</exclusions>
<scope>${scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.version}</artifactId>
<version>${spark.version}</version>
<scope>${scope}</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.60</version>
</dependency>
<dependency>
<groupId>org.codehaus.janino</groupId>
<artifactId>janino</artifactId>
<version>3.0.8</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.3.1</version>
<exclusions>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.3.1</version>
</dependency>
</dependencies>
<build>
<finalName>SparkPro</finalName>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.0.0</version>
<configuration>
<archive>
<manifest>
<mainClass>com.wisers.spark.PubLisErrTask</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
示例
package com.wisers.spark.hbase
import com.alibaba.fastjson.JSON
import com.wisers.spark.utils.{HBaseUtil, IsNullUtil}
import org.apache.commons.codec.digest.DigestUtils
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object HtmlInsert {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("HtmlInsert").setMaster("local[2]")
val context = new SparkContext(conf)
context.setLogLevel("WARN")
val userDir = System.getProperty("user.dir")
val inputFile = if (IsNullUtil.isEmpty(args)) {
userDir + "\\data\\input\\20200712000249list-hbase-little.data"
} else {
args(0)
}
val hbaseDts = String.valueOf(System.currentTimeMillis())
val fileRDD = context.textFile(inputFile).map(x => {
val obj = JSON.parseObject(x)
val pubcode = obj.getString("pubcode")
val url = obj.getString("url")
val listing = obj.getString("listing")
val content = obj.getString("content")
val dts = obj.getString("dts")
Data(pubcode, url, listing, content, dts, hbaseDts)
})
val hbaseRDD: RDD[(ImmutableBytesWritable, Put)] = fileRDD.map(data => {
val put = new Put(Bytes.toBytes(DigestUtils.md5Hex(data.url)))
put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("pubcode"), Bytes.toBytes(data.pubcode))
put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("url"), Bytes.toBytes(data.url))
if (!IsNullUtil.isEmpty(data.listing)) put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("listing"), Bytes.toBytes(data.listing))
if (!IsNullUtil.isEmpty(data.content)) put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("content"), Bytes.toBytes(data.content))
put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("dts"), Bytes.toBytes(data.dts))
put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("hbaseDts"), Bytes.toBytes(data.hbaseDts))
(new ImmutableBytesWritable, put)
})
hbaseRDD.saveAsHadoopDataset(HBaseUtil.getJobConf())
context.stop()
}
case class Data(pubcode: String, url: String, listing: String, content: String, dts: String, hbaseDts: String)
}
package com.wisers.spark.utils
import java.util.Properties
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.mapred.JobConf
object HBaseUtil {
def getJobConf(): JobConf = {
val inputStream = HBaseUtil.getClass.getClassLoader.getResourceAsStream("param.properties")
val props = new Properties()
props.load(inputStream)
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum", props.getProperty("hbase.zookeeper.quorum"))
conf.set("hbase.zookeeper.property.clientPort", props.getProperty("hbase.zookeeper.property.clientPort"))
val jobConf = new JobConf(conf)
jobConf.setOutputFormat(classOf[TableOutputFormat])
jobConf.set(TableOutputFormat.OUTPUT_TABLE, "tb_html")
jobConf.setOutputKeyClass(classOf[ImmutableBytesWritable])
jobConf.setOutputValueClass(classOf[Result])
jobConf
}
}
说明
- 因为我读取的数据用到了zip压缩和base64编码,并且是json的,所以引入了fastjson依赖,如果用户不需要可以不加
- zip压缩和base64编码工具类可以忽略
- 本地运行时spark的scope使用compile,但是集群运行时可以为provided,因为集群有spark环境