Spark获取CSV文件导入ClickHouse
由于我们在工作中可能会用到导数需求,所以我就用sparkSQL进行开发了一个开发工具
环境配置
本地开发环境:WIN10、IDEA2019.3、Scala2.11.12、Spark2.4.0
POM文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<repositories><!-- 阿里云代码库 -->
<repository>
<id>maven-ali</id>
<url>http://maven.aliyun.com/nexus/content/groups/public//</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>true</enabled>
<updatePolicy>always</updatePolicy>
<checksumPolicy>fail</checksumPolicy>
</snapshots>
</repository>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
<repository>
<id>cloudera.public.repo</id>
<url>https://repository.cloudera.com/artifactory/public</url>
</repository>
</repositories>
<groupId>org.example</groupId>
<artifactId>UploadFileScala</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<log4j.version>1.2.17</log4j.version>
<slf4j.version>1.7.22</slf4j.version>
<casbah.version>3.1.1</casbah.version>
<redis.version>2.9.0</redis.version>
<spark.version>2.4.0</spark.version>
<jblas.version>1.2.1</jblas.version>
<pg.version>42.2.5</pg.version>
<scala.version>2.11.12</scala.version>
</properties>
<dependencies>
<dependency>
<groupId>ru.yandex.clickhouse</groupId>
<artifactId>clickhouse-jdbc</artifactId>
<version>0.2</version>
<exclusions>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- 引入共同的日志管理工具 -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>${log4j.version}</version>
</dependency>
<!-- 引入Spark相关的Jar包 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId>
<version>${pg.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.databricks/spark-csv -->
<dependency>
<groupId>com.databricks</groupId>
<artifactId>spark-csv_2.10</artifactId>
<version>1.5.0</version>
</dependency>
<!-- 该插件是限制lz4的版本为1.3.0 -->
<dependency>
<groupId>net.jpountz.lz4</groupId>
<artifactId>lz4</artifactId>
<version>1.3.0</version>
</dependency>
<!--ClickHouser 驱动包-->
<dependency>
<groupId>ru.yandex.clickhouse</groupId>
<artifactId>clickhouse-jdbc</artifactId>
<version>0.2</version>
<exclusions>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
<!--maven打包编译插件-->
<build>
<resources>
<resource>
<directory>src/main/scala</directory>
<includes>
<include>**/*.properties</include>
<include>**/*.xml</include>
</includes>
<filtering>true</filtering>
</resource>
<resource>
<directory>src/main/resources</directory>
</resource>
<resource>
<directory>src/main/resources/lib</directory>
<targetPath>BOOT-INF/lib/</targetPath>
<includes>
<include>**/*.jar</include>
</includes>
</resource>
</resources>
<!--scala待编译的文件目录-->
<sourceDirectory>src/main/scala</sourceDirectory>
<!--scala插件-->
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
<args>
<arg>-target:jvm-1.8</arg>
</args>
<jvmArgs>
<!--<jvmArg>-Xms256m</jvmArg>
<jvmArg>-Xmx1024m</jvmArg>-->
<jvmArg>-Xss4096k</jvmArg>
</jvmArgs>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.0.0</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
ClickHouser创建临时表
我们上传的CSV文件数据是供于临时机器学习数据,故导入临时表
CREATE DATABASE temp;
CREATE TABLE temp.user_Detail_20200416 (
user_id Int32
,sale_amount Float32
,trans_count Int32
,offline_count Int32
,online_count Int32
,shopping_count Int32
,tuihuo_count Int32
,tuihuo_lv Float32
,apru Float32
,create_day Int32
,is_gravida Int32
,is_dobule_source Int32
,baby_day Int32
,active_code Int32
) ENGINE = MergeTree() ORDER BY (user_id) SETTINGS index_granularity = 8192;
数据库连接工具类(Scala版)
该类是工具类主类DBUtils
object DBUtils {
val DEV = "dev"
val PROD = "prod"
var concurrentMode = ""
private def DBUtils(mode: String): Unit = {
concurrentMode = mode
}
private def getDBProperties(mode: String) = {
val dbProperties = new Properties()
if (mode == null) {
throw new IllegalArgumentException("需要正确的参数 mode 属性")
}
if (DEV.equals(mode)) {
dbProperties.load(DBUtils(mode).getClass.getClassLoader.getResourceAsStream("dev-db.properties"))
}
if (PROD.equals(mode)) {
dbProperties.load(DBUtils(mode).getClass.getClassLoader.getResourceAsStream("prod-db.properties"))
}
dbProperties
}
def getDBConfig(mode: String) = {
if (mode == null) {
println("需要正确的参数 mode 属性")
}
val properties = getDBProperties(mode)
val url = properties.getProperty("bi.bigdata.spark.ml.gp.url")
val driver = properties.getProperty("bi.bigdata.spark.ml.gp.driver")
val username = properties.getProperty("bi.bigdata.spark.ml.gp.username")
val password = properties.getProperty("bi.bigdata.spark.ml.gp.password")
DBConfig(url, driver, username, password)
}
def sparkDBProp(mode: String): Unit ={
val config = getDBConfig(mode)
val properties = new Properties()
properties.setProperty("user",config.username)
properties.setProperty("password",config.password)
properties.setProperty("url",config.url)
properties.setProperty("driver",config.driver)
}
}
DBConfig为数据库连接工具配置类
case class DBConfig(url:String,driver:String,username:String,password:String) {}
获取csv使用工具类导入ClickHouse(重点)
localhostToCsv 此类为主要调用类
object localhostToCsv {
def main(args: Array[String]): Unit = {
//Spark集群配置的各种参数
val sparkConf = new SparkConf();
//SparkContext的初始化需要一个SparkConf对象
sparkConf.set("spark.testing.memory", "2147480000")
//获取SparkSession
val sess = SparkSession.builder().appName("SQLTest").master("local[*]").config(sparkConf).getOrCreate()
//HDFS文件路径地址
val csvPath = "/leyou/temp/result1.csv"
val csvDf = sess.read
.option("delimiter", ",") //分隔符,默认为逗号,
.option("header", "true") //指定一个字符串代表 null 值
.option("quote", "'") //引号字符,默认为双引号"
.option("nullValue", "\\N") //第一行不作为数据内容,作为标题
.option("inferSchema", "true") //自动推测字段类型
.schema(ScalaReflection.schemaFor[UserDetail].dataType.asInstanceOf[StructType]) //指定csv字段类型
.csv(csvPath)
val csvCols = csvDf.columns
//csvDf.show() //打印csv明细数据,只展示20行
//csvDf.printSchema() //打印数据结构信息包含每列的名称及类型
//设置sprark临时表
csvDf.createTempView("csvView")
//使用sparkSQL进行临时表数据
val frame = sess.sql("select user_id,sale_amount,trans_count,offline_count,online_count,shopping_count,tuihuo_count,tuihuo_lv,apru,create_day,is_gravida,is_dobule_source,baby_day,active_code from csvView")
//新建配置类
val connProperties = new Properties
connProperties.setProperty("driver", "ru.yandex.clickhouse.ClickHouseDriver")
connProperties.setProperty("user", "default")
frame.write.mode(SaveMode.Append).option("batchsize", "100000")
.jdbc("jdbc:clickhouse://cdh2:8123/", "temp.user_Detail_20200416", connProperties)
//关闭SparkSession
sess.stop()
}
}
UserDetail 该类为csv字段映射类,主要用于设置字段类型
case class UserDetail(user_id:Int,sale_amount:Double,trans_count:Int,offline_count:Int,online_count:Int,shopping_count:Int,tuihuo_count:Int,tuihuo_lv:Double,apru:Double,create_day:Int,is_gravida:Int,is_dobule_source:Int,baby_day:Int,active_code:Int) {}
由于该项目是在yarn上运行的,故我们需要将csv文件上传只HDFS上
线上运行语句
spark-submit --class com.leyou.bi.uploadFile.localhostToCsv --master yarn --deploy-mode cluster --executor-memory 4G --num-executors 4 --driver-memory 4G --conf spark.default.parallelism=1000 --conf spark.memory.fraction=0.75 --conf spark.memory.storageFraction=0.5 --conf spark.network.timeout=10000000 /opt/apps/spark/jars/UploadFileCsvScala20200416-0.0.1.jar
执行语句完成之后我们可以去yarn WEB管理页面上去我们执行的任务,查看是否运行成功