Spark csv文件转换Parquet Scala
csv文件转换Parquet
pom依赖
<mirror>
<id>nexus-aliyun</id>
<mirrorOf>central</mirrorOf>
<name>Nexus aliyun</name>
<url>http://maven.aliyun.com/nexus/content/groups/public</url>
</mirror>
pom文件:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.harvey</groupId>
<artifactId>scala_parquet</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>com.databricks</groupId>
<artifactId>spark-csv_2.11</artifactId>
<version>1.5.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>com.univocity</groupId>
<artifactId>univocity-parsers</artifactId>
<version>2.5.9</version>
</dependency>
</dependencies>
</project>
scala文件
object TestDemo {
def main(args: Array[String]): Unit = {
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession;
val schema = StructType(
Array(
StructField("primary_key", StringType, true),
StructField("user_name", StringType, true),
StructField("contact_num", LongType, true)
)
)
val df = SparkSession.builder().appName("ParquetDemo").master("local").getOrCreate()
.read
.format("com.databricks.spark.csv")
.schema(schema)
.option("header", true)
.option("delimiter",",")
.csv("E:/test/old/data1.csv")
//df.show()
df.write.parquet("E:/test/new/parquet")
}
}