官方文档:http://carbondata.apache.org/documentation.html
可用从官网下载对应hadoop和spark版本的jar包,也可以自己编译需要的版本。
这里测试的版本:
hadoop 2.7.2
spark 2.1.0
1、先测试spark-shell下是否可用
//启动spark-shell
bin/spark-shell --jars /opt/carbon/apache-carbondata-1.5.1-bin-spark2.1.0-hadoop2.7.2.jar
//测试代码
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.CarbonSession._
val carbon = SparkSession.builder().config(sc.getConf).getOrCreateCarbonSession("hdfs://sdw-12:9000/carbon/data/store")
#创建表
carbon.sql(
s"""
| CREATE TABLE IF NOT EXISTS test_table(
| id string,
| name string,
| city string,
| age Int)
| STORED AS carbondata
""".stripMargin)
#加载数据
carbon.sql("LOAD DATA INPATH '/Users/zhangwan/Downloads/carbon/sample.csv' INTO TABLE test_table")
#查询数据
carbon.sql("SELECT * FROM test_table").show()
#分组查询
carbon.sql(
s"""
| SELECT city, avg(age), sum(age)
| FROM test_table
| GROUP BY city
""".stripMargin).show()
2、Spark on YARN方式
注意:yarn 模式的时候必须先把csv文件放到hdfs上,否则报错找不到文件
#启动服务
./spark-submit --master yarn --deploy-mode client --conf spark.sql.hive.thriftServer.singleSession=true --class org.apache.carbondata.spark.thriftserver.CarbonThriftServer /opt/carbon/apache-carbondata-1.5.2-bin-spark2.3.2.3.1.0.0-78-hadoop3.1.1.3.1.0.0-78.jar
hdfs://192.168.3.103:8020/carbon.store
#上传文件
hadoop fs -put sample.csv /
#启动beeline
./bin/beeline -u jdbc:hive2://192.168.3.105:10016 -n root
#创建表
CREATE TABLE IF NOT EXISTS test_table(
id string,
name string,
city string,
age Int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
stored as orc;
STORED AS carbondata
#加载数据
LOAD DATA INPATH '/sample.csv' INTO TABLE test_table