1.加载分区表数据
MSCK REPAIR TABLE teat_table;
2. 合并分区文件
2.1 执行sql
ALTER TABLE table_name [PARTITION (partition_key = 'partition_value')] CONCATENATE;
2.2 参数设置
set mapreduce.input.fileinputformat.split.maxsize=100000000;
set mapreduce.input.fileinputformat.split.minsize=100000000;
set mapreduce.input.fileinputformat.split.minsize.per.node=100000000;
set mapreduce.input.fileinputformat.split.minsize.per.rack=100000000;
3. 创建Es外表
CREATE EXTERNAL TABLE `test.test_table`(
id string)
ROW FORMAT SERDE
'org.elasticsearch.hadoop.hive.EsSerDe'
STORED BY
'org.elasticsearch.hadoop.hive.EsStorageHandler'
WITH SERDEPROPERTIES (
'serialization.format' = '1')
TBLPROPERTIES (
'es.index.auto.create' = 'false',
'es.nodes' = 'xxxx:9200,xxxx:9200,xxxx:9200',
'es.read.metadata' = 'true',
'es.mapping.id' = 'id',
'es.mapping.date.rich' = 'true',
'es.write.operation' = 'upsert',
'es.date.format' = 'epoch_second',
'es.resource' = 'test_table/_doc');
4. 创建alihbase外表
set hbase.zookeeper.quorum=xxxx;
set hbase.client.username=xxx;
set hbase.client.password=xxxx;
set hbase.client.connection.impl = org.apache.hadoop.hbase.client.AliHBaseUEClusterConnection;
CREATE TABLE test.test_table
(
first string,
second string
)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf:first,cf:second")
TBLPROPERTIES ("hbase.table.name" = "xxx:xxxx", "hbase.mapred.output.outputtable" = "xxx:xxxx");
5. 本地文件导入数据到表
drop table test.test_table;
create table test.test_table ( id bigint,name string) row format delimited fields terminated by '\t' STORED AS TEXTFILE;
load data local inpath '/tmp/text.txt' into table test.test_table;
6. 修改列名
alter table tablename change column column_orign column_new int(修改后列的属性) comment 'column_name' after severity;//可以把该列放到指定列的后面,或者使用‘first’放到第一位
将表tablename中的列column_orign修改成column_new,同时指定修改后的列名称的属性,comment是这个列的注释例1:alter table emp change column age uage double comment 'column age' after id;
7. 增加列
alter table tablename add columns(column1 string comment 'xxxx',column2 long comment 'yyyy')
例:alter table emp add columns(age int);
8. 常用配置
set spark.app.name=merge; 应用程序的名称,会在日志和webUI显示
set spark.dynamicAllocation.maxExecutors=10; 执行器最大数量
set hive.exec.max.dynamic.partitions=10000; 动态分区的上限,默认1000
set hive.exec.max.dynamic.partitions.pernode=10000; 每个mapper/reducer节点可以创建的最大动态分区数,默认100
set hive.exec.dynamic.partition=true; 在DML/DDL中是否支持动态分区,默认false
set hive.exec.dynamic.partition.mode=nonstrict;默认strict,在strict模式下,动态分区的使用必须在一个静态分区确认的情况下,其他分区可以是动态
set spark.dynamicAllocation.enabled=false; 是否启动动态资源分配
set spark.executor.instances=5; 动态分配executor的实例初始化
set spark.executor.memory=10g; 每个executor可用的内存数量 (e.g. 2g, 8g).
set spark.executor.cores=4; 每个executor可用的CPU核心数目,standalone模式下,每个worker会每个executor使用一个CPU核心
例:
set spark.executor.instances=5;
set spark.executor.memory=6g;
set spark.executor.cores=4;
set hive.auto.convert.join = false;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=10000;
set hive.exec.max.dynamic.partitions=10000;
set hive.exec.max.created.files=10000;
delete jars;
add jar hdfs://xxxx:8020/lib/elasticsearch-hadoop-7.0.0.jar;
9. 删除外表
ALTER TABLE test.test_table SET TBLPROPERTIES('EXTERNAL'='False');
drop table test.test_table;