spark-sql建表语句限制_SparkSql常用语句

最新推荐文章于 2023-12-20 19:13:53 发布

bh lin

最新推荐文章于 2023-12-20 19:13:53 发布

阅读量1.4k

点赞数 1

文章标签： spark-sql建表语句限制

本文链接：https://blog.csdn.net/weixin_35671110/article/details/111987363

版权

本文介绍了如何使用Spark SQL进行建表操作，包括创建带有分区的表，并展示了数据的插入、查看、删除等操作。重点强调了建表时的注意事项，如字段间的分隔符、最后一个字段后的逗号问题，以及分区管理中的动态和静态分区操作。

摘要由CSDN通过智能技术生成

-连接sparksql：

cd/home/mr/spark/bin ./beeline !connect jdbc:hive2://hostname:port--切换数据库

usedatabaseName;--建表：

create tabletab_test(

name string,

ageint,

num1double,

num2bigint,

msgvarchar(80) --最后一个字段后面不能有 ',' 号

)

partitionedby (p_age int,p_name string) --分区信息

row format delimited fields terminated by ',' --数据中，属性间用逗号分隔

stored as textfile location '/tab/test/tab_test'; --保存路径，最后也可带'/' 即写成 '/tab/test/tab_test/'

--stored as orc ;orc类型的表，手动推数据(txt / csv 文件；无需表头，行尾无需','，数据文件保存为unix utf-8 无bom格式)不行;--可以借助textfile类型的临时表插入数据；插入时，要注意字段顺序对应一致。--指定分区，追加插入;最好不要用 'seletc * ' 表字段变化时,*指代的内容不一样

insert into table tab_test_orc partition(p_age=10,p_name='lucy') select name,age,num1,num2,msg fromtab_test_temp;--指定分区，覆盖插入

insert overwrite table tab_test_orc partition(p_age=10,p_name='lucy') select name,age,num1,num2,msg fromtab_test_temp;

查看表字段、结构：select * from tab_test; --分区字段也会被选出来

+-------+------+-------+-------+------+--------+---------+

| name | age | num1 | num2 | msg | p_age | p_name |

+-------+------+-------+-------+------+--------+---------+

desctab_test;0: jdbc:hive2://vmax32:18000> desctab_test;+--------------------------+--------------+----------+

| col_name | data_type | comment |

+--------------------------+--------------+----------+

| name | string | NULL |

| age | int | NULL |

| num1 | double | NULL |

| num2 | bigint | NULL |

| msg | varchar(80) | NULL |

| p_age | int | NULL |

| p_name | string | NULL |

| # Partition Information | | |

| # col_name | data_type | comment |

| p_age | int | NULL |

| p_name | string | NULL |

+--------------------------+--------------+----------+

desc formatted tab_test; --更详细地查看表结构;hdfs保存位置

+------------------------------------------------------------------------------------+

| result |

+------------------------------------------------------------------------------------+

| # col_name data_type comment |

| |

| name string |

| age int |

| num1 double |

| num2 bigint |

| msg varchar(80) |

| |

| # Partition Information |

| # col_name data_type comment |

| |

| p_age int |

| p_name string |

| |

| # Detailed Table Information |

| Database: zxvmax |

| Owner: mr |

| CreateTime: Fri Aug 12 11:02:35 CST 2016 |

| LastAccessTime: UNKNOWN |

| Protect Mode: None |

| Retention: 0 |

| Location: hdfs://vmax53:9000/tab/test/tab_test |

| Table Type: MANAGED_TABLE |

| Table Parameters: |

| transient_lastDdlTime 1470970955 |

| |

| # Storage Information |

| SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe |

| InputFormat: org.apache.hadoop.mapred.TextInputFormat |

| OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat |

| Compressed: No |

| Num Buckets: -1 |

| Bucket Columns: [] |

| Sort Columns: [] |

| Storage Desc Params: |

| field.delim , |

| serialization.format , |

+------------------------------------------------------------------------------------+

37 rows selected (0.12seconds)--删表：

drop table if exists tab_test; --该表分区也会被删除

--删表中数据：

truncate table tab_test; --执行后，分区依然存在

truncate table tab_test partition(p_age=10,p_name='Tom'); --删除某分区

--增加分区：

更完善写法： alter table tab_test add if not exists partition(p_age=11,p_name="Tom");alter table tab_test add partition(p_age=10,p_name='Tom'); --需要指定所有的分区，不能只是p_age或p_name;否则org.apache.spark.sql.execution.QueryExecutionException：doesn't contain all (2) partition columns--查看分区: show partitions tab_test;

0: jdbc:hive2://vmax32:18000>show partitions tab_test;+----------------------+ | result | +----------------------+ |

p_age=10/p_name=Tom | +----------------------+ 1 row selected (0.083 seconds)--删除分区

alter table tab_test drop if exists partition(p_age=10);--删除分区时，可以只指定局部

alter table tab_test drop partition(p_name='Tom');--只执行该条语句,p_age=10分区连同一起被删掉，show partitions 结果为空；hdfs dfs -ls 也看不到 p_age=10的分区

alter table tab_test add partition(p_age=10,p_name='cat');--只drop p_name='Tome', p_name='cat' 的分区还存在，show partitions 可以查到

--动态分区；动态分区匹配最后选出的字段；只与字段顺序有关系，与名字无关；同时存在静态和动态分区，动态分区必须在静态分区之后

insert into tabletab_test(p_age,p_name)selectname,

age,

num1,

num2,

msg,

ageas pppp_age, --取不取别名都可以;分区需要出现在select出来的字段的最后位置，为了匹配。