bin/sqoop list-databases \
--connect jdbc:mysql://master:3306 \
--username root \
--password 123456 //测试MYSQL
bin/sqoop import \
--connect jdbc:mysql://master:3306/test \
--username root \
--password 123456 \
--table my_user //从MYSQL导入数据到HDFS
bin/sqoop list-tables --connect \
jdbc:oracle:thin:@192.23.0.53:1521/PDBORCL \
--username hljcreditc_user --password hljcredit_pwd \
--driver oracle.jdbc.driver.OracleDriver //连接ORACLE数据库成功
//连接外部计算机数据库注意防火墙是否关闭,ODBJ是否放到LIB文件夹,ORACLE12数据库名称前面不是冒号是/,(手动加载驱动名称,不加也可以)
使用sqoop导入oracle数据到hive表的时候,取消–driver oracle.jdbc.driver.OracleDriver这条参数,否则报错manager.SqlManager: Error executing statement: java.sql.SQLSyntaxErrorException: ORA-00933: SQL command not properly ended
查询数据库表可以这么写
bin/sqoop list-tables --connect \
jdbc:oracle:thin:@192.23.0.53:1521/PDBORCL \
--username hljcreditc_user --password hljcredit_pwd 测试有多少表
bin/sqoop import -D sqoop.hbase.add.row.key=true \
-Dorg.apache.sqoop.splitter.allow_text_splitter=true \
--connect jdbc:oracle:thin:@192.23.0.53:1521/PDBORCL \
--username hljcreditc_user \
--password hljcredit_pwd \
--table T_B_CORE_AJ_HMD \
--columns ID,ENTNAME \
--hbase-table T_B_CORE_AJ_HMD \
--column-family info \
--hbase-row-key ID -m -1 //导出ORACLE到HBASE 未测试成功
--------------------------------------------------------------------------------------------------------------------------
Hadoop 2.x
HDFS
YARN
MapReduce
分而治之
分:map
Zookeeper
Hive
==============================
对日志类型的海量数据
* hdfs
* mr , hive - hql
大数据协作框架
第一个问题
hdfs
文件来源哪里?数据存储到hdfs ? 海量
现实数据来源两个方面
* RDBMS(Oracle,MySQL,DB2...) -> sqoop(SQL to HADOOP)
* 文件(apache,nginx日志数据) -> Flume(实时抽取数据)
第二个问题
对数据的分析任务Job,至少都是上千(互联网公司)
调度任务 ?
什么执行,多长执行一次,执行频率
某一些业务的分析,需要许多job任务共同完成,相互依赖关系 ,工作流 ?
Oozie
第三个问题
hadoop 2.x生态系统中重要的框架,8个,
监控
统一WEB UI界面,管理框架,监控框架
Hue
========================================================================
HiveQL
对数据进行分析
结果集存储
* hdfs 文件
* hive 表中
| Sqoop
导出到RDBMS中
Sqoop
就是将常用的MapReduce(数据导入导出)进行封装,通过传递参数的形式,运行MapReduce任务。
Map任务(分,批量,并行)
CLI
bin/sqoop import ..........
==================================
以Hadoop 为主体,RDBMS为客体,
sqoop import
将RDBMS数据放入hadoop 中,就是导入,import
sqoop export
将hadoo中的数据放入到RDBMs中,就是导出,export
sqoop 依赖于hadoop
* 数据的一方,存储在hdfs
* 底层的数据传输的实现MapReduce / YARN
所以我们需要依据hadoop 版本进行编译sqoop>>>>>>>>>>>>>>>>>>>>>>>>>
===============================================================
CDH
http://archive.cloudera.com/cdh5/
CDH 5.3.x 版本,非常的稳定,好用 cdh-5.3.6,各个版本之间的依赖和兼容不用
* hadoop-2.5.0-cdh5.3.6.tar.gz
* hive-0.13.1-cdh5.3.6.tar.gz
* zookeeper-3.4.5-cdh5.3.6.tar.gz
* sqoop-1.4.5-cdh5.3.6.tar.gz
下载地址
http://archive.cloudera.com/cdh5/cdh/5/
===============================================================
作业
cdh-5.3.6版本,将hadoop 2.x和hive 伪分布式环境安装测试好
写word文档,进行全程截图,发给我
提交时间:10.6晚上24点前
# mkdir /opt/cdh-5.3.6
# chown -R beifeng:beifeng /opt/cdh-5.3.6/
==================================
rdbms/mysql
* jbdcurl\username\password\tablename
转换 |
improt / export
hdfs
* path
=================================================================
RDBMS以Mysql数据库为例讲解,拷贝jdbc驱动包到$SQOOP_HOME/lib目录下
cp /opt/softwares/mysql-libs/mysql-connector-java-5.1.27/mysql-connector-java-5.1.27-bin.jar /opt/cdh-5.3.6/sqoop-1.4.5-cdh5.3.6/lib/
bin/sqoop list-databases \
--connect jdbc:mysql://hadoop-senior.ibeifeng.com:3306 \
--username root \
--password 123456
>>>>>>>>>>>>>>>>>>>
CREATE TABLE `my_user` (
`id` tinyint(4) NOT NULL AUTO_INCREMENT,
`account` varchar(255) DEFAULT NULL,
`passwd` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`)
);
INSERT INTO `my_user` VALUES ('1', 'admin', 'admin');
INSERT INTO `my_user` VALUES ('2', 'pu', '12345');
INSERT INTO `my_user` VALUES ('3', 'system', 'system');
INSERT INTO `my_user` VALUES ('4', 'zxh', 'zxh');
INSERT INTO `my_user` VALUES ('5', 'test', 'test');
INSERT INTO `my_user` VALUES ('6', 'pudong', 'pudong');
INSERT INTO `my_user` VALUES ('7', 'qiqi', 'qiqi');
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior.ibeifeng.com:3306/test \
--username root \
--password 123456 \
--table my_user
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior.ibeifeng.com:3306/test \
--username root \
--password 123456 \
--table my_user \
--target-dir /user/beifeng/sqoop/imp_my_user \
--num-mappers 1
----------------------------------------------------------------------------------------
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior.ibeifeng.com:3306/test \
--username root \
--password 123456 \
--table my_user \
--target-dir /user/beifeng/sqoop/imp_my_user \
--num-mappers 1
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
sqoop 底层的实现就是MapReduce,import来说,仅仅运行Map Task
数据存储文件
* textfile
* orcfile
* parquet
>>>>>>>>>>>>>>import hdfs : parquet>>>>>>>>>>>>>>>>>>
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior.ibeifeng.com:3306/test \
--username root \
--password 123456 \
--table my_user \
--target-dir /user/beifeng/sqoop/imp_my_user_parquet \
--fields-terminated-by ',' \
--num-mappers 1 \
--as-parquetfile
>>>>>>>>>>>>>>>>>hive : create table hive_user>>>>>>>>>>>>>>>>>>
drop table if exists default.hive_user_orc ;
create table default.hive_user_orc(
id int,
username string,
password string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS parquet ;
load data inpath '/user/beifeng/sqoop/imp_my_user_parquet' into table default.hive_user_orc ;
??????????????????????? 什么回事 null ????????
>>>>>>>>>>>>>>>>>import hdfs : columns >>>>>>>>>>>>>>>>>>
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior.ibeifeng.com:3306/test \
--username root \
--password 123456 \
--table my_user \
--target-dir /user/beifeng/sqoop/imp_my_user_column \
--num-mappers 1 \
--columns id,account
* 在实际的项目中,要处理的数据,需要进行初步清洗和过滤
* 某些字段过滤
* 条件
* join
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior.ibeifeng.com:3306/test \
--username root \
--password 123456 \
--query 'select id, account from my_user' \
--target-dir /user/beifeng/sqoop/imp_my_user_query \
--num-mappers 1
>>>>>>>>>>>>>>出错了????
15/09/01 07:10:39 ERROR tool.ImportTool: Encountered IOException running import job: java.io.IOException: Query [select id, account from my_user] must contain '$CONDITIONS' in WHERE clause.
at org.apache.sqoop.manager.ConnManager.getColumnTypes(ConnManager.java:300)
at org.apache.sqoop.orm.ClassWriter.getColumnTypes(ClassWriter.java:1833)
at org.apache.sqoop.orm.ClassWriter.generate(ClassWriter.java:1645)
at org.apache.sqoop.tool.CodeGenTool.generateORM(CodeGenTool.java:96)
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior.ibeifeng.com:3306/test \
--username root \
--password 123456 \
--query 'select id, account from my_user where $CONDITIONS' \
--target-dir /user/beifeng/sqoop/imp_my_user_query \
--num-mappers 1
>>>>>>>>>>>>>>>>>import hdfs : compress >>>>>>>>>>>>>>>>>>
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior.ibeifeng.com:3306/test \
--username root \
--password 123456 \
--table my_user \
--target-dir /user/beifeng/sqoop/imp_my_sannpy \
--delete-target-dir \
--num-mappers 1 \
--compress \
--compression-codec org.apache.hadoop.io.compress.SnappyCodec \
--fields-terminated-by '\t'
drop table if exists default.hive_user_snappy ;
create table default.hive_user_snappy(
id int,
username string,
password string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;
load data inpath '/user/beifeng/sqoop/imp_my_sannpy' into table default.hive_user_snappy ; 把SQOOP导入到HDFS的文件直接导入到HIVE中
mysql table
->
hdfs (sanpyy)
->
hive create table
->
load data into table
->
query
>>>>>>>>>>>>>>>>>import hdfs increment>>>>>>>>>>>>>>>>>>
增量数据的导入
有一个唯一标识符,通常这个表都有一个字段,类似于插入时间createtime
* query
where createtime => 20150924000000000 and createtime < 20150925000000000
* sqoop
Incremental import arguments:
--check-column <column> Source column to check for incremental
change
--incremental <import-type> Define an incremental import of type
'append' or 'lastmodified'
--last-value <value> Last imported value in the incremental
check column
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior.ibeifeng.com:3306/test \
--username root \
--password 123456 \
--table my_user \
--target-dir /user/beifeng/sqoop/imp_my_incr \
--num-mappers 1 \
--incremental append \
--check-column id \
--last-value 4
>>>>>>>>>>>>>>>>>import hdfs direct >>>>>>>>>>>>>>>>>>
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior.ibeifeng.com:3306/test \
--username root \
--password 123456 \
--table my_user \
--target-dir /user/beifeng/sqoop/imp_my_incr \
--num-mappers 1 \
--delete-target-dir \
--direct
===================导出数据RDBMS=====================================
hive table
* table
hiveserver2进行jdbc方式查询数据
* hdfs file
export -> mysql/oracle/db2/
>>>>>>>>>>>>>>>>>export mysql table >>>>>>>>>>>>>>>>>>
touch /opt/datas/user.txt
vi /opt/datas/user.txt
12,beifeng,beifeng
13xuanyun,xuanyu
bin/hdfs dfs -mkdir -p /user/beifeng/sqoop/exp/user/
bin/hdfs dfs -put /opt/datas/user.txt /user/beifeng/sqoop/exp/user/
bin/sqoop export \
--connect jdbc:mysql://hadoop-senior.ibeifeng.com:3306/test \
--username root \
--password 123456 \
--table my_user \
--export-dir /user/beifeng/sqoop/exp/user/ \
--num-mappers 1
===================导入导出Hive====================================
Hive数据存储在hdfs上
schema
table location / file
>>>>>>>>>>>>>>>>>import hive table >>>>>>>>>>>>>>>>>>
use default ;
drop table if exists user_hive ;
create table user_hive(
id int,
account string,
password string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' ;
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior.ibeifeng.com:3306/test \
--username root \
--password 123456 \
--table my_user \
--fields-terminated-by '\t' \
--delete-target-dir \
--num-mappers 1 \
--hive-import \
--hive-database default \
--hive-table user_hive
>>>>>>>>>>>>>>>>>export mysql table >>>>>>>>>>>>>>>>>>
CREATE TABLE `my_user2` (
`id` tinyint(4) NOT NULL AUTO_INCREMENT,
`account` varchar(255) DEFAULT NULL,
`passwd` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`)
);
bin/sqoop export \
--connect jdbc:mysql://hadoop-senior.ibeifeng.com:3306/test \
--username root \
--password 123456 \
--table my_user2 \
--export-dir /user/hive/warehouse/user_hive \
--num-mappers 1 \
--input-fields-terminated-by '\t'
=======================================================
shell scripts
## step 1
load data ...
## step 2
bin/hive -f xxxx
## step 3
bin/sqoop --options-file /opt/datas/sqoop-import-hdfs.txt
SQOOP常用命令
最新推荐文章于 2023-03-21 20:28:09 发布