以前数据抽取都是kettle ETL,测试一下sqoop
1. CDH sqoop2环境搭建
添加服务,选择sqoop2,然后下一步,遇到错误修改错误。
2. 脚本命令查看
http://sqoop.apache.org/docs/1.99.5/Sqoop5MinutesDemo.html
2.1 查看连接
show connector
CDH5.5支持四种类型
JDBC,kite,hdfs,kafka;其中kafka只支持接收
2.2 查看连接
show link
2.3 查看任务
show job
2.4 创建连接
sqoop:000> create link -c 1
数据序号是connector编号,然后不能类型填写不同参数
2.5 创建任务
sqoop:000> create job -f 1 -t 2
从一个link(目标数据源)到另一个link(目的数据源),编号是link编号,编号名字不能重复
2.6 启动任务
sqoop:000> start job -j 1
启动所选的任务,编号是任务编号
3. Java代码
3.1 实例从mysql到hdfs
将student表抽到hdfs
packagecom.bocom;
importorg.apache.sqoop.client.SqoopClient;
importorg.apache.sqoop.model.MConnector;
importorg.apache.sqoop.model.MDriverConfig;
importorg.apache.sqoop.model.MFromConfig;
importorg.apache.sqoop.model.MJob;
importorg.apache.sqoop.model.MLink;
importorg.apache.sqoop.model.MLinkConfig;
importorg.apache.sqoop.model.MSubmission;
importorg.apache.sqoop.model.MToConfig;
importorg.apache.sqoop.submission.counter.Counter;
importorg.apache.sqoop.submission.counter.CounterGroup;
importorg.apache.sqoop.submission.counter.Counters;
importorg.apache.sqoop.validation.Status;
publicclass MysqlToHDFS {
public static void main(String[] args) {
sqoopTransfer();
}
public static void sqoopTransfer() {
//初始化
String url ="http://server4:12000/sqoop/";
SqoopClient client = newSqoopClient(url);
client.setServerUrl(url);
int num=55;
//创建一个源链接 JDBC
long fromConnectorId = 1;
MLink fromLink =client.createLink(fromConnectorId);
fromLink.setName("JDBC"+num);
// fromLink.setCreationUser("liuwei");
MLinkConfig fromLinkConfig =fromLink.getConnectorLinkConfig();
fromLinkConfig.getStringInput("linkConfig.connectionString").setValue("jdbc:mysql://192.168.3.142/mysql");
fromLinkConfig.getStringInput("linkConfig.jdbcDriver").setValue("com.mysql.jdbc.Driver");
fromLinkConfig.getStringInput("linkConfig.username").setValue("root");
fromLinkConfig.getStringInput("linkConfig.password").setValue("1234");
Status fromStatus =client.saveLink(fromLink);
System.out.println(fromStatus);
if(fromStatus.canProceed()) {
System.out.println("创建JDBC Link成功,ID为:" + fromLink.getPersistenceId());
} else {
System.out.println("创建JDBC Link失败");
}
//创建一个目的地链接HDFS
long toConnectorId = 3;
MLink toLink =client.createLink(toConnectorId);
toLink.setName("HDFS"+num);
toLink.setCreationUser("admln");
MLinkConfig toLinkConfig =toLink.getConnectorLinkConfig();
System.out.println(toLinkConfig);
toLinkConfig.getStringInput("linkConfig.uri").setValue("hdfs://server4:8020/");
Status toStatus =client.saveLink(toLink);
if(toStatus.canProceed()) {
System.out.println("创建HDFS Link成功,ID为:" + toLink.getPersistenceId());
} else {
System.out.println("创建HDFS Link失败");
}
//创建一个任务
long fromLinkId =fromLink.getPersistenceId();
long toLinkId =toLink.getPersistenceId();
MJob job = client.createJob(fromLinkId,toLinkId);
job.setName("MySQL to HDFSjob"+num);
job.setCreationUser("hadoop");
//设置源链接任务配置信息
MFromConfig fromJobConfig = job.getFromJobConfig();
System.out.println("****"+fromJobConfig);
fromJobConfig.getStringInput("fromJobConfig.schemaName").setValue("mysql");
fromJobConfig.getStringInput("fromJobConfig.tableName").setValue("student");
fromJobConfig.getStringInput("fromJobConfig.partitionColumn").setValue("id");
// fromJobConfig.getStringInput("fromJobConfig.columns").setValue("id,name");
MToConfig toJobConfig =job.getToJobConfig();
System.out.println("===="+toJobConfig);
// toJobConfig.getEnumInput("toJobConfig.outputFormat").setValue("TEXT_FILE");
toJobConfig.getEnumInput("toJobConfig.outputFormat").setValue("TEXT_FILE");
toJobConfig.getEnumInput("toJobConfig.compression").setValue("NONE");
toJobConfig.getStringInput("toJobConfig.outputDirectory").setValue("/sqoop/student");
MDriverConfig driverConfig =job.getDriverConfig();
driverConfig.getIntegerInput("throttlingConfig.numExtractors").setValue(3);
Status status = client.saveJob(job);
System.out.println("===="+status);
if(status.canProceed()) {
System.out.println("JOB创建成功,ID为:"+ job.getPersistenceId());
} else {
System.out.println("JOB创建失败。");
}
//启动任务
long jobId = job.getPersistenceId();
// long jobId = 4;
MSubmission submission =client.startJob(jobId);
System.out.println("JOB提交状态为 : " + submission.getStatus());
while(submission.getStatus().isRunning()&& submission.getProgress() != -1) {
System.out.println("进度 : " + String.format("%.2f%%", submission.getProgress() * 100));
//三秒报告一次进度
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
System.out.println("JOB执行结束... ...");
System.out.println("Hadoop任务ID为:" + submission.getExternalJobId());
Counters counters =submission.getCounters();
if(counters != null) {
System.out.println("计数器:");
for(CounterGroup group : counters) {
System.out.print("\t");
System.out.println(group.getName());
for(Counter counter : group) {
System.out.print("\t\t");
System.out.print(counter.getName());
System.out.print(": ");
System.out.println(counter.getValue());
}
}
}
if(submission.getError().getErrorSummary() != null) {
System.out.println("JOB执行异常,异常信息为 : " +submission.getError());
}
System.out.println("HDFS通过sqoop传输数据到MySQL统计执行完毕");
}
}
3.2 实例从hdfs到mysql
插入到student_test表
Select count(*)from student_test
共一百万数据
packagecom.bocom;
importorg.apache.sqoop.client.SqoopClient;
importorg.apache.sqoop.model.MDriverConfig;
importorg.apache.sqoop.model.MFromConfig;
importorg.apache.sqoop.model.MJob;
importorg.apache.sqoop.model.MLink;
importorg.apache.sqoop.model.MLinkConfig;
importorg.apache.sqoop.model.MSubmission;
importorg.apache.sqoop.model.MToConfig;
importorg.apache.sqoop.submission.counter.Counter;
importorg.apache.sqoop.submission.counter.CounterGroup;
importorg.apache.sqoop.submission.counter.Counters;
importorg.apache.sqoop.validation.Status;
publicclass HDFSToMysql {
public static void main(String[] args) {
sqoopTransfer();
}
public static void sqoopTransfer() {
//初始化
String url ="http://server4:12000/sqoop/";
SqoopClient client = newSqoopClient(url);
int num=205;
//创建一个源链接 HDFS
long fromConnectorId = 3;
MLink fromLink =client.createLink(fromConnectorId);
fromLink.setName("HDFSconnector"+num);
fromLink.setCreationUser("admln");
MLinkConfig fromLinkConfig =fromLink.getConnectorLinkConfig();
fromLinkConfig.getStringInput("linkConfig.uri").setValue("hdfs://server4:8020/");
Status fromStatus =client.saveLink(fromLink);
if(fromStatus.canProceed()) {
System.out.println("创建HDFS Link成功,ID为:" + fromLink.getPersistenceId());
} else {
System.out.println("创建HDFS Link失败");
}
//创建一个目的地链接 JDBC
long toConnectorId = 1;
MLink toLink = client.createLink(toConnectorId);
toLink.setName("JDBCconnector"+num);
toLink.setCreationUser("admln");
MLinkConfig toLinkConfig =toLink.getConnectorLinkConfig();
toLinkConfig.getStringInput("linkConfig.connectionString").setValue("jdbc:mysql://192.168.3.142/mysql");
toLinkConfig.getStringInput("linkConfig.jdbcDriver").setValue("com.mysql.jdbc.Driver");
toLinkConfig.getStringInput("linkConfig.username").setValue("root");
toLinkConfig.getStringInput("linkConfig.password").setValue("1234");
Status toStatus =client.saveLink(toLink);
if(toStatus.canProceed()) {
System.out.println("创建JDBC Link成功,ID为:" + toLink.getPersistenceId());
} else {
System.out.println("创建JDBC Link失败");
}
//创建一个任务
long fromLinkId =fromLink.getPersistenceId();
long toLinkId =toLink.getPersistenceId();
MJob job = client.createJob(fromLinkId,toLinkId);
job.setName("HDFS to MySQLjob"+num);
job.setCreationUser("admln");
//设置源链接任务配置信息
MFromConfig fromJobConfig =job.getFromJobConfig();
fromJobConfig.getStringInput("fromJobConfig.inputDirectory").setValue("/sqoop/student/");
//创建目的地链接任务配置信息
MToConfig toJobConfig =job.getToJobConfig();
toJobConfig.getStringInput("toJobConfig.schemaName").setValue("mysql");
toJobConfig.getStringInput("toJobConfig.tableName").setValue("student_test");
//toJobConfig.getStringInput("fromJobConfig.partitionColumn").setValue("id");
// set the driver config values
// MDriverConfig driverConfig =job.getDriverConfig();
// driverConfig.getIntegerInput("throttlingConfig.numExtractors").setValue(3);//这句还没弄明白
Status status = client.saveJob(job);
if(status.canProceed()) {
System.out.println("JOB创建成功,ID为:"+ job.getPersistenceId());
} else {
System.out.println("JOB创建失败。");
}
//启动任务
long jobId = job.getPersistenceId();
MSubmission submission =client.startJob(jobId);
System.out.println("JOB提交状态为 : " + submission.getStatus());
while(submission.getStatus().isRunning() &&submission.getProgress() != -1) {
System.out.println("进度 : " + String.format("%.2f%%", submission.getProgress() * 100));
//三秒报告一次进度
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
System.out.println("JOB执行结束... ...");
System.out.println("Hadoop任务ID为:" + submission.getExternalJobId());
Counters counters =submission.getCounters();
if(counters != null) {
System.out.println("计数器:");
for(CounterGroup group : counters) {
System.out.print("\t");
System.out.println(group.getName());
for(Counter counter : group) {
System.out.print("\t\t");
System.out.print(counter.getName());
System.out.print(": ");
System.out.println(counter.getValue());
}
}
}
if(submission.getError().getErrorSummary() != null) {
System.out.println("JOB执行异常,异常信息为 : " +submission.getError());
}
System.out.println("HDFS通过sqoop传输数据到MySQL统计执行完毕");
}
}