阿里cdp介绍
- 阿里的cdp我们理解为是一个ETL工具。
- Pipeline是资源隔离的单元,Pipeline之间的管道资源以及底层的引擎等资源都是相互隔离的,一个Project的运行任务不会影响其他的Pipeline任务运行
同时提供一套抽象化的数据抽取插件(称之为Reader)、数据写入插件(称之为Writer),
cdp技术与原理
- 用户使用CDP Job启动API,向CDP服务端发起调用,提交一个离线数据同步Job。
- CDP收到Job API请求后,将负责做必要的安全和权限校验,待校验通过后,CDP会下发相应的Job到执行集群节点启动离线数据同步任务。
- Job启动后,根据用户提供源端(Reader)、目的端(Writer)的配置信息,加载并初始化相关插件,连接两端数据源,开始数据同步工作。
- Job运行过程中,将随心跳向CDP汇报当前进度、速度、数据量等关键运行指标,用户根据Job的状态API实时获取该Job运行状态,直至Job运行结束(成功或者失败)。
ODPS
集成介绍
1.我将展示我们怎么样把CDP数据上传,odps运算,cdp数据下载集成起来。
2.该工具采用ExecutorService线程来监控每一个cdp的job运行状态。
3.采用future来阻塞等待所有上传数据job的完成。
4.把所有脚本按任务组jobGroup,任务类型jobType进行分组,再利用QuartzSchedule来驱动执行。
5.例如每天增量运算,按upload,odps,download的jobType编制在一个groupDailyAdd组,每天定时运行,如下面的图:
任务管理表:
drop table cdp_job_config;
CREATE TABLE `cdp_job_config` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '自增主键 : 自增主键',
`name` varchar(100) NOT NULL COMMENT 'job名称',
`code` varchar(60) DEFAULT NULL COMMENT 'job的code',
`job_type` int(11) NOT NULL COMMENT '1:upload;2:opds;3:download',
`project` varchar(60) DEFAULT NULL COMMENT 'job的工程',
`job_group` varchar(40) NOT NULL COMMENT 'jobGroup的code',
`sort` int(11) NOT NULL DEFAULT '0' COMMENT '排序 : 排序',
`job_content` text NOT NULL COMMENT 'job任务脚本内容',
`status_flag` tinyint(4) NOT NULL DEFAULT '1' COMMENT '状态标识 : 状态标识 1:启用 2:禁用',
interrupt_flag tinyint(4) not null DEFAULT '1' COMMENT '状态标识 : 状态标识 1:启用 2:禁用',
`creator_name` varchar(32) DEFAULT NULL COMMENT '创建者名称 : 创建者名称',
`creator_id` bigint(20) DEFAULT NULL COMMENT '创建者id : 创建者id',
`updator_name` varchar(32) DEFAULT NULL COMMENT '最后修改者姓名 : 最后修改者姓名',
`updator_id` bigint(20) DEFAULT NULL COMMENT '更新者id : 更新者id',
`server_create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '服务器创建时间 : 服务器创建时间',
`server_update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '服务器更新时间 : 服务器更新时间',
PRIMARY KEY (`id`),
UNIQUE KEY `idx_code` (`code`),
KEY `idx_job_group_type` (job_group,code,job_type)
) ENGINE=InnoDB AUTO_INCREMENT=12 DEFAULT CHARSET=utf8 COMMENT='cdp,odps任务job配置';
create UNIQUE index idx_code on cdp_job_config(code);
cdp的job的任务代码
package com.calm.cdp.service.impl;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import javax.annotation.Resource;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.MapUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import com.alibaba.cdp.sdk.model.Job;
import com.alibaba.cdp.sdk.model.JobStatus;
import com.alibaba.cdp.sdk.model.Pipeline;
import com.alibaba.cdp.sdk.model.Session;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.calm.b.common.concurrent.ExecutorServiceUtil;
import com.calm.b.common.quartz.QuartzLog;
import com.calm.cdp.base.CdpConfigUtils;
import com.calm.cdp.base.ConfigParser;
import com.calm.cdp.dao.CdpJobManagerMapper;
import com.calm.cdp.entity.CdpJobConfig;
import com.calm.cdp.service.CdpJobConfigService;
import com.calm.cdp.service.CdpOperationService;
@Service("cdpOperationService")
public class CdpOperationServiceImpl implements CdpOperationService {
private static final Logger logger = LoggerFactory.getLogger(CdpOperationServiceImpl.class);
@Resource
private CdpJobManagerMapper cdpJobManagerMapper;
@Autowired
private CdpJobConfigService cdpJobConfigService;
@QuartzLog(taskCode = "callCdpTask", taskDesc = "callCdpTask")
public void runOrderLogJob(String configjsonPath, Map<String, Object> pramaMap) {
long startTime = System.currentTimeMillis();
try {
JSONObject content = ConfigParser.getJsonJobConfig(configjsonPath, pramaMap);
createJob(content, pramaMap);
} catch (Exception e) {
logger.error("", e);
}
logger.info("task {} execute time:{}", configjsonPath, (System.currentTimeMillis() - startTime));
}
public Future<Map<String, Object>> executeJob(String configjsonPath, Map<String, Object> pramaMap) {
// 提交Job
JSONObject content = ConfigParser.getJsonJobConfig(configjsonPath, pramaMap);
return ExecutorServiceUtil.submitInThreadPool(new CdpTask(content, pramaMap));
}
class CdpTask implements Callable<Map<String, Object>> {
Map<String, Object> pramaMap;
JSONObject jobCntent;
public CdpTask(JSONObject jobCntent, Map<String, Object> pramaMap) {
this.jobCntent = jobCntent;
this.pramaMap = pramaMap;
}
@Override
public Map<String, Object> call() throws Exception {
Map<String, Object> result = new HashMap<String, Object>();
boolean success = false;
try {
createJob(jobCntent, pramaMap);
success = true;
} catch (Exception e) {
String error = jobCntent.getString("traceId") + "run occur error:";
logger.error(error, e);
}
result.put("success", success);
return result;
}
}
public void createJob(JSONObject jobCntent, Map<String, Object> pramaMap) {
if (MapUtils.isEmpty(pramaMap)) {
pramaMap = new HashMap<String, Object>();
}
if (jobCntent == null) {
throw new RuntimeException("Not enter empty job json task");
}
long startTime = System.currentTimeMillis();
pramaMap.putAll(CdpConfigUtils.getPropertiesMap());
String jobContent = ConfigParser.format(jobCntent.toJSONString(), pramaMap);
logger.info("Run job json script>>>>>>>>:" + jobContent);
// 登录
Session session =
new Session(CdpConfigUtils.CDP_API_URL, CdpConfigUtils.CDP_ACCESSID, CdpConfigUtils.CDP_ACCESSKEY);
// 如果已有project,直接使用即可
Pipeline pipeline = session.getPipeline(CdpConfigUtils.CDP_PIPELINE);
if (pipeline == null) {
pipeline = new Pipeline();
pipeline.setName(CdpConfigUtils.CDP_PIPELINE);
pipeline.setDescription(CdpConfigUtils.CDP_PIPELINE);
pipeline = session.createPipeline(pipeline);
}
String tradeId = jobCntent.getString("traceId");
if (StringUtils.isBlank(tradeId)) {
throw new RuntimeException("System not find job tradeId,please check config,");
}
Job job = new Job();
job.setTraceId(tradeId); // 自定义用以追踪作业的信息
job.setContext(jobContent); // 详情参看不同的同步方式配置信息
final Job starter = pipeline.start(job);
JobStatus status;
do {
try {
// 10s
Thread.sleep(10000);
} catch (InterruptedException e) {
logger.error("createJob error:", e);
}
status = starter.status();
logger.info("workJob [ {} ] getResult:" + status, tradeId);
} while (status.isJobAlive());
starter.stop();
logger.info("task {} execute time:{} second", tradeId, (System.currentTimeMillis() - startTime) / 1000);
}
@Override
public void runOrderLogJob(String jobGroup, Integer jobType, Map<String, Object> pramaMap) {
List<CdpJobConfig> cdpJobConfigs = this.cdpJobConfigService.findJobConfig(jobGroup, jobType, null);
runCdpJob(pramaMap, cdpJobConfigs);
}
public void runCdpJob(String jobCode, Integer jobType, Map<String, Object> pramaMap) {
List<CdpJobConfig> cdpJobConfigs = this.cdpJobConfigService.findJobConfig(null, jobType, jobCode);
runCdpJob(pramaMap, cdpJobConfigs);
}
private void runCdpJob(Map<String, Object> pramaMap, List<CdpJobConfig> cdpJobConfigs) {
if (CollectionUtils.isEmpty(cdpJobConfigs)) {
return;
}
List<Future> futures = new ArrayList<Future>();
for (CdpJobConfig cdpJobConfig : cdpJobConfigs) {
logger.info("start add config" + cdpJobConfig);
String jobContent = cdpJobConfig.getJobContent();
JSONObject job = JSON.parseObject(jobContent);
Future<Map<String, Object>> future = ExecutorServiceUtil.submitInThreadPool(new CdpTask(job, pramaMap));
futures.add(future);
}
for (Future future : futures) {
Map<String, Object> result = null;
try {
if (CdpConfigUtils.CDP_WAIT_TIMEOUT < 1) {
result = (Map<String, Object>) future.get();
} else {
result = (Map<String, Object>) future.get(CdpConfigUtils.CDP_WAIT_TIMEOUT, TimeUnit.MINUTES);
}
} catch (Exception e) {
logger.error("InterruptedException :", e);
continue;
}
if (result != null && StringUtils.equals(String.valueOf(result.get("success")), "false")) {
throw new RuntimeException("Run taks have error,stop process.please check error log");
}
}
}
}
odbs调用代码
package com.calm.cdp.service.impl;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import javax.annotation.Resource;
import org.apache.commons.collections.CollectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import com.aliyun.odps.Instance;
import com.aliyun.odps.Odps;
import com.aliyun.odps.OdpsException;
import com.aliyun.odps.account.Account;
import com.aliyun.odps.account.AliyunAccount;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.task.SQLTask;
import com.calm.b.common.concurrent.ExecutorServiceUtil;
import com.calm.cdp.base.CdpConfigUtils;
import com.calm.cdp.base.ConfigParser;
import com.calm.cdp.dao.CdpJobManagerMapper;
import com.calm.cdp.entity.CdpJobConfig;
import com.calm.cdp.service.CdpJobConfigService;
import com.calm.cdp.service.OdpsSqlTaskExecute;
/**
* ODPS 处理类
*
* @author my
* @Date 2016年6月24日 上午10:13:01
*/
@Service("odpsSqlTaskExecute")
public class OdpsSqlTaskExecuteImpl implements OdpsSqlTaskExecute {
private static final Logger logger = LoggerFactory.getLogger(OdpsSqlTaskExecuteImpl.class);
@Resource
private CdpJobManagerMapper cdpJobManagerMapper;
@Autowired
private CdpJobConfigService cdpJobConfigService;
@Override
public void execute(List<String> sqlTasks) throws OdpsException {
Account account = new AliyunAccount(CdpConfigUtils.CDP_ACCESSID, CdpConfigUtils.CDP_ACCESSKEY);
Odps odps = new Odps(account);
odps.setEndpoint(CdpConfigUtils.ODPS_END_POINT);
odps.setDefaultProject(CdpConfigUtils.ODPS_PROJECT_NAME);
if (!CollectionUtils.isEmpty(sqlTasks)) {
for (String sql : sqlTasks) {
doSqlTask(odps, sql);
}
}
}
private void doSqlTask(Odps odps, String sql) throws OdpsException {
long startTime = System.currentTimeMillis();
logger.info("start execute sql>>>:" + sql);
Instance i;
i = SQLTask.run(odps, sql);
i.waitForSuccess();
List<Record> records = SQLTask.getResult(i);
logger.info("total execute time " + (System.currentTimeMillis() - startTime) + "ms");
for (Record r : records) {
logger.info("executeResult>>>:" + r.get(0).toString());
}
}
@Override
public void executeTask(List<String> sqFilelTasks) {
// 将要添加多线block
for (String sqlFilePath : sqFilelTasks) {
try {
List<String> sqlStatment = ConfigParser.loadSql(sqlFilePath);
execute(sqlStatment);
} catch (Exception e) {
logger.error("execut result:", e);
}
}
}
@Override
public void executeJobCode(String jobCode) throws OdpsException {
List<CdpJobConfig> cdpJobConfigs =
this.cdpJobConfigService.findJobConfig(null, CdpJobConfig.JOB_TYPE_ODPS, jobCode);
// for (CdpJobConfig cdpJobConfig : cdpJobConfigs) {
// String jobContent = cdpJobConfig.getJobContent();
// List<String> sqls = ConfigParser.loadSqlByStr(jobContent);
// execute(sqls);
// }
runAsyncTask(cdpJobConfigs);
}
public void executeTask(String jobGroup) {
List<CdpJobConfig> cdpJobConfigs =
this.cdpJobConfigService.findJobConfig(jobGroup, CdpJobConfig.JOB_TYPE_ODPS, null);
if (CollectionUtils.isEmpty(cdpJobConfigs)) {
return;
}
runAsyncTask(cdpJobConfigs);
}
private void runAsyncTask(List<CdpJobConfig> cdpJobConfigs) {
List<Future> futures = new ArrayList<Future>();
for (CdpJobConfig cdpJobConfig : cdpJobConfigs) {
String jobContent = cdpJobConfig.getJobContent();
List<String> sqls = ConfigParser.loadSqlByStr(jobContent);
Future<Map<String, Object>> future = ExecutorServiceUtil.submitInThreadPool(new OdpsTask(sqls));
futures.add(future);
}
for (Future future : futures) {
Map<String, Object> result = null;
try {
if (CdpConfigUtils.CDP_WAIT_TIMEOUT < 1) {
result = (Map<String, Object>) future.get();
} else {
future.get(CdpConfigUtils.CDP_WAIT_TIMEOUT, TimeUnit.MINUTES);
}
} catch (Exception e) {
logger.error("", e);
}
}
}
class OdpsTask implements Callable<Map<String, Object>> {
List<String> sqlStatment = null;
OdpsTask(List<String> sqlStatment) {
this.sqlStatment = sqlStatment;
}
@Override
public Map<String, Object> call() throws Exception {
Map<String, Object> result = new HashMap<String, Object>();
boolean success = false;
try {
execute(sqlStatment);
success = false;
} catch (Exception e) {
logger.error("", e);
}
result.put("success", success);
return result;
}
}
}
过程集成
采用一个抽象class,把数据upload,calculate,down 三部过程集成起来,完成一个大数据运算。
public abstract class AbstractJob {
@Autowired
protected CdpOperationService cdpOperationService;
@Autowired
protected OdpsSqlTaskExecute odpsSqlTaskExecute;
protected static final Logger logger = LoggerFactory.getLogger(AbstractJob.class);
public void executeJob(String jobGroupEnumCode, Map<String, Object> pramaMap) {
long startTime = System.currentTimeMillis();
logger.info(StringUtils.center("start execute upload jobType ", 60, "=="));
cdpOperationService.runOrderLogJob(jobGroupEnumCode, CdpJobConfig.JOB_TYPE_UPLOAD, pramaMap);
postUploadJobType(jobGroupEnumCode, pramaMap);
logger.info(StringUtils.center("start execute odps jobType", 60, "=="));
odpsSqlTaskExecute.executeTask(jobGroupEnumCode);
postOdpsJobType(jobGroupEnumCode, pramaMap);
logger.info(StringUtils.center("start execute download jobType", 60, "=="));
cdpOperationService.runOrderLogJob(jobGroupEnumCode, CdpJobConfig.JOB_TYPE_DOWNLOAD, pramaMap);
postDownloadJobType(jobGroupEnumCode, pramaMap);
logger.info(StringUtils.center("end execute cdp,odps", 60, "=="));
long endTime = System.currentTimeMillis();
logger.info("execute total time : {} second==============", (endTime - startTime) / 1000);
}
/**
* 自定义需要执行的upload任务
*
* @param jobGroupEnum
* @param pramaMap
*/
protected void postUploadJobType(String jobGroupEnumCode, Map<String, Object> pramaMap) {
}
/**
* 自定义需要执行的odps任务
*
* @param jobGroupEnum
* @param pramaMap
*/
protected void postOdpsJobType(String jobGroupEnumCode, Map<String, Object> pramaMap) {
}
protected void postDownloadJobType(String jobGroupEnumCode, Map<String, Object> pramaMap) {
}
}