首先创建你自己所需要的变量
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.List;
/**
* @ClassName: DataX
* @Description: datax抽取数据所用
* @Author wjw
* @Date 2024/6/15
*/
@Data
@NoArgsConstructor
@AllArgsConstructor
public class DataX {
private String name;
private String type;
private String filePath;
private String databaseName;//oracle 数据库名
private String datasourceTable;
private String host;
private Integer port;
private String username;
private String password;
private String storageSource; //dataCollect
private String storageSourceTable; //dataCollect
private List<Column> tableColumns;
}
我这边需要有四种类型的数据格式,分别是MySQL、Oracle、csv和excel,excel其实是转换成csv来进行操作,但我还是写了四个实现类,整个方法用了工厂模式来进行编写。
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.stereotype.Component;
import javax.annotation.PostConstruct;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* datax工厂类
*/
@Slf4j
@Component
public class DataXServiceFactory {
@Autowired
private ApplicationContext applicationContext;
/**
* Reader类型: MySQL、Oracle、CSV、Excel
*/
private static Map<String, ReaderConfigGenerator> dbTypeReaderConfigGeneratorMap = new ConcurrentHashMap<>();
@PostConstruct
private void init() throws Exception {
try {
Map<String, ReaderConfigGenerator> dataxServiceMap = applicationContext.getBeansOfType(ReaderConfigGenerator.class);
if (dataxServiceMap == null || dataxServiceMap.isEmpty()) {
throw new Exception("DataX工厂类---初始化加载服务 null!");
}
for (Map.Entry<String, ReaderConfigGenerator> entry : dataxServiceMap.entrySet()) {
ReaderConfigGenerator readerConfigGenerator = entry.getValue();
String dbType = readerConfigGenerator.getDbType();
dbTypeReaderConfigGeneratorMap.put(dbType, readerConfigGenerator);
}
log.info("DataX工厂类---初始化加载服务成功");
} catch (Exception e) {
log.error("DataX工厂类---初始化加载服务 error!");
//记录日志后,重新抛出异常,防止服务不正常启动
throw new Exception();
}
}
public ReaderConfigGenerator getReaderConfigGeneratorByDbType(String dbType) {
return dbTypeReaderConfigGeneratorMap.get(dbType);
}
}
接口
import com.alibaba.fastjson2.JSONObject;
/**
* @author wjw
* @date 2024/6/7
* @description datax接口
*/
public interface ReaderConfigGenerator {
String getDbType();
JSONObject generateReaderConfig(DataX dataX);
}
具体实现层
/**
* @ClassName: MySQLReaderConfigGenerator
* @Description: MySQLreader
* @Author wjw
* @Date 2024/6/17
*/
@Service
public class MySQLReaderConfigGenerator implements ReaderConfigGenerator {
@Override
public String getDbType() {
return "MySQL";
}
@Override
public JSONObject generateReaderConfig(DataX dataX) {
// 生成 MySQL reader 配置
JSONObject readerConfig = new JSONObject();
readerConfig.put("name", "mysqlreader");
JSONObject parameter = new JSONObject();
parameter.put("username", dataX.getUsername());
parameter.put("password", dataX.getPassword());
parameter.put("connection", new JSONObject[]{new JSONObject()
.fluentPut("jdbcUrl", "jdbc:mysql://" + dataX.getHost() + ":" + dataX.getPort())
.fluentPut("table", new String[]{dataX.getDatasourceTable()})});
parameter.put("column", dataX.getTableColumns().stream().map(Column::getName).toArray());
parameter.put("splitPk", "id");
readerConfig.put("parameter", parameter);
return readerConfig;
}
}
/**
* @ClassName: MySQLReaderConfigGenerator
* @Description: MySQLreader
* @Author wjw
* @Date 2024/6/17
*/
@Service
public class OracleReaderConfigGenerator implements ReaderConfigGenerator {
@Override
public String getDbType() {
return "Oracle";
}
@Override
public JSONObject generateReaderConfig(DataX dataX) {
// 生成 Oracle reader 配置
JSONObject readerConfig = new JSONObject();
readerConfig.put("name", "oraclereader");
JSONObject parameter = new JSONObject();
parameter.put("username", dataX.getUsername());
parameter.put("password", dataX.getPassword());
parameter.put("connection", new JSONObject[]{new JSONObject()
.fluentPut("jdbcUrl", "jdbc:oracle:thin:@" + dataX.getHost() + ":" + dataX.getPort() + "/" + dataX.getDatabaseName())
.fluentPut("table", new String[]{dataX.getDatasourceTable()})});
parameter.put("column", dataX.getTableColumns().stream().map(Column::getName).toArray());
parameter.put("splitPk", "id");
readerConfig.put("parameter", parameter);
return readerConfig;
}
}
/**
* @ClassName: CSVReaderConfigGenerator
* @Description: CSVreader
* @Author wjw
* @Date 2024/6/17
*/
@Service
public class CSVReaderConfigGenerator implements ReaderConfigGenerator {
@Override
public String getDbType() {
return "CSV";
}
@Override
public JSONObject generateReaderConfig(DataX dataX) {
// 生成 CSV reader 配置
JSONObject readerConfig = new JSONObject();
readerConfig.put("name", "txtfilereader");
JSONObject parameter = new JSONObject();
parameter.put("path", dataX.getFilePath());
parameter.put("column", new JSONObject[]{/* column definitions */});
readerConfig.put("parameter", parameter);
return readerConfig;
}
}
excel转csv再进行操作
/**
* @ClassName: ExcelReaderConfigGenerator
* @Description:
* @Author wjw
* @Date 2024/6/17
*/
@Service
public class ExcelReaderConfigGenerator implements ReaderConfigGenerator {
@Override
public String getDbType() {
return "Excel";
}
@Override
public JSONObject generateReaderConfig(DataX dataX) {
try {
// 生成 Excel reader 配置
//把路径的最后.xsl删除然后加上excel.csv后缀。形成新的路径
String csvFilePath =dataX.getFilePath().substring(0, dataX.getFilePath().lastIndexOf(".")) + "Excel2CSV.csv";
excelToCSV(dataX.getFilePath(), csvFilePath);
// 生成 CSV reader 配置
JSONObject readerConfig = new JSONObject();
readerConfig.put("name", "txtfilereader");
JSONObject parameter = new JSONObject();
parameter.put("path", csvFilePath);
parameter.put("column", new JSONObject[]{/* column definitions */});
readerConfig.put("parameter", parameter);
return readerConfig;
} catch (IOException e) {
throw new ServiceException(ErrorCodeConstants.EXCEL_TO_CSV_FAIL);
}
}
public void excelToCSV(String excelFilePath, String csvFilePath) throws IOException {
try (FileInputStream file = new FileInputStream(new File(excelFilePath));
Workbook workbook = WorkbookFactory.create(file);
FileOutputStream fos = new FileOutputStream(new File(csvFilePath));
OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
BufferedWriter bw = new BufferedWriter(osw)) {
Sheet sheet = workbook.getSheetAt(0);
for (Row row : sheet) {
StringBuilder sb = new StringBuilder();
for (Cell cell : row) {
switch (cell.getCellType()) {
case STRING:
sb.append('"').append(cell.getStringCellValue()).append('"');
break;
case NUMERIC:
if (DateUtil.isCellDateFormatted(cell)) {
sb.append(cell.getDateCellValue());
} else {
sb.append(cell.getNumericCellValue());
}
break;
case BOOLEAN:
sb.append(cell.getBooleanCellValue());
break;
case FORMULA:
sb.append(cell.getCellFormula());
break;
default:
sb.append("");
}
sb.append(',');
}
if (sb.length() > 0) {
bw.write(sb.substring(0, sb.length() - 1)); // remove last comma
}
bw.newLine();
}
}
}
}
后面一个方法调用上面创建reader的方法然后生成writer再合并再一起,保存下来json文件的路径
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;
@Component
public class DataXJobService {
@Autowired
private DataXServiceFactory dataXServiceFactory;
@Value("${hive.defaultFS}")
private String defaultFS;
public String executeDataXJob(DataX dataX) {
try {
// 根据数据源类型生成相应的 reader 配置
ReaderConfigGenerator readerConfigGenerator= dataXServiceFactory.getReaderConfigGeneratorByDbType(dataX.getType());
JSONObject readerConfig = readerConfigGenerator.generateReaderConfig(dataX);
// 生成 Hive writer 配置
JSONObject writerConfig = generateWriterConfig(dataX.getStorageSource(), dataX.getStorageSourceTable(), dataX.getTableColumns());
// 合并 reader 和 writer 配置
JSONObject dataXJobConfig = mergeConfigs(readerConfig, writerConfig);
// 将配置文件保存到本地(路径根据需要调整)
String dataXConfigFilePath = saveDataXConfigToFile(dataXJobConfig,dataX);
return dataXConfigFilePath;
// 执行 DataX 任务 放到 xxl-job中DataCollectTask
} catch (Exception e) {
throw new ServerException(GlobalErrorCodeConstants.DATAX_FAIL);
}
}
private JSONObject generateWriterConfig(String targetDbName, String targetTableName, List<Column> tableColumns) {
// 生成 Hive writer 配置
JSONObject writerConfig = new JSONObject();
writerConfig.put("name", "hdfswriter");
JSONObject parameter = new JSONObject();
parameter.put("defaultFS", defaultFS);
parameter.put("fileType", "orc");
parameter.put("path", "/apps/hive/warehouse/" + targetTableName);///+ targetDbName + ".db/"自己创建需要加这个
parameter.put("fileName", targetTableName);
parameter.put("fieldDelimiter", "\t");
parameter.put("column", tableColumns);
parameter.put("writeMode", "overwrite");
writerConfig.put("parameter", parameter);
return writerConfig;
}
// 合并配置
private JSONObject mergeConfigs(JSONObject readerConfig, JSONObject writerConfig) {
JSONObject jobConfig = new JSONObject();
JSONObject job = new JSONObject();
job.put("content", new JSONObject[]{new JSONObject().fluentPut("reader", readerConfig).fluentPut("writer", writerConfig)});
job.put("setting", new JSONObject().fluentPut("speed", new JSONObject().fluentPut("channel", 3)));
jobConfig.put("job", job);
return jobConfig;
}
private String saveDataXConfigToFile(JSONObject dataXJobConfig, DataX datax) throws IOException {
// 将配置文件保存到本地当前工作目录
String currentDir = System.getProperty("user.dir");
String filePath = currentDir + "\\"+datax.getName()+"_"+datax.getDatasourceTable()+".json"; // 保存到当前目录
try (FileWriter file = new FileWriter(filePath)) {
file.write(dataXJobConfig.toJSONString());
}
return filePath;
}
}
后面再需要的地方调用这个方法,把json文件的保存路径给xxljob,让他创建定时任务
xxlJob.setFilePath(dataXJobService.executeDataXJob(dataX));
创建定时任务 这样定时任务就会自动执行数据采集任务
import com.xxl.job.core.context.XxlJobHelper;
import com.xxl.job.core.handler.annotation.XxlJob;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.time.LocalDateTime;
/**
* @ClassName: DataCollectTask
* @Description: datax采集任务
* @Author wangjunwei
* @Date 2024/6/18
*/
public class DataCollectTask {
private static Logger logger = LoggerFactory.getLogger(AssetsTask.class);
@XxlJob("dataXJobHandler")
public void dataXJobHandler(String filePath) throws Exception {
try {
ProcessBuilder processBuilder = new ProcessBuilder("python", "D:\\BaiduNetdiskDownload\\datax.tar\\datax\\bin\\datax.py", filePath);
processBuilder.inheritIO();
Process process = processBuilder.start();
process.waitFor();
} catch (IOException | InterruptedException e) {
e.printStackTrace();
// 处理执行过程中的异常,例如记录日志或返回错误信息
}
XxlJobHelper.log("XXL_JOB,run successfully.");
System.out.println("XXL_JOB:Hello WOrld." + LocalDateTime.now());
}
}