#非结构化数据同步#
#Oracle#
#MongoDB#
#PowerCenter#
博主在数仓建设过程中,发现部分采集系统在使用结构化数据库存储文档、图片、各种数据体,在数仓建设工程汇总,又会出现非结构化数据剥离的情况,此时需要完成结构化数据库中非结构化数据文件向MongoDB、hdfs等非结构化数据存储文件系统的同步工作,故此结合实际数据ETL工作经验,故此记录此工作过程。
1、添加控件和导入jar包
1.1、添加控件
此功能需要添加java控件实现诶结构化数据获取和传输。2)
1.2、导入必要的jar包
需要导入相应的jar包,如下表:
2、java程序代码
2.1、java组件-导入包
将下列代码贴入 java控件-导入包页面中
import net.sf.json.JSONObject;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.mime.HttpMultipartMode;
import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.entity.ContentType;
import java.io.*;
import java.sql.*;
import java.util.*;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.text.SimpleDateFormat;
2.2、帮助程序代码
2.3、代码
int updateCount = 0;
int insertCount = 0;
2.3.1、日期类型转换
public static String dateToString(long datalong){
Date date = new Date(datalong);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String str = sdf.format(date);
return str;
}
2.3.2、文件上传接口
public String fileToMongodb(byte[] filebyte, String ywjid, String tbname, String fjmc){
String wjid = "";
if(filebyte != null){
String ymd5 = generateMD5(filebyte); //生成源文件的md5
logInfo("源md5:"+ymd5 );
//如果源wjid为空,则执行插入操作
if("".equals(ywjid) || ywjid == null){
String result = UploadFileByHttpClient(filebyte,"Document","insert",ywjid); //请求接口返回的数据
if(!"".equals(result)){
JSONObject json = JSONObject.fromObject(result);
if("true".equals(json.getString("Result")) && json.getString("MD5").equals(ymd5)){
wjid =json.getString("FileID");
updateMongodbJcxx(wjid,ymd5,fjmc,tbname,filebyte,"insert");
}else{
logError("错误:接上传Mongodb失败");
}
}else{
logError("错误:接口返回数据为null");
}
}else{ //如果wjid不为空,则判断文件是否发生了修改
String mbMd5_sql = " SELECT MD5 FROM component.sygc_jcxx_wjjcxx where wjid = '"+ywjid+"' and ssbm = '"+tbname+"' order by FJSCSJ desc";
List<Map<String,Object>> md5list = new ArrayList<>();
try{
md5list = queryForList(mbMd5_sql);
}catch (Exception e){
}
String mbMd5 = "";
if(md5list.size() > 0 ){
mbMd5 = md5list.get(0).get("MD5").toString();
}
logInfo("目标md5:"+mbMd5 );
if(ymd5.equals(mbMd5)){ //如果md5相同,则说明文件无修改,忽略
wjid = ywjid;
logInfo("文件没发生修改");
}
if(!mbMd5.equals(ymd5 )){ //如果不同,则说明文件已修改。这更新操作
logInfo("文件发生了修改");
String result = UploadFileByHttpClient(filebyte,"Document","update",ywjid); //请求接口返回的数据
if(!"".equals(result)){
JSONObject json = JSONObject.fromObject(result);
if("true".equals(json.getString("Result")) && json.getString("MD5").equals(ymd5)){
wjid =json.getString("FileID");
updateMongodbJcxx(wjid,ymd5,fjmc,tbname,filebyte,"update");
}else{
System.err.println("错误:接口上传Mongodb失败");
}
}else{
System.err.println("错误:接口返回数据为null");
}
}
}
}
return wjid;
}
2.3.3、文件更新接口
public static String UploadFileByHttpClient(byte[] filebyte,String fileType,String serviceType,String ywjid) {
String url = "";
if("insert".equals(serviceType)){
url = "http://ip:port/bucketName/PutSignalFileToDB";
}
if("update".equals(serviceType)){
url = "http://IP:port/SYGC_JH/UpdateSignalFileToDB?FileID="+ywjid;
}
String result = "";
CloseableHttpClient httpClient = HttpClients.createDefault();
try {
HttpPost httpPost = new HttpPost(url);
//HttpMultipartMode.RFC6532参数的设定是为避免文件名为中文时乱码
MultipartEntityBuilder builder = MultipartEntityBuilder.create().setMode(HttpMultipartMode.RFC6532);
httpPost.addHeader("header1", "");//头部放文件上传的head可自定义
builder.addBinaryBody("file", filebyte, ContentType.MULTIPART_FORM_DATA,"");
builder.addTextBody("FileType", fileType);//其余参数,可自定义
HttpEntity entity = builder.build();
httpPost.setEntity(entity);
HttpResponse response = httpClient.execute(httpPost);// 执行提交
HttpEntity responseEntity = response.getEntity();//接收调用外部接口返回的内容
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK){
// 返回的内容都在content中
InputStream content = responseEntity.getContent();
// 定义BufferedReader输入流来读取URL的响应
BufferedReader in = new BufferedReader(new InputStreamReader(content,"UTF-8"));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
}
}catch(Exception e) {
//logError("上传文件失败");
}finally {//处理结束后关闭httpclient的链接
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return result;
}
2.3.4、文档存储信息接口
public String updateMongodbJcxx(String wjid,String md5,String fjmc, String tbname,byte[] filebyte,String type){
String wjlx = "";
if(fjmc != null){
wjlx = fjmc.substring(fjmc.lastIndexOf("."));
}
if("insert".equals(type)){
logInfo("执行插入文件基础信息数据方法");
String insert_sql = " insert into component.table_name (wjid,md5,ssbm,wjdx,wjlx,fjscsj,bz)" +
" values ( ?,?,?,?,?,?,?)";
try {
executeUpdate(insert_sql,new Object[]{wjid, md5, tbname,filebyte.length,wjlx,new Date(),""});
}catch (Exception e){
}
}
if("update".equals(type)){
logInfo("执行更新文件基础信息数据方法");
String update_sql = "update component.table_name set md5=?,wjdx=?,wjlx=?,fjscsj=?,bz=? " +
" where wjid=? and ssbm=?";
try {
executeUpdate(update_sql,new Object[]{ md5, filebyte.length,wjlx,new Date(),"",wjid,tbname});
}catch (Exception e){
}
}
return "";
}
2.3.5、文件MD5获取接口
public static String generateMD5(byte[] b){
String s = null;
char hexDigits[] = { // 用来将字节转换成 16 进制表示的字符
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
try
{
java.security.MessageDigest md = java.security.MessageDigest.getInstance( "MD5" );
md.update( b );
byte tmp[] = md.digest(); // MD5 的计算结果是一个 128 位的长整数,
// 用字节表示就是 16 个字节
char str[] = new char[16 * 2]; // 每个字节用 16 进制表示的话,使用两个字符,
// 所以表示成 16 进制需要 32 个字符
int k = 0; // 表示转换结果中对应的字符位置
for (int i = 0; i < 16; i++) { // 从第一个字节开始,对 MD5 的每一个字节
// 转换成 16 进制字符的转换
byte byte0 = tmp[i]; // 取第 i 个字节
str[k++] = hexDigits[byte0 >>> 4 & 0xf]; // 取字节中高 4 位的数字转换,
// >>> 为逻辑右移,将符号位一起右移
str[k++] = hexDigits[byte0 & 0xf]; // 取字节中低 4 位的数字转换
}
s = new String(str); // 换后的结果转换为字符串
}catch( Exception e )
{
e.printStackTrace();
}
return s;
}
2.3.5、结构化数据源连接
@SuppressWarnings("unchecked")
public static Connection getConn() throws SDKException{
String driver = "com.mysql.jdbc.Driver";
String url = "jdbc:mysql://ip:port/component_divided?useUnicode=true&useSSL=false&characterEncoding=utf-8&autoReconnect=true";
String username = "user";
String password = "pwd";
Connection conn = null;
try {
Class.forName(driver); // classLoader,加载对应驱动
conn = DriverManager.getConnection(url, username, password);
} catch (Exception e) {
System.out.println("链接错误:"+e.toString());
}
return conn;
}
2.3.6、更新接口
@SuppressWarnings("unchecked")
public static int executeUpdate(String sql, Object[] ars) throws SDKException{
int num = 0;
Connection conn = null;
PreparedStatement pstm = null;
try {
// 1.建立连接
conn = JTXPartitionDriverImplGen.getConn();
//设置事务非自动提交, 提升速度
conn.setAutoCommit(false);
// 2.处理预编译SQL语句
pstm = conn.prepareStatement(sql);
// 3给点位符赋值
for (int i = 0; i < ars.length; i++) {
pstm.setObject(i + 1, ars[i]);
}
// 4.执行预编译SQL语句
num = pstm.executeUpdate();
conn.commit();
} catch (Exception e) {
e.printStackTrace();
JTXPartitionDriverImplGen.closeAll(conn, pstm, null);
} finally {
// 释放资源
JTXPartitionDriverImplGen.closeAll(conn, pstm, null);
}
return num;
}
2.3.7、查询接口
/**
* 查询方法
* @param sql
* @param ars
* @return
*/
@SuppressWarnings("unchecked")
public static List<Map<String,Object>> queryForList(String sql, Object[] ars) throws SDKException{
Connection conn = null;
List<Map<String,Object>> resultList = new ArrayList<>();
ResultSet rs = null;
ResultSetMetaData data = null; //定义ResultSetMetaData对象
int coloumCount = 0; //列的数目
String[] coloumNames = null;//列的名称数组,存储列的名称
PreparedStatement pstm = null;
try {
// 1.建立连接
conn = JTXPartitionDriverImplGen .getConn();
// 2.处理预编译SQL语句
pstm = conn.prepareStatement(sql);
// 3给点位符赋值
for (int i = 0; i < ars.length; i++) {
pstm.setObject(i + 1, ars[i]);
}
// 4.执行预编译SQL语句
rs = pstm.executeQuery();// 查询数据
data = rs.getMetaData();
coloumCount = data.getColumnCount();
coloumNames = new String[coloumCount];
for(int i = 0; i < coloumCount; i++){
coloumNames[i] = data.getColumnName(i+1);
}
//resultList = new ArrayList<Map>();
while(rs.next()){
Map map = new HashMap();
for(int i = 0; i < coloumCount; i++){
map.put(coloumNames[i], rs.getObject(coloumNames[i]));
}
resultList.add(map);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
// 释放资源
JTXPartitionDriverImplGen.closeAll(conn, pstm, null);
}
return resultList;
}
/**
*
* @param sql
* @return
*/
@SuppressWarnings("unchecked")
public static List<Map<String,Object>> queryForList(String sql) throws SDKException{
Connection conn = null;
List<Map<String,Object>> resultList = new ArrayList<>();
ResultSet rs = null;
ResultSetMetaData data = null; //定义ResultSetMetaData对象
int coloumCount = 0; //列的数目
String[] coloumNames = null;//列的名称数组,存储列的名称
PreparedStatement pstm = null;
try {
// 1.建立连接
conn = JTXPartitionDriverImplGen.getConn();
// 2.处理预编译SQL语句
pstm = conn.prepareStatement(sql);
// 3给点位符赋值
// 4.执行预编译SQL语句
rs = pstm.executeQuery();// 查询数据
data = rs.getMetaData();
coloumCount = data.getColumnCount();
coloumNames = new String[coloumCount];
for(int i = 0; i < coloumCount; i++){
coloumNames[i] = data.getColumnName(i+1);
}
//resultList = new ArrayList<Map>();
while(rs.next()){
Map map = new HashMap();
for(int i = 0; i < coloumCount; i++){
map.put(coloumNames[i], rs.getObject(coloumNames[i]));
}
resultList.add(map);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
// 释放资源
JTXPartitionDriverImplGen.closeAll(conn, pstm, null);
}
return resultList;
}
2.3.8、关闭数据库连接
/**
* 关闭连接
*/
@SuppressWarnings("unchecked")
public static void closeAll(Connection conn, Statement stm, ResultSet rs) throws SDKException {
if (rs != null) {
try {
rs.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
if (stm != null) {
try {
stm.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
if (conn != null) {
try {
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
2.4、【在输入行】代码
2.4.1、代码
long startTime = System.currentTimeMillis();
O_FJWJID = fileToMongodb(FJWJID,IN_FJWJID,"table_name",FJMC);
if(IN_DBID == null || "".equals(IN_DBID)){
String insert_sql = " INSERT INTO table_name VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?) ";
try {
executeUpdate(insert_sql,new Object[]{DBID, JHDM, JH,FJLBMC,FJMC,dateToString(BXSJ),BXDWMC,O_FJWJID,FJLJ,dateToString(CJSJ),CJR,dateToString(XGSJ),XGR});
insertCount++;
logInfo("结构化数据执行插入成功");
}catch (Exception e){
logInfo("结构化数据插入失败");
}
}else{
String update_sql = " UPDATE table_name SET JHDM =?,JH=?,FJLBMC=?,FJMC=?,BXRQ=?,BXDWMC=?,FJWJID=?," +
" FJLJ=?,CJSJ=?,CJR=?,XGSJ=?,XGR=? WHERE DBID = ?";
try {
executeUpdate(update_sql,new Object[]{ JHDM, JH,FJLBMC,FJMC,dateToString(BXSJ),BXDWMC,O_FJWJID,FJLJ,dateToString(CJSJ),CJR,dateToString(XGSJ),XGR,IN_DBID});
updateCount ++;
logInfo("结构化数据执行更新成功");
}catch (Exception e){
logInfo("结构化数据更新失败");
}
}
logInfo("运行时间: " + (System.currentTimeMillis() - startTime ) + "ms");
2.5、在数据末尾
2.5.1、
代码
// ToDo: Enter code that executes when all the input data is received by the transformation here
//
// logInfo("The number of null rows for this partition is : " + partCountNullRows);
// synchronized(lock)
// {
// logInfo("The total number of null rows across partitions till now is : " + countNullRows);
// }
logInfo("插入成功数据行:"+insertCount);
logInfo("更新成功数据行:"+ updateCount);
3、注意!
代码中需要替换IP和Port,同时根据需要扩展数据表结构!