概述
访问Salesforace,将数据写入到hdfs中。稍加改动就可以写入其他数据源等等
详细
概述
通过代码抽取Salesforace数据,写入文本格式至到HDFS中。
项目结构
├─dataTransfer(项目根节点)
│ ├─com.example.sfdctohdfs 项目主包
│ │ ├─com.example.sfdctohdfs.hdfs hdfs工具包
│ │ │ ├─com.example.sfdctohdfs.hdfs.util 工具类
│ │ ├─com.example.sfdctohdfs.sfdc sfdc处理包
详情
-
配置
配置Salesforace、Hive配置信息。配置内容存放与项目中resources文件下application.yml文件中,如下所示
#salesforce 配置
salesforce:
client_id: 3MVG99OxTyEMCQ3jLn42JgU9JP4qJYx5cZgYGcAq4CdAqWd.R6A4L2KHUonNl0schJV07gNk.OahPYvlGpHaq
client_secret: 13CA01010ED07DE764DE083D0C802FB7F69CC2CA23AFB89450626FBE1A8C3095
username: username
password: test1234
url: https://login.salesforce.com/services/oauth2/token?grant_type=password
//是否使用代理
isProxy: true
proxyUrl: proxy.xx.com
proxyPort: 80
hdfs:
url: hdfs://10.1.1.1:9000
user: root
SalesforceDataUtilg工具类
package com.example.sfdctohdfs.sfdc;
import cn.hutool.core.date.DateUnit;
import cn.hutool.core.date.DateUtil;
import cn.hutool.core.date.TimeInterval;
import cn.hutool.core.text.csv.CsvWriter;
import cn.hutool.core.util.NumberUtil;
import cn.hutool.core.util.StrUtil;
import cn.hutool.http.HttpRequest;
import cn.hutool.http.HttpUtil;
import cn.hutool.json.JSONArray;
import cn.hutool.json.JSONObject;
import cn.hutool.json.JSONUtil;
import com.example.sfdctohdfs.hdfs.HdfsClient;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.math.BigDecimal;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.URISyntaxException;
import java.util.*;
@Component
@Slf4j
public class SalesforceDataUtil {
@Value("${salesforce.client_id}")
private String client_id;
@Value("${salesforce.client_secret}")
private String client_secret;
@Value("${salesforce.username}")
private String username;
@Value("${salesforce.password}")
private String password;
@Value("${salesforce.url}")
private String url;
@Value("${salesforce.isProxy}")
private Boolean isProxy;
@Value("${salesforce.proxyUrl}")
private String proxyUrl;
@Value("${salesforce.proxyPort}")
private Integer proxyPort;
@Autowired
private HdfsClient hdfsClient;
/**
* 登录salesforce
* @return
*/
public JSONObject login(){
HashMap<String, Object> paramMap = new HashMap<>();
paramMap.put("client_id", client_id);
paramMap.put("client_secret", client_secret);
paramMap.put("username", username);
paramMap.put("password", password);
log.info(paramMap.toString());
if(!isProxy){
String result= HttpUtil.post(url, paramMap);
JSONObject json= JSONUtil.parseObj(result);
return json;
}else{
String result=HttpRequest.post(url)
.setProxy(new Proxy(Proxy.Type.HTTP,new InetSocketAddress(proxyUrl,proxyPort)))
.form(paramMap)
.execute().body();
JSONObject json= JSONUtil.parseObj(result);
return json;
}
}
//处理sfdc sql语句 select id,name,LastModifiedDate,CreatedDate from Account
//select from 必须小写
public String getQuerySalesforceSql(String sfdcSql) {
return sfdcSql.split("select\\+")[1].split("\\+from")[0];
}
/**
* 对数据进行简单的处理,去掉一些特殊字符
* @param str
* @return
*/
public static List<String> getInsertSqlStr(String str,String field) {
List<String> list = new ArrayList<>();
JSONArray jsonArray = JSONUtil.parseArray(JSONUtil.parseObj(str).get("records").toString());
//处理查询字段
String[] fields = field.split(",");
//查询字段不为null 根据参数中的字段来查询
//list.add(fields);
for (int i = 0; i < jsonArray.size(); i++) {
StringBuffer strings = new StringBuffer();
Integer count=0;
for (int j = 0; j < fields.length; j++) {
JSONObject o = JSONUtil.parseObj(jsonArray.get(i).toString());
Object value = o.get(fields[j]);
if(j==0){
if (value == null) {
strings.append("\\N");
} else {
strings.append(value.toString()
.replace("\n"," ")
.replace("\r"," ")
.replace("|","-")
);
}
}else{
if (value == null) {
strings.append("|"+"\\N");
} else {
strings.append("|"+value.toString()
.replace("\n"," ")
.replace("\r"," ")
.replace("|","-")
);
}
}
}
list.add(strings.toString());
}
return list;
}
public void writeString(OutputStreamWriter writer,List<String> list) throws IOException {
for(int i=0;i<list.size();i++){
writer.write(list.get(i)+System.getProperty("line.separator"));
}
}
public void executeQuery(String querySql,String filePath) throws URISyntaxException, IOException, InterruptedException {
String field=getQuerySalesforceSql(querySql);
OutputStreamWriter writer=hdfsClient.createCsvWriter(filePath,querySql.split("from\\+")[1].split("\\+")[0]);
Proxy proxy = new Proxy(Proxy.Type.HTTP,new InetSocketAddress(proxyUrl,proxyPort));
try {
TimeInterval timer = DateUtil.timer();
JSONObject json=login();
log.info(JSONUtil.toJsonStr(json));
String queryUrl=json.get("instance_url")+"/services/data/v52.0/query/?q=";
log.info("查询语句:"+querySql);
HttpRequest httpRequest=HttpRequest.get(queryUrl+querySql)
.header("Authorization","Bearer " +json.get("access_token"));
if(isProxy){
httpRequest.setProxy(proxy);
}
String queryBody=httpRequest.execute().body();
if(!JSONUtil.isJsonObj(queryBody)){
log.error(queryBody);
}
String totalSize= JSONUtil.parseObj(queryBody).get("totalSize").toString();
if("0".equals(totalSize)){
log.info("未查询到数据");
}
BigDecimal size=null;
String nextRecordsUrl=null;
if(JSONUtil.parseObj(queryBody).containsKey("nextRecordsUrl")){
size= new BigDecimal(JSONUtil.parseObj(queryBody).get("nextRecordsUrl").toString().split("-")[1]);
nextRecordsUrl=json.get("instance_url")+JSONUtil.parseObj(queryBody).get("nextRecordsUrl").toString();
log.info(nextRecordsUrl);
}else{
size=new BigDecimal(JSONUtil.parseObj(queryBody).get("totalSize").toString());
}
//获取表头
String head= field;
writer.write(head+System.getProperty("line.separator"));
writer.flush();
writeString(writer,getInsertSqlStr(queryBody,field));
writer.flush();
log.info("第一次读取数据:"+size+"条,花费时间:"+timer.interval()+"毫秒。");
Long temlpTime=System.currentTimeMillis();
while (JSONUtil.parseObj(queryBody).containsKey("nextRecordsUrl")) {
long currentTimeMillis8 = System.currentTimeMillis();
//token有效期大于一个小时50分钟重新登录!
if(DateUtil.between(new Date(temlpTime), new Date(currentTimeMillis8), DateUnit.MINUTE)>110){
json=login();
temlpTime=currentTimeMillis8;
log.info("token过期-重新登录!");
}
httpRequest=HttpRequest.get(nextRecordsUrl)
.header("Authorization","Bearer " +json.get("access_token"));
if(isProxy){
httpRequest.setProxy(proxy);
}
String queryBodytlmp=httpRequest.execute().body();
writeString(writer,getInsertSqlStr(queryBodytlmp,field));
writer.flush();
if(!JSONUtil.parseObj(queryBodytlmp).containsKey("nextRecordsUrl")){
size=NumberUtil.add(size,new BigDecimal((JSONUtil.parseArray(JSONUtil.parseObj(queryBodytlmp).get("records").toString()).size())));
log.info("总读取数据:"+size+",花费时间:"+(timer.interval())+"毫秒。");
break;
}
nextRecordsUrl=json.get("instance_url")+JSONUtil.parseObj(queryBodytlmp).get("nextRecordsUrl").toString();
size=new BigDecimal(JSONUtil.parseObj(queryBodytlmp).get("nextRecordsUrl").toString().split("-")[1]);
log.info(nextRecordsUrl);
log.info("总读取数据:"+size+",花费时间:"+(timer.interval())+"毫秒。");
}
}catch (Exception e){
throw e;
}finally {
writer.close();
}
}
}
-
程序主入口SfdctohdfsApplication
本demo需求为jar包运行,故需要通过主程序入口调用方法。最终调用方式为 java -jar jarname args[0] args[1]。代码中可以将自定义日志名称和地址三行注释后本地直接运行,参数也需要手动赋值;
package com.example.sfdctohdfs;
import cn.hutool.core.date.DateUtil;
import com.example.sfdctohdfs.hdfs.util.SpringUtil;
import com.example.sfdctohdfs.sfdc.SalesforceDataUtil;
import lombok.extern.slf4j.Slf4j;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.context.ApplicationContext;
import java.io.FileNotFoundException;
import java.io.PrintStream;
@SpringBootApplication
@Slf4j
public class SfdctohdfsApplication {
public static void main(String[] args) throws FileNotFoundException {
//自定义日志文件名称和地址
String logName=args[0].split("from\\+")[1].split("\\+")[0]+"_"+DateUtil.format(DateUtil.date(),"yyyy-MM-dd_HH-mm-ss");
PrintStream ps2=new PrintStream("./"+logName +".log");// 创建文件输出流2
System.setOut(ps2);
try {
SpringApplication.run(SfdctohdfsApplication.class, args);
ApplicationContext context = SpringUtil.getApplicationContext();
SalesforceDataUtil salesforceDataUtil =context.getBean(SalesforceDataUtil.class);
String querySql=args[0];
//args[0]: 传入sql参数
//String querySql="select+UCM_ID__c,UCM_Status__c,Name,Alternate_Account_Name__c,Alias__c,Facility_Type__c,Facility_Ownership_Type__c,Go_to_Market_P_L_del__c,China_Hospital_Category__c,Account_Segmentation__c,Sales_Channel__c,China_Hospital_Level__c,China_Hospital_Grade__c,Large_Agency__c,Country_Name_2__c,Billing_State__c,Primary_Billing_City__c,Primary_County__c,BillingAddress,BillingCity,BillingCountry,BillingGeocodeAccuracy,BillingLatitude,BillingLongitude,BillingPostalCode,BillingState,BillingStreet,Parent_Commercial_ID__c,Parentid,KYC_Done__c,GSP_Status__c,GE_Show_Site__c,China_Number_of_Outpatient__c,China_Number_of_bed__c,China_Annual_Revenue__c,CreatedDate,LastModifiedDate,Record_Status__c,Description,Comments__c,GE_Site_Free_Field_01__c,GE_Site_Free_Field_02__c,Service_Sales_Market_Segment__c,GD_Name__c,GD_UCMID__c,Parent__c,Ultimate_Parent__c,Active__c,China_Government_Level__c,China_GE_segmentation__c,Is_Top_X__c,Ascend__c,GE_Site_ID__c,GE_Site_Type__c,GE_Site_Abbreviated_Name__c,GE_Site_Service_Station_ID__c,GE_Site_Name_English__c,HUB_Commercial_Entity__c,Termination_Reason__c,Termination_Date__c,UltimateParent_HL__c,Independent_Procurement__c,Japan_Primary_Address_UCM__c,Secondary_Address__c,Phone__c,Alternate_Phone__c,Website,Fax__c,L1_Market_Revenue__c,L1_Position__c,L1_Type__c,L1_Segment__c,ePOP_Account_ID__c,China_Dealer_Id__c,ePOP_Account_Name__c,Data_Steward_Notes__c,MMI_ID__c,Hospital_Certificate_No__c,Hospital_Certificate_End_Date__c,Business_License_End_Date__c,Business_License_No__c,Medical_Device_Operation_License_No__c,Medical_Device_Op_License_End_Date__c,Medical_Device_Filing_LicenseNo_ClassII__c,Other_License_No__c,Other_License_End_Date__c,Region__c,Zone__c,CreatedByid,UCM_Address_ID__c,Site_ID__c,Commercial_ID__c,LastModifiedByid,Ownerid,DUNS__c,Salesforce_ID__c,Oracle_ERP_ID__c,ReadyToPublish__c,Japan_Memo_File__c,RecordTypeid,LS_Global_Region__c,Global_Region__c,Language__c,Test_Record__c,Account_Setting__c,HUB_Identifier__c,GE_Hub_Country__c,MedicalDevice_FilingLicense_ClassII_Date__c,ShippingAddress,ShippingCity,ShippingCountry,ShippingGeocodeAccuracy,ShippingLatitude,ShippingLongitude,ShippingPostalCode,ShippingState,ShippingStreet,L1_Market_Revenue_Sub_Segment__c,L1_Type_Sub_Category__c,L2_Org_Segment__c,Partner_Account_Status__c,Partner_Account_Status_Description__c,Termination_Category__c,Approval_Status__c,HCIT_PACS_IW_Service_Level__c,IsDataLoad__c+from+account+where+Global_Region__c='China'";
String filePath=args[1];
//args[1]: hdfs文件存放路径
//String filePath="/wtest/";
salesforceDataUtil.executeQuery(querySql,filePath);
System.out.println("程序结束!");
System.exit(0);
}catch (Exception e){
log.error("系统异常:",e);
System.out.println("程序运行出现异常!");
System.exit(1);
}
}
}
-
总结
该程序主要是读取sfdc数据写入hdfs中,稍加改动即可写入其他数据源比如mysql等等。也可以集成到项目中,作为工具类来使用