背景:一般情况下,我们在大数据使用中,我们习惯用Spark来做计算,用JAVA来JDBC MySQL库,随着Spark的成熟,Spark自己也有实现一套连接Mysql的方法。这里主要以Spark Core 来处理数据,然后用JDBC 来批量导入数据。
工程:我们的数据工程通过flume收集nginx上数据,直接落地到Hdfs,然后用SparkCore 解析处理数据,JDBC批量入MySQL库。
DataFrame的方式JDBC批量导入MySQL的地址:https://blog.csdn.net/qq_33792843/article/details/91518940
简化的Hdfs上的一条的原始数据:
{“header”: {“device_id”: “CBI9SMY9ZP”,“type”: “102”,“device_time”: “1564453420207”},
“body”: “[{“aStatus”:“INSTALL”,“apkName”:“移动网络配置”,“createTime”:1563854212000,“icon”:”",
“packageName”:“com.android.providers.telephony”,“updateTime”:1563854212000,“version”:“5.1.1-20190723.094502”,“versionCode”:22},{“aStatus”:“INSTALL”,“apkName”:“日历存储”,“createTime”:1563854209000,“icon”:"",
“packageName”:“com.android.providers.calendar”,“updateTime”:1563854209000,“version”:“5.1.1-20190723.094502”,“versionCode”:22}]"
}
主类代码:
package com.mhl.test
import java.sql.{DriverManager, PreparedStatement, SQLException, Timestamp}
import java.text.SimpleDateFormat
import java.util
import java.util.Date
import com.mhl.utils.{IdWorker, User, Utils}
import com.mysql.jdbc.Connection
import org.apache.commons.lang3.StringUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
object Mhl_Device_App_Version {
def main(args: Array[String]): Unit = {
System.setProperty("HADOOP_USER_NAME","hdfs")
var day="2019-07-30"
/**
* 1 、参数验证
* 传入参数是天,所以args.length要为1
*/
if (args.length >= 1) {
day =args(0)
if (args.length >= 2) {
args(1).trim
} else {
""
}
} else {
println("======day is required!")
System.exit(1)
""
}
println(s"======run day=$day !")
//2、初始化程序的入口
val spark =SparkSession.builder()
.appName(Mhl_Device_App_Version.getClass.getSimpleName)
.master("local")
.config("spark.testing.memory","471859200")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.config("spark.yarn.executor.memoryOverhead","1024")
.config("spark.default.parallelism","26")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val application =spark.sparkContext.applicationId
val inputPath ="hdfs://mhl/mhl/flume/logdfs/hdfs_102"+"/"+s"$day"+"/"+"*"
val logRDD: RDD[String] = spark.sparkContext.textFile(inputPath).persist(StorageLevel.MEMORY_AND_DISK)
val res = logRDD.count()
println(res)
logRDD.foreachPartition(it=>{
/**
* jdbc连接
*/
val driver: String = "com.mysql.jdbc.Driver"
val url: String = "jdbc:mysql://172.20.1.15:3306/mhl_advert"
val username: String = "user_big_wirte"
val password: String = "mhlslave12*"
var conn: Connection = null
var ps :PreparedStatement=null
try {
//classLoader,加载对应驱动
Class.forName(driver)
conn = DriverManager.getConnection(url, username, password).asInstanceOf[ Connection ]
conn.setAutoCommit(false)
val sql = " REPLACE INTO device_app_version (id,package_name,label,create_time,update_time,serial_number,version_name,version_code) VALUES (?,?,?,?,?,?,?,?)"
//批量导入ps一定要写在循环外面
ps = conn.prepareStatement(sql)
it.foreach(x=>{
if (StringUtils.isNotEmpty(x)) {
val device_id = parseDevice_id(x)
val users: util.ArrayList[User] = parseBody(x)
for (i <- 0 to users.size() - 1) {
ps.setLong(1, new IdWorker().nextId().toLong)
ps.setString(2, users.get(i).getPackageName)
ps.setString(3, null)
ps.setTimestamp(4, tranTimeToDate(users.get(i).getCreateTime))
ps.setTimestamp(5, tranTimeToDate(users.get(i).getUpdateTime))
ps.setString(6, device_id)
ps.setString(7, users.get(i).getVersion)
ps.setString(8, users.get(i).getVersionCode)
ps.addBatch()
println(users.get(i).getPackageName + "," + users.get(i).getCreateTime)
}
//Batch写在循环外面
ps.executeBatch()
conn.commit()
ps.clearBatch()
}else{
println("======Data is Empty !")
}
})
} catch {
case e: ClassNotFoundException =>
System.err.println("======need driver class !")
case e: SQLException =>
e.printStackTrace()
}
finally {
if (ps != null) {
ps.close()
}
if (conn != null) {
conn.close()
}
}
})
System.err.println("======Load Data Into MySQL Success!")
spark.stop()
}
/**
* 解析device_id
* @param line
* @return
*/
def parseDevice_id(line:String):String={
if (StringUtils.isNotEmpty(line)) {
val jValue = (parse(line) \ “header” \ “device_id”)
implicit var format = DefaultFormats
//返回的device_id的值
jValue.extract[String]
}else{
println(s"======primary data is exception !")
null
}
}
/**
* 解析device_time
* @param line
* @return
*/
def parseDevice_time(line:String):String={
if (StringUtils.isNotEmpty(line)) {
val jV = (parse(line) \ “header” \ “device_time”)
implicit var format = DefaultFormats
//返回的device_id的值
jV.extract[String]
}else{
println(s"======primary data is exception !")
null
}
}
/**
* 解析body 取id、package_name、label、create_time、update_time、version、version_code
* @param line
* @return
*/
def parseBody(line:String): util.ArrayList[ User ]={
implicit var format = DefaultFormats
val body: String = (parse(line) \ "body").extract[String]
Utils.parseJsonToRDD(body)
}
/**
* 将时间转成Timestamp类型---------因为:MySQL中的datatime类型对应Java中的Timestamp类型
* @param tm
* @return
*/
def tranTimeToDate(tm:String) :java.sql.Timestamp= {
val fm = new SimpleDateFormat(“yyyy-MM-dd HH:mm:ss”)
val tim: String = fm.format(new Date(tm.toLong))
val date: Date = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(tim)
val ts = new Timestamp(date.getTime());
ts
}
}
注意:代码中使用prepareStatement接口来实现SQL语句的预编译时,如果是使用批量的方式导入MySQL,那么批量导入的数据的格式一定要相同。
方法bean类:
package com.mhl.utils;
public class User {
private String aStatus;
private String apkName;
private String createTime;
private String icon;
private String packageName;
private String updateTime;
private String version;
private String versionCode;
public String getaStatus() {
return aStatus;
}
public void setaStatus(String aStatus) {
this.aStatus = aStatus;
}
public String getApkName() {
return apkName;
}
public void setApkName(String apkName) {
this.apkName = apkName;
}
public String getCreateTime() {
return createTime;
}
public void setCreateTime(String createTime) {
this.createTime = createTime;
}
public String getIcon() {
return icon;
}
public void setIcon(String icon) {
this.icon = icon;
}
public String getPackageName() {
return packageName;
}
public void setPackageName(String packageName) {
this.packageName = packageName;
}
public String getUpdateTime() {
return updateTime;
}
public void setUpdateTime(String updateTime) {
this.updateTime = updateTime;
}
public String getVersion() {
return version;
}
public void setVersion(String version) {
this.version = version;
}
public String getVersionCode() {
return versionCode;
}
public void setVersionCode(String versionCode) {
this.versionCode = versionCode;
}
}
Utils方法类:
package com.mhl.utils;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
import java.util.ArrayList;
import java.util.List;
public class Utils {
public static String parseJson(String line) {
IdWorker ik = new IdWorker();
Gson gson = new Gson();//创建Gson对象
JsonParser jsonParser = new JsonParser();
//获取JsonArray对象
JsonArray jsonElements = jsonParser.parse(line).getAsJsonArray();
ArrayList<User> users = new ArrayList<>();
for (JsonElement user : jsonElements) {
User user1 = gson.fromJson(user, User.class);//解析
users.add(user1);
}
List<String> lis = new ArrayList<>();
/* System.out.println(users.size());
System.out.println(users.get(1).getPackageName());
System.out.println(users.get(1).getCreateTime());
System.out.println(users.get(1).getUpdateTime());
System.out.println(users.get(1).getVersion());
System.out.println(users.get(1).getVersionCode());*/
return ik.nextId()+","+users.get(1).getPackageName()+","+null+","+users.get(1).getCreateTime()+","+users.get(1).getUpdateTime()+","+users.get(1).getVersion()+","+users.get(1).getVersionCode();
/* for (int i=0;i<users.size();i++) {
return ik.nextId() + "," + users.get(i).getPackageName() + "," + null + "," + users.get(i).getCreateTime() + "," + users.get(i).getUpdateTime() + "," + users.get(i).getVersion() + "," + users.get(i).getVersionCode();
}
*/
}
/**
* 返回数组对象
* @param line
* @return
*/
public static ArrayList<User> parseJsonToRDD(String line) {
IdWorker ik = new IdWorker();
Gson gson = new Gson();//创建Gson对象
JsonParser jsonParser = new JsonParser();
//获取JsonArray对象
JsonArray jsonElements = jsonParser.parse(line).getAsJsonArray();
ArrayList<User> users = new ArrayList<>();
for (JsonElement user : jsonElements) {
User user1 = gson.fromJson(user, User.class);//解析
users.add(user1);
}
List<String> lis = new ArrayList<>();
/* System.out.println(users.size());
System.out.println(users.get(1).getPackageName());
System.out.println(users.get(1).getCreateTime());
System.out.println(users.get(1).getUpdateTime());
System.out.println(users.get(1).getVersion());
System.out.println(users.get(1).getVersionCode());*/
return users;
}
}
参考其他的博主的地址和实现方法:
批量插入:耗时930ms
public static void batchInsert() throws SQLException {
long start = System.currentTimeMillis();
Connection conn = getConnection();
conn.setAutoCommit(false);
PreparedStatement ps = null;
String sql = “insert into xxx values (?,‘1’,‘1’)”;
ps = conn.prepareStatement(sql); // 批量插入时ps对象必须放到for循环外面
for (int i=0;i < 50000;i++){
ps.setString(1, i+"");
ps.addBatch();
// 每1000条记录插入一次
if (i % 1000 == 0){
ps.executeBatch();
conn.commit();
ps.clearBatch();
}
}
// 剩余数量不足1000
ps.executeBatch();
conn.commit();
ps.clearBatch();
long end = System.currentTimeMillis();
System.out.println(end - start);
}