spark-SQL使用广播变量以及应用数据库的UDF自定义函数的查询会比两张表的连接更加的优化的程序的执行。
两表连接是比较费效率的。
spar-sql 2.x的数据读取,处理,添加schema信息,常见表,SQL查询。
将sql结果输出到mysql的api参数设置。
还可以创建时设置参数:
val conf = new SparkConf()
.set("spark.serilizer", "org.apache.spark.serializer.KryoSerializer")
.set("spark.rdd.compress", "true")
SparkSession.builder()
.master("local[*]")
.appName("xx")
.config(conf)
.getOrCreate()
2
import java.util.Properties
import kaoshi.IPUtils
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.{DataFrame, Dataset, SaveMode, SparkSession}
/**
*/
object IpMapJoin {
def main(args: Array[String]): Unit = {
val session: SparkSession = SparkSession.builder()
.master("local")
.appName("xx")
.getOrCreate()
import session.implicits._
//读取ip
val ipFile = session.read.textFile("ip.txt")
val ipData: Dataset[(Long, Long, String)] = ipFile.map({
line =>
val fields = line.split("[|]")
val start = fields(2).toLong
val end = fields(3).toLong
val province = fields(6)
(start, end, province)
})
//将数据收集到广播出去。
val ipCollect: Array[(Long, Long, String)] = ipData.collect()
val ipBC: Broadcast[Array[(Long, Long, String)]] = session.sparkContext.broadcast(ipCollect)
//读取日志数据
val accFile: Dataset[String] = session.read.textFile("access.log")
val accessData: Dataset[Long] = accFile.map({
line =>
val fields = line.split("[|]")
val ipStr = fields(1)
IPUtils.ip2Long(ipStr)
})
//添加自定义schema信息,然后创建表,
val accDF: DataFrame = accessData.toDF("ipLong")
accDF.createTempView("v_acc")
//注册自定义函数UDF,传递一个参数后返回一个参数
session.udf.register("searchip2Prov", (ipLong: Long) => {
val ipBCGet: Array[(Long, Long, String)] = ipBC.value
val index = IPUtils.serach(ipLong, ipBCGet)
var province = "unknow"
if (index != -1) {
province = ipBCGet(index)._3
}
province
})
val sql: DataFrame = session.sql("select searchip2Prov(ipLong) prov,count(*) cns from v_acc group by prov order by cns desc")
//将结果写出到mysql,需要设置的参数。
val url = "jdbc:mysql://localhost:3306/bbs?characterEncoding=utf-8"
val tname = "t_acc_Ip"
val conn = new Properties()
conn.setProperty("user","root")
conn.setProperty("password","123456")
conn.setProperty("Driver","com.mysql.jdbc.Driver")
//savemode分为四类。Append, Overwrite, ErrorIfExists, Ignore;
sql.write.mode(SaveMode.Ignore).jdbc(url,tname,conn)
session.close()
}
}
2.此过程中需要导入spark-sql,mysql的jar包。
3.注意UDF的函数的注册,是传递一个参数后返回一个参数。
4.数据写出到jdbc的方式,写出的保存格式有四种。