前面几篇博客说了如何读取mysql数据库中的表到DataFrame中以及如何将结果写入到mysql中
今天这个实例主要实现应用sparksql完成用户日志数据的提取并转换成DataFrame(我们将其定义为表 user)另外我们要从mysql数据库中load一个用户配置表(这里定义为userinfo)我们将这两个表根据imei号进行join获得用户完整的信息数据
具体的环境见http://blog.csdn.net/zfszhangyuan/article/details/52593521
spark用的是1.5.2版本
看代码吧
package spark_sql
import java.text.SimpleDateFormat
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import scala.util.matching.Regex;
object SparkSql {
case class user( imei:String,logtime:String, region:String)
def main(args: Array[String]) {
val sparkConf = new SparkConf().setMaster("local[4]").setAppName("SparkSql")
val sc = new SparkContext(sparkConf)
//val ssc = new StreamingContext(sc, Seconds(10))
val sqlContext = new SQLContext(sc)
val data = sc.textFile(args(0))
//剔除type等于3的数据
val notContainsType3 = data.filter(!_.contains("\\\"type\\\":\\\"3\\\"")).filter(!_.contains("\\\"imei\\\":\\\"\\\"")).filter(!_.contains("000000000000000")).filter(!_.contains("Unknown"))
//过滤logid或imei不存在的数据
val cleanData = notContainsType3.filter(_.contains("logid")).filter(_.contains("imei")).filter(_.contains("areacode"))
val cleanMap = cleanData.map {
line =>
val data = formatLine(line).split(",")
user(data(0),data(1),data(2))
}
import sqlContext.implicits._
val dfuser= cleanMap.toDF()
dfuser.registerTempTable("user")
// val testuser=sqlContext.sql("select region ,count(distinct imei) from user group by region order by count(distinct imei) desc ")
//testuser.collect().foreach(println)
//mysql中的配置表
val jdbcDF = sqlContext.read.format("jdbc").options(
Map("url"->"jdbc:mysql://localhost:3306/db_ldjs",
"dbtable"->"(select imei,region,city,company,name from tb_user_imei) as some_alias",
"driver"->"com.mysql.jdbc.Driver",
"user"-> "root",
//"partitionColumn"->"day_id",
"lowerBound"->"0",
"upperBound"-> "1000",
//"numPartitions"->"2",
"fetchSize"->"100",
"password"->"123456")).load()
jdbcDF.registerTempTable("userinfo")
// dfuser.printSchema()
// jdbcDF.printSchema()
// dfuser.show()
// jdbcDF.show()
val testuser=sqlContext.sql("select A.imei,A.logtime,A.region,B.name,B.city,B.company from userinfo as B left join user as A on A.imei=B.imei ")
testuser.collect().foreach(println)
//
}
/**
* 从每行日志解析出imei和logid
*
**/
def formatLine(line: String): String = {
val logIdRegex = """"logid":"([0-9]+)",""".r
val imeiRegex = """\\"imei\\":\\"([A-Za-z0-9]+)\\"""".r
val regionRegex = """"areacode":"([^A-Za-z0-9_]+)",""".r //""""areacode":"[\\u4e00-\\u9fa5]*"""".r
val logId = getDataByPattern(logIdRegex, line)
val imei = getDataByPattern(imeiRegex, line)
val region = getDataByPattern(regionRegex, line)
//时间取到秒
imei + "," + logId.substring(0, 14)+ "," + region
}
/**
* 根据正则表达式,查找相应值
*
**/
def getDataByPattern(p: Regex, line: String): String = {
val result = (p.findFirstMatchIn(line)).map(item => {
val s = item group 1
s
})
result.getOrElse("NULL")
}
/**
* 根据时间字符串获取时间,单位(秒)
*
**/
def getTimeByString(timeString: String): Long = {
val sf: SimpleDateFormat = new SimpleDateFormat("yyyyMMddHHmmss")
sf.parse(timeString).getTime / 1000
}
}
运行结果如下:
这里主要说下使用的感受:
1.sparksql用起来感觉肯定没有Hsql那么好用了,主要是不能调试,有时候自己的sql写错了,报出来的错误,如果是外行根本无法发现错误的根源(我就是犯过sql写错了,结果报的错误不搭嘎,调试了半天)
2.这个实例证明我们在用spark做数据分析时,一些不变的用户配置表可以放mysql中,完全通过这种新形式实现跨数据平台运行sql(这个很实用)
3.上面提供了一下语句打印表结构和表数据方便检查sql和表数据构建是否写的有问题。
// dfuser.printSchema()
// jdbcDF.printSchema()
// dfuser.show()
// jdbcDF.show()
忘记放日志了:
2016-04-18 16:00:00 {"countAll":0,"countCorrect":0,"logid":"201604181600001605286233","requestip":"36.23.153.219","requesttime":"2016-04-18 16:00:00","requesttype":"0"}
2016-04-18 16:00:00 {"areacode":"浙江省丽水市","countAll":0,"countCorrect":0,"datatime":"4134362","logid":"201604181600001184409476","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966390499\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"100\",\"imei\":\"12345678900987654321\",\"subjectNum\":\"13989589062\",\"imsi\":\"1234567