转自:http://blog.csdn.net/zfszhangyuan/article/details/52702238
前面几篇博客说了如何读取mysql数据库中的表到DataFrame中以及如何将结果写入到mysql中
今天这个实例主要实现应用sparksql完成用户日志数据的提取并转换成DataFrame(我们将其定义为表 user)另外我们要从mysql数据库中load一个用户配置表(这里定义为userinfo)我们将这两个表根据imei号进行join获得用户完整的信息数据
具体的环境见http://blog.csdn.net/zfszhangyuan/article/details/52593521
spark用的是1.5.2版本
看代码吧
- package spark_sql
- import java.text.SimpleDateFormat
- import org.apache.spark.{SparkConf, SparkContext}
- import org.apache.spark.sql.SQLContext
- import scala.util.matching.Regex;
- object SparkSql {
- case class user( imei:String,logtime:String, region:String)
- def main(args: Array[String]) {
- val sparkConf = new SparkConf().setMaster("local[4]").setAppName("SparkSql")
- val sc = new SparkContext(sparkConf)
- //val ssc = new StreamingContext(sc, Seconds(10))
- val sqlContext = new SQLContext(sc)
- val data = sc.textFile(args(0))
- //剔除type等于3的数据
- val notContainsType3 = data.filter(!_.contains("\\\"type\\\":\\\"3\\\"")).filter(!_.contains("\\\"imei\\\":\\\"\\\"")).filter(!_.contains("000000000000000")).filter(!_.contains("Unknown"))
- //过滤logid或imei不存在的数据
- val cleanData = notContainsType3.filter(_.contains("logid")).filter(_.contains("imei")).filter(_.contains("areacode"))
- val cleanMap = cleanData.map {
- line =>
- val data = formatLine(line).split(",")
- user(data(0),data(1),data(2))
- }
- import sqlContext.implicits._
- val dfuser= cleanMap.toDF()
- dfuser.registerTempTable("user")
- // val testuser=sqlContext.sql("select region ,count(distinct imei) from user group by region order by count(distinct imei) desc ")
- //testuser.collect().foreach(println)
- //mysql中的配置表
- val jdbcDF = sqlContext.read.format("jdbc").options(
- Map("url"->"jdbc:mysql://localhost:3306/db_ldjs",
- "dbtable"->"(select imei,region,city,company,name from tb_user_imei) as some_alias",
- "driver"->"com.mysql.jdbc.Driver",
- "user"-> "root",
- //"partitionColumn"->"day_id",
- "lowerBound"->"0",
- "upperBound"-> "1000",
- //"numPartitions"->"2",
- "fetchSize"->"100",
- "password"->"123456")).load()
- jdbcDF.registerTempTable("userinfo")
- // dfuser.printSchema()
- // jdbcDF.printSchema()
- // dfuser.show()
- // jdbcDF.show()
- val testuser=sqlContext.sql("select A.imei,A.logtime,A.region,B.name,B.city,B.company from userinfo as B left join user as A on A.imei=B.imei ")
- testuser.collect().foreach(println)
- //
- }
- /**
- * 从每行日志解析出imei和logid
- *
- **/
- def formatLine(line: String): String = {
- val logIdRegex = """"logid":"([0-9]+)",""".r
- val imeiRegex = """\\"imei\\":\\"([A-Za-z0-9]+)\\"""".r
- val regionRegex = """"areacode":"([^A-Za-z0-9_]+)",""".r //""""areacode":"[\\u4e00-\\u9fa5]*"""".r
- val logId = getDataByPattern(logIdRegex, line)
- val imei = getDataByPattern(imeiRegex, line)
- val region = getDataByPattern(regionRegex, line)
- //时间取到秒
- imei + "," + logId.substring(0, 14)+ "," + region
- }
- /**
- * 根据正则表达式,查找相应值
- *
- **/
- def getDataByPattern(p: Regex, line: String): String = {
- val result = (p.findFirstMatchIn(line)).map(item => {
- val s = item group 1
- s
- })
- result.getOrElse("NULL")
- }
- /**
- * 根据时间字符串获取时间,单位(秒)
- *
- **/
- def getTimeByString(timeString: String): Long = {
- val sf: SimpleDateFormat = new SimpleDateFormat("yyyyMMddHHmmss")
- sf.parse(timeString).getTime / 1000
- }
- }
这里主要说下使用的感受:
1.sparksql用起来感觉肯定没有Hsql那么好用了,主要是不能调试,有时候自己的sql写错了,报出来的错误,如果是外行根本无法发现错误的根源(我就是犯过sql写错了,结果报的错误不搭嘎,调试了半天)
2.这个实例证明我们在用spark做数据分析时,一些不变的用户配置表可以放mysql中,完全通过这种新形式实现跨数据平台运行sql(这个很实用)
3.上面提供了一下语句打印表结构和表数据方便检查sql和表数据构建是否写的有问题。
- // dfuser.printSchema()
- // jdbcDF.printSchema()
- // dfuser.show()
- // jdbcDF.show()
- 2016-04-18 16:00:00 {"countAll":0,"countCorrect":0,"logid":"201604181600001605286233","requestip":"36.23.153.219","requesttime":"2016-04-18 16:00:00","requesttype":"0"}
- 2016-04-18 16:00:00 {"areacode":"浙江省丽水市","countAll":0,"countCorrect":0,"datatime":"4134362","logid":"201604181600001184409476","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966390499\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"100\",\"imei\":\"12345678900987654321\",\"subjectNum\":\"13989589062\",\"imsi\":\"12345678900987654321\",\"queryNum\":\"13989589062\"}","requestip":"36.16.128.234","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果"}
- 2016-04-18 16:00:00 {"areacode":"宁夏银川市","countAll":0,"countCorrect":0,"datatime":"4715990","logid":"201604181600001858043208","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966400120\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"1210\",\"imei\":\"A0000044ABFD25\",\"subjectNum\":\"15379681917\",\"imsi\":\"460036951451601\",\"queryNum\":\"\"}","requestip":"115.168.93.87","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果","userAgent":"ZTE-Me/Mobile"}
- 2016-04-18 16:00:00 {"areacode":"黑龙江省哈尔滨市","countAll":0,"countCorrect":0,"datatime":"5369561","logid":"201604181600001068429609","requestinfo":"{\"interfaceUserName\":\"12345678900987654321\",\"queryNum\":\"\",\"timestamp\":\"1460966400139\",\"sign\":\"4\",\"imsi\":\"460030301212545\",\"imei\":\"35460207765269\",\"subjectNum\":\"55588237\",\"subjectPro\":\"123456\",\"remark\":\"4\",\"channelno\":\"2100\"}","requestip":"42.184.41.180","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果"}
- 2016-04-18 16:00:00 {"areacode":"浙江省丽水市","countAll":0,"countCorrect":0,"datatime":"4003096","logid":"201604181600001648238807","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966391025\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"100\",\"imei\":\"12345678900987654321\",\"subjectNum\":\"13989589062\",\"imsi\":\"12345678900987654321\",\"queryNum\":\"13989589062\"}","requestip":"36.16.128.234","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果"}
- 2016-04-18 16:00:00 {"areacode":"广西南宁市","countAll":0,"countCorrect":0,"datatime":"4047993","logid":"201604181600001570024205","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966382871\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"1006\",\"imei\":\"A000004853168C\",\"subjectNum\":\"07765232589\",\"imsi\":\"460031210400007\",\"queryNum\":\"13317810717\"}","requestip":"219.159.72.3","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果"}
- 2016-04-18 16:00:00 {"areacode":"海南省五指山市","countAll":0,"countCorrect":0,"datatime":"5164117","logid":"201604181600001227842048","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966399159\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"1017\",\"imei\":\"A000005543AFB7\",\"subjectNum\":\"089836329061\",\"imsi\":\"460036380954376\",\"queryNum\":\"13389875751\"}","requestip":"140.240.171.71","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果"}
- 2016-04-18 16:00:00 {"areacode":"山西省","countAll":0,"countCorrect":0,"datatime":"14075772","logid":"201604181600001284030648","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966400332\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"1006\",\"imei\":\"A000004FE0218A\",\"subjectNum\":\"03514043633\",\"imsi\":\"460037471517070\",\"queryNum\":\"\"}","requestip":"1.68.5.227","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果"}
- 2016-04-18 16:00:00 {"areacode":"四川省","countAll":0,"countCorrect":0,"datatime":"6270982","logid":"201604181600001173504863","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966398896\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"100\",\"imei\":\"12345678900987654321\",\"subjectNum\":\"13666231300\",\"imsi\":\"12345678900987654321\",\"queryNum\":\"13666231300\"}","requestip":"182.144.66.97","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果"}
- 2016-04-18 16:00:00 {"areacode":"浙江省","countAll":0,"countCorrect":0,"datatime":"4198522","logid":"201604181600001390637240","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966399464\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"100\",\"imei\":\"12345678900987654321\",\"subjectNum\":\"05533876327\",\"imsi\":\"12345678900987654321\",\"queryNum\":\"05533876327\"}","requestip":"36.23.9.49","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"000000","responsedata":"操作成功"}
- 2016-04-18 16:00:00 {"areacode":"江苏省连云港市","countAll":0,"countCorrect":0,"datatime":"4408097","logid":"201604181600001249944032","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966395908\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"100\",\"imei\":\"12345678900987654321\",\"subjectNum\":\"18361451463\",\"imsi\":\"12345678900987654321\",\"queryNum\":\"18361451463\"}","requestip":"58.223.4.210","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果"}
- 2016-04-18 16:00:00 {"areacode":"浙江省","countAll":0,"countCorrect":0,"datatime":"5154518","logid":"201604181600001714496463","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966399474\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"100\",\"imei\":\"12345678900987654321\",\"subjectNum\":\"05533876327\",\"imsi\":\"12345678900987654321\",\"queryNum\":\"05533876327\"}","requestip":"36.23.9.49","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"000000","responsedata":"操作成功"}
- 2016-04-18 16:00:00 {"areacode":"浙江省","countAll":0,"countCorrect":0,"datatime":"4761269","logid":"201604181600001187577136","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966400191\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"100\",\"imei\":\"12345678900987654321\",\"subjectNum\":\"057427895481\",\"imsi\":\"12345678900987654321\",\"queryNum\":\"057427895481\"}","requestip":"36.23.153.219","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果"}
- 2016-04-18 16:00:00 {"areacode":"河北省廊坊市","countAll":0,"countCorrect":0,"datatime":"75408665","logid":"201604181600001020722122","requestinfo":"{\"subjectNum\":\"13582968216\",\"imsi\":\"460031298611058\",\"queryNum\":\"18033684000\",\"channelno\":\"100\",\"imei\":\"99000586096233\"}","requestip":"110.251.61.62","requesttime":"2016-04-18 16:00:00","requesttype":"28","responsecode":"010005","responsedata":"查询结果为空"}
- 2016-04-18 16:00:00 {"areacode":"贵州省黔西南州兴义市","countAll":0,"countCorrect":0,"datatime":"4586950","logid":"201604181600001499837763","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966398600\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"1006\",\"imei\":\"865707029710377\",\"subjectNum\":\"509\",\"imsi\":\"460025864693571\",\"queryNum\":\"\"}","requestip":"111.85.45.172","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果"}
- 2016-04-18 16:00:00 {"areacode":"云南省昆明市","countAll":0,"countCorrect":0,"datatime":"4441961","logid":"201604181600001794147521","requestinfo":"{\"interfaceUserName\":\"12345678900987654321\",\"queryNum\":\"13618922555\",\"timestamp\":\"1460966401214\",\"sign\":\"4\",\"imsi\":\"12345678900987654321\",\"imei\":\"12345678900987654321\",\"subjectNum\":\"13618922555\",\"subjectPro\":\"123456\",\"remark\":\"4\",\"channelno\":\"100\"}","requestip":"113.63.132.128","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果"}
- 2016-04-18 16:00:00 {"areacode":"江苏省连云港市","countAll":0,"countCorrect":0,"datatime":"4186305","logid":"201604181600001175993827","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966397309\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"100\",\"imei\":\"12345678900987654321\",\"subjectNum\":\"18361451463\",\"imsi\":\"12345678900987654321\",\"queryNum\":\"18361451463\"}","requestip":"58.223.4.210","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果"}
- 2016-04-18 16:00:00 {"areacode":"江苏省","countAll":0,"countCorrect":0,"datatime":"4103662","logid":"201604181600001051944754","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966399642\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"1006\",\"imei\":\"a0000059788b71\",\"subjectNum\":\"768\",\"imsi\":\"460036660539168\",\"queryNum\":\"\"}","requestip":"180.98.180.95","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果"}
- 2016-04-18 16:00:00 {"areacode":"山西省","countAll":0,"countCorrect":0,"datatime":"4247256","logid":"201604181600001013319164","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966400334\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"1006\",\"imei\":\"A000004FE0218A\",\"subjectNum\":\"03514043633\",\"imsi\":\"460037471517070\",\"queryNum\":\"\"}","requestip":"1.68.5.227","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果"}
- 2016-04-18 16:00:00 {"areacode":"北京市","countAll":0,"countCorrect":0,"datatime":"5401532","logid":"201604181600001469644300","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966399603\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"100\",\"imei\":\"12345678900987654321\",\"subjectNum\":\"4001004259\",\"imsi\":\"12345678900987654321\",\"queryNum\":\"\"}","requestip":"106.121.0.143","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果"}
- 2016-04-18 16:00:00 {"areacode":"北京市","countAll":0,"countCorrect":0,"datatime":"4876709","logid":"201604181600001476349766","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966399603\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"100\",\"imei\":\"12345678900987654321\",\"subjectNum\":\"4001004259\",\"imsi\":\"12345678900987654321\",\"queryNum\":\"\"}","requestip":"106.121.0.143","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果"}
- 2016-04-18 16:00:00 {"areacode":"江苏省连云港市","countAll":0,"countCorrect":0,"datatime":"4498474","logid":"201604181600001508125886","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966397987\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"100\",\"imei\":\"12345678900987654321\",\"subjectNum\":\"18361451463\",\"imsi\":\"12345678900987654321\",\"queryNum\":\"18361451463\"}","requestip":"58.223.4.210","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果"}
- 2016-04-18 16:00:00 {"areacode":"江苏省连云港市","countAll":0,"countCorrect":0,"datatime":"4318254","logid":"201604181600001766447939","requestinfo":"{\"subjectNum\":\"66699\",\"imsi\":\"460036611592505\",\"queryNum\":\"\",\"channelno\":\"100\",\"imei\":\"A00000457ECC28\"}","requestip":"58.223.4.210","requesttime":"2016-04-18 16:00:00","requesttype":"28","responsecode":"000000","responsedata":"操作成功"}
- 2016-04-18 16:00:00 {"areacode":"江西省南昌市","countAll":0,"countCorrect":0,"datatime":"244260927","logid":"201604181559591112708085","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966400525\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"1006\",\"imei\":\"a000004f883c2e\",\"subjectNum\":\"813161\",\"imsi\":\"460031392055476\",\"queryNum\":\"\"}","requestip":"182.97.149.145","requesttime":"2016-04-18 15:59:59","requesttype":"0","responsecode":"010005","responsedata":"无查询结果","userAgent":"Dalvik/1.6.0 (Linux; U; Android 4.4.2; HUAWEI P7-L09 Build/HuaweiP7-L09)"}
- 2016-04-18 16:00:00 {"areacode":"上海市黄浦区","countAll":0,"countCorrect":0,"datatime":"4657170","logid":"201604181600001303952983","requestinfo":"{\"interfaceUserName\":\"12345678900987654321\",\"queryNum\":\"\",\"timestamp\":\"1460966400444\",\"sign\":\"4\",\"imei\":\"a000005901fef3\",\"subjectNum\":\"4235\",\"subjectPro\":\"123456\",\"remark\":\"4\",\"channelno\":\"9000\"}","requestip":"124.74.160.162","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果","userAgent":"Dalvik/2.1.0 (Linux; U; Android 6.0; HUAWEI CRR-CL00 Build/HUAWEICRR-CL00)"}
- 2016-04-18 16:00:00 {"areacode":"江西省南昌市","countAll":0,"countCorrect":0,"datatime":"252676235","logid":"201604181559591152287931","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966400399\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"1006\",\"imei\":\"a000004f883c2e\",\"subjectNum\":\"813161\",\"imsi\":\"460031392055476\",\"queryNum\":\"\"}","requestip":"182.97.149.145","requesttime":"2016-04-18 15:59:59","requesttype":"0","responsecode":"010005","responsedata":"无查询结果","userAgent":"Dalvik/1.6.0 (Linux; U; Android 4.4.2; HUAWEI P7-L09 Build/HuaweiP7-L09)"}
- 2016-04-18 16:00:00 {"areacode":"局域网","countAll":0,"countCorrect":0,"datatime":"5160006","logid":"201604181600001026793341","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966399352\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"1002\",\"imei\":\"A00000457ECC28\",\"subjectNum\":\"66699\",\"imsi\":\"460036611592505\",\"queryNum\":\"\"}","requestip":"10.55.80.187","requesttime":"2016-04-18 16:00:00","requesttype":"0","responsecode":"010005","responsedata":"无查询结果"}
- 2016-04-18 16:00:00 {"areacode":"江苏省","countAll":0,"countCorrect":0,"datatime":"245262271","logid":"201604181559591753547387","requestinfo":"{\"sign\":\"4\",\"timestamp\":\"1460966399846\",\"remark\":\"4\",\"subjectPro\":\"123456\",\"interfaceUserName\":\"12345678900987654321\",\"channelno\":\"1006\",\"imei\":\"A000004F661365\",\"subjectNum\":\"2336\",\"imsi\":\"460036580978572\",\"queryNum\":\"\"}","requestip":"180.98.187.27","requesttime":"2016-04-18 15:59:59","requesttype":"0","responsecode":"010005","responsedata":"无查询结果","userAgent":"Dalvik/1.6.0 (Linux; U; Android 4.4.2; HUAWEI C199 Build/HuaweiC199)"}
- 2016-04-18 16:00:00 {"countAll":0,"countCorrect":0,"logid":"201604181600001605286233","requestip":"36.23.153.219","requesttime":"2016-04-18 16:00:00","requesttype":"0"}
配置: