解决办法:
pom文件加:spark是1.6.0
spark-csv_2.10注意scala的版本与之对应
<!--spark on hive 写入csv格式文件-->
<dependency>
<groupId>com.databricks</groupId>
<artifactId>spark-csv_2.10</artifactId>
<version>1.4.0</version>
</dependency>
package com.joy.loganalysis
import java.sql.{Connection, DriverManager, PreparedStatement}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ListBuffer
//import org.apache.spark.sql.hive.orc //使用orc格式,需要导入包org.apache.spark.sql.hive.orc
/**
* @Author: xqg
* @Date: 2019/9/5 14:54
* @Description:
*
* O2oApiInfoLogAnalysis 日志解析
*/
object O2oApiInfoLogAnalysis {
case class o2o_api_info(create_date: String, request_address: String, request_person: String, request_ip: String, param0: String,
content: String, business_area: String, city_code: String, brand_id: String, level: String,
level1: String, level2: String, level3: String, app_history: String, parent: String,
top_parent: String, store_id: String, sceening_category: String, product_id: String,
device_id: String, app_id: String, systematic_one_level: String)
case class brand(brand_id: String, brand_name: String, level2: String)
case class store(store_id: String, store_name: String, brand_id: String)
//数据库表
case class O2oApiInfo_medical_oral2DB(create_date: String, date_type: String, stat_type: String, ct: String)
def main(args: Array[String]): Unit = {
//屏蔽不必要的日志显示在终端上
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
val fileUrl1 = "E:\\joyshebaoProject\\logAnalysis\\src\\data\\log\\20190923_o2o.log"
val saveFilePath1 = "E:\\joyshebaoProject\\logAnalysis\\src\\data\\log_analysis\\o2o_api_info"
val conf = new SparkConf().setAppName("O2oApiInfoLogAnalysis").setMaster("local")
// val conf = new SparkConf().setAppName("O2oApiInfoLogAnalysis")
val sc = new SparkContext(conf)
// sc.setLogLevel("ERROR")
val ssc = new SQLContext(sc)
//隐式转换
import ssc.implicits._
/** 第一步: */
val json_log = InfoLogAnalysis(ssc, fileUrl1)
// json_log.show(false)
/** 第二步: */
// val o2o_api_info_arguments: DataFrame = O2oApiInfoArgumentsAnalysis(ssc, json_log)
val o2o_api_info_arguments: DataFrame = O2oApiInfoArgumentsAnalysis(ssc, json_log, saveFilePath1)
// O2oApiInfoArgumentsAnalysis(ssc, json_log, saveFilePath1)
o2o_api_info_arguments.show(100, false)
/** 第三步: 统计指标 */
//读取保存中间数据
val file = sc.textFile(saveFilePath1)
val lines = file.map(_.split("\\|")).filter(x => x.length >= 31)
val df = lines.map(t => o2o_api_info(t(0), t(3), t(4), t(5), t(6), t(9), t(10),
t(11), t(12), t(13), t(14),
t(15), t(16), t(20), t(21),
t(22), t(23), t(24), t(25),
t(28), t(29), t(30))).toDF()
df.registerTempTable("o2o_api_info")
df.show()
/**
* 读取自定义分隔符csv格式的数据
*/
println("--------------start-----------------")
val result: DataFrame = ssc.read.format("csv")
.option("header", "true")
.option("delimiter", "|")
.load(saveFilePath1)
result.show()
/** 查看有那些接口 */
// val request_address: DataFrame = LookRequestAddress(ssc, json_log)
// request_address.show(100, false)
// request_address.rdd.count()
sc.stop()
}
/**
* 查看全部接口:
* 对[请求参数]解析,暴露返回参数,后续调用该参数
*/
def LookRequestAddress(ssc: SQLContext, json_log: DataFrame) = {
/** 注册成临时表 */
json_log.registerTempTable("request_address_info")
/**
* 查看接口信息
*/
val look_request_address: DataFrame = ssc.sql(
"""
|select
| distinct request_address -- 请求地址
|from request_address_info
|where request_arguments is not null
""".stripMargin)
look_request_address
}
/**
* 第二步:
* 对[请求参数]解析,暴露返回参数,后续调用该参数
*/
// def O2oApiInfoArgumentsAnalysis(ssc: SQLContext, json_log: DataFrame) = {
def O2oApiInfoArgumentsAnalysis(ssc: SQLContext, json_log: DataFrame, saveFilePath1: String) = {
/** 注册成临时表 */
json_log.registerTempTable("o2o_api_info_detail")
val o2o_api_info_arguments: DataFrame = ssc.sql(
"""
|select substring(create_date, 1, 10) as create_date
| -- ,message
| ,request_version -- 请求版本
| ,method_describe -- 方法描述
| ,request_address -- 请求地址
| ,request_person -- 请求人
| ,split(request_ip, '\,')[0] AS request_ip -- 请求IP
| -- ,request_arguments -- 请求参数
| ,get_json_object(request_arguments, '$.param0') AS param0
| ,get_json_object(request_arguments, '$.param0.lon') AS lon
| ,get_json_object(request_arguments, '$.param0.lat') AS lat
| ,get_json_object(request_arguments, '$.param0.content') AS content
| ,get_json_object(request_arguments, '$.param0.businessArea') AS business_area
| ,get_json_object(request_arguments, '$.param0.cityCode') AS city_code
| ,get_json_object(request_arguments, '$.param0.brandId') AS brand_id
| ,get_json_object(request_arguments, '$.param0.level') AS level
| ,get_json_object(request_arguments, '$.param0.level1') AS level1
| ,get_json_object(request_arguments, '$.param0.level2') AS level2
| ,get_json_object(request_arguments, '$.param0.level3') AS level3
| ,get_json_object(request_arguments, '$.param0.orderby') AS order_by
| ,get_json_object(request_arguments, '$.param0.pageNo') AS page_no
| ,get_json_object(request_arguments, '$.param0.pageSize') AS page_size
| ,get_json_object(request_arguments, '$.param0.appVersion') AS app_history
| ,get_json_object(request_arguments, '$.param0.parent') AS parent
| ,get_json_object(request_arguments, '$.param0.topParent') AS top_parent
| ,case when request_address in ('/app/storeInfo','/app/userViewProductSaleInfo') then get_json_object(request_arguments, '$.param3.storeId')
| when request_address = '/app/getRecommendedProductsByStoreId' then get_json_object(request_arguments, '$.param0.storeId') end AS store_id
| ,get_json_object(request_arguments, '$.param0.sceeningCategory') AS sceening_category
| ,case when request_address = '/app/productSaleInfo' then get_json_object(request_arguments, '$.param1.productId')
| when request_address = '/app/userViewProductSaleInfo' then get_json_object(request_arguments, '$.param3.productId') end AS product_id
| ,get_json_object(request_arguments, '$.param0.nearbyArea') AS nearby_area
| ,get_json_object(request_arguments, '$.param0.area') AS area
| ,case when request_address in ('/app/storeInfo','/app/userViewProductSaleInfo') then get_json_object(request_arguments, '$.param1') end AS device_id
| ,case when request_address in ('/app/storeInfo','/app/userViewProductSaleInfo') then get_json_object(request_arguments, '$.param0') end AS app_id
| ,get_json_object(request_arguments, '$.param0.systematicOneLevel') AS systematic_one_level
| ,case when request_address in ('/app/storeInfo','/app/userViewProductSaleInfo') then get_json_object(request_arguments, '$.param2') end AS user_id
|from o2o_api_info_detail
|where request_address in ('/app/searchStore', '/app/searchNewStore', '/app/storeInfo'
| ,'/app/productSaleInfo' ,'/app/searchProduct' ,'/app/findAllAvailableBrand'
| ,'/app/getProjectSystematicList', '/app/getRecommendedProductsByStoreId'
| ,'/app/userViewProductSaleInfo'
| )
""".stripMargin)
/**
* 保存到HDFS,筛选清洗后的数据所包含的信息,宽表信息明细
*/
// o2o_api_info_arguments.show(false)
// val result: RDD[String] = o2o_api_info_arguments.map(_.mkString("|"))
// result.saveAsTextFile(saveFilePath1)
// o2o_api_info_arguments.rdd.repartition(1).saveAsTextFile(saveFilePath1)
//o2o_api_info_arguments.write.mode(SaveMode.Overwrite).orc(saveFilePath1) //使用orc格式,需要导入包org.apache.spark.sql.hive.orc -->在这里不使用
/**
* 注意:
* df.write
* .mode(SaveMode.Overwrite) //保存模式
* .format("csv") //存储格式
* .option("header", "true") //header 第一行不作为数据内容,作为标题
* .option("delimiter", "|") //delimiter 分隔符,默认为逗号
* .save(saveFilePath1) //保存路径
*/
o2o_api_info_arguments.write.mode(SaveMode.Overwrite).format("csv").option("header", "true").option("delimiter", "|").save(saveFilePath1)
o2o_api_info_arguments
}
/**
* 第一步:
* 对json日志解析,暴露返回参数,后续调用该参数
*/
def InfoLogAnalysis(ssc: SQLContext, fileUrl1: String) = {
/** 解析第一步: 获取message信息,进行切分数据, 保存处理过后的中间数据 */
val json_df: DataFrame = ssc.read.format("json").load(fileUrl1)
json_df.registerTempTable("info_log_analysis")
/**
* 备注:
* substring(split(message, '\n')[7], 7) 截取=>请求参数:___的所有内容___
*/
val json_log: DataFrame = ssc.sql(
"""
|select -- split(message, '\n')[0] as message,
| logName as log_name
| -- ,split(message, '\n')[1] as 空行 -- 日志中这一行为空
| ,split(split(message, '\n')[2], ':')[1] as request_version -- 请求版本
| ,split(split(message, '\n')[3], ':')[1] as method_describe -- 方法描述
| ,split(split(message, '\n')[4], ':')[1] as request_address -- 请求地址
| ,split(split(message, '\n')[5], ':')[1] as request_person -- 请求人
| ,split(split(message, '\n')[6], ':')[1] as request_ip -- 请求IP
| ,substring(split(message, '\n')[7], 7) as request_arguments -- 请求参数
| ,substring(`@timestamp`, 1, 10) as create_date
|from info_log_analysis
|where logName = 'o2o'
""".stripMargin)
json_log
}
def insertO2oApiInfo_medical_oral(list: ListBuffer[O2oApiInfo_medical_oral2DB], url: String, usr: String, pwd: String, data_type: String): Unit = {
// def insertO2oApiInfo_medical_oral(iterator: Iterator[(String, String)], url: String, usr: String, pwd: String, date_type: String, stat_type: String): Unit = {
var conn: Connection = null
var ps: PreparedStatement = null
val sql = "insert into o2o_api_info(create_date, date_type, stat_type, ct, data_type) values(?,?,?,?,?)"
try {
Class.forName("com.mysql.jdbc.Driver")
conn = DriverManager.getConnection(url, usr, pwd)
ps = conn.prepareStatement(sql)
//数据高效插入数据库
for (ele <- list) {
ps.setString(1, ele.create_date)
ps.setString(2, ele.date_type)
ps.setString(3, ele.stat_type)
ps.setString(4, ele.ct)
ps.setString(5, data_type)
ps.addBatch()
}
ps.executeBatch()
// iterator.foreach(data => {
// ps.setString(1, data._1)
// ps.setString(2, date_type)
// ps.setString(3, stat_type)
// ps.setString(4, data._2)
// ps.executeUpdate()
// })
} catch {
case e: Exception => e.printStackTrace
} finally {
if (ps != null) {
ps.close()
}
if (conn != null) {
conn.close()
}
}
}
def deleteData1(url: String, usr: String, pwd: String, create_date: String): Unit = {
var conn: Connection = null
var ps: PreparedStatement = null
val sql = s"delete from o2o_api_info where create_date = '${create_date}' and data_type = 'o2o'"
try {
Class.forName("com.mysql.jdbc.Driver")
conn = DriverManager.getConnection(url, usr, pwd)
ps = conn.prepareStatement(sql)
ps.executeUpdate()
} catch {
case e: Exception => e.printStackTrace
} finally {
if (ps != null) {
ps.close()
}
if (conn != null) {
conn.close()
}
}
}
def deleteData2(url: String, usr: String, pwd: String, create_date: String): Unit = {
var conn: Connection = null
var ps: PreparedStatement = null
val sql = s"delete from o2o_api_info_brand_shop where create_date = '${create_date}'"
try {
Class.forName("com.mysql.jdbc.Driver")
conn = DriverManager.getConnection(url, usr, pwd)
ps = conn.prepareStatement(sql)
ps.executeUpdate()
} catch {
case e: Exception => e.printStackTrace
} finally {
if (ps != null) {
ps.close()
}
if (conn != null) {
conn.close()
}
}
}
}
ERROR yarn.ApplicationMaster: User class threw exception: java.lang.ClassNotFoundException: Failed to find data source: csv. Please find packages at http://spark-packages.org
java.lang.ClassNotFoundException: Failed to find data source: csv. Please find packages at http://spark-packages.org
at org.apache.spark.sql.execution.datasources.ResolvedDataSource$.lookupDataSource(ResolvedDataSource.scala:77)
at org.apache.spark.sql.execution.datasources.ResolvedDataSource$.apply(ResolvedDataSource.scala:219)
at org.apache.spark.sql.DataFrameWriter.dataSource$lzycompute$1(DataFrameWriter.scala:181)
at org.apache.spark.sql.DataFrameWriter.org$apache$spark$sql$DataFrameWriter$$dataSource$1(DataFrameWriter.scala:181)
at org.apache.spark.sql.DataFrameWriter$$anonfun$save$1.apply$mcV$sp(DataFrameWriter.scala:188)
at org.apache.spark.sql.DataFrameWriter.executeAndCallQEListener(DataFrameWriter.scala:154)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:188)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:172)
at com.joy.loganalysis.O2oApiInfoLogAnalysis_copy$.O2oApiInfoArgumentsAnalysis(O2oApiInfoLogAnalysis_copy.scala:208)
at com.joy.loganalysis.O2oApiInfoLogAnalysis_copy$.main(O2oApiInfoLogAnalysis_copy.scala:76)
at com.joy.loganalysis.O2oApiInfoLogAnalysis_copy.main(O2oApiInfoLogAnalysis_copy.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.yarn.ApplicationMaster$$anon$2.run(ApplicationMaster.scala:567)
Caused by: java.lang.ClassNotFoundException: csv.DefaultSource
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at org.apache.spark.sql.execution.datasources.ResolvedDataSource$$anonfun$4$$anonfun$apply$1.apply(ResolvedDataSource.scala:62)
at org.apache.spark.sql.execution.datasources.ResolvedDataSource$$anonfun$4$$anonfun$apply$1.apply(ResolvedDataSource.scala:62)
at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.sql.execution.datasources.ResolvedDataSource$$anonfun$4.apply(ResolvedDataSource.scala:62)
at org.apache.spark.sql.execution.datasources.ResolvedDataSource$$anonfun$4.apply(ResolvedDataSource.scala:62)
at scala.util.Try.orElse(Try.scala:82)
at org.apache.spark.sql.execution.datasources.ResolvedDataSource$.lookupDataSource(ResolvedDataSource.scala:62)
... 15 more
LogType:stdout
Log Upload Time:Thu Nov 07 09:45:20 +0800 2019
LogLength:0