前言:
kafka的消息是通过fileBeat采集Nginx的日志进kafka。
spark消费kafka的数据
1、源消息:
{
"@timestamp":"2020-04-18T09:30:41.525Z",
"@metadata":{
"beat":"filebeat",
"type":"_doc",
"version":"7.2.0",
"topic":"bigdata_nginx_access"
},
"message":"192.168.25.1 - - [18/Apr/2020:14:15:45 +0800] "GET /nocar/Download?a=520&g=520p&b=h&p=123456 HTTP/1.1" 200 103580 "-""
}
2、依赖 pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.test.bigdata</groupId>
<artifactId>scalaspark</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<scala.version>2.11.8</scala.version>
<spark.version>2.4.0</spark.version>
<hadoop.version>3.0.0</hadoop.version>
<jackson.version>2.6.2</jackson.version>
<encoding>UTF-8</encoding>
</properties>
<repositories>
<repository>
<id>scala-tools.org</id>
<name>Scala-Tools Maven2 Repository</name>
<url>http://scala-tools.org/repo-Releases</url>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>scala-tools.org</id>
<name>Scala-Tools Maven2 Repository</name>
<url>http://scala-tools.org/repo-Releases</url>
</pluginRepository>
</pluginRepositories>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka-0-8_2.11 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.44</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.1.41</version>
</dependency>
</dependencies>
<build>
<resources>
<resource>
<directory>src/config</directory>
<includes>
<include>**/*.properties</include>
</includes>
</resource>
</resources>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<classifier>dist</classifier>
<appendAssemblyId>true</appendAssemblyId>
<descriptorRefs>
<descriptor>jar-with-dependencies</descriptor>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.7</source>
<target>1.7</target>
</configuration>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
<recompileMode>incremental</recompileMode>
<useZincServer>true</useZincServer>
<args>
<arg>-unchecked</arg>
<arg>-deprecation</arg>
<arg>-feature</arg>
</args>
<jvmArgs>
<jvmArg>-Xms1024m</jvmArg>
<jvmArg>-Xmx1024m</jvmArg>
</jvmArgs>
<javacArgs>
<javacArg>-source</javacArg>
<javacArg>${java.version}</javacArg>
<javacArg>-target</javacArg>
<javacArg>${java.version}</javacArg>
<javacArg>-Xlint:all,-serial,-path</javacArg>
</javacArgs>
</configuration>
</plugin>
<plugin>
<groupId>org.antlr</groupId>
<artifactId>antlr4-maven-plugin</artifactId>
<version>4.3</version>
<executions>
<execution>
<id>antlr</id>
<goals>
<goal>antlr4</goal>
</goals>
<phase>none</phase>
</execution>
</executions>
<configuration>
<outputDirectory>src/test/java</outputDirectory>
<listener>true</listener>
<treatWarningsAsErrors>true</treatWarningsAsErrors>
</configuration>
</plugin>
</plugins>
</build>
</project>
3、代码:
package com.test.kafka
import java.util.concurrent.TimeUnit
import java.util.regex.{Matcher, Pattern}
import com.google.gson.Gson
import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery, StreamingQueryException, Trigger}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
object StrutsSparkTest {
def main(args: Array[String]): Unit = {
//System.setProperty("hadoop.home.dir", "D:\\Software\\hadoop-common-2.6.0-bin-master\\")
//创建sparkSession
val spark: SparkSession = SparkSession.builder()
.appName("struts_spark")
.master("local[*]")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
import spark.implicits._
//kafka
val topic = "WordCount"
val kafkaCluer = "192.168.25.121:9092,192.168.25.122:9092,192.168.25.123:9092"
//source创建
val df: DataFrame = spark.readStream.format("kafka")
.option("kafka.bootstrap.servers", kafkaCluer)
.option("subscribe", topic)
.option("startingOffsets", "latest")
.load()
val kafkaDF: Dataset[(String, String)] = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").as[(String, String)]
//channel 过滤一些脏消息
val msg: DataFrame = kafkaDF.map(record =>handleMessage2CaseClass(record._2)).filter((_: Msg).!=(null))
.select("message")
.filter({ row =>
//ip的提取和判断
val strings: Array[String] = row.toString.split(" ")
isIp(strings(0)) && strings.length>10
})
//拆分nginx日志
val nginx: Dataset[nginxDesc] = msg.map(nginx => {
val strings: Array[String] = nginx.toString.split(" ")
if (strings(6).contains("?")) {
val urlStr: Array[String] = strings(6).split("\\?")
strings(6) = urlStr(0) //urlStr
strings(7) = urlStr(1) //param
}
if (strings(3).contains("[")){
val formatter = new SimpleDateFormat("dd/MMM/yyyy:hh:mm:ss", Locale.ENGLISH)
val date: Date = formatter.parse(strings(3).replace("[", ""))
val format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
strings(3)=format.format(date) //nginxTime
}
if (strings(5).contains("\""))
strings(5) = strings(5).replace("\"", "") //get post method
if (strings(0).contains("["))
strings(0) = strings(0).replace("[", "") //ip
nginxDesc(strings(0), strings(3), strings(5), strings(6), strings(7), strings(8).toInt, strings(9).toInt)
})
//创建流式数据写入格式对象
//sink
// val query: StreamingQuery = nginx.writeStream
// .outputMode("append")
// .format("console")
// .start()
/*输出路径*//*输出路径*/
val outputPath = "F:\\tmp\\spark\\data"
val checkpointLocation = "F:\\tmp\\spark\\checkpoint"
// Sink:存入本地csv,主要为了验证
val query: StreamingQuery = nginx.writeStream
.format("csv")
.option("path", outputPath)
.option("checkpointLocation", checkpointLocation)
.outputMode(OutputMode.Append)
.trigger(Trigger.ProcessingTime(1, TimeUnit.MINUTES))
.start()
try
query.awaitTermination()
catch {
case e: StreamingQueryException =>
e.printStackTrace()
}
}
/**
* 使用正则判断IP
*/
def isIp(addr:String): Boolean={
if(addr.length() < 7 || addr.length() > 15 || "".equals(addr))
return false
val pat: Pattern = Pattern.compile("([1-9]|[1-9]\\d|1\\d{2}|2[0-4]\\d|25[0-5])(\\.(\\d|[1-9]\\d|1\\d{2}|2[0-4]\\d|25[0-5])){3}")
val mat: Matcher = pat.matcher(addr)
mat.find()
}
/***
*
* @param ip ip
* @param nginxTime 时间
* @param method 方法
* @param urlStr url
* @param http http
* @param status 状态
* @param flow 流量
*/
case class nginxDesc(ip:String, nginxTime:String, method:String, urlStr:String, http:String,
status:Int,flow:Int )
/**样例类
*
* @param timestamp 时间
* @param metadata 信息
* @param message 消息
*/
case class Msg(timestamp:String,metadata:String ,message:String)
/**
* 转换json
* @param jsonStr json
* @return 类
*/
def handleMessage2CaseClass(jsonStr: String): Msg = {
val gson = new Gson()
try{
gson.fromJson(jsonStr, classOf[Msg])
} catch {
case ex: Exception =>
ex.printStackTrace() // 打印到标准err
System.err.println("exception===>: ...") // 打印到标准err
//主要为了返回对象
Msg(null,null,null)
}
}
}
4、结果:
1、第一次更新
2020/06/09 只是打印的消息体。
2、第二次更新
2020/06/10,优化了消息源的判断,异常捕捉,拆分nginx日志
3、第三次更新
2020/06/11,优化了数据的sink,验证数据
数据分析小白入门指南