这里使用github上二叉树快速搜索IP地址数据库生成Jar包,通过scala调用解析IP
1、将Java项目下载下来打成Jar包,并将Jar包放入scala项目中
ipdatabase-1.0-SNAPSHOT.jar
2、编辑intall-ipdatabase.bat文件,将Jar包install到本地仓库
mvn install:install-file -Dfile=ipdatabase-1.0-SNAPSHOT.jar -DgroupId=com.ggstar -DartifactId=ipdatabase -Dversion=1.0 -Dpackaging=jar
3、将ipDatabase.csv和ipRegion.xlsx文件复制到scala项目的resources目录下
4、引入Jar包
<dependency>
<groupId>com.ggstar</groupId>
<artifactId>ipdatabase</artifactId>
<version>1.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.14</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.14</version>
</dependency>
5、完善 【Spark实战】日志分析(二):日志解析中的代码并测试
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
/**
* 访问日志转换(输入==>输出)工具类
*/
object AccessConvertUtil {
//定义输出的字段
val struct = StructType(
Array(StructField("url",StringType),
StructField("cmsType",StringType),
StructField("cmsId",LongType),
StructField("traffic",LongType),
StructField("ip",StringType),
StructField("city",StringType),
StructField("time",StringType),
StructField("day",StringType)
)
)
/**
* 根据输入的每一行信息转换成输出的样式
* @param log 输入的每一行记录信息
*/
def parseLog(log:String) = {
try{
val splits = log.split("\t")
val url = splits(1)
val traffic = splits(2).toLong
val ip = splits(3)
val domain = "http://www.imooc.com/"
val cms = url.substring(url.indexOf(domain) + domain.length)
val cmsTypeId = cms.split("/")
var cmsType = ""
var cmsId = 0l
if(cmsTypeId.length > 1) {
cmsType = cmsTypeId(0)
cmsId = cmsTypeId(1).toLong
}
val city = IpUtils.getCity(ip)
// val city = ""
val time = splits(0)
val day = time.substring(0,10).replaceAll("-","")
//这个row里面的字段要和struct中的字段对应上
Row(url, cmsType, cmsId, traffic, ip, city, time, day)
} catch {
case e:Exception => Row(0)
}
}
}