概述
使用Flume
从业务系统中获取到用户的行为日志之后,我们需要对这些日志进行一定的清洗,并才采集出对之后分析
有用的数据。
本篇和下篇将根据,用户登录行为的风险判定,对数据抽取
做简单的介绍
本篇主要介绍抽取原始数据
整体框架
引入依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.baizhi</groupId>
<artifactId>EvaluateModel</artifactId>
<version>1.0-SNAPSHOT</version>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<!--使用lombok技术的坐标-->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.8</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
处理日志格式
无论是抽取何种数据,都需要首先可以正确的从日志中,获取我们需要的数据信息。
使用正则表达式对日志进行格式匹配与提取。
在线正则生成网站:https://regex101.com/
- INFO 2018-03-31 10:12:00 C1S应用1 evaluate [张三] 6ebaf4ac780f40f486359f3ea6934620 “123456” Beijing “116.4,39.5” [1200,15000,2100] “Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36”
- 例如对上方文字的正则提取表达式是
^INFO\s(\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})\s([a-z0-9\u4e00-\u9fa5]*)\s(EVALUATE|SUCCESS)\s\[([a-z0-9\u4e00-\u9fa5]*)\]\s([a-z0-9]{32})\s\"([a-z0-9\.\-\,]{6,12})\"\s([a-z\u4e00-\u9fa5]*)\s\"([0-9\.\,]*)\"\s\[([0-9\,\.]*)\]\s\"(.*)\"
- 部分规则:
^
表示开头
\s
表示匹配空白
\d
表示匹配数字
|
表示或匹配,与JAVA类似
{}
其中填数字,表示匹配几个
()
表示进行提取
[]
表示匹配满足内部条件的
\u4e00-\u9fa5
表示匹配汉字
*
表示匹配多个
.
表示匹配所有
\,
反斜杠+任意特殊字符 表示转义匹配
开发日志提取工具类
package com.baizhi.util;
import com.baizhi.entities.EvaluateData;
import com.baizhi.entities.GeoPoint;
import com.baizhi.entities.LoginSuccessData;
import com.sun.org.apache.xpath.internal.operations.Bool;
import jdk.nashorn.internal.runtime.regexp.RegExp;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class EvaluateUtil {
public static final String LEGAL_REGEX="^INFO\\s(\\d{4}-\\d{2}-\\d{2}\\s\\d{2}:\\d{2}:\\d{2})\\s([a-z0-9\\u4e00-\\u9fa5]*)\\s(EVALUATE|SUCCESS)\\s\\[([a-z0-9\\u4e00-\\u9fa5]*)\\]\\s([a-z0-9]{32})\\s\\\"([a-z0-9\\.\\-\\,]{6,12})\\\"\\s([a-z\\u4e00-\\u9fa5]*)\\s\\\"([0-9\\.\\,]*)\\\"\\s\\[([0-9\\,\\.]*)\\]\\s\\\"(.*)\\\"";
public static final String EVALUATE_REGEX="^INFO\\s(\\d{4}-\\d{2}-\\d{2}\\s\\d{2}:\\d{2}:\\d{2})\\s([a-z0-9\\u4e00-\\u9fa5]*)\\s(EVALUATE)\\s\\[([a-z0-9\\u4e00-\\u9fa5]*)\\]\\s([a-z0-9]{32})\\s\\\"([a-z0-9\\.\\-\\,]{6,12})\\\"\\s([a-z\\u4e00-\\u9fa5]*)\\s\\\"([0-9\\.\\,]*)\\\"\\s\\[([0-9\\,\\.]*)\\]\\s\\\"(.*)\\\"";
public static final String SUCCESS_REGEX="^INFO\\s(\\d{4}-\\d{2}-\\d{2}\\s\\d{2}:\\d{2}:\\d{2})\\s([a-z0-9\\u4e00-\\u9fa5]*)\\s(SUCCESS)\\s\\[([a-z0-9\\u4e00-\\u9fa5]*)\\]\\s([a-z0-9]{32})\\s\\\"([a-z0-9\\.\\-\\,]{6,12})\\\"\\s([a-z\\u4e00-\\u9fa5]*)\\s\\\"([0-9\\.\\,]*)\\\"\\s\\[([0-9\\,\\.]*)\\]\\s\\\"(.*)\\\"";
public static final Pattern LEGAL_PATTERN = Pattern.compile(LEGAL_REGEX, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
public static final Pattern EVALUATE_PATTERN = Pattern.compile(EVALUATE_REGEX, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
public static final Pattern SUCCESS_PATTERN = Pattern.compile(SUCCESS_REGEX, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
public static Boolean isLegal(String input){
Matcher matcher = LEGAL_PATTERN.matcher(input);
return matcher.matches();
}
/**
* 判断是否是评估数据
* @param input
* @return
*/
public static Boolean isEvaluate(String input){
Matcher matcher = EVALUATE_PATTERN.matcher(input);
return matcher.matches();
}
/**
* 判断是否是成功登录的数据
* @param input
* @return
*/
public static Boolean isLoginSuccess(String input){
Matcher matcher = SUCCESS_PATTERN.matcher(input);
return matcher.matches();
}
/**
* 解析日志文件,得到待评估数据
* @param input
* @return
* @throws ParseException
*/
public static EvaluateData parseEvaluateData(String input) throws ParseException {
//指定一个验证数据对象
EvaluateData evaluateData = new EvaluateData();
//获取匹配体
Matcher matcher = EvaluateUtil.EVALUATE_PATTERN.matcher(input);
//如果配配到了
if(matcher.find()){
//遍历
for (int i = 0; i <= matcher.groupCount(); i++) {
switch (i){
case 1:
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date date = simpleDateFormat.parse(matcher.group(i));
//传递时间参数
evaluateData.setEvaluateTime(date.getTime());
break;
case 2:
//设置应用名
evaluateData.setApplicationName(matcher.group(i));
break;
case 4:
//设置用户标识
evaluateData.setUserIdentify(matcher.group(i));
break;
case 5:
//设置应用序列
evaluateData.setLoginSequence(matcher.group(i));
break;
case 6:
//设置密码
evaluateData.setOrdernessPassword(matcher.group(i));
break;
case 7:
//设置城市
evaluateData.setCityName(matcher.group(i));
break;
case 8:
//设置经纬度
String geoparams = matcher.group(i);
String[] geos = geoparams.split(",");
//指定一个经纬度对象
GeoPoint geoPoint = new GeoPoint(Double.parseDouble(geos[0]),Double.parseDouble(geos[1]));
//设置位置对象
evaluateData.setGeoPoint(geoPoint);
break;
case 9:
//设置输入特性
String featrues = matcher.group(i);
String[] featureGroup = featrues.split(",");
Double[] doubleFertrue = {Double.parseDouble(featureGroup[0]),Double.parseDouble(featureGroup[1]),Double.parseDouble(featureGroup[2])};
evaluateData.setInputFeatures(doubleFertrue);
break;
case 10:
//设置设备信息
evaluateData.setDeviceInformation(matcher.group(i));
break;
}
}
}
return evaluateData;
}
/**
* 解析成功的登录数据
* @param input
* @return
* @throws ParseException
*/
public static LoginSuccessData parseLoginSuccessData(String input) throws ParseException {
//指定一个验证数据对象
LoginSuccessData loginSuccessData = new LoginSuccessData();
//获取匹配体
Matcher matcher = EvaluateUtil.SUCCESS_PATTERN.matcher(input);
//如果配配到了
if(matcher.find()){
//省略。。。。。。。。
}
return loginSuccessData;
}
/**获取应用名
* 解析合法的的登录数据
* @param input
*/
public static String getApplicationName(String input) throws ParseException {
//指定一个验证数据对象
EvaluateData evaluateData = new EvaluateData();
//获取匹配体
Matcher matcher = EvaluateUtil.LEGAL_PATTERN.matcher(input);
//构建返回值
String applicationName = null;
//如果配配到了
if(matcher.find()){
applicationName = matcher.group(2);
}
return applicationName;
}
/**获取用户的名字
* 解析合法的的登录数据
* @param input
*/
public static String getUserIdentify(String input) throws ParseException {
//指定一个验证数据对象
EvaluateData evaluateData = new EvaluateData();
//获取匹配体
Matcher matcher = EvaluateUtil.LEGAL_PATTERN.matcher(input);
//构建返回值
String userIdentify = null;
//如果配配到了
if(matcher.find()){
userIdentify = matcher.group(4);
}
return userIdentify;
}
抽取原始数据
在抽取用户行为数据进行风险判定
时,需要首先具有对照数据
,即历史登录成功的数据
。
引入责任链模式
顾名思义,责任链模式(Chain of Responsibility Pattern)为请求创建了一个接收者对象的链。这种模式给予请求的类型,对请求的发送者和接收者进行解耦。这种类型的设计模式属于行为型模式。
在这种模式中,通常每个接收者都包含对另一个接收者的引用。如果一个对象不能处理该请求,那么它会把相同的请求传给下一个接收者,依此类推。
详见 菜鸟教程:https://www.runoob.com/design-pattern/chain-of-responsibility-pattern.html
当我们需要抽取数据进行风险判定时,需要明确需要判断的数据依据内容包含:
①
登录城市
②登录设备
③登录密码
④登录次数(当天)
-----> 未来实现时,将此数值保存在流的状态中,不需要代码实现
⑤登录时间
—> 获取登录习惯(一周的每天的何时是常用时间)
⑥登录的地理位置
-----> 结合时间 获取与上次相比的位移量
⑦输入特征
根据以上,我们需要建立相应的实例类
,与相应的责任链
进行历史数据
实体类的更新,即每次登录成功的数据
都要存入历史数据中。
① 创建登录成功数据
的实体类
对应解析日志之后的内容,接收全部数据
package com.baizhi.entities;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.Accessors;
import java.io.Serializable;
@NoArgsConstructor
@AllArgsConstructor
@Data
@Accessors(chain = true)
public class LoginSuccessData implements Serializable {
private long evaluateTime;
private String applicationName;
private String userIdentify;
private String loginSequence;
private String ordernessPassword;
private String cityName;
private GeoPoint geoPoint; //构建一个经纬度实体类
private Double[] inputFeatures;
private String deviceInformation;
}
构建一个经纬度实体类
package com.baizhi.entities;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.Accessors;
import java.io.Serializable;
@NoArgsConstructor
@AllArgsConstructor
@Data
@Accessors(chain = true)
public class GeoPoint implements Serializable {
private double longtitude;//经度
private double latitude;//纬度
}
②创建历史数据
实体类
用于接收所有登录成功后的,判断依据的属性
。
package com.baizhi.entities;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.Accessors;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
/**
* 记录是用户的登录的历史状态
*/
public class HistoryData implements Serializable {
//历史登录城市
private Set<String> historyCities;
//历史设备信息
private List<String> historyDeviceInformations;
//登录次数
private Integer currentDayLoginCount;
//登录时段习惯
private Map<String, Map<String,Integer>> historyLoginTimeSlot;
//存储的乱序密码
private Set<String> historyOrdernessPasswords;
//历史的输入特征
private List<Double[]> latestInputFeatures;
//上次登录的时间和Geo坐标
private long lastLoginTime;
private GeoPoint lastLoginGeoPoint;
@Override
public String toString() {
return "HistoryData{" +
"historyCities=" + historyCities +
", historyDeviceInformations=" + historyDeviceInformations +
", currentDayLoginCount=" + currentDayLoginCount +
", historyLoginTimeSlot=" + historyLoginTimeSlot +
", historyOrdernessPasswords=" + historyOrdernessPasswords +
", latestInputFeatures=" + latestInputFeatures.stream().map(x-> (Arrays.stream(x).map(y->y+"").reduce((z,v)->z+","+v).get())).collect(Collectors.toList())+
", lastLoginTime=" + lastLoginTime +
", lastLoginGeoPoint=" + lastLoginGeoPoint +
'}';
}
//省略所有Get/Set 方法,使用时记得补上!!!
}
③创建更新历史数据的接口
package com.baizhi.update;
import com.baizhi.entities.HistoryData;
import com.baizhi.entities.LoginSuccessData;
public interface Updater {
//构建更新链
public void update(LoginSuccessData loginSuccessData, HistoryData historyData, UpdaterChain updaterChain);
}
构建更新链
package com.baizhi.update;
import com.baizhi.entities.EvaluateData;
import com.baizhi.entities.EvaluateReport;
import com.baizhi.entities.HistoryData;
import com.baizhi.entities.LoginSuccessData;
import com.baizhi.evaluate.Evaluate;
import java.util.List;
public class UpdaterChain {
//提供一个位置属性
private int position=0;
//提供一个评估内容属性
private List<Updater> updaters;
//提供一个传递更新内容的构造
public UpdaterChain(List<Updater> updaters) {
this.updaters = updaters;
}
//提供一个作更新历史数据的方法
public void doChain(LoginSuccessData loginSuccessData,HistoryData historyData){
//如果,位置没有达到最后位置
if(position<updaters.size()){
Updater updater = updaters.get(position);
position++;
updater.update(loginSuccessData,historyData,this);
}
}
}
④实现更新数据
接口
更新登录城市
package com.baizhi.update.impl;
import com.baizhi.entities.HistoryData;
import com.baizhi.entities.LoginSuccessData;
import com.baizhi.update.Updater;
import com.baizhi.update.UpdaterChain;
import java.util.HashSet;
import java.util.Set;
public class CitiesUpdates implements Updater {
@Override
public void update(LoginSuccessData loginSuccessData, HistoryData historyData, UpdaterChain updaterChain) {
doUpdate(loginSuccessData,historyData);
updaterChain.doChain(loginSuccessData,historyData);
}
/**
* 保留所有用户登录过的城市
* @param loginSuccessData
* @param historyData
*/
private void doUpdate(LoginSuccessData loginSuccessData, HistoryData historyData){
String cityName = loginSuccessData.getCityName();
Set<String> historyCities = historyData.getHistoryCities();
if(historyCities==null){
historyCities=new HashSet<String>();
}
historyCities.add(cityName);
historyData.setHistoryCities(historyCities);
}
}
更新登录设备
package com.baizhi.update.impl;
import com.baizhi.entities.HistoryData;
import com.baizhi.entities.LoginSuccessData;
import com.baizhi.update.Updater;
import com.baizhi.update.UpdaterChain;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class DeviceUpdates implements Updater {
private Integer deviceCount=3;
public DeviceUpdates(Integer deviceCount) {
this.deviceCount = deviceCount;
}
@Override
public void update(LoginSuccessData loginSuccessData, HistoryData historyData, UpdaterChain updaterChain) {
doUpdate(loginSuccessData,historyData);
updaterChain.doChain(loginSuccessData,historyData);
}
/**
* 保留所有用户最近deviceCount个设备信息
* @param loginSuccessData
* @param historyData
*/
private void doUpdate(LoginSuccessData loginSuccessData, HistoryData historyData){
String deviceInformation = loginSuccessData.getDeviceInformation();
List<String> deviceInformations = historyData.getHistoryDeviceInformations();
if(deviceInformations==null){
deviceInformations=new ArrayList<String>();
}
if(!deviceInformations.contains(deviceInformation)){
deviceInformations.add(deviceInformation);
//判断一下集合大小是否达到阈值
if(deviceInformations.size()>deviceCount){
deviceInformations.remove(0);
}
}
historyData.setHistoryDeviceInformations(deviceInformations);
}
}
更新登录密码
package com.baizhi.update.impl;
import com.baizhi.entities.HistoryData;
import com.baizhi.entities.LoginSuccessData;
import com.baizhi.update.Updater;
import com.baizhi.update.UpdaterChain;
import java.util.HashSet;
import java.util.Set;
public class PasswordsUpdates implements Updater {
@Override
public void update(LoginSuccessData loginSuccessData, HistoryData historyData, UpdaterChain updaterChain) {
doUpdate(loginSuccessData,historyData);
updaterChain.doChain(loginSuccessData,historyData);
}
/**
* 保留所有用户正常登录过的密码
* @param loginSuccessData
* @param historyData
*/
private void doUpdate(LoginSuccessData loginSuccessData, HistoryData historyData){
String ordernessPassword = loginSuccessData.getOrdernessPassword();
Set<String> historyOrdernessPasswords = historyData.getHistoryOrdernessPasswords();
if(historyOrdernessPasswords==null){
historyOrdernessPasswords=new HashSet<String>();
}
historyOrdernessPasswords.add(ordernessPassword);
historyData.setHistoryOrdernessPasswords(historyOrdernessPasswords);
}
}
更新登录特征
package com.baizhi.update.impl;
import com.baizhi.entities.HistoryData;
import com.baizhi.entities.LoginSuccessData;
import com.baizhi.update.Updater;
import com.baizhi.update.UpdaterChain;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class InputFeaturesUpdater implements Updater {
private Integer numCount=10;
public void setNumCount(Integer numCount) {
this.numCount = numCount;
}
@Override
public void update(LoginSuccessData loginSuccessData, HistoryData historyData, UpdaterChain updaterChain) {
doUpdate(loginSuccessData,historyData);
updaterChain.doChain(loginSuccessData,historyData);
}
private void doUpdate(LoginSuccessData loginSuccessData, HistoryData historyData){
Double[] inputFeatures = loginSuccessData.getInputFeatures();
List<Double[]> latestInputFeatures = historyData.getLatestInputFeatures();
//如果历史数据中不存在特征,则创建一个新的集合,用于接收特征
if(latestInputFeatures==null){
latestInputFeatures=new ArrayList<Double[]>();
}
latestInputFeatures.add(inputFeatures);
//特征属性,规定阈值
if(latestInputFeatures.size()>numCount){
latestInputFeatures.remove(0);
}
historyData.setLatestInputFeatures(latestInputFeatures);
}
}
更新登录时间
package com.baizhi.update.impl;
import com.baizhi.entities.HistoryData;
import com.baizhi.entities.LoginSuccessData;
import com.baizhi.update.Updater;
import com.baizhi.update.UpdaterChain;
public class LoginTimeUpdater implements Updater {
@Override
public void update(LoginSuccessData loginSuccessData, HistoryData historyData, UpdaterChain updaterChain) {
doUpdate(loginSuccessData,historyData);
updaterChain.doChain(loginSuccessData,historyData);
}
public void doUpdate(LoginSuccessData loginSuccessData, HistoryData historyData) {
historyData.setLastLoginTime(loginSuccessData.getEvaluateTime());
}
}
更新登录经纬度
package com.baizhi.update.impl;
import com.baizhi.entities.HistoryData;
import com.baizhi.entities.LoginSuccessData;
import com.baizhi.update.Updater;
import com.baizhi.update.UpdaterChain;
public class LastLoginGeoPoint implements Updater {
@Override
public void update(LoginSuccessData loginSuccessData, HistoryData historyData, UpdaterChain updaterChain) {
doUpdate( loginSuccessData, historyData);
updaterChain.doChain(loginSuccessData,historyData);
}
public void doUpdate(LoginSuccessData loginSuccessData, HistoryData historyData) {
historyData.setLastLoginGeoPoint(loginSuccessData.getGeoPoint());
}
}
更新登录习惯
根据登录时间,进行广义数据挖掘
package com.baizhi.update.impl;
import com.baizhi.entities.HistoryData;
import com.baizhi.entities.LoginSuccessData;
import com.baizhi.update.Updater;
import com.baizhi.update.UpdaterChain;
import java.text.DecimalFormat;
import java.util.Calendar;
import java.util.HashMap;
import java.util.Map;
public class TimeSlotUpdater implements Updater {
@Override
public void update(LoginSuccessData loginSuccessData, HistoryData historyData, UpdaterChain updaterChain) {
doUpdate(loginSuccessData,historyData);
updaterChain.doChain(loginSuccessData,historyData);
}
public void doUpdate(LoginSuccessData loginSuccessData, HistoryData historyData) {
long loginTime = loginSuccessData.getEvaluateTime();
String[] WEEKS={"星期日","星期一","星期二","星期三","星期四","星期五","星期六"};
Calendar calendar = Calendar.getInstance();
calendar.setTimeInMillis(loginTime);
String dayOfWeek = WEEKS[calendar.get(Calendar.DAY_OF_WEEK) - 1];
DecimalFormat decimalFormat=new DecimalFormat("00");
String hourOfDay= decimalFormat.format(calendar.get(Calendar.HOUR_OF_DAY));//01 02 ... 24
Map<String, Map<String, Integer>> historyLoginTimeSlot = historyData.getHistoryLoginTimeSlot();
if(historyLoginTimeSlot==null){
historyLoginTimeSlot=new HashMap<String, Map<String, Integer>>();
}
//更新用户的登录习惯
if(!historyLoginTimeSlot.containsKey(dayOfWeek)){
HashMap<String, Integer> timeSlot = new HashMap<String, Integer>();
timeSlot.put(hourOfDay,1);
historyLoginTimeSlot.put(dayOfWeek,timeSlot);
}else{//包含dayOfWeek
Map<String, Integer> timeSlot = historyLoginTimeSlot.get(dayOfWeek);
Integer count=0;
if(timeSlot.containsKey(hourOfDay)){//含有时段
count=timeSlot.get(hourOfDay);
}
timeSlot.put(hourOfDay,count+1);
}
historyData.setHistoryLoginTimeSlot(historyLoginTimeSlot);
}
}
至此,我们的原始数据以及可以获取
在 下一篇中,将介绍
风险判定评估数据
的抽取。