一、基础工程构建
创建父工程
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.atguigu.gmall2019.dw</groupId>
<artifactId>gmall2019-dw</artifactId>
<version>1.0-SNAPSHOT</version>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>1.5.10.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<properties>
<spark.version>2.1.1</spark.version>
<scala.version>2.11.8</scala.version>
<log4j.version>1.2.17</log4j.version>
<slf4j.version>1.7.22</slf4j.version>
<fastjson.version>1.2.47</fastjson.version>
<httpclient.version>4.5.5</httpclient.version>
<httpmime.version>4.3.6</httpmime.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
</properties>
<dependencies>
<!--此处放日志包,所有项目都要引用-->
<!-- 所有子项目的日志框架 -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>${slf4j.version}</version>
</dependency>
<!-- 具体的日志实现 -->
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>${log4j.version}</version>
</dependency>
</dependencies>
<dependencyManagement>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>${httpclient.version}</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
<version>${httpmime.version}</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>${fastjson.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
</dependencies>
</dependencyManagement>
</project>
创建common模块
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>gmall2019-dw</artifactId>
<groupId>com.atguigu.gmall2019.dw</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>gmall2019-common</artifactId>
<dependencies>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
</dependency>
</dependencies>
</project>
创建Mock模块
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>gmall2019_dw</artifactId>
<groupId>com.atguigu.gmall2019.dw</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dw-mocker</artifactId>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>com.atguigu.gmall2019.dw</groupId>
<artifactId>dw-common</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
</dependencies>
</project>
创建工具类
RandomDate
import java.util.Date;
import java.util.Random;
public class RandomDate {
Long logDateTime =0L;//
int maxTimeStep=0 ;
public RandomDate (Date startDate , Date endDate,int num) {
Long avgStepTime = (endDate.getTime()- startDate.getTime())/num;
this.maxTimeStep=avgStepTime.intValue()*2;
this.logDateTime=startDate.getTime();
}
public Date getRandomDate() {
int timeStep = new Random().nextInt(maxTimeStep);
logDateTime = logDateTime+timeStep;
return new Date( logDateTime);
}
}
RanOpt
public class RanOpt<T>{
T value ;
int weight;
public RanOpt ( T value, int weight ){
this.value=value ;
this.weight=weight;
}
public T getValue() {
return value;
}
public int getWeight() {
return weight;
}
}
RandomOptionGroup
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
public class RandomOptionGroup<T> {
int totalWeight=0;
List<RanOpt> optList=new ArrayList();
public RandomOptionGroup(RanOpt<T>... opts) {
for (RanOpt opt : opts) {
totalWeight+=opt.getWeight();
for (int i = 0; i <opt.getWeight() ; i++) {
optList.add(opt);
}
}
}
public RanOpt<T> getRandomOpt() {
int i = new Random().nextInt(totalWeight);
return optList.get(i);
}
public static void main(String[] args) {
RanOpt[] opts= {new RanOpt("zhang3",20),new RanOpt("li4",30),new RanOpt("wang5",50)};
RandomOptionGroup randomOptionGroup = new RandomOptionGroup(opts);
for (int i = 0; i <10 ; i++) {
System.out.println(randomOptionGroup.getRandomOpt().getValue());
}
}
}
RandomNum
import java.util.Random;
public class RandomNum {
public static final int getRandInt(int fromNum,int toNum){
return fromNum+ new Random().nextInt(toNum-fromNum+1);
}
}
发送日志工具类 :LogUploader
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.apache.http.entity.mime.content.FileBody;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
public class LogUploader {
public static void sendLogStream(String log){
try{
//不同的日志类型对应不同的URL
URL url =new URL("http://logserver/log");
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
//设置请求方式为post
conn.setRequestMethod("POST");
//时间头用来供server进行时钟校对的
conn.setRequestProperty("clientTime",System.currentTimeMillis() + "");
//允许上传数据
conn.setDoOutput(true);
//设置请求的头信息,设置内容类型为JSON
conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
System.out.println("upload" + log);
//输出流
OutputStream out = conn.getOutputStream();
out.write(("logString="+log).getBytes());
out.flush();
out.close();
int code = conn.getResponseCode();
System.out.println(code);
}
catch (Exception e){
e.printStackTrace();
}
}
}
日志生成类JsonMocker
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.atguigu.gmall.dw.mock.utils.*;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Random;
public class JsonMocker {
int startupNum=100000;
int eventNum=200000 ;
RandomDate logDateUtil= null;
RanOpt[] osOpts= {new RanOpt("ios",3),new RanOpt("andriod",7) };
RandomOptionGroup<String> osOptionGroup= new RandomOptionGroup(osOpts);
Date startTime= null;
Date endTime= null;
RanOpt[] areaOpts= {new RanOpt("beijing",10),
new RanOpt("shanghai",10),new RanOpt("guangdong",20),new RanOpt("hebei",5),
new RanOpt("heilongjiang",5),new RanOpt("shandong",5),new RanOpt("tianjin",5),
new RanOpt("shan3xi",5),new RanOpt("shan1xi",5),new RanOpt("sichuan",5)
};
RandomOptionGroup<String> areaOptionGroup= new RandomOptionGroup(areaOpts);
String appId="gmall2019";
RanOpt[] vsOpts= {new RanOpt("1.2.0",50),new RanOpt("1.1.2",15),
new RanOpt("1.1.3",30),
new RanOpt("1.1.1",5)
};
RandomOptionGroup<String> vsOptionGroup= new RandomOptionGroup(vsOpts);
RanOpt[] eventOpts= {new RanOpt("addFavor",10),new RanOpt("addComment",30),
new RanOpt("addCart",20), new RanOpt("clickItem",40)
};
RandomOptionGroup<String> eventOptionGroup= new RandomOptionGroup(eventOpts);
RanOpt[] channelOpts= {new RanOpt("xiaomi",10),new RanOpt("huawei",20),
new RanOpt("wandoujia",30), new RanOpt("360",20), new RanOpt("tencent",20)
, new RanOpt("baidu",10), new RanOpt("website",10)
};
RandomOptionGroup<String> channelOptionGroup= new RandomOptionGroup(channelOpts);
RanOpt[] quitOpts= { new RanOpt(true,20),new RanOpt(false,80)};
RandomOptionGroup<Boolean> isQuitGroup= new RandomOptionGroup(quitOpts);
public JsonMocker( ) {
}
public JsonMocker(String startTimeString ,String endTimeString,int startupNum,int eventNum) {
try {
startTime= new SimpleDateFormat("yyyy-MM-dd").parse(startTimeString);
endTime= new SimpleDateFormat("yyyy-MM-dd").parse(endTimeString);
} catch (ParseException e) {
e.printStackTrace();
}
logDateUtil= new RandomDate(startTime,endTime,startupNum+eventNum);
}
String initEventLog(String startLogJson){
/*`type` string COMMENT '日志类型',
`mid` string COMMENT '设备唯一 表示',
`uid` string COMMENT '用户标识',
`os` string COMMENT '操作系统',
`appid` string COMMENT '应用id',
`area` string COMMENT '地区' ,
`evid` string COMMENT '事件id',
`pgid` string COMMENT '当前页',
`npgid` string COMMENT '跳转页',
`itemid` string COMMENT '商品编号',
`ts` bigint COMMENT '时间',*/
JSONObject startLog = JSON.parseObject(startLogJson);
String mid= startLog.getString("mid");
String uid= startLog.getString("uid");
String os= startLog.getString("os");
String appid=this.appId;
String area=startLog.getString("area");
String evid = eventOptionGroup.getRandomOpt().getValue();
int pgid = new Random().nextInt(50)+1;
int npgid = new Random().nextInt(50)+1;
int itemid = new Random().nextInt(50);
// long ts= logDateUtil.getRandomDate().getTime();
JSONObject jsonObject = new JSONObject();
jsonObject.put("type","event");
jsonObject.put("mid",mid);
jsonObject.put("uid",uid);
jsonObject.put("os",os);
jsonObject.put("appid",appid);
jsonObject.put("area",area);
jsonObject.put("evid",evid);
jsonObject.put("pgid",pgid);
jsonObject.put("npgid",npgid);
jsonObject.put("itemid",itemid);
return jsonObject.toJSONString();
}
String initStartupLog( ){
/*`type` string COMMENT '日志类型',
`mid` string COMMENT '设备唯一标识',
`uid` string COMMENT '用户标识',
`os` string COMMENT '操作系统', ,
`appId` string COMMENT '应用id', ,
`vs` string COMMENT '版本号',
`ts` bigint COMMENT '启动时间', ,
`area` string COMMENT '城市' */
String mid= "mid_"+ RandomNum.getRandInt(1,500);
String uid=""+ RandomNum.getRandInt(1,500);
String os=osOptionGroup.getRandomOpt().getValue();
String appid=this.appId;
String area=areaOptionGroup.getRandomOpt().getValue();
String vs = vsOptionGroup.getRandomOpt().getValue();
//long ts= logDateUtil.getRandomDate().getTime();
String ch=os.equals("ios")?"appstore": channelOptionGroup.getRandomOpt().getValue();
JSONObject jsonObject = new JSONObject();
jsonObject.put("type","startup");
jsonObject.put("mid",mid);
jsonObject.put("uid",uid);
jsonObject.put("os",os);
jsonObject.put("appid",appid);
jsonObject.put("area",area);
jsonObject.put("ch",ch);
jsonObject.put("vs",vs);
return jsonObject.toJSONString();
}
public static void genLog() {
JsonMocker jsonMocker = new JsonMocker();
jsonMocker.startupNum = 1000000;
for (int i = 0; i < jsonMocker.startupNum; i++) {
String startupLog = jsonMocker.initStartupLog();
jsonMocker.sendLog(startupLog);
while (!jsonMocker.isQuitGroup.getRandomOpt().getValue()) {
String eventLog = jsonMocker.initEventLog(startupLog);
jsonMocker.sendLog(eventLog);
}
try {
Thread.sleep(20);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
public void sendLog(String log) {
LogUploader.sendLogStream(log);
}
public static void main(String[] args) {
genLog();
}
}
二、日志采集系统构建
基于SpringBoot接收端口数据,转发到Kakfa
springboot整合kafka
application.propeties
#============== kafka ===================
# 指定kafka 代理地址,可以多个
spring.kafka.bootstrap-servers=hadoop1:9092
# 指定消息key和消息体的编解码方式
spring.kafka.producer.key-serializer=org.apache.kafka.common.serialization.StringSerializer
spring.kafka.producer.value-serializer=org.apache.kafka.common.serialization.StringSerializer
LogJsonController
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.web.bind.annotation.*;
import com.atguigu.gmall.constant.GmallConstants;
@Slf4j
@RestController //Controller+responsebody
public class LoggerController {
@Autowired
KafkaTemplate<String,String> kafkaTemplate;
@PostMapping("log")
public String doLog(@RequestParam("logString") String logString ){
// 0 补充时间戳
JSONObject jsonObject = JSON.parseObject(logString);
jsonObject.put("ts",System.currentTimeMillis());
// 1 落盘 file
String jsonString = jsonObject.toJSONString();
log.info(jsonObject.toJSONString());
// 2 推送到kafka
if( "startup".equals( jsonObject.getString("type"))){
kafkaTemplate.send(GmallConstants.KAFKA_TOPIC_STARTUP,jsonString);
}else{
kafkaTemplate.send(GmallConstants.KAFKA_TOPIC_EVENT,jsonString);
}
return "success";
}
}
在common模块添加常量
public class GmallConstants {
public static final String KAFKA_TOPIC_STARTUP="GMALL_STARTUP";
public static final String KAFKA_TOPIC_EVENT="GMALL_EVENT";
public static final String KAFKA_TOPIC_NEW_ORDER="GMALL_NEW_ORDER";
public static final String KAFKA_TOPIC_ORDER_DETAIL="GMALL_ORDER_DETAIL";
public static final String ES_INDEX_DAU="gmall2019_dau";
public static final String ES_INDEX_NEW_MID="gmall2019_new_mid";
public static final String ES_INDEX_NEW_ORDER="gmall2019_new_order";
public static final String ES_INDEX_SALE_DETAIL="gmall2019_sale_detail";
}
增加log4j.properties
log4j.appender.atguigu.MyConsole=org.apache.log4j.ConsoleAppender
log4j.appender.atguigu.MyConsole.target=System.err
log4j.appender.atguigu.MyConsole.layout=org.apache.log4j.PatternLayout
log4j.appender.atguigu.MyConsole.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %10p (%c:%M) - %m%n
log4j.appender.atguigu.File=org.apache.log4j.DailyRollingFileAppender
log4j.appender.atguigu.File.file=d:/applog/gmall2019/log/app.log
log4j.appender.atguigu.File.DatePattern='.'yyyy-MM-dd
log4j.appender.atguigu.File.layout=org.apache.log4j.PatternLayout
log4j.appender.atguigu.File.layout.ConversionPattern=%m%n
log4j.logger.com.atguigu.xxxxxx.XXXXcontroller=info,atguigu.File,atguigu.MyConsole
打包发布到服务器
安装Nginx负载均衡,修改配置文件:nginx.conf
http{
..........
upstream logserver{
server hadoop1:8080 weight=1;
server hadoop2:8080 weight=1;
server hadoop3:8080 weight=1;
}
server {
listen 80;
server_name logserver;
location / {
root html;
index index.html index.htm;
proxy_pass http://logserver;
proxy_connect_timeout 10;
}
..........
}
三、实时处理,创建realtime模块
消费kafka中的数据。
利用redis过滤当日已经计入的日活设备。
把每批次新增的当日日活信息保存到HBASE或ES中。
从ES中查询出数据,发布成数据接口,通可视化化工程调用。
3.1代码开发之消费Kafka
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>gmall2019_dw</artifactId>
<groupId>com.atguigu.gmall2019.dw</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dw-realtime</artifactId>
<dependencies>
<dependency>
<groupId>com.atguigu.gmall2019.dw</groupId>
<artifactId>dw-common</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.10.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- 该插件用于将Scala代码编译成class文件 -->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<!-- 声明绑定到maven的compile阶段 -->
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
3.2、创建工具类
3.2.1 MykafkaUtil
import org.apache.kafka.common.serialization.StringDeserializer
import java.util.Properties
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
object MyKafkaUtil {
private val properties: Properties = PropertiesUtil.load("config.properties")
val broker_list = properties.getProperty("kafka.broker.list")
// kafka消费者配置
val kafkaParam = Map(
"bootstrap.servers" -> broker_list,//用于初始化链接到集群的地址
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
//用于标识这个消费者属于哪个消费团体
"group.id" -> "gmall_consumer_group",
//如果没有初始化偏移量或者当前的偏移量不存在任何服务器上,可以使用这个配置属性
//可以使用这个配置,latest自动重置偏移量为最新的偏移量
"auto.offset.reset" -> "latest",
//如果是true,则这个消费者的偏移量会在后台自动提交,但是kafka宕机容易丢失数据
//如果是false,会需要手动维护kafka偏移量
"enable.auto.commit" -> (true: java.lang.Boolean)
)
// 创建DStream,返回接收到的输入数据
// LocationStrategies:根据给定的主题和集群地址创建consumer
// LocationStrategies.PreferConsistent:持续的在所有Executor之间分配分区
// ConsumerStrategies:选择如何在Driver和Executor上创建和配置Kafka Consumer
// ConsumerStrategies.Subscribe:订阅一系列主题
def getKafkaStream(topic: String,ssc:StreamingContext): InputDStream[ConsumerRecord[String,String]]={
val dStream = KafkaUtils.createDirectStream[String,String](ssc, LocationStrategies.PreferConsistent,ConsumerStrategies.Subscribe[String,String](Array(topic),kafkaParam))
dStream
}
}
3.2.2 PropertiesUtil
object PropertiesUtil {
def main(args: Array[String]): Unit = {
val properties: Properties = PropertiesUtil.load("config.properties")
println(properties.getProperty("kafka.broker.list"))
}
def load(propertieName:String): Properties ={
val prop=new Properties();
prop.load(new InputStreamReader(Thread.currentThread().getContextClassLoader.getResourceAsStream(propertieName) , "UTF-8"))
prop
}
}
3.3.3 RedisUtil
object RedisUtil {
var jedisPool:JedisPool=null
def getJedisClient: Jedis = {
if(jedisPool==null){
// println("开辟一个连接池")
val config = PropertiesUtil.load("config.properties")
val host = config.getProperty("redis.host")
val port = config.getProperty("redis.port")
val jedisPoolConfig = new JedisPoolConfig()
jedisPoolConfig.setMaxTotal(100) //最大连接数
jedisPoolConfig.setMaxIdle(20) //最大空闲
jedisPoolConfig.setMinIdle(20) //最小空闲
jedisPoolConfig.setBlockWhenExhausted(true) //忙碌时是否等待
jedisPoolConfig.setMaxWaitMillis(500)//忙碌时等待时长 毫秒
jedisPoolConfig.setTestOnBorrow(true) //每次获得连接的进行测试
jedisPool=new JedisPool(jedisPoolConfig,host,port.toInt)
}
// println(s"jedisPool.getNumActive = ${jedisPool.getNumActive}")
// println("获得一个连接")
jedisPool.getResource
}
}
3.2 制作case class Startuplog
case class StartUpLog(mid:String,
uid:String,
appid:String,
area:String,
os:String,
ch:String,
logType:String,
vs:String,
var logDate:String,
var logHour:String,
var ts:Long
) {
}
3.4 业务类—消费kafka
object RealtimeStartupApp {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("gmall2019")
val sc = new SparkContext(sparkConf)
val ssc = new StreamingContext(sc,Seconds(10))
val startupStream: InputDStream[ConsumerRecord[String, String]] = MyKafkaUtil.getKafkaStream(GmallConstants.KAFKA_TOPIC_STARTUP,ssc)
// startupStream.map(_.value()).foreachRDD{ rdd=>
// println(rdd.collect().mkString("\n"))
// }
val startupLogDstream: DStream[StartUpLog] = startupStream.map(_.value()).map { log =>
// println(s"log = ${log}")
val startUpLog: StartUpLog = JSON.parseObject(log, classOf[StartUpLog])
startUpLog
}
}
3.5代码开发2---去重
- 把今日新增的活跃用户保存到redis中
- 每条数据经过过滤,去掉redis中已有的用户
- 去掉本批次重复的用户
import java.util
import java.text.SimpleDateFormat
import java.util.Date
import com.alibaba.fastjson.JSON
import com.atguigu.gmall.constant.GmallConstants
import com.atguigu.gmall2019.realtime.bean.StartupLog
import com.atguigu.gmall2019.realtime.util.{MyKafkaUtil, RedisUtil}
import org.apache.hadoop.conf.Configuration
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.SparkConf
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import redis.clients.jedis.Jedis
import org.apache.phoenix.spark._
object DauApp {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("dau_app")
val ssc = new StreamingContext(sparkConf,Seconds(5))
// 1 消费kafka
val inputDstream: InputDStream[ConsumerRecord[String, String]] = MyKafkaUtil.getKafkaStream(GmallConstants.KAFKA_TOPIC_STARTUP,ssc)
//2 数据流 转换 结构变成case class 补充两个时间字段
val startuplogDstream: DStream[StartupLog] = inputDstream.map { record =>
val jsonStr: String = record.value()
val startupLog: StartupLog = JSON.parseObject(jsonStr, classOf[StartupLog])
val dateTimeStr: String = new SimpleDateFormat("yyyy-MM-dd HH").format(new Date(startupLog.ts))
val dateArr: Array[String] = dateTimeStr.split(" ")
startupLog.logDate = dateArr(0)
startupLog.logHour = dateArr(1)
startupLog
}
startuplogDstream.cache()
// 3 利用用户清单进行过滤 去重 只保留清单中不存在的用户访问记录
val filteredDstream: DStream[StartupLog] = startuplogDstream.transform { rdd =>
val jedis: Jedis = RedisUtil.getJedisClient //driver //按周期执行
val dateStr: String = new SimpleDateFormat("yyyy-MM-dd").format(new Date())
val key = "dau:" + dateStr
val dauMidSet: util.Set[String] = jedis.smembers(key)
jedis.close()
val dauMidBC: Broadcast[util.Set[String]] = ssc.sparkContext.broadcast(dauMidSet)
println("过滤前:" + rdd.count())
val filteredRDD: RDD[StartupLog] = rdd.filter { startuplog => //executor
val dauMidSet: util.Set[String] = dauMidBC.value
!dauMidSet.contains(startuplog.mid)
}
println("过滤后:" + filteredRDD.count())
filteredRDD
}
// 4 批次内进行去重::按照mid 进行分组,每组取第一个值
val groupbyMidDstream: DStream[(String, Iterable[StartupLog])] = filteredDstream.map(startuplog=>(startuplog.mid,startuplog)).groupByKey()
val distictDstream: DStream[StartupLog] = groupbyMidDstream.flatMap { case (mid, startupLogItr) =>
startupLogItr.toList.take(1)
}
// 5 保存今日访问过的用户(mid)清单 -->Redis 1 key类型 : set 2 key : dau:2019-xx-xx 3 value : mid
distictDstream.foreachRDD{rdd=>
//driver
rdd.foreachPartition{ startuplogItr=>
val jedis:Jedis=RedisUtil.getJedisClient //executor
for (startuplog <- startuplogItr ) {
val key= "dau:"+startuplog.logDate
jedis.sadd(key,startuplog.mid)
println(startuplog)
}
jedis.close()
}
}
ssc.start()
ssc.awaitTermination()
}
}
3.5保存到Hbase 中
添加依赖
<dependency>
<groupId>org.apache.phoenix</groupId>
<artifactId>phoenix-spark</artifactId>
<version>4.14.2-HBase-1.3</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
通过phoenix保存到hbase
引入隐式转换的包:
import org.apache.phoenix.spark._
//把数据写入hbase+phoenix
distictDstream.foreachRDD{rdd=>
rdd.saveToPhoenix("GMALL2019_DAU",Seq("MID", "UID", "APPID", "AREA", "OS", "CH", "TYPE", "VS", "LOGDATE", "LOGHOUR", "TS") ,new Configuration,Some("hadoop1,hadoop2,hadoop3:2181"))
}
四、SpringBoot框架开发日活数据查询接口
1 访问路径
总数 | http://publisher:8070/realtime-total?date=2019-02-01 |
分时统计 | http://publisher:8070/realtime-hour?id=dau&date=2019-02-01 |
2 要求数据格式
总数 | [{"id":"dau","name":"新增日活","value":1200}, {"id":"new_mid","name":"新增设备","value":233} ] |
分时统计 | {"yesterday":{"11":383,"12":123,"17":88,"19":200 }, "today":{"12":38,"13":1233,"17":123,"19":688 }} |
5.1创建应用GmallPublisherApplication,添加扫描包
@SpringBootApplication
@MapperScan(basePackages = "com.atguigu.gmallXXXXXXX.publisher.mapper")
public class Gmall2019PublisherApplication{
public static void main(String[] args) {
SpringApplication.run(Gmall2019PublisherApplication.class, args);
}
}
5.2 controller层
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.atguigu.gmall2019.dw.publisher.service.PublisherService;
import org.apache.commons.lang.time.DateUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
@RestController
public class PublisherController {
@Autowired
PublisherService publisherService;
@GetMapping("realtime-total")
public String realtimeHourDate(@RequestParam("date") String date) {
List<Map> list = new ArrayList<Map>();
// 日活总数
int dauTotal = publisherService.getDauTotal(date);
Map dauMap=new HashMap<String,Object>();
dauMap.put("id","dau");
dauMap.put("name","新增日活");
dauMap.put("value",dauTotal);
list.add(dauMap);
// 新增用户
int newMidTotal = publisherService.getNewMidTotal(date);
Map newMidMap=new HashMap<String,Object>();
newMidMap.put("id","new_mid");
newMidMap.put("name","新增用户");
newMidMap.put("value",newMidTotal);
list.add(newMidMap);
return JSON.toJSONString(list);
}
@GetMapping("realtime-hours")
public String realtimeHourDate(@RequestParam("id") String id,@RequestParam("date") String date){
if( "dau".equals(id)){
Map dauHoursToday = publisherService.getDauHours(date);
JSONObject jsonObject = new JSONObject();
jsonObject.put("today",dauHoursToday);
String yesterdayDateString="";
try {
Date dateToday = new SimpleDateFormat("yyyy-MM-dd").parse(date);
Date dateYesterday = DateUtils.addDays(dateToday, -1);
yesterdayDateString=new SimpleDateFormat("yyyy-MM-dd").format(dateYesterday);
} catch (ParseException e) {
e.printStackTrace();
}
Map dauHoursYesterday = publisherService.getDauHours(yesterdayDateString);
jsonObject.put("yesterday",dauHoursYesterday);
return jsonObject.toJSONString();
}
if( "new_order_totalamount".equals(id)){
String newOrderTotalamountJson = publisherService.getNewOrderTotalAmountHours(date);
return newOrderTotalamountJson;
}
return null;
}
}
5.3 service层
public interface PublisherService { |
5.4 service层实现类
@Service
public class PublisherServiceImpl implements PublisherService{
@Autowired
DauMapper dauMapper;
@Override
public Integer getDauTotal(String date) {
return dauMapper.selectDauTotal(date);
}
@Override
public Map getDauHour(String date) {
HashMap dauHourMap=new HashMap();
List<Map> dauHourList = dauMapper.selectDauTotalHourMap(date);
for (Map map : dauHourList) {
dauHourMap.put(map.get("LH"),map.get("CT"));
}
return dauHourMap;
}
}
5.5 数据层 mapper
import java.util.List;
import java.util.Map;
public interface DauMapper {
public Integer selectDauTotal(String date);
public List<Map> selectDauTotalHourMap(String date);
}
5.6 数据层 实现配置
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper SYSTEM "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >
<mapper namespace="com.atguigu.gmall2019.publisher.mapper.DauMapper">
<select id="selectDauTotal" resultType="Integer">
select count(*) from gmall2019_dau where logdate=#{date}
</select>
<select id="selectDauTotalHourMap" resultMap="dauTotalHour">
select LOGHOUR lh, count(*) ct from gmall2019_dau where LOGDATE=#{date}
group by LOGHOUR
</select>
<resultMap id="dauTotalHour" type="java.util.Map" autoMapping="true">
</resultMap>
</mapper>
六、附录1
#启动Kafka集群脚本
#! /bin/bash
case $1 in
"start"){
for i in hadoop102 hadoop103 hadoop104
do
echo " --------启动 $i Kafka-------"
# 用于KafkaManager监控
ssh $i "export JMX_PORT=9988 && /opt/module/kafka-default/bin/kafka-server-start.sh -daemon /opt/module/kafka-default/config/server.properties "
done
};;
"stop"){
for i in hadoop102 hadoop103 hadoop104
do
echo " --------停止 $i Kafka-------"
ssh $i "/opt/module/kafka-default/bin/kafka-server-stop.sh stop"
done
};;
esac
启动:./kafka start
停止:./kafka stop
#启动消费 消费日志
/opt/module/kafka-default/bin/kafka-console-consumer.sh --bootstrap-server hadoop102:9092,hadoop103:9092,hadoop104:9092 --topic GMALL_STARTUP
/opt/module/kafka-default/bin/kafka-console-consumer.sh --bootstrap-server hadoop102:9092,hadoop103:9092,hadoop104:9092 --topic GMALL_EVENT
#启动SpringBoot
[deploy@hadoop102 gmall0513]$ java -jar /opt/module/gmall0513/gmall0513-logger-0.0.1-SNAPSHOT.jar --server.port=8090
#日志服务集群启动脚本 logger-cluster.sh
#!/bin/bash
JAVA_BIN=/opt/module/jdk-default/bin/java
PROJECT=gmall0513
APPNAME=gmall0513-logger-0.0.1-SNAPSHOT.jar
SERVER_PORT=8090
case $1 in
"start")
{
echo "========START NGINX==============="
ssh hadoop103 "/opt/module/nginx/sbin/nginx"
for i in hadoop102 hadoop103 hadoop104
do
echo "========start: $i============="
ssh $i "$JAVA_BIN -Xms32m -Xmx64m -jar /opt/module/$PROJECT/$APPNAME --server.port=$SERVER_PORT >/dev/null 2>&1 &"
done
echo "========NGINX STARTED============="
};;
"stop")
{
echo "========STOP NGINX==============="
ssh hadoop103 "/opt/module/nginx/sbin/nginx -s stop"
for i in hadoop102 hadoop103 hadoop104
do
echo "========stop: $i==============="
ssh $i "ps -ef|grep $APPNAME |grep -v grep|awk '{print \$2}'|xargs kill" >/dev/null 2>&1
done
echo "========NGINX STOPED============="
};;
esac
七、附录二,安装启动Phoenix
[deploy@hadoop102 phoenix-default]$ cp phoenix-server-hbase-2.2-5.1.2.jar /opt/module/hbase-default/lib/
[deploy@hadoop102 phoenix-default]$ cp phoenix-client-hbase-2.2-5.1.2.jar /opt/module/hbase-default/lib/
[deploy@hadoop102 lib]$ xsync phoenix-server-hbase-2.2-5.1.2.jar
[deploy@hadoop102 lib]$ xsync phoenix-client-hbase-2.2-5.1.2.jar
#PHOENIX
export PHOENIX_HOME=/opt/module/phoenix-default
export PHOENIX_CLASSPATH=$PHOENIX_HOME
export PATH=$PATH:$PHOENIX_HOME/bin
[root@hadoop102 ~]# xsync /etc/profile
JAVA_HOME=/opt/module/jdk-default
PATH=$JAVA_HOME/bin:$PATH
CLASSPATH=$JAVA_HOME/jre/lib/ext:$JAVA_HOME/lib/tools.jar
export PATH JAVA_HOME CLASSPATH
export JRE_HOME=$JAVA_HOME/jre
#HADOOP_HOME
export HADOOP_HOME=/opt/module/hadoop-default
export HADOOP_INSTALL=$HADOOP_HOME
export HADOOP_MAPRED_HOME=$HADOOP_HOME
export HADOOP_COMMON_HOME=$HADOOP_HOME
export HADOOP_HDFS_HOME=$HADOOP_HOME
export YARN_HOME=$HADOOP_HOME
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin
export HADOOP_PREFIX=$HADOOP_HOME
export HADOOP_LIBEXEC_DIR=$HADOOP_HOME/libexec
export JAVA_LIBRARY_PATH=$HADOOP_HOME/lib/native:$JAVA_LIBRARY_PATH
export HADOOP_CONF_DIR=$HADOOP_PREFIX/etc/hadoop
export HADOOP_CLASSPATH=${JAVA_HOME}/lib/tools.jar:${HADOOP_CLASSPATH}
#SCALA_HOME
export SCALA_HOME=/opt/module/scala-default
export PATH=$PATH:$SCALA_HOME/bin
#HIVE_HOME
export HIVE_HOME=/opt/module/hive-default
export PATH=$HIVE_HOME/bin:$PATH
export HADOOP_CLASSPATH=$HIVE_HOME/conf:$HIVE_HOME/lib
#SPARK_HOME
export SPARK_HOME=/opt/module/spark-default
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME
#SQOOP_HOME
export SQOOP_HOME=/opt/module/sqoop-default
export PATH=$SQOOP_HOME/bin:$PATH
export LOGDIR=$SQOOP_HOME/logs
#ES_HOME
export ES_HOME=/opt/module/elasticsearch-default
export PATH=$PATH:$ES_HOME/bin
#KIBANA_HOME
export ES_HOME=/opt/module/kibana-default
export PATH=$PATH:$KIBANA_HOME/bin
#ZK_HOME
export ES_HOME=/opt/module/zookeeper-default
export PATH=$PATH:$ZK_HOME/bin
#HBASE_HOME
export HBASE_HOME=/opt/module/hbase-default
export PATH=$PATH:$HBASE_HOME/bin:$HBASE_HOME/sbin
#AIRFLOW_HOME
export AIRFLOW_HOME=/home/deploy/airflow
#PRESTO_HOME
export PRESTO_HOME=/opt/module/presto-default
export PATH=$PATH:$PRESTO_HOME/bin
#FLINK_HOME
export FLINK_HOME=/opt/module/flink_default
export PATH=$PATH:$FLINK_HOME/bin
#PHOENIX
export PHOENIX_HOME=/opt/module/phoenix-default
export PHOENIX_CLASSPATH=$PHOENIX_HOME
export PATH=$PATH:$PHOENIX_HOME/bin
启动Phoenix
sqlline.py hadoop104,hadoop105,hadoop106:2181
//master.HMaster: hbase:meta,,1.1588230740 is NOT online\\\\\\\\\\\\\\\\\
[deploy@hadoop104 bin]$ ./zkCli.sh -server hadoop104:2181
[zk: hadoop104:2181(CONNECTED) 4] deleteall /hbase