大数据实训项目
该博客只为了个人记录
数据流程图
离线处理数据
需求
手机App log:统计一周内用户版本分布比例
思路
数据可视化
项目的构建使用到了springboot的逆向工程
application.yml
server:
port: 8888
spring:
datasource:
url: jdbc:mysql://localhost:3306/spark?characterEncoding=utf-8&serverTimezone=GMT%2B8
username: root
password: 403411
driver-class-name: com.mysql.cj.jdbc.Driver
thymeleaf:
cache: false
mybatis:
typeAliasesPackage: com.stu.console.model
mapperLocations: classpath:mapper/*.xml
generatorConfig.xml
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE generatorConfiguration
PUBLIC "-//mybatis.org//DTD MyBatis Generator Configuration 1.0//EN"
"http://mybatis.org/dtd/mybatis-generator-config_1_0.dtd">
<!-- 配置Run As Maven build : Goals 参数 : mybatis-generator:generate -Dmybatis.generator.overwrite=true -->
<!-- 配置 tableName,使用 Run As Maven build 生成 dao model 层 -->
<generatorConfiguration>
<!--数据库驱动包路径 -->
<classPathEntry
location="C:\Users\DELL\.m2\repository\mysql\mysql-connector-java\8.0.16\mysql-connector-java-8.0.16.jar"/>
<context id="DB2Tables" targetRuntime="MyBatis3">
<!--关闭注释 -->
<commentGenerator>
<property name="suppressAllComments" value="true"/>
</commentGenerator>
<!--数据库连接信息 -->
<jdbcConnection driverClass="com.mysql.cj.jdbc.Driver"
connectionURL="jdbc:mysql://localhost:3306/spark?useUnicode=true&characterEncoding=utf-8&useSSL=false&serverTimezone=UTC"
userId="root" password="403411">
</jdbcConnection>
<!--生成的model 包路径 -->
<javaModelGenerator targetPackage="com.example.bgdteam2echarts.model"
targetProject="src/main/java">
<property name="enableSubPackages" value="ture"/>
<property name="trimStrings" value="true"/>
</javaModelGenerator>
<!--生成xml mapper文件 路径 -->
<sqlMapGenerator targetPackage="mapper"
targetProject="src/main/resources">
<property name="enableSubPackages" value="ture"/>
</sqlMapGenerator>
<!-- 生成的Dao接口 的包路径 -->
<javaClientGenerator type="XMLMAPPER"
targetPackage="com.example.bgdteam2echarts.dao" targetProject="src/main/java">
<property name="enableSubPackages" value="ture"/>
</javaClientGenerator>
<!--对应数据库表名,多个表,请复制指定 -->
<!--生成对应表及类名-->
<!-- <table tableName="android_table" domainObjectName="Android_table"></table>-->
<table tableName="aaaaaa" domainObjectName="AAAAAAA"></table>
</context>
</generatorConfiguration>
使用逆向工程会自动生成dao包,model包,mapper包中的代码
index.html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8"/>
<title>ECharts</title>
<script src="ec/echarts.min.js"></script>
<script src="ec/jquery.min.js"></script>
</head>
<body>
<!-- 为ECharts准备一个具备大小(宽高)的Dom -->
<div id="main" style="width: 600px;height:400px;"></div>
<script>
$(document).ready(function(){
// 基于准备好的dom,初始化echarts实例
var myChart = echarts.init(document.getElementById('main'));
//数据加载完之前先显示一段简单的loading动画
myChart.showLoading();
var names=[]; //横坐标数组(实际用来盛放X轴坐标值)
var values=[]; //纵坐标数组(实际用来盛放Y坐标值)
$.ajax({
type : "post",
async : true, //异步请求(同步请求将会锁住浏览器,用户其他操作必须等待请求完成才可以执行)
url : "data.json", //请求发送到dataActiont处
data : {},
dataType : "json", //返回数据形式为json
success : function(result) {
//请求成功时执行该函数内容,result即为服务器返回的json对象
var len = result.length;
if (result) {
for(var i in result){
names.push(result[i].vs);
values.push(result[i].num);
}
myChart.hideLoading(); //隐藏加载动画
myChart.setOption({ //加载数据图表
tooltip: {},
legend: {
data:['Android用户使用app个个版本数量']
},
xAxis: {
data: names
},
yAxis: {
type: 'value'
},
series: [{
// 根据名字对应到相应的系列
name: 'Android用户使用app个个版本数量',
type: 'bar',
data: values
}]
});
}
},
error : function(errorMsg) {
//请求失败时执行该函数
alert("图表请求数据失败!");
myChart.hideLoading();
}
});//end ajax
});
</script>
</body>
</html>
IndexController
@Controller
public class IndexController {
@Autowired
IAndroid_tableService android_tableService;
IOs_tableService iOs_tableService;
@RequestMapping("")
public String getIndex(){
return "index";
}
@RequestMapping(value = "data.json")
@ResponseBody
public List<Android_table> getAndroidData(){
List<Android_table> android_tables = android_tableService.selectByExample();
System.out.println(android_tables);
return android_tables;
}
}
service
IAndroid_tableService
public interface IAndroid_tableService {
List<Android_table> selectByExample();
}
Android_tableServiceImpl
@Service
public class Android_tableServiceImpl implements IAndroid_tableService {
@Autowired
Android_tableMapper android_tableMapper;
@Override
public List<Android_table> selectByExample() {
Android_tableExample android_tableExample = new Android_tableExample();
return android_tableMapper.selectByExample(android_tableExample);
}
}
数据可视化
实时处理数据
需求
北京地区实时统计上线人数及操作系统的比例(登陆过就算上线人数,同一id去重)
流程
项目结构
pom.xml文件**
<dependencies>
<dependency>
<groupId>org.codehaus.janino</groupId>
<artifactId>janino</artifactId>
<version>3.0.8</version>
</dependency>
<!-- 导入scala的依赖 -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<!-- 导入spark的依赖 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- 指定hadoop-client API的版本 -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- mysql依赖 -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.15</version>
</dependency>
<!-- SparkStream依赖 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- redis依赖 -->
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.0</version>
</dependency>
<!-- kafka依赖 -->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>2.2.0</version>
</dependency>
<!-- Hbase依赖 -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.1</version>
</dependency>
<!-- spark-streaming-kafka 依赖 -->
<!-- <dependency>-->
<!-- <groupId>org.apache.spark</groupId>-->
<!-- <artifactId>spark-streaming-kafka-0-8_2.11</artifactId>-->
<!-- <version>2.4.4</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>org.apache.spark</groupId>-->
<!-- <artifactId>spark-streaming-kafka_2.11</artifactId>-->
<!-- <version>1.6.3</version>-->
<!-- </dependency>-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.4.4</version>
</dependency>
<!-- Streaming KMeans -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.11</artifactId>
<version>2.4.4</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
</dependency>
</dependencies>
config.properties 配置文件
# Kafka配置
kafka.broker.list=hdp-1:9092,hdp-2:9092,hdp-3:9092
# Redis配置
redis.host=hdp-1
redis.port=6379
# auth:0不需要密码;1需要密码
redis.authFlag=1
redis.password=403411
utils工具类
KafkaUtil
package com.zpark.sparksteam.utils
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
/**
*
* @Date 2019/12/11 9:05
* @Version 1.0
*/
class KafkaUtil {
def getKafka(ssc: StreamingContext, topic: String, groupId: String) = {
//将kafka参数应映射成map
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "hdp-1:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupId,
"auto.offset.reset" -> "latest",
"fetch.max.wait.ms" -> Integer.valueOf(500),
"enable.auto.commit" -> java.lang.Boolean.valueOf(false)
)
val topics = Set(topic)
//通过KafkaUtils创建除data
val date: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent,
Subscribe[String, String](topics, kafkaParams))
date
}
}
PropertiesUtil 加载配置文件工具类
package com.zpark.sparksteam.utils
import java.io.InputStreamReader
import java.util.Properties
object PropertiesUtil {
def main(args: Array[String]): Unit = {
val properties: Properties = PropertiesUtil.load("config.properties")
println(properties.getProperty("kafka.broker.list"))
}
def load(propertieName:String): Properties ={
val prop=new Properties();
prop.load(new InputStreamReader(Thread.currentThread().getContextClassLoader.getResourceAsStream(propertieName) , "UTF-8"))
prop
}
}
RedisUtil 连接redis工具类
public class RedisUtil {
private static JedisPool jedisPool;
private static Properties properties;
private static String proPath = "config.properties";
private static boolean authFlag;
private static String password;
static {
//读取配置文件
try {
readConfigFile();
} catch (IOException e) {
e.printStackTrace();
}
// 配置连接池
JedisPoolConfig poolConfig = new JedisPoolConfig();
poolConfig.setMaxIdle(20);
poolConfig.setMinIdle(10);
poolConfig.setMaxTotal(30);
poolConfig.setMaxWaitMillis(3000);
poolConfig.setTestOnBorrow(true);
poolConfig.setTestOnReturn(true);
jedisPool = new JedisPool(properties.getProperty("redis.host"), Integer.valueOf(properties.getProperty("redis.port")));
//判断是否需要密码验证
if ("1".equals(properties.getProperty("redis.authFlag"))) {
authFlag = true;
password = properties.getProperty("redis.password");
if (password == null) {
try {
throw new Exception("redis密码没有读取到!");
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
private static Properties readConfigFile() throws IOException {
InputStream in = RedisUtil.class.getClass().getResourceAsStream("/" + proPath);
if (in == null) {
throw new IOException("读取不到配置文件:" + proPath);
}
properties = new Properties();
properties.load(in);
in.close();
return properties;
}
public static Jedis getJedisClient() {
Jedis jedis = jedisPool.getResource();
if (authFlag) {
jedis.auth(password);
}
return jedis;
}
public static void close() {
jedisPool.close();
}
}
domain
Arealog
case class Arealog (area: String, sum: Int)
OSlog
case class OSlog (os: String, sum: Int)
Startuplog
case class Startuplog (uid:String, area:String, os:String)
application
SparkStreamDemo
object SparkStreamDemo {
Logger.getLogger("org").setLevel(Level.WARN)
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("SparkStreamDemo").setMaster("local[*]")
val ssc: StreamingContext = new StreamingContext(conf, Seconds(7))
//从卡夫卡中读取数据
val kafkaUtil: KafkaUtil = new KafkaUtil
val kafkaValues: InputDStream[ConsumerRecord[String, String]] = kafkaUtil.getKafka(ssc, "bigdata", "spark")
// kafkaValues.window(Seconds(14), Seconds(7))
// 转换处理
/**
* 数据源:
* {"area":"tianjin","uid":"121","os":"andriod","ch":"baidu","appid":"Hello-Vs","mid":"mid_162","type":"startup","vs":"1.3.1","ts":1576044095950}
* {"area":"guangdong","uid":"160","os":"andriod","ch":"tencent","appid":"Hello-Vs","mid":"mid_476","type":"startup","vs":"1.1.1","ts":1576044096189}
*/
val startuplogStream: DStream[Startuplog] = kafkaValues.map(rdd => {
val values: String = rdd.value()
val startuplog: Startuplog = JSON.parseObject(values, classOf[Startuplog])
startuplog
}).filter(_.area.equals("beijing")) //Startuplog(37,beijing,andriod)
startuplogStream.print()
// 利用redis进行去重过滤
//考虑到spark执行在driver和executor中,所以我们采取transform算子
val filteredDstream: DStream[Startuplog] = startuplogStream.transform { rdd =>
println("过滤前:" + rdd.count())
//driver 周期性执行
val curdate: String = new SimpleDateFormat("yyyy-MM-dd").format(new Date)
//MyRedisUtil用的是java代码构造的
val jedis: Jedis = RedisUtil.getJedisClient
val key = "dau:" + curdate
/**
* 在Redis中,我们可以将Set类型看作为没有排序的字符集合,和List类型一样,
* 我们也可以在该类型的数据值上执行添加、删除或判断某一元素是否存在等操作。
* 需要说明的是,这些操作的时间复杂度为O(1),即常量时间内完成次操作。
* Set可包含的最大元素数量是4294967295。
* 和List类型不同的是,Set集合中不允许出现重复的元素,如果多次添加相同元素,
* Set中将仅保留该元素的一份拷贝
*/
val dauSet: util.Set[String] = jedis.smembers(key)
//将dauSet设置为广播变量
val dauBC: Broadcast[util.Set[String]] = ssc.sparkContext.broadcast(dauSet)
//对数据进行过滤
val filteredRDD: RDD[Startuplog] = rdd.filter { startuplog =>
//executor
val dauSet: util.Set[String] = dauBC.value
!dauSet.contains(startuplog.uid)
}
println("过滤后:" + filteredRDD.count())
filteredRDD
}
//去重思路;把相同的uid的数据分成一组 ,每组取第一个
val groupbyMidDstream: DStream[(String, Iterable[Startuplog])] = filteredDstream.map(startuplog => (startuplog.uid, startuplog)).groupByKey()
val distinctDstream: DStream[Startuplog] = groupbyMidDstream.flatMap { case (mid, startulogItr) =>
startulogItr.take(1)
}
//统计北京地区的实时上线人数
val areaValues: DStream[Arealog] = distinctDstream.map(arr => {
(arr.area, 1)
}).reduceByKey(_ + _).map(arr => {
Arealog(arr._1, arr._2)
})
//将北京地区实时上线人数保存到Redis中
areaValues.foreachRDD { rdd =>
rdd.foreachPartition { arealog =>
//executor
val jedis: Jedis = RedisUtil.getJedisClient
val list: List[Arealog] = arealog.toList
for (arealog <- list) {
val key = "dau:" + arealog.area
val value = arealog.sum.toString
jedis.sadd(key, value)
}
jedis.close()
}
}
//统计北京地区上线用户使用的操作系统
val osValues: DStream[OSlog] = distinctDstream.map(arr => {
(arr.os, 1)
}).reduceByKey(_ + _).map(arr => {
OSlog(arr._1, arr._2)
})
//将北京地区上线用户使用的操作系统的个数保存到Redis中
osValues.foreachRDD { rdd =>
rdd.foreachPartition { oslog =>
//executor
val jedis: Jedis = RedisUtil.getJedisClient
val list: List[OSlog] = oslog.toList
for (oslog <- list) {
val key = "dau:" + oslog.os
val value = oslog.sum.toString
jedis.sadd(key, value)
}
jedis.close()
}
}
ssc.start()
ssc.awaitTermination()
}
}