文章目录
一、构建项目
1.1 创建Spring Boot
项目
此次开发使用IDEA
,在创建项目阶段选择Spring Initializer
,如下所示:
选择Maven
进行包管理,填写好项目信息然后一路Next
下去,创建好目录结构如下的Spring
项目:
1.2 pom.xml
项目使用Maven
进行包管理,所以自然需要编辑pom.xml
来引入所需的第三方依赖。我的项目中完整的pom.xml
文件如下:(如果复制请注意修改groupId
、artifactId
、name
等项目信息和Java版本)
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.4.4</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>cn.edu.neu</groupId>
<artifactId>movie</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>movie</name>
<description>movie recommend system</description>
<properties>
<java.version>1.8</java.version>
<scala.version>2.12</scala.version>
<flink.version>1.12.0</flink.version>
<encoding>UTF-8</encoding>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<!-- spring boot -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<!-- flink -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.12</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-scala_2.12</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.12</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.12</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-scala-bridge_2.12</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_2.12</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- flink执行计划,这是1.9版本之前的-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_2.12</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- blink执行计划,1.11+默认的-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-blink_2.12</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- flink连接器-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_2.12</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-sql-connector-kafka_2.12</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-jdbc_2.12</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-csv</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-json</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-hive_2.12</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-shaded-hadoop-2-uber</artifactId>
<version>2.7.5-10.0</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-metastore</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>2.3.2</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.49</version>
</dependency>
<!-- 高性能异步组件:Vertx-->
<dependency>
<groupId>io.vertx</groupId>
<artifactId>vertx-core</artifactId>
<version>3.9.0</version>
</dependency>
<dependency>
<groupId>io.vertx</groupId>
<artifactId>vertx-jdbc-client</artifactId>
<version>3.9.0</version>
</dependency>
<dependency>
<groupId>io.vertx</groupId>
<artifactId>vertx-redis-client</artifactId>
<version>3.9.0</version>
</dependency>
<!-- 日志 -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.7</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.12</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.44</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.2</version>
<scope>provided</scope>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/java</sourceDirectory>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<!--<encoding>${project.build.sourceEncoding}</encoding>-->
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.12.4</version>
<configuration>
<useFile>false</useFile>
<disableXmlReport>true</disableXmlReport>
<includes>
<include>**/*Test.*</include>
<include>**/*Suite.*</include>
</includes>
</configuration>
</plugin>
<!-- 打包插件(会包含所有依赖) -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<!--
zip -d learn_spark.jar META-INF/*.RSA META-INF/*.DSA META-INF/*.SF -->
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<!-- 设置jar包的入口类(可选) -->
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
二、建立数据库
我的项目中MySQL
版本为5.7.31
,如果你的版本在8
及8
以上,则注意修改pom.xml
文件中JDBC
的版本以及后端代码中JDBC
的Driver
类(8
版本后的MySQL
使用com.mysql.cj.jdbc.Driver
,而不再是com.mysql.jdbc.Driver
)。将后续dao
包中DataBase
类的静态变量DRIVER
修改为"com.mysql.cj.jdbc.Driver"
即可。
首先创建名为movie
的数据库:CREATE DATABASE IF NOT EXISTS movie
。该数据库包含以下数据表:
hot
:存储Flink
实时统计的最近热播数据
movies
:存储电影数据
ratings
:存储评分数据
ts_ratings
:存储过去十次统计的十秒内评分数量
小项目,依赖啥的都没加,有需要可以自己加上。
三、数据准备
3.1 数据库初始化
首先前往movie-lens
官网下载数据集,我们采用movie-lens
数据集中的movies.csv
初始化我们的电影数据库。下载好数据集后,将movies.csv
文件放在resources/data
下即可。
先看两个工具类Dataset
和它的子类MovieDataset
:
public abstract class Dataset<T> {
protected Scanner in;
public Dataset(String url) {
in = new Scanner(Dataset.class.getResourceAsStream(url));
}
public boolean hasNext() {
return in != null && in.hasNextLine();
}
public abstract T next();
}
public class MovieDataset extends Dataset<MovieBean> {
public MovieDataset(String url) {
super(url);
}
@Override
public MovieBean next() {
if (hasNext()) {
String csv = in.nextLine();
if (!csv.equals("")) {
return (MovieBean) new MovieBean().initFromCSV(csv, ";");
} else return next();
} else return null;
}
}
可以看见MovieDataset
通过读取传入的csv
文件的每一行,然后通过这一行数据初始化一个MovieBean
对象返回。
下面看看这个MovieBean
类:
@Data
@NoArgsConstructor
@AllArgsConstructor
public class MovieBean implements CSVObjectInitializable {
private int movieId;
private String title;
private String genres;
private double avgRating;
private int count;
@Override
public CSVObjectInitializable initFromCSV(String csv, String sep) {
String[] tokens = csv.trim().split(sep);
try {
movieId = Integer.parseInt(tokens[0]);
} catch (NumberFormatException e) {
e.printStackTrace();
movieId = -1;
}
title = tokens[1];
genres = tokens[2].split("\\|")[0];
avgRating = 0;
count = 0;
return this;
}
}
可以发现,一个MovieBean
就对应了movies
表中完整的一行。而CSVObjectInitializable
接口则表示此类可以通过csv
文件的行数据进行初始化。
数据库初始化需要清空所有的表,然后只初始化movies
表即可,其他的表的存储数据在程序运行后产生。代码如下:
public class DataBase {
public static final String DRIVER = "com.mysql.jdbc.Driver";
public static final String URL = "jdbc:mysql://localhost:3306/movie?characterEncoding=UTF-8";
public static final String USR = "root";
public static final String PSW = "root";
public static void init() {
try {
Class.forName(DRIVER);
Connection conn = DriverManager.getConnection(URL, USR, PSW);
Statement s = conn.createStatement();
s.execute("delete from movies");
s.execute("delete from ratings");
s.execute("delete from hot");
s.execute("delete from ts_ratings");
s.close();
PreparedStatement ps = conn.prepareStatement("insert into movies values (?, ?, ?, ?, ?)");
Dataset<MovieBean> movies = new MovieDataset("/data/movies.csv");
while (movies.hasNext()) {
MovieBean movie = movies.next();
ps.setInt(1, movie.getMovieId());
ps.setString(2, movie.getTitle());
ps.setString(3, movie.getGenres());
ps.setDouble(4, movie.getAvgRating());
ps.setInt(5, movie.getCount());
ps.execute();
}
ps.close();
conn.close();
} catch (Exception e) {
e.printStackTrace();
}
}
...
}
然后DataSimulator
再调用DataBase
的init
方法,即可选择是否对数据库进行初始化。代码如下:
public class DataSimulator {
public static void main(String[] args) {
Scanner in = new Scanner(System.in);
System.out.print(">>> ");
String code = in.next();
if (code.equals("init")) {
initDataBase();
Message.info("Database init finished");
} else produce();
}
public static void produce() {
...
}
public static void initDataBase() {
DataBase.init();
}
}
3.2 Kafka Topic
建立
此项目需要使用Kafka
作为消息队列,所以需要创建相应的Topic
。
安装好Kafka
环境并开启zookeeper
后,进入Kafka
根目录,使用如下命令开启Kafka
服务。
kafka-server-start.sh config/server.properties
然后再使用如下命令创建一个名为movie
的topic
。
kafka-topics.sh --create --bootstrap-server localhost:9092 --replication-factor 1 --partitions 1 --topic movie
3.3 数据模拟器DataSimulator
(可选)
此项目产生的数据为评分数据RatingBean
,其包含userId
、movieId
和rating
三个属性。此项目是一个电影数据的实时统计平台,如果需要做演示,在前端手动地一个个评分显然不科学。因此,有必要写一个数据模拟器以在短时间内大量生产随机的评分数据。
评分数据模拟生产的核心类是Simulator
,它能够随机组合用户id
和电影id
,并通过以3.2
为中心的高斯分布随机产生一个评分数据。代码如下:
public class Simulator implements Serializable {
private ArrayList<Integer> movies;
private int users;
private boolean inited;
private int movieIdx;
public Simulator() {
this.movies = new ArrayList<>();
Dataset<MovieBean> dataset = new MovieDataset("/data/movies.csv");
while (dataset.hasNext()) {
MovieBean movie = dataset.next();
if (movie != null) movies.add(movie.getMovieId());
}
this.users = 0;
this.inited = false;
}
public Simulator run(int users) {
this.users = users;
this.inited = true;
return this;
}
public double nextRating(Random random) {
double rating = (double) Math.round(Math.abs(random.nextGaussian() + 3.2));
while (rating < 1 || rating > 5) {
rating = (double) Math.round(Math.abs(random.nextGaussian() + 3.2));
}
return rating;
}
public RatingBean next() {
if (inited) {
Random random = new Random();
int userId = random.nextInt(users) + 1;
movieIdx = random.nextDouble() > 0.5 ? random.nextInt(movies.size()) : movieIdx;
double rating = nextRating(random);
return new RatingBean(userId, movies.get(movieIdx), rating);
} else return null;
}
}
然后创建一个Flink Source
类:
public class SimSource extends RichParallelSourceFunction<RatingBean> {
private Simulator simulator = new Simulator().run(100);
private boolean flag = true;
@Override
public void run(SourceContext<RatingBean> sourceContext) throws Exception {
while (flag) {
RatingBean bean = simulator.next();
if (bean != null) {
Random random = new Random();
long gap = random.nextInt(3) * 1000 + 500;
Thread.sleep(gap);
sourceContext.collect(bean);
} else break;
}
}
@Override
public void cancel() {
flag = false;
}
}
最后在DataSimulator
中使用上面的Flink Source
类向Kafka
的movie
主题发送数据即可。至此DataSimulator
的完整代码如下:
public class DataSimulator {
public static void main(String[] args) {
Scanner in = new Scanner(System.in);
System.out.print(">>> ");
String code = in.next();
if (code.equals("init")) {
initDataBase();
Message.info("Database init finished");
} else produce();
}
public static void produce() {
try {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
DataStreamSource<RatingBean> ds = env.addSource(new SimSource()).setParallelism(2);
SingleOutputStreamOperator<String> jsonDS = ds.map((MapFunction<RatingBean, String>) JSON::toJSONString);
// 输出到控制台
jsonDS.print();
// 创建 Kafka Producer
Properties prop = new Properties();
prop.setProperty("bootstrap.servers", "localhost:9092");
FlinkKafkaProducer<String> kafka = new FlinkKafkaProducer<>("movie", new SimpleStringSchema(), prop);
jsonDS.addSink(kafka);
env.execute();
} catch (Exception e) {
e.printStackTrace();
}
}
public static void initDataBase() {
DataBase.init();
}
}