大数据电商数仓(5.0)相关文档
一、虚拟机的准备
1、进入到虚拟机目录后,配置网络端口和主机名称
#配置网络端口,保证虚拟机可以连接网络
vim /etc/sysconfig/network-scripts/ifcfg-ens33
#编写完配置信息后wq保存,重启网卡
service network restart
网络端口配置信息
BOOTPROTO=static
ONBOOT = yes
#ip地址
IPADDR=192.168.147.11
#子网掩码
NETMASK=255.255.255.0
#网关
GATEWAY=192.168.147.2
#dns配置
DNS1=8.8.8.8
2、配置主机名称
#在vi /etc/hostname
#直接输入主机名即可
node1
3、配置/etc/hosts文件,同时也需要配置window的host文件
#打开host文件按后,将下面信息粘贴进去,并且在window目录下也需要粘贴进去
vim /etc/hosts
192.168.147.100 hadoop100
192.168.147.102 hadoop102
192.168.147.103 hadoop103
192.168.147.104 hadoop104
#window中host目录的位置
C:\Windows\System32\drivers\etc\host
4、关闭防火墙和防火墙开机自启动
systemctl stop firewalld
systemctl disable firewalld.service
5、修改dns解析地址,为防止部分镜像下载不成功
vim /etc/resolv.conf
# 将内容换成阿里巴巴解析地址
nameserver 8.8.8.8
二、配置Java环境
首先上传Java1.8 .rpm的文件
#解压并且安装
rpm -ivh jdk-8u221-linux-x64.rpm
#进入到/usr/java 目录下
cd /usr/java
#编写java环境变量
vim /etc/profile
#编写完后执行脚本
source /etc/profile
在/etc/profile文件中将下列配置信息粘贴到最下面
export JAVA_HOME=/usr/java/default
export PATH=$PATH:$JAVA_HOME/bin
三、克隆虚拟机
#克隆三个虚拟机,分别以hadoop103、hadoop104命名,并且修改好对应的端口号
vim /etc/sysconfig/network-scripts/ifcfg-ens33
#修改主机名称
vi /etc/hostname
在克隆的主机上,只需要修改这两个地方即可。
四、免密登录
#进入到对应的文件夹
vim /etc/ssh/ssh_config
#将信息修改成下列
StrictHostKeyChecking no
#将配置文件发送到其他服务器上
scp /etc/ssh/ssh_config node2:/etc/ssh/
#安装ssh密钥
ssh-keygen -t dsa -P '' -f ~/.ssh/id_dsa
#将每台服务器的公钥拷贝到authorized_keys文件中
cat id_dsa.pub >> authorized_keys
#所有服务器拷贝完成后,将完整的公钥分发给所有机器
scp authorized_keys 192.168.147.12:`pwd` #ip地址需要更换到自己服务器的ip
#完成后可以用ssh连接虚拟机测试
ssh hadoop100
五、安装Hadoop
1、Hadoop规划部署(版本hadoop3.1.3)
hadoop102 | hadoop103 | hadoop104 | |
---|---|---|---|
HDFS | NameNode,DataNode | DataNode | SecondaryNameNode,DataNode |
YARN | NodeManage | ResourceManager,NodeManager | NodeManage |
2、配置hadoop环境
#先将Hadoop上传到服务器中,然后对压缩包进行解压
tar -zxvf hadoop-3.1.3.tar.gz -C /opt/module/
#将解压后的文件夹改名
mv /opt/module/hadoop-3.1.3 /opt/module/hadoop
#将Hadoop添加到环境变量中去
vim /etc/profile
环境变量配置
export HADOOP_HOME=/opt/module/hadoop
export PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
3、核心配置文件
1、core-site.xml配置文件
<configuration>
<!-- 指定NameNode的地址 -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://hadoop102:8020</value>
</property>
<!-- 指定hadoop数据的存储目录 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/opt/module/hadoop/data</value>
</property>
<!-- 配置HDFS网页登录使用的静态用户为atguigu -->
<property>
<name>hadoop.http.staticuser.user</name>
<value>root</value>
</property>
<!-- 配置该atguigu(superUser)允许通过代理访问的主机节点 -->
<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.root.groups</name>
<value>root</value>
</property>
</configuration>
2、hdfs-site.xml配置文件
<configuration>
<!-- nn web端访问地址-->
<property>
<name>dfs.namenode.http-address</name>
<value>hadoop102:9870</value>
</property>
<!-- 2nn web端访问地址-->
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>hadoop104:9868</value>
</property>
<!-- 测试环境指定HDFS副本的数量1 -->
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
</configuration>
3、mapred-site.xml配置文件
<configuration>
<!-- 指定MapReduce程序运行在Yarn上 -->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<!-- 历史服务器端地址 -->
<property>
<name>mapreduce.jobhistory.address</name>
<value>hadoop102:10020</value>
</property>
<!-- 历史服务器web端地址 -->
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>hadoop102:19888</value>
</property>
</configuration>
4、yarn-site.xml配置文件
<configuration>
<!-- Site specific YARN configuration properties -->
<!-- 指定MR走shuffle -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- 指定ResourceManager的地址-->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>hadoop103</value>
</property>
<!-- 环境变量的继承 -->
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
</property>
<!--yarn单个容器允许分配的最大最小内存 -->
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>512</value>
</property>
<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>4096</value>
</property>
<!-- yarn容器允许管理的物理内存大小 -->
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>4096</value>
</property>
<!-- 关闭yarn对物理内存和虚拟内存的限制检查 -->
<property>
<name>yarn.nodemanager.pmem-check-enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
<!-- 开启日志聚集功能 -->
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<!-- 设置日志聚集服务器地址 -->
<property>
<name>yarn.log.server.url</name>
<value>http://hadoop102:19888/jobhistory/logs</value>
</property>
<!-- 设置日志保留时间为7天 -->
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>604800</value>
</property>
</configuration>
5、workers配置文件
hadoop102
hadoop103
hadoop104
6、hadoop-env.sh配置文件
#在该文件中需要配置Java的路径,否则Hadoop找不到Java路径
export JAVA_HOME=/usr/java/default
7、权限配置
start-dfs.sh 和 stop-dfs.sh两个文件
#在/opt/module/hadoop/sbin目录中,需要对start-dfs.sh 和 stop-dfs.sh 添加以下参数
vim /opt/module/hadoop/sbin/start-dfs.sh
or
vim /opt/module/hadoop/sbin/stop-dfs.sh
#将下列配置输入进去
#旧版配置
HDFS_DATANODE_USER=root
HADOOP_SECURE_DN_USER=hdfs
HDFS_NAMENODE_USER=root
HDFS_SECONDARYNAMENODE_USER=root
#新版配置 没有警告
# 新版
HDFS_DATANODE_USER=root
HDFS_DATANODE_SECURE_USER=hdfs
HDFS_NAMENODE_USER=root
HDFS_SECONDARYNAMENODE_USER=root
start-yarn.sh 和 stop-yarn.sh两个文件
#在/opt/module/hadoop/sbin目录中,需要对start-yarn.sh 和 stop-yarn.sh 添加以下参数
vim /opt/module/hadoop/sbin/start-yarn.sh
or
vim /opt/module/hadoop/sbin/stop-yarn.sh
YARN_RESOURCEMANAGER_USER=root
HADOOP_SECURE_DN_USER=yarn
YARN_NODEMANAGER_USER=root
4、Hadoop启动
1、Hadoop启动
#第一次启动,需要在hadoop102上进行格式化
hdfs namenode -format #仅仅只是第一次使用,后续不用
#在hadoop102上启动 在hadoop102上停止
start-dfs.sh stop-dfs.sh
#在hadoop103上启动 在hadoop103上停止
start-yarn.sh stop-yarn.sh
#机器脚本启动 机器脚本停止
hdp.sh start hdp.sh stop
2、编写脚本启动Hadoop集群
在/root/bin目录中创建一个hdp.sh文件
#!/bin/bash
if [ $# -lt 1 ]
then
echo "No Args Input..."
exit ;
fi
case $1 in
"start")
echo " =================== 启动 hadoop集群 ==================="
echo " --------------- 启动 hdfs ---------------"
ssh hadoop102 "/opt/module/hadoop/sbin/start-dfs.sh"
echo " --------------- 启动 yarn ---------------"
ssh hadoop103 "/opt/module/hadoop/sbin/start-yarn.sh"
echo " --------------- 启动 historyserver ---------------"
ssh hadoop102 "/opt/module/hadoop/bin/mapred --daemon start historyserver"
;;
"stop")
echo " =================== 关闭 hadoop集群 ==================="
echo " --------------- 关闭 historyserver ---------------"
ssh hadoop102 "/opt/module/hadoop/bin/mapred --daemon stop historyserver"
echo " --------------- 关闭 yarn ---------------"
ssh hadoop103 "/opt/module/hadoop/sbin/stop-yarn.sh"
echo " --------------- 关闭 hdfs ---------------"
ssh hadoop102 "/opt/module/hadoop/sbin/stop-dfs.sh"
;;
*)
echo "Input Args Error..."
;;
esac
给文件hdp.sh加权限
chmod 777 hdp.sh
3、解决Hadoop104的报错问题
#在/opt/module/hadoop/share/hadoop/hdfs/webapps/static目录中寻找dfs-dust.js文件,进行编辑
vim /opt/module/hadoop/share/hadoop/hdfs/webapps/static/dfs-dust.js
#原文件中函数方法
'date_tostring' : function (v) {
return moment(Number(v)).format('ddd MMM DD HH:mm:ss ZZ YYYY');
}
#替换函数的方法
'date_tostring' : function (v) {
return Number(v).toLocaleString();
}
六、安装zookeeper(3.5.7)
在配置zookeeper之前需要将其他主机用ssh连接,确保能够正常访问。
1、安装环境
先将zookeeper-3.4.6解压到 /opt/module目录下
#解压文件
tar -zxvf zookeeper-3.4.6 -C /opt/module/
#创建身份信息文件夹
mkdir -p /opt/module/zookeeper/zkData
cd /opt/module/zookeeper/zkData
echo 1 > myid
#进入到zookeeper配置文件中,并修改配置文件
cd /opt/module/zookeeper/conf
cp zoo_sample.cfg zoo.cfg
vim zoo.cfg
zoo.cfg配置文件
dataDir=/opt/module/zookeeper/zkData
server.2=hadoop102:2888:3888
server.3=hadoop103:2888:3888
server.4=hadoop104:2888:3888
修改完配置文件后,需要将zookeeper分发到Hadoop103和Hadoop104上,并修改机器上的myid编号
#将zookeeper进行分发
scp -r /opt/module/zookeeper/ hadoop103:/opt/module/
#其他机器进入zookeeper目录
cd /opt/module/zookeeper/zkData/
#修改myid中对应的编号
echo "3" > myid
#查看myid文件
cat myid
2、zookeeper启动
#zookeeper 编写脚本启动
vim /root/bin/zk.sh
#设置zk.sh的权限
chmod 777 zk.sh
#zookeeper的启动
zk.sh start
#zookeeper的停止
zk.sh stop
#zookeeper查看状态
zk.sh status
zk.sh配置文件
#!/bin/bash
case $1 in
"start"){
for i in hadoop102 hadoop103 hadoop104
do
echo ---------- zookeeper $i 启动 ------------
ssh $i "/opt/module/zookeeper/bin/zkServer.sh start"
done
};;
"stop"){
for i in hadoop102 hadoop103 hadoop104
do
echo ---------- zookeeper $i 停止 ------------
ssh $i "/opt/module/zookeeper/bin/zkServer.sh stop"
done
};;
"status"){
for i in hadoop102 hadoop103 hadoop104
do
echo ---------- zookeeper $i 状态 ------------
ssh $i "/opt/module/zookeeper/bin/zkServer.sh status"
done
};;
esac
七、安装Kafka2.12-3.0.0
1、安装环境
#首先将kafka解压到目录中
tar -zxvf kafka2.12-3.0.0..tar.gz -C /opt/module
#修改/opt/module/kafka/config目录中的server.properties文件
vim server.properties
#将文件中路径修改为下面路径
broker.id=0 #hadoop103中id为1,hadoop104中id为2
log.dirs=/opt/module/kafka/datas
zookeeper.connect=hadoop102:2181,hadoop103:2181,hadoop104:2181/kafka
配置kafka环境变量
#打开文件
vim /etc/profile
#更新文件
source /etc/profile
export KAFKA_HOME=/opt/module/kafka
export PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$KAFKA_HOME/bin
2、kafka集群启停脚本
#kafka启动
kf.sh start
#kafka停止
kf.sh stop
kf.sh文件
#! /bin/bash
case $1 in
"start"){
for i in hadoop102 hadoop103 hadoop104
do
echo " --------启动 $i Kafka-------"
ssh $i "/opt/module/kafka/bin/kafka-server-start.sh -daemon /opt/module/kafka/config/server.properties"
done
};;
"stop"){
for i in hadoop102 hadoop103 hadoop104
do
echo " --------停止 $i Kafka-------"
ssh $i "/opt/module/kafka/bin/kafka-server-stop.sh "
done
};;
esac
#编写完成后需要增加权限
chmod 777 kf.sh
3、kafka基本操作命令
参数 | 描述 |
---|---|
–bootstrap-server <String: server toconnect to> | 连接的Kafka Broker主机名称和端口号。 |
–topic <String: topic> | 操作的topic名称。 |
–create | 创建主题。 |
–delete | 删除主题。 |
–alter | 修改主题。 |
–list | 查看所有主题。 |
–describe | 查看主题详细描述。 |
–partitions <Integer: # of partitions> | 设置分区数。 |
–replication-factor<Integer: replication factor> | 设置分区副本。 |
–config <String: name=value> | 更新系统默认的配置。 |
#查看当前服务器中的所有topic
kafka-topics.sh --bootstrap-server hadoop102:9092 --list
#创建first topic 为方便后续操作,需要创建一下
kafka-topics.sh --bootstrap-server hadoop102:9092 --create --partitions 1 --replication-factor 3 --topic first
#再次查看first主题的详情
kafka-topics.sh --bootstrap-server hadoop102:9092 --describe --topic first
#删除topic
kafka-topics.sh --bootstrap-server hadoop102:9092 --delete --topic first
#生产者命令操作
kafka-console-producer.sh --bootstrap-server hadoop102:9092 --topic first
#消费者命令操作
kafka-console-consumer.sh --bootstrap-server hadoop102:9092 --topic first
八、安装flume-1.9.0
1、安装flume环境
#首先将安装包解压到目录下
tar -zxvf apache-flume-1.9.0-bin.tar.gz -C /opt/module/
mv apache-flume-1.9.0-bin flume
#解压后需要将/opt/module/flume/lib/guava-11.0.2.jar删除掉
rm -f /opt/module/flume/lib/guava-11.0.2.jar
#编写flume的配置信息
vim /opt/module/flume/conf/log4j.properties
#复制flume-env.sh模板文件
cp flume-env.sh.template flume-env.sh
vim /opt/module/flume/conf/flume-env.sh
#配置完成后,需要对其他机器进行分发
scp -r flume/ hadoop104:/opt/module/
log4j.properties配置文件
flume.log.dir=/opt/module/flume/logs
flume-env.sh配置文件
#需要将这条注释打开
export JAVA_OPTS="-Xms100m -Xmx2000m -Dcom.sun.management.jmxremote"
2、配置flume
#需要flume中创建job目录
mkdir -p /opt/module/flume/job
#在job目录下编写file_to_kafka.conf文件
vim /opt/module/flume/job/file_to_kafka.conf
file_to_kafka.conf配置文件
#定义组件
a1.sources = r1
a1.channels = c1
#配置source
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /opt/module/applog/log/app.*
a1.sources.r1.positionFile = /opt/module/flume/taildir_position.json
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = com.atguigu.gmall.flume.interceptor.ETLInterceptor$Builder
#配置channel
a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers = hadoop102:9092,hadoop103:9092
a1.channels.c1.kafka.topic = topic_log
a1.channels.c1.parseAsFlumeEvent = false
#组装
a1.sources.r1.channels = c1
3、测试flume是否安装成功
#在测试之前要确定Hadoop、zookeeper、kafka集群已经启动
#在Hadoop102上的flume目录中进行采集
bin/flume-ng agent -n a1 -c conf/ -f job/file_to_kafka.conf -Dflume.root.logger=info,console
#在Hadoop103上启动一个Kafka的console-cousumer
kafka-console-consumer.sh --bootstrap-server hadoop102:9092 --topic topic_log
#在Hadoop102上生成数据,hadoop103 就可以查看到
lg.sh
4、创建flume Java拦截器
1、创建maven项目
pom.xml文件
<dependencies>
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-core</artifactId>
<version>1.9.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.62</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
2、创建ETLInterceptor类
在com.atguigu.gmall.flume.interceptor目录下
package com.atguigu.gmall.flume.interceptor;
import com.atguigu.gmall.flume.utils.JSONUtil;
import com.sun.org.apache.bcel.internal.generic.IF_ACMPEQ;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import javax.swing.border.EmptyBorder;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.List;
/**
* @author 东东
* @date 2022-11-25 0025 14:48
*/
public class ETLInterceptor implements Interceptor {
@Override
public void initialize() {
}
@Override
public Event intercept(Event event) {
//1、获取body当中的数据
byte[] body = event.getBody();
String log = new String(body, StandardCharsets.UTF_8);
//2、判断是不是合法的json
//3 、 是: return event 不是: return null
if (JSONUtil.isJSONValidate(log)){
return event;
}else {
return null;
}
}
@Override
public List<Event> intercept(List<Event> list) {
Iterator<Event> iterator = list.iterator();
while (iterator.hasNext()) {
Event event = iterator.next();
if (intercept(event) == null) {
iterator.remove();
}
}
return list;
}
@Override
public void close() {
}
public static class Builder implements Interceptor.Builder{
@Override
public Interceptor build() {
return new ETLInterceptor();
}
@Override
public void configure(Context context) {
}
}
}
3、创建JSONUtil类
在com.atguigu.gmall.flume.utils目录下
package com.atguigu.gmall.flume.utils;
import com.alibaba.fastjson.JSONObject;
/**
* @author 东东
* @date 2022-11-25 0025 14:54
*/
public class JSONUtil {
//校验json是不是合法的json数据
//通过异常捕捉
public static boolean isJSONValidate(String log) {
try {
JSONObject.parseObject(log);
return true;
}catch (Exception e){
return false;
}
}
}
4、将jar包上传到服务器中
#程序打包后,将后缀名带有with-dependencies的jar包进行上传/opt/module/flume/lib文件中
#上传后继续进行测试 在hadoop102上/opt/module/applog/log运行,可以在hadoop103上实时监测
echo "{id:1}" >> app.2022-11-25.log
#发送错误的json格式,hadoop103上不显示,则拦截成功
echo "{id:" >> app.2022-11-25.log
5、创建flume的启停脚本
在/root/bin目录下创建f1.sh脚本
#!/bin/bash
case $1 in
"start"){
for i in hadoop102 hadoop103
do
echo " --------启动 $i 采集flume-------"
ssh $i "nohup /opt/module/flume/bin/flume-ng agent -n a1 -c /opt/module/flume/conf/ -f /opt/module/flume/job/file_to_kafka.conf >/dev/null 2>&1 &"
done
};;
"stop"){
for i in hadoop102 hadoop103
do
echo " --------停止 $i 采集flume-------"
ssh $i "ps -ef | grep file_to_kafka | grep -v grep |awk '{print \$2}' | xargs -n1 kill -9 "
done
};;
esac
九、安装MySQL数据库
#首先检查电脑是否安装MySQL数据库 先查找后删除
rpm -qa |grep -i -E mysql\|mariadb| xargs -n1 sudo rpm -e --nodeps
#1、首先检查自己Linux是否安装过MySQL
rpm -qa | grep mysql
#如果有的话,就可以删除
rpm -e --nodeps mysql-xxxx
#2、查询所有MySQL对应的文件夹
whereis mysqlm
find / -name mysql
#如果有的话,删除相关目录或者文件
rm -rf /usr/bin/mysql /usr/include/mysql /data/mysql /data/mysql/mysql
#检查一下 是否删除干净
whereis mysqlm
find / -name mysql
#开始安装MySQL
rpm -ivh 01_mysql-community-common-5.7.16-1.el7.x86_64.rpm
rpm -ivh 02_mysql-community-libs-5.7.16-1.el7.x86_64.rpm
rpm -ivh 03_mysql-community-libs-compat-5.7.16-1.el7.x86_64.rpm
rpm -ivh 04_mysql-community-client-5.7.16-1.el7.x86_64.rpm
rpm -ivh 05_mysql-community-server-5.7.16-1.el7.x86_64.rpm
#安装05_mysql-community-server-5.7.16-1.el7.x86_64.rpm如果报错,则用下面这行命令
rpm -ivh 05_mysql-community-server-5.7.16-1.el7.x86_64.rpm --force --nodeps
#启动MySQL程序
systemctl start mysqld.service
#查看MySQL登录密码
cat /var/log/mysqld.log |grep password
#登录密码的格式
mysql -uroot -p'!0eesmRZh;sP'
#在MySQL内输入
#修改密码策略
set global validate_password_length=4;
set global validate_password_policy=0;
set password = password("root");
#作者连接密码是000000,如果要运行作者的jar包,需要修改密码
set password = password("000000");
#设置远程访问
show databases;
use mysql;
select user,host from user;
update user set host='%' where user="root";
#设置完刷新
flush privileges;
ezdml_setup_win64_v322.exe 作为数据流程的可视化界面
十、安装maxwell环境
版本:maxwell-1.29.2.tar.gz
1、配置Maxwell安装环境
#将安装包解压到module目录下,并改名
tar -zxvf maxwell-1.29.2.tar.gz -C /opt/module/
mv maxwell-1.29.2/ maxwell
#修改数据库my.cnf文件
vim /etc/my.cnf
#将下列信息复制到my.cnf 文件中
[mysqld]
#数据库id
server-id = 1
#启动binlog,该参数的值会作为binlog的文件名
log-bin=mysql-bin
#binlog类型,maxwell要求为row类型
binlog_format=row
#启用binlog的数据库,需根据实际情况作出修改
binlog-do-db=gmall
#配置完成后重启MySQL
systemctl restart mysqld
#检验binlog是否配置成功
#首先进入MySQL命令行
mysql -uroot -p"root";
#正确显示信息则说明配置成功
show master status;
2、创建Maxwell所需数据库和用户
#创建数据库
CREATE DATABASE maxwell;
#调整MySQL数据库的密码级别
set global validate_password_policy=0;
set global validate_password_length=4;
#创建Maxwell用户并赋予其必要权限
CREATE USER 'maxwell'@'%' IDENTIFIED BY 'maxwell';
GRANT ALL ON maxwell.* TO 'maxwell'@'%';
GRANT SELECT, REPLICATION CLIENT, REPLICATION SLAVE ON *.* TO 'maxwell'@'%';
3、配置Maxwell
#进入到Maxwell目录下,复制一份文件
cp config.properties.example config.properties
#对文件进行编辑
vim config.properties
config.properties配置文件
#Maxwell数据发送目的地,可选配置有stdout|file|kafka|kinesis|pubsub|sqs|rabbitmq|redis
producer=kafka
kafka.bootstrap.servers=hadoop102:9092,hadoop103:9092
#目标Kafka topic,可静态配置,例如:maxwell,也可动态配置,例如:%{database}_%{table}
kafka_topic=topic_db
# mysql login info
#MySQL相关配置
host=hadoop102
user=maxwell
password=maxwell
jdbc_options=useSSL=false&serverTimezone=Asia/Shanghai
4、Maxwell的启停操作
#在hadoop103上开始kafka 的检测
bin/kafka-console-consumer.sh --bootstrap-server hadoop102:9092 --topic topic_db
#增量数据同步 在Hadoop102上运行
/opt/module/maxwell/bin/maxwell --config /opt/module/maxwell/config.properties --daemon
or
#历史数据全量同步
/opt/module/maxwell/bin/maxwell-bootstrap --database gmall --table user_info --config /opt/module/maxwell/config.properties
#执行数据生成
java -jar gmall2020-mock-db-2021-01-22.jar
启停脚本mxw.sh
#!/bin/bash
MAXWELL_HOME=/opt/module/maxwell
status_maxwell(){
result=`ps -ef | grep com.zendesk.maxwell.Maxwell | grep -v grep | wc -l`
return $result
}
start_maxwell(){
status_maxwell
if [[ $? -lt 1 ]]; then
echo "启动Maxwell"
$MAXWELL_HOME/bin/maxwell --config $MAXWELL_HOME/config.properties --daemon
else
echo "Maxwell正在运行"
fi
}
stop_maxwell(){
status_maxwell
if [[ $? -gt 0 ]]; then
echo "停止Maxwell"
ps -ef | grep com.zendesk.maxwell.Maxwell | grep -v grep | awk '{print $2}' | xargs kill -9
else
echo "Maxwell未在运行"
fi
}
case $1 in
start )
start_maxwell
;;
stop )
stop_maxwell
;;
restart )
stop_maxwell
start_maxwell
;;
esac
十一、用户行为数据同步,flume
flume配置文件版本(1.9.0)
1、kafka_to_hdfs_log.conf配置文件
#机器上已经配置好了flume,需要在hadoop104上进行配置
#在/opt/module/flume/job/kafka_to_hdfs_log.conf编写目录
vim /opt/module/flume/job/kafka_to_hdfs_log.conf
#定义组件
a1.sources=r1
a1.channels=c1
a1.sinks=k1
#配置source1
a1.sources.r1.type = org.apache.flume.source.kafka.KafkaSource
a1.sources.r1.kafka.bootstrap.servers = hadoop102:9093,hadoop103:9092
a1.sources.r1.kafka.topics=topic_log
a1.sources.r1.batchSize = 5000
a1.sources.r1.batchDurationMillis = 1000
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = com.atguigu.gmall.flume.interceptor.TimestampInterceptor$Builder
#配置channel
a1.channels.c1.type = file
a1.channels.c1.checkpointDir = /opt/module/flume/checkpoint/behavior1
a1.channels.c1.dataDirs = /opt/module/flume/data/behavior1
a1.channels.c1.maxFileSize = 2146435071
a1.channels.c1.capacity = 1000000
a1.channels.c1.keep-alive = 3
#配置sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /origin_data/gmall/log/topic_log/%Y-%m-%d
a1.sinks.k1.hdfs.filePrefix = log
a1.sinks.k1.hdfs.round = false
a1.sinks.k1.hdfs.rollInterval = 10
a1.sinks.k1.hdfs.rollSize = 134217728
a1.sinks.k1.hdfs.rollCount = 0
#控制输出文件类型
a1.sinks.k1.hdfs.fileType = CompressedStream
a1.sinks.k1.hdfs.codeC = gzip
#组装
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
2、Java中TimestampInterceptor类
在上述的Java文件中增加一个TimestampInterceptor类
package com.atguigu.gmall.flume.interceptor;
import com.alibaba.fastjson.JSONObject;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
/**
* @author 东东
* @date 2022-11-28 0028 15:59
*/
public class TimestampInterceptor implements Interceptor {
@Override
public void initialize() {
}
@Override
public Event intercept(Event event) {
//1、获取header和body当中的数据
Map<String, String> headers = event.getHeaders();
byte[] body = event.getBody();
String log = new String(body, StandardCharsets.UTF_8);
//2、解析json(log)的ts时间戳
JSONObject jsonObject = JSONObject.parseObject(log);
String ts = jsonObject.getString("ts");
//3、把解析出来的ts放入到header投当中timestamp
headers.put("timestamp",ts);
return event;
}
@Override
public List<Event> intercept(List<Event> list) {
for (Event event : list) {
intercept(event);
}
return list;
}
@Override
public void close() {
}
public static class Builder implements Interceptor.Builder{
@Override
public Interceptor build() {
return new TimestampInterceptor();
}
@Override
public void configure(Context context) {
}
}
}
3、flume测试
#测试之前需要确保kafka、hadoop、zookeeper集群启动,在Hadoop104上启动
bin/flume-ng agent -n a1 -c conf/ -f job/kafka_to_hdfs_log.conf -Dflume.root.logger=info,console
#在Hadoop102和Hadoop103上运行
nohup /opt/module/flume/bin/flume-ng agent -n a1 -c /opt/module/flume/conf/ -f /opt/module/flume/job/file_to_kafka.conf >/dev/null 2>&1 &
#启动日志生成操作
lg.sh
flume在Hadoop104启动脚本
编写完后,需要对编写文件添加权限
在Hadoop102上编写
#!/bin/bash
case $1 in
"start")
echo " --------启动 hadoop104 日志数据flume-------"
ssh hadoop104 "nohup /opt/module/flume/bin/flume-ng agent -n a1 -c /opt/module/flume/conf -f /opt/module/flume/job/kafka_to_hdfs_log.conf >/dev/null 2>&1 &"
;;
"stop")
echo " --------停止 hadoop104 日志数据flume-------"
ssh hadoop104 "ps -ef | grep kafka_to_hdfs_log | grep -v grep |awk '{print \$2}' | xargs -n1 kill"
;;
esac
十二、安装datax
#将datax解压到module目录下
tar -zxvf datax.tar.gz -C /opt/module/
#使用下列命令检测datax是否可以使用
python /opt/module/datax/bin/datax.py /opt/module/datax/job/job.json
#当出现下列信息时,说明datax安装成功
2022-11-29 17:19:54.534 [job-0] INFO JobContainer -
任务启动时刻 : 2022-11-29 17:19:44
任务结束时刻 : 2022-11-29 17:19:54
任务总计耗时 : 10s
任务平均流量 : 253.91KB/s
记录写入速度 : 10000rec/s
读出记录总数 : 100000
读写失败总数 : 0
1、hdfs同步到MySQL案例
1、首先在MySQL中gmall中创建数据库
DROP TABLE IF EXISTS `test_province`;
CREATE TABLE `test_province` (
`id` bigint(20) NOT NULL,
`name` varchar(20) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`region_id` varchar(20) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`area_code` varchar(20) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`iso_code` varchar(20) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`iso_3166_2` varchar(20) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
2、数据同步策略
在/datax/job目录下创建一个相关文件
1、MySQL同步到hdfs策略
1、无传参
创建一个base_province.json
{
"job": {
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"connection": [
{
"jdbcUrl": [
"jdbc:mysql://hadoop102:3306/gmall"
],
"querySql": [
"select id,name,region_id,area_code,iso_code,iso_3166_2 from base_province where id>=3"
]
}
],
"password": "root",
"username": "root"
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"column": [
{
"name": "id",
"type": "bigint"
},
{
"name": "name",
"type": "string"
},
{
"name": "region_id",
"type": "string"
},
{
"name": "area_code",
"type": "string"
},
{
"name": "iso_code",
"type": "string"
},
{
"name": "iso_3166_2",
"type": "string"
}
],
"compress": "gzip",
"defaultFS": "hdfs://hadoop102:8020",
"fieldDelimiter": "\t",
"fileName": "base_province",
"fileType": "text",
"path": "/base_province",
"writeMode": "append"
}
}
}
],
"setting": {
"speed": {
"channel": 1
}
}
}
}
2、传参
创建一个base_province.json
#有传参的运行命令为 -Ddt后面为需要传参的值
python bin/datax.py -p"-Ddt=2020-06-14" job/base_province.json
{
"job": {
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"connection": [
{
"jdbcUrl": [
"jdbc:mysql://hadoop102:3306/gmall"
],
"querySql": [
"select id,name,region_id,area_code,iso_code,iso_3166_2 from base_province where id>=3"
]
}
],
"password": "root",
"username": "root"
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"column": [
{
"name": "id",
"type": "bigint"
},
{
"name": "name",
"type": "string"
},
{
"name": "region_id",
"type": "string"
},
{
"name": "area_code",
"type": "string"
},
{
"name": "iso_code",
"type": "string"
},
{
"name": "iso_3166_2",
"type": "string"
}
],
"compress": "gzip",
"defaultFS": "hdfs://hadoop102:8020",
"fieldDelimiter": "\t",
"fileName": "base_province",
"fileType": "text",
"path": "/base_province/${dt}",
"writeMode": "append"
}
}
}
],
"setting": {
"speed": {
"channel": 1
}
}
}
}
2、hdfs数据同步到MySQL策略
创建test_provice.json文件
{
"job": {
"content": [
{
"reader": {
"name": "hdfsreader",
"parameter": {
"defaultFS": "hdfs://hadoop102:8020",
"path": "/base_province",
"column": [
"*"
],
"fileType": "text",
"compress": "gzip",
"encoding": "UTF-8",
"nullFormat": "\\N",
"fieldDelimiter": "\t",
}
},
"writer": {
"name": "mysqlwriter",
"parameter": {
"username": "root",
"password": "root",
"connection": [
{
"table": [
"test_province"
],
"jdbcUrl": "jdbc:mysql://hadoop102:3306/gmall?useUnicode=true&characterEncoding=utf-8"
}
],
"column": [
"id",
"name",
"region_id",
"area_code",
"iso_code",
"iso_3166_2"
],
"writeMode": "replace"
}
}
}
],
"setting": {
"speed": {
"channel": 1
}
}
}
}
3、运行测试
#在运行前需要在Hadoop上创建/base_province目录
hdfs dfs -mkdir /base_province
#运行python脚本文件
python /opt/module/datax/bin/datax.py /opt/module/datax/job/test_provice.json
#查看Hadoop /base_province 下的文件命令
hdfs dfs -cat /base_province/* | zcat
4、datax配置文件生成脚本(全量同步)
1、python脚本
需要创建的gen_import_config.py 文件在下列
#运行前需要导入python的数据库包
yum install -y MySQL-python
#启动脚本命令
python /root/bin/gen_import_config.py -d gmall -t base_province
#用生成后的json文件实现MySQL同步到hdfs上,在datax目录上输入
python bin/datax.py -p"-Dtargetdir=/base_province" job/import/gmall.base_province.json
gen_import_config.py 文件
# ecoding=utf-8
import json
import getopt
import os
import sys
import MySQLdb
#MySQL相关配置,需根据实际情况作出修改
mysql_host = "hadoop102"
mysql_port = "3306"
mysql_user = "root"
mysql_passwd = "root"
#HDFS NameNode相关配置,需根据实际情况作出修改
hdfs_nn_host = "hadoop102"
hdfs_nn_port = "8020"
#生成配置文件的目标路径,可根据实际情况作出修改
output_path = "/opt/module/datax/job/import"
def get_connection():
return MySQLdb.connect(host=mysql_host, port=int(mysql_port), user=mysql_user, passwd=mysql_passwd)
def get_mysql_meta(database, table):
connection = get_connection()
cursor = connection.cursor()
sql = "SELECT COLUMN_NAME,DATA_TYPE from information_schema.COLUMNS WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s ORDER BY ORDINAL_POSITION"
cursor.execute(sql, [database, table])
fetchall = cursor.fetchall()
cursor.close()
connection.close()
return fetchall
def get_mysql_columns(database, table):
return map(lambda x: x[0], get_mysql_meta(database, table))
def get_hive_columns(database, table):
def type_mapping(mysql_type):
mappings = {
"bigint": "bigint",
"int": "bigint",
"smallint": "bigint",
"tinyint": "bigint",
"decimal": "string",
"double": "double",
"float": "float",
"binary": "string",
"char": "string",
"varchar": "string",
"datetime": "string",
"time": "string",
"timestamp": "string",
"date": "string",
"text": "string"
}
return mappings[mysql_type]
meta = get_mysql_meta(database, table)
return map(lambda x: {"name": x[0], "type": type_mapping(x[1].lower())}, meta)
def generate_json(source_database, source_table):
job = {
"job": {
"setting": {
"speed": {
"channel": 3
},
"errorLimit": {
"record": 0,
"percentage": 0.02
}
},
"content": [{
"reader": {
"name": "mysqlreader",
"parameter": {
"username": mysql_user,
"password": mysql_passwd,
"column": get_mysql_columns(source_database, source_table),
"splitPk": "",
"connection": [{
"table": [source_table],
"jdbcUrl": ["jdbc:mysql://" + mysql_host + ":" + mysql_port + "/" + source_database]
}]
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"defaultFS": "hdfs://" + hdfs_nn_host + ":" + hdfs_nn_port,
"fileType": "text",
"path": "${targetdir}",
"fileName": source_table,
"column": get_hive_columns(source_database, source_table),
"writeMode": "append",
"fieldDelimiter": "\t",
"compress": "gzip"
}
}
}]
}
}
if not os.path.exists(output_path):
os.makedirs(output_path)
with open(os.path.join(output_path, ".".join([source_database, source_table, "json"])), "w") as f:
json.dump(job, f)
def main(args):
source_database = ""
source_table = ""
options, arguments = getopt.getopt(args, '-d:-t:', ['sourcedb=', 'sourcetbl='])
for opt_name, opt_value in options:
if opt_name in ('-d', '--sourcedb'):
source_database = opt_value
if opt_name in ('-t', '--sourcetbl'):
source_table = opt_value
generate_json(source_database, source_table)
if __name__ == '__main__':
main(sys.argv[1:])
2、sh脚本
#编写sh脚本,需要设置权限 并且执行
chmod 777 gen_import_config.sh
1、生成json脚本的命令
#!/bin/bash
python ~/bin/gen_import_config.py -d gmall -t activity_info
python ~/bin/gen_import_config.py -d gmall -t activity_rule
python ~/bin/gen_import_config.py -d gmall -t base_category1
python ~/bin/gen_import_config.py -d gmall -t base_category2
python ~/bin/gen_import_config.py -d gmall -t base_category3
python ~/bin/gen_import_config.py -d gmall -t base_dic
python ~/bin/gen_import_config.py -d gmall -t base_province
python ~/bin/gen_import_config.py -d gmall -t base_region
python ~/bin/gen_import_config.py -d gmall -t base_trademark
python ~/bin/gen_import_config.py -d gmall -t cart_info
python ~/bin/gen_import_config.py -d gmall -t coupon_info
python ~/bin/gen_import_config.py -d gmall -t sku_attr_value
python ~/bin/gen_import_config.py -d gmall -t sku_info
python ~/bin/gen_import_config.py -d gmall -t sku_sale_attr_value
python ~/bin/gen_import_config.py -d gmall -t spu_info
2、实行数据库同步的脚本和执行
#编写sh脚本,需要设置权限 执行脚本文件
chmod 777 mysql_to_hdfs_full.sh
#执行脚本 实行完成后,全量同步就已经结束
mysql_to_hdfs_full.sh all 2020-06-14
mysql_to_hdfs_full.sh文件
#!/bin/bash
DATAX_HOME=/opt/module/datax
# 如果传入日期则do_date等于传入的日期,否则等于前一天日期
if [ -n "$2" ] ;then
do_date=$2
else
do_date=`date -d "-1 day" +%F`
fi
#处理目标路径,此处的处理逻辑是,如果目标路径不存在,则创建;若存在,则清空,目的是保证同步任务可重复执行
handle_targetdir() {
hadoop fs -test -e $1
if [[ $? -eq 1 ]]; then
echo "路径$1不存在,正在创建......"
hadoop fs -mkdir -p $1
else
echo "路径$1已经存在"
fs_count=$(hadoop fs -count $1)
content_size=$(echo $fs_count | awk '{print $3}')
if [[ $content_size -eq 0 ]]; then
echo "路径$1为空"
else
echo "路径$1不为空,正在清空......"
hadoop fs -rm -r -f $1/*
fi
fi
}
#数据同步
import_data() {
datax_config=$1
target_dir=$2
handle_targetdir $target_dir
python $DATAX_HOME/bin/datax.py -p"-Dtargetdir=$target_dir" $datax_config
}
case $1 in
"activity_info")
import_data /opt/module/datax/job/import/gmall.activity_info.json /origin_data/gmall/db/activity_info_full/$do_date
;;
"activity_rule")
import_data /opt/module/datax/job/import/gmall.activity_rule.json /origin_data/gmall/db/activity_rule_full/$do_date
;;
"base_category1")
import_data /opt/module/datax/job/import/gmall.base_category1.json /origin_data/gmall/db/base_category1_full/$do_date
;;
"base_category2")
import_data /opt/module/datax/job/import/gmall.base_category2.json /origin_data/gmall/db/base_category2_full/$do_date
;;
"base_category3")
import_data /opt/module/datax/job/import/gmall.base_category3.json /origin_data/gmall/db/base_category3_full/$do_date
;;
"base_dic")
import_data /opt/module/datax/job/import/gmall.base_dic.json /origin_data/gmall/db/base_dic_full/$do_date
;;
"base_province")
import_data /opt/module/datax/job/import/gmall.base_province.json /origin_data/gmall/db/base_province_full/$do_date
;;
"base_region")
import_data /opt/module/datax/job/import/gmall.base_region.json /origin_data/gmall/db/base_region_full/$do_date
;;
"base_trademark")
import_data /opt/module/datax/job/import/gmall.base_trademark.json /origin_data/gmall/db/base_trademark_full/$do_date
;;
"cart_info")
import_data /opt/module/datax/job/import/gmall.cart_info.json /origin_data/gmall/db/cart_info_full/$do_date
;;
"coupon_info")
import_data /opt/module/datax/job/import/gmall.coupon_info.json /origin_data/gmall/db/coupon_info_full/$do_date
;;
"sku_attr_value")
import_data /opt/module/datax/job/import/gmall.sku_attr_value.json /origin_data/gmall/db/sku_attr_value_full/$do_date
;;
"sku_info")
import_data /opt/module/datax/job/import/gmall.sku_info.json /origin_data/gmall/db/sku_info_full/$do_date
;;
"sku_sale_attr_value")
import_data /opt/module/datax/job/import/gmall.sku_sale_attr_value.json /origin_data/gmall/db/sku_sale_attr_value_full/$do_date
;;
"spu_info")
import_data /opt/module/datax/job/import/gmall.spu_info.json /origin_data/gmall/db/spu_info_full/$do_date
;;
"all")
import_data /opt/module/datax/job/import/gmall.activity_info.json /origin_data/gmall/db/activity_info_full/$do_date
import_data /opt/module/datax/job/import/gmall.activity_rule.json /origin_data/gmall/db/activity_rule_full/$do_date
import_data /opt/module/datax/job/import/gmall.base_category1.json /origin_data/gmall/db/base_category1_full/$do_date
import_data /opt/module/datax/job/import/gmall.base_category2.json /origin_data/gmall/db/base_category2_full/$do_date
import_data /opt/module/datax/job/import/gmall.base_category3.json /origin_data/gmall/db/base_category3_full/$do_date
import_data /opt/module/datax/job/import/gmall.base_dic.json /origin_data/gmall/db/base_dic_full/$do_date
import_data /opt/module/datax/job/import/gmall.base_province.json /origin_data/gmall/db/base_province_full/$do_date
import_data /opt/module/datax/job/import/gmall.base_region.json /origin_data/gmall/db/base_region_full/$do_date
import_data /opt/module/datax/job/import/gmall.base_trademark.json /origin_data/gmall/db/base_trademark_full/$do_date
import_data /opt/module/datax/job/import/gmall.cart_info.json /origin_data/gmall/db/cart_info_full/$do_date
import_data /opt/module/datax/job/import/gmall.coupon_info.json /origin_data/gmall/db/coupon_info_full/$do_date
import_data /opt/module/datax/job/import/gmall.sku_attr_value.json /origin_data/gmall/db/sku_attr_value_full/$do_date
import_data /opt/module/datax/job/import/gmall.sku_info.json /origin_data/gmall/db/sku_info_full/$do_date
import_data /opt/module/datax/job/import/gmall.sku_sale_attr_value.json /origin_data/gmall/db/sku_sale_attr_value_full/$do_date
import_data /opt/module/datax/job/import/gmall.spu_info.json /origin_data/gmall/db/spu_info_full/$do_date
;;
esac
5、flume配置(增量同步)
1、kafka_to_hdfs_db.conf
在hadoop104上/opt/module/flume/job目录下创建一个kafka_to_hdfs_db.conf 文件
a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.type = org.apache.flume.source.kafka.KafkaSource
a1.sources.r1.batchSize = 5000
a1.sources.r1.batchDurationMillis = 2000
a1.sources.r1.kafka.bootstrap.servers = hadoop102:9092,hadoop103:9092
a1.sources.r1.kafka.topics = topic_db
a1.sources.r1.kafka.consumer.group.id = topic_db
a1.sources.r1.setTopicHeader = false
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = com.atguigu.gmall.flume.interceptor.TimestampAndTableNameInterceptor$Builder
a1.channels.c1.type = file
a1.channels.c1.checkpointDir = /opt/module/flume/checkpoint/behavior2
a1.channels.c1.dataDirs = /opt/module/flume/data/behavior2/
a1.channels.c1.maxFileSize = 2146435071
a1.channels.c1.capacity = 1000000
a1.channels.c1.keep-alive = 6
## sink1
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /origin_data/gmall/db/%{tableName}_inc/%Y-%m-%d
a1.sinks.k1.hdfs.filePrefix = db
a1.sinks.k1.hdfs.round = false
a1.sinks.k1.hdfs.rollInterval = 10
a1.sinks.k1.hdfs.rollSize = 134217728
a1.sinks.k1.hdfs.rollCount = 0
a1.sinks.k1.hdfs.fileType = CompressedStream
a1.sinks.k1.hdfs.codeC = gzip
## 拼装
a1.sources.r1.channels = c1
a1.sinks.k1.channel= c1
2、创建一个TimestampAndTableNameInterceptor类
在之前的基础上,创建一个TimestampAndTableNameInterceptor类打包上传
package com.atguigu.gmall.flume.interceptor;
import com.alibaba.fastjson.JSONObject;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
/**
* @author 东东
* @date 2022-11-30 0030 17:07
*/
public class TimestampAndTableNameInterceptor implements Interceptor {
@Override
public void initialize() {
}
@Override
public Event intercept(Event event) {
//1、获取hader和body当中的数据
Map<String, String> headers = event.getHeaders();
byte[] body = event.getBody();
String log = new String(body, StandardCharsets.UTF_8);
//2、解析log当中的ts和table字段
JSONObject jsonObject = JSONObject.parseObject(log);
String table = jsonObject.getString("table");
String ts = jsonObject.getString("ts");
//3、把ts和table放到header当中的tablename和timestamp
headers.put("tableName",table);
headers.put("timestamp",ts+"000");
return event;
}
@Override
public List<Event> intercept(List<Event> list) {
for (Event event : list) {
intercept(event);
}
return list;
}
@Override
public void close() {
}
public static class Builder implements Interceptor.Builder{
@Override
public Interceptor build() {
return new TimestampAndTableNameInterceptor();
}
@Override
public void configure(Context context) {
}
}
}
文件打包后,上传到hadoop104上/opt/module/flume/lib目录下
3、测试运行
测试本程序需要将Hadoop、zookeeper、Kafka、Maxwell提前启动
#在hadoop104上将flume启动
bin/flume-ng agent -n a1 -c conf/ -f job/kafka_to_hdfs_db.conf -Dflume.root.logger=info,console
#在hadoop102上进行数据采集
java -jar gmall2020-mock-db-2021-11-14.jar
#Hadoop104上会有显示,后面以inc为结尾的为增量数据同步
4、增量数据启停脚本
f3.sh文件 flume脚本
#!/bin/bash
case $1 in
"start")
echo " --------启动 hadoop104 业务数据flume-------"
ssh hadoop104 "nohup /opt/module/flume/bin/flume-ng agent -n a1 -c /opt/module/flume/conf -f /opt/module/flume/job/kafka_to_hdfs_db.conf >/dev/null 2>&1 &"
;;
"stop")
echo " --------停止 hadoop104 业务数据flume-------"
ssh hadoop104 "ps -ef | grep kafka_to_hdfs_db | grep -v grep |awk '{print \$2}' | xargs -n1 kill"
;;
esac
#编写完成后,需要增加权限
chmod 777 f3.sh
6、解决maxwell时间戳问题
#在Maxwell目录下 在Hadoop102上
vim /opt/module/maxwell/config.properties
#在config.properties文件中加入下面这一列数据
mock_date=2020-06-14
#重启Maxwell
mxw.sh restart
7、增量表首日全量同步
mysql_to_kafka_inc_init.sh脚本
在/root/bin目录下创建一个脚本
#!/bin/bash
# 该脚本的作用是初始化所有的增量表,只需执行一次
MAXWELL_HOME=/opt/module/maxwell
import_data() {
$MAXWELL_HOME/bin/maxwell-bootstrap --database gmall --table $1 --config $MAXWELL_HOME/config.properties
}
case $1 in
"cart_info")
import_data cart_info
;;
"comment_info")
import_data comment_info
;;
"coupon_use")
import_data coupon_use
;;
"favor_info")
import_data favor_info
;;
"order_detail")
import_data order_detail
;;
"order_detail_activity")
import_data order_detail_activity
;;
"order_detail_coupon")
import_data order_detail_coupon
;;
"order_info")
import_data order_info
;;
"order_refund_info")
import_data order_refund_info
;;
"order_status_log")
import_data order_status_log
;;
"payment_info")
import_data payment_info
;;
"refund_payment")
import_data refund_payment
;;
"user_info")
import_data user_info
;;
"all")
import_data cart_info
import_data comment_info
import_data coupon_use
import_data favor_info
import_data order_detail
import_data order_detail_activity
import_data order_detail_coupon
import_data order_info
import_data order_refund_info
import_data order_status_log
import_data payment_info
import_data refund_payment
import_data user_info
;;
esac
#给文件超级权限
chmod 777 mysql_to_kafka_inc_init.sh
#启动脚本
mysql_to_kafka_inc_init.sh all
8、采集通道的启动/停止脚本
cluster.sh文件
#!/bin/bash
case $1 in
"start"){
echo ================== 启动 集群 ==================
#启动 Zookeeper集群
zk.sh start
#启动 Hadoop集群
hdp.sh start
#启动 Kafka采集集群
kf.sh start
#启动采集 Flume
f1.sh start
#启动日志消费 Flume
f2.sh start
#启动业务消费 Flume
f3.sh start
#启动 maxwell
mxw.sh start
};;
"stop"){
echo ================== 停止 集群 ==================
#停止 Maxwell
mxw.sh stop
#停止 业务消费Flume
f3.sh stop
#停止 日志消费Flume
f2.sh stop
#停止 日志采集Flume
f1.sh stop
#停止 Kafka采集集群
kf.sh stop
#停止 Hadoop集群
hdp.sh stop
#停止 Zookeeper集群
zk.sh stop
};;
esac
#给文件超级权限
chmod 777 cluster.sh
#启动脚本
cluster.sh start
#停止脚本
cluster.sh stop
十三、hive安装部署
1、安装环境
apache-hive-3.1.2-bin.tar.gz
#首先将安装包放入虚拟机中
tar -zxvf apache-hive-3.1.2-bin.tar.gz -C /opt/module/
cd /opt/module/
#给安装包更换名称
mv apache-hive-3.1.2-bin/ hive
#添加hive 的环境变量
vim /etc/profile
#环境变量的配置
export HIVE_HOME=/opt/module/hive
export PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$KAFKA_HOME/bin:$HIVE_HOME/bin
#更新配置文件
source /etc/profile
#为解决日志jar包冲突问题,进入到/opt/module/hive/lib目录中删除jar包
mv log4j-slf4j-impl-2.10.0.jar log4j-slf4j-impl-2.10.0.jar.bak
#hive元数据配置到MySQL中
cp /opt/software/mysql-connector-java-5.1.27-bin.jar /opt/module/hive/lib/
#在/opt/module/hive/conf目录下创建hive-site.xml 文件
vim hive-site.xml
hive-site.xml文件
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://hadoop102:3306/metastore?useSSL=false&useUnicode=true&characterEncoding=UTF-8</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>root</value>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/user/hive/warehouse</value>
</property>
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>
<property>
<name>hive.server2.thrift.port</name>
<value>10000</value>
</property>
<property>
<name>hive.server2.thrift.bind.host</name>
<value>hadoop102</value>
</property>
<property>
<name>hive.metastore.event.db.notification.api.auth</name>
<value>false</value>
</property>
<property>
<name>hive.cli.print.header</name>
<value>true</value>
</property>
<property>
<name>hive.cli.print.current.db</name>
<value>true</value>
</property>
</configuration>
2、启动hive
#首先要进入MySQL数据库创建hive原数据
mysql -uroot -p"root"
#创建hive元数据
create database metastore;
#退出操作
quit;
#初始化hive源数据库
schematool -initSchema -dbType mysql -verbose
#修改元数据字符集
#首先进入metastore数据中
use metastore;
#字段注释
alter table COLUMNS_V2 modify column COMMENT varchar(256) character set utf8;
#表注释
alter table TABLE_PARAMS modify column PARAM_VALUE mediumtext character set utf8;
#退出mysql
quit;
#直接输入hive启动
hive
#输入下面命令,有显示说明安装成功
show databases;
十四、安装脚本
f1.sh脚本
#!/bin/bash
case $1 in
"start"){
for i in hadoop102 hadoop103
do
echo " --------启动 $i 采集flume-------"
ssh $i "nohup /opt/module/flume/bin/flume-ng agent -n a1 -c /opt/module/flume/conf/ -f /opt/module/flume/job/file_to_kafka.conf >/dev/null 2>&1 &"
done
};;
"stop"){
for i in hadoop102 hadoop103
do
echo " --------停止 $i 采集flume-------"
ssh $i "ps -ef | grep file_to_kafka | grep -v grep |awk '{print \$2}' | xargs -n1 kill -9 "
done
};;
esac
f2.sh脚本
#!/bin/bash
case $1 in
"start")
echo " --------启动 hadoop104 日志数据flume-------"
ssh hadoop104 "nohup /opt/module/flume/bin/flume-ng agent -n a1 -c /opt/module/flume/conf -f /opt/module/flume/job/kafka_to_hdfs_log.conf >/dev/null 2>&1 &"
;;
"stop")
echo " --------停止 hadoop104 日志数据flume-------"
ssh hadoop104 "ps -ef | grep kafka_to_hdfs_log | grep -v grep |awk '{print \$2}' | xargs -n1 kill"
;;
esac
f3.sh脚本
#!/bin/bash
case $1 in
"start")
echo " --------启动 hadoop104 业务数据flume-------"
ssh hadoop104 "nohup /opt/module/flume/bin/flume-ng agent -n a1 -c /opt/module/flume/conf -f /opt/module/flume/job/kafka_to_hdfs_db.conf >/dev/null 2>&1 &"
;;
"stop")
echo " --------停止 hadoop104 业务数据flume-------"
ssh hadoop104 "ps -ef | grep kafka_to_hdfs_db | grep -v grep |awk '{print \$2}' | xargs -n1 kill"
;;
esac