- Maxwell采集原理:伪装成MySQL的从节点,从binlog日志中获取变化的数据,我们把binlog的日志设置为了记录数据的方式,而不是记录命令,这样就不用再查询一遍了
- Maxwell可以配置采集哪个库的数据,默认采集这个库的所有表的变化数据,如果不想采集某个表,可以排除
- json格式,有old
Maxwell配置文件
- config.properties
# tl;dr config
log_level=info
#Maxwell数据发送目的地,可选配置有stdout|file|kafka|kinesis|pubsub|sqs|rabbitmq|redis
producer=kafka
# 目标Kafka集群地址
kafka.bootstrap.servers=hadoop102:9092,hadoop103:9092,hadoop104:9092
#目标Kafka topic,可静态配置,例如:maxwell,也可动态配置,例如:%{database}_%{table}
kafka_topic=topic_db
# MySQL相关配置
host=hadoop100
user=maxwell
password=maxwell
jdbc_options=useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai
# 过滤gmall中的z_log表数据,该表是日志数据的备份,无须采集
#filter=exclude:gmall.z_log
# 指定数据按照主键分组进入Kafka不同分区,避免数据倾斜
producer_partition_by=primary_key
# 修改数据时间戳的日期部分
mock_date=2022-06-08
# *** general ***
# choose where to produce data to. stdout|file|kafka|kinesis|pubsub|sqs|rabbitmq|redis
#producer=kafka
# set the log level. note that you can configure things further in log4j2.xml
#log_level=DEBUG # [DEBUG, INFO, WARN, ERROR]
# if set, maxwell will look up the scoped environment variables, strip off the prefix and inject the configs
#env_config_prefix=MAXWELL_
# *** mysql ***
# mysql host to connect to
#host=hostname
# mysql port to connect to
#port=3306
# mysql user to connect as. This user must have REPLICATION SLAVE permissions,
# as well as full access to the `maxwell` (or schema_database) database
#user=maxwell
# mysql password
#password=maxwell
# options to pass into the jdbc connection, given as opt=val&opt2=val2
#jdbc_options=opt1=100&opt2=hello
# name of the mysql database where maxwell keeps its own state
#schema_database=maxwell
# whether to use GTID or not for positioning
#gtid_mode=true
# maxwell will capture an initial "base" schema containing all table and column information,
# and then keep delta-updates on top of that schema. If you have an inordinate amount of DDL changes,
# the table containing delta changes will grow unbounded (and possibly too large) over time. If you
# enable this option Maxwell will periodically compact its tables.
#max_schemas=10000
# SSL/TLS options
# To use VERIFY_CA or VERIFY_IDENTITY, you must set the trust store with Java opts:
# -Djavax.net.ssl.trustStore=<truststore> -Djavax.net.ssl.trustStorePassword=<password>
# or import the MySQL cert into the global Java cacerts.
# MODE must be one of DISABLED, PREFERRED, REQUIRED, VERIFY_CA, or VERIFY_IDENTITY
#
# turns on ssl for the maxwell-store connection, other connections inherit this setting unless specified
#ssl=DISABLED
# for binlog-connector
#replication_ssl=DISABLED
# for the schema-capture connection, if used
#schema_ssl=DISABLED
# maxwell can optionally replicate from a different server than where it stores
# schema and binlog position info. Specify that different server here:
#replication_host=other
#replication_user=username
#replication_password=password
#replication_port=3306
# This may be useful when using MaxScale's binlog mirroring host.
# Specifies that Maxwell should capture schema from a different server than
# it replicates from:
#schema_host=other
#schema_user=username
#schema_password=password
#schema_port=3306
# *** output format ***
# records include binlog position (default false)
#output_binlog_position=true
# records include a gtid string (default false)
#output_gtid_position=true
# records include fields with null values (default true). If this is false,
# fields where the value is null will be omitted entirely from output.
#output_nulls=true
# records include server_id (default false)
#output_server_id=true
# records include thread_id (default false)
#output_thread_id=true
# records include schema_id (default false)
#output_schema_id=true
# records include row query, binlog option "binlog_rows_query_log_events" must be enabled" (default false)
#output_row_query=true
# DML records include list of values that make up a row's primary key (default false)
#output_primary_keys=true
# DML records include list of columns that make up a row's primary key (default false)
#output_primary_key_columns=true
# records include commit and xid (default true)
#output_commit_info=true
# This controls whether maxwell will output JSON information containing
# DDL (ALTER/CREATE TABLE/ETC) infromation. (default: false)
# See also: ddl_kafka_topic
#output_ddl=true
# turns underscore naming style of fields to camel case style in JSON output
# default is none, which means the field name in JSON is the exact name in MySQL table
#output_naming_strategy=underscore_to_camelcase
# *** kafka ***
# list of kafka brokers
#kafka.bootstrap.servers=hosta:9092,hostb:9092
# kafka topic to write to
# this can be static, e.g. 'maxwell', or dynamic, e.g. namespace_%{database}_%{table}
# in the latter case 'database' and 'table' will be replaced with the values for the row being processed
#kafka_topic=maxwell
# alternative kafka topic to write DDL (alter/create/drop) to. Defaults to kafka_topic
#ddl_kafka_topic=maxwell_ddl
# hash function to use. "default" is just the JVM's 'hashCode' function.
#kafka_partition_hash=default # [default, murmur3]
# how maxwell writes its kafka key.
#
# 'hash' looks like:
# {"database":"test","table":"tickets","pk.id":10001}
#
# 'array' looks like:
# ["test","tickets",[{"id":10001}]]
#
# default: "hash"
#kafka_key_format=hash # [hash, array]
# extra kafka options. Anything prefixed "kafka." will get
# passed directly into the kafka-producer's config.
# a few defaults.
# These are 0.11-specific. They may or may not work with other versions.
kafka.compression.type=snappy
kafka.retries=0
kafka.acks=1
#kafka.batch.size=16384
# kafka+SSL example
# kafka.security.protocol=SSL
# kafka.ssl.truststore.location=/var/private/ssl/kafka.client.truststore.jks
# kafka.ssl.truststore.password=test1234
# kafka.ssl.keystore.location=/var/private/ssl/kafka.client.keystore.jks
# kafka.ssl.keystore.password=test1234
# kafka.ssl.key.password=test1234#
# controls a heuristic check that maxwell may use to detect messages that
# we never heard back from. The heuristic check looks for "stuck" messages, and
# will timeout maxwell after this many milliseconds.
#
# See https://github.com/zendesk/maxwell/blob/master/src/main/java/com/zendesk/maxwell/producer/InflightMessageList.java
# if you really want to get into it.
#producer_ack_timeout=120000 # default 0
# *** partitioning ***
# What part of the data do we partition by?
#producer_partition_by=database # [database, table, primary_key, transaction_id, thread_id, column]
# specify what fields to partition by when using producer_partition_by=column
# column separated list.
#producer_partition_columns=id,foo,bar
# when using producer_partition_by=column, partition by this when
# the specified column(s) don't exist.
#producer_partition_by_fallback=database
# *** kinesis ***
#kinesis_stream=maxwell
# AWS places a 256 unicode character limit on the max key length of a record
# http://docs.aws.amazon.com/kinesis/latest/APIReference/API_PutRecord.html
#
# Setting this option to true enables hashing the key with the md5 algorithm
# before we send it to kinesis so all the keys work within the key size limit.
# Values: true, false
# Default: false
#kinesis_md5_keys=true
# *** sqs ***
#sqs_queue_uri=aws_sqs_queue_uri
# The sqs producer will need aws credentials configured in the default
# root folder and file format. Please check below link on how to do it.
# http://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/setup-credentials.html
# *** pub/sub ***
#pubsub_project_id=maxwell
#pubsub_topic=maxwell
#ddl_pubsub_topic=maxwell_ddl
# *** rabbit-mq ***
#rabbitmq_host=rabbitmq_hostname
#rabbitmq_port=5672
#rabbitmq_user=guest
#rabbitmq_pass=guest
#rabbitmq_virtual_host=/
#rabbitmq_exchange=maxwell
#rabbitmq_exchange_type=fanout
#rabbitmq_exchange_durable=false
#rabbitmq_exchange_autodelete=false
#rabbitmq_routing_key_template=%db%.%table%
#rabbitmq_message_persistent=false
#rabbitmq_declare_exchange=true
# *** redis ***
#redis_host=redis_host
#redis_port=6379
#redis_auth=redis_auth
#redis_database=0
# name of pubsub/list/whatever key to publish to
#redis_key=maxwell
# this can be static, e.g. 'maxwell', or dynamic, e.g. namespace_%{database}_%{table}
#redis_pub_channel=maxwell
# this can be static, e.g. 'maxwell', or dynamic, e.g. namespace_%{database}_%{table}
#redis_list_key=maxwell
# this can be static, e.g. 'maxwell', or dynamic, e.g. namespace_%{database}_%{table}
# Valid values for redis_type = pubsub|lpush. Defaults to pubsub
#redis_type=pubsub
# *** custom producer ***
# the fully qualified class name for custom ProducerFactory
# see the following link for more details.
# http://maxwells-daemon.io/producers/#custom-producer
#custom_producer.factory=
# custom producer properties can be configured using the custom_producer.* property namespace
#custom_producer.custom_prop=foo
# *** filtering ***
# filter rows out of Maxwell's output. Command separated list of filter-rules, evaluated in sequence.
# A filter rule is:
# <type> ":" <db> "." <tbl> [ "." <col> "=" <col_val> ]
# type ::= [ "include" | "exclude" | "blacklist" ]
# db ::= [ "/regexp/" | "string" | "`string`" | "*" ]
# tbl ::= [ "/regexp/" | "string" | "`string`" | "*" ]
# col_val ::= "column_name"
# tbl ::= [ "/regexp/" | "string" | "`string`" | "*" ]
#
# See http://maxwells-daemon.io/filtering for more details
#
#filter= exclude: *.*, include: foo.*, include: bar.baz, include: foo.bar.col_eg = "value_to_match"
# javascript filter
# maxwell can run a bit of javascript for each row if you need very custom filtering/data munging.
# See http://maxwells-daemon.io/filtering/#javascript_filters for more details
#
#javascript=/path/to/javascript_filter_file
# *** encryption ***
# Encryption mode. Possible values are none, data, and all. (default none)
#encrypt=none
# Specify the secret key to be used
#secret_key=RandomInitVector
# *** monitoring ***
# Maxwell collects metrics via dropwizard. These can be exposed through the
# base logging mechanism (slf4j), JMX, HTTP or pushed to Datadog.
# Options: [jmx, slf4j, http, datadog]
# Supplying multiple is allowed.
#metrics_type=jmx,slf4j
# The prefix maxwell will apply to all metrics
#metrics_prefix=MaxwellMetrics # default MaxwellMetrics
# Enable (dropwizard) JVM metrics, default false
#metrics_jvm=true
# When metrics_type includes slf4j this is the frequency metrics are emitted to the log, in seconds
#metrics_slf4j_interval=60
# When metrics_type includes http or diagnostic is enabled, this is the port the server will bind to.
#http_port=8080
# When metrics_type includes http or diagnostic is enabled, this is the http path prefix, default /.
#http_path_prefix=/some/path/
# ** The following are Datadog specific. **
# When metrics_type includes datadog this is the way metrics will be reported.
# Options: [udp, http]
# Supplying multiple is not allowed.
#metrics_datadog_type=udp
# datadog tags that should be supplied
#metrics_datadog_tags=tag1:value1,tag2:value2
# The frequency metrics are pushed to datadog, in seconds
#metrics_datadog_interval=60
# required if metrics_datadog_type = http
#metrics_datadog_apikey=API_KEY
# required if metrics_datadog_type = udp
#metrics_datadog_host=localhost # default localhost
#metrics_datadog_port=8125 # default 8125
# Maxwell exposes http diagnostic endpoint to check below in parallel:
# 1. binlog replication lag
# 2. producer (currently kafka) lag
# To enable Maxwell diagnostic
#http_diagnostic=true # default false
# Diagnostic check timeout in milliseconds, required if diagnostic = true
#http_diagnostic_timeout=10000 # default 10000
# *** misc ***
# maxwell's bootstrapping functionality has a couple of modes.
#
# In "async" mode, maxwell will output the replication stream while it
# simultaneously outputs the database to the topic. Note that it won't
# output replication data for any tables it is currently bootstrapping -- this
# data will be buffered and output after the bootstrap is complete.
#
# In "sync" mode, maxwell stops the replication stream while it
# outputs bootstrap data.
#
# async mode keeps ops live while bootstrapping, but carries the possibility of
# data loss (due to buffering transactions). sync mode is safer but you
# have to stop replication.
#bootstrapper=async [sync, async, none]
# output filename when using the "file" producer
#output_file=/path/to/file
自动安装脚本
common.sh
# 确保脚本在遇到错误时立即退出
set -x
set -e
set -o pipefail
# 检查是否为root用户
if [[ "$(whoami)" != "root" ]]; then
echo "请使用root用户运行此脚本。"
exit 1
fi
root_password="123456"
new_user="atguigu"
new_user_password="123456"
# 将 'mysql_password' 替换为你的实际MySQL root密码
mysql_user="root"
mysql_password="000000"
#JDK安装包所在位置
jdk_package_path="/opt/software/jdk-8u212-linux-x64.tar.gz"
#zookeeper安装包所在位置
zookeeper_package_path="/opt/software/apache-zookeeper-3.7.1-bin.tar.gz"
#hadoop安装包所在位置
hadoop_package_path="/opt/software/hadoop-3.3.4.tar.gz"
#kafka安装包所在位置
kafka_package_path="/opt/software/kafka_2.12-3.3.1.tgz"
#flume安装包所在位置
flume_package_path="/opt/software/apache-flume-1.10.1-bin.tar.gz"
#hive安装包所在位置
hive_package_path="/opt/software/hive-3.1.3.tar.gz"
#spark安装包所在位置
spark_package_path="/opt/software/spark-3.3.1-bin-without-hadoop.tgz"
#maxwell安装包所在位置
maxwell_package_path="/opt/software/maxwell-1.29.2.tar.gz"
#datax安装包所在位置
datax_package_path="/opt/software/datax.tar.gz"
#复制压缩包到这个目录
software_directory="/opt/software/"
#解压到的目录
module_path="/opt/module"
#脚本存放的目录
autoshell_path="/opt/auto_script"
#启用binlog的数据库,需根据实际情况作出修改
binlog_do_db="edu"
# 定义服务器列表、密码和用户名等变量
servers=(
"hadoop100"
"hadoop101"
"hadoop102"
"hadoop103"
"hadoop104"
)
#安装zookeeper的服务器
zookeeper_servers=(
"hadoop102"
"hadoop103"
"hadoop104"
)
#安装JournalNode的服务器
JournalNode_servers=(
"hadoop102"
"hadoop103"
"hadoop104"
)
#安装NameNode的服务器
NameNode_servers=(
"hadoop100"
"hadoop101"
)
#安装DataNode的服务器
DataNode_servers=(
"hadoop102"
"hadoop103"
"hadoop104"
)
#安装kafka的服务器
kafka_servers=(
"hadoop102"
"hadoop103"
"hadoop104"
)
#安装flume的服务器
flume_servers=(
"hadoop100"
"hadoop101"
)
#安装hive的服务器
hive_servers=(
"hadoop100"
"hadoop101"
"hadoop102"
"hadoop103"
"hadoop104"
)
#安装 hiveserver2 的服务器
hiveserver2_servers=(
"hadoop100"
)
#安装 metastore 的服务器
metastore_servers=(
"hadoop101"
)
#安装MySQL的服务器
MySQL_servers=(
"hadoop100"
)
#安装spark的服务器
spark_servers=(
"hadoop100"
"hadoop102"
"hadoop103"
"hadoop104"
)
#安装maxwell的服务器
maxwell_servers=(
"hadoop100"
)
#安装datax的服务器
datax_servers=(
"hadoop100"
)
#软件都从这台服务器上复制
master_server="${servers[0]}"
maxwell_conf目录
把config.properties配置文件放到此目录中
10_auto_maxwell_install.sh
#!/bin/bash
source ./common.sh
# 获取版本
maxwell_version1=$(basename "${maxwell_package_path}" | awk -F'.tar.gz' '{print $1}')
maxwell_version="maxwell"
for server in "${maxwell_servers[@]}"; do
# 复制安装包
sshpass -p "${new_user_password}" ssh -o StrictHostKeyChecking=no "${new_user}@${master_server}" "rsync -av ${maxwell_package_path} ${new_user}@${server}:${software_directory}"
done
for server in "${maxwell_servers[@]}"; do
# 删除已存在的目录
sshpass -p "${new_user_password}" ssh -o StrictHostKeyChecking=no "${new_user}@${server}" "sudo rm -rf ${module_path}/maxwell*"
done
# 解压缩安装包
for server in "${maxwell_servers[@]}"; do
echo "开始解压 ${server}"
sshpass -p "${new_user_password}" ssh -o StrictHostKeyChecking=no "${new_user}@${server}" "tar -zxf ${maxwell_package_path} -C ${module_path}/"
done
#修改解压后的文件名称
for server in "${maxwell_servers[@]}"; do
sshpass -p "${new_user_password}" ssh -o StrictHostKeyChecking=no "${new_user}@${server}" "mv ${module_path}/${maxwell_version1} ${module_path}/${maxwell_version}"
done
#修改MySQL配置文件/etc/my.cnf
for server in "${MySQL_servers[@]}"; do
server_id_line_number=$(sshpass -p "${root_password}" ssh -o StrictHostKeyChecking=no "root@${server}" "awk '/server-id/{print NR; exit}' /etc/my.cnf")
if [[ -n "${server_id_line_number}" ]]; then
# 使用 rsa_auth_line_number 进行操作,在 sed 命令中删除该行
sshpass -p "${root_password}" ssh -o StrictHostKeyChecking=no "root@${server}" "sed -i \"${server_id_line_number}d\" /etc/my.cnf"
fi
sshpass -p "${new_user_password}" ssh -o StrictHostKeyChecking=no "${new_user}@${server}" "echo 'server-id = 1' | sudo tee -a /etc/my.cnf"
#########
log_bin_line_number=$(sshpass -p "${root_password}" ssh -o StrictHostKeyChecking=no "root@${server}" "awk '/log-bin/{print NR; exit}' /etc/my.cnf")
if [[ -n "${log_bin_line_number}" ]]; then
# 使用 rsa_auth_line_number 进行操作,在 sed 命令中删除该行
sshpass -p "${root_password}" ssh -o StrictHostKeyChecking=no "root@${server}" "sed -i \"${log_bin_line_number}d\" /etc/my.cnf"
fi
sshpass -p "${new_user_password}" ssh -o StrictHostKeyChecking=no "${new_user}@${server}" "echo 'log-bin=mysql-bin' | sudo tee -a /etc/my.cnf"
#########
binlog_format_line_number=$(sshpass -p "${root_password}" ssh -o StrictHostKeyChecking=no "root@${server}" "awk '/binlog_format/{print NR; exit}' /etc/my.cnf")
if [[ -n "${binlog_format_line_number}" ]]; then
# 使用 rsa_auth_line_number 进行操作,在 sed 命令中删除该行
sshpass -p "${root_password}" ssh -o StrictHostKeyChecking=no "root@${server}" "sed -i \"${binlog_format_line_number}d\" /etc/my.cnf"
fi
sshpass -p "${new_user_password}" ssh -o StrictHostKeyChecking=no "${new_user}@${server}" "echo 'binlog_format=row' | sudo tee -a /etc/my.cnf"
#########
binlog_do_db_line_number=$(sshpass -p "${root_password}" ssh -o StrictHostKeyChecking=no "root@${server}" "awk '/binlog-do-db/{print NR; exit}' /etc/my.cnf")
if [[ -n "${binlog_do_db_line_number}" ]]; then
# 使用 rsa_auth_line_number 进行操作,在 sed 命令中删除该行
sshpass -p "${root_password}" ssh -o StrictHostKeyChecking=no "root@${server}" "sed -i \"${binlog_do_db_line_number}d\" /etc/my.cnf"
fi
sshpass -p "${new_user_password}" ssh -o StrictHostKeyChecking=no "${new_user}@${server}" "echo 'binlog-do-db=${binlog_do_db}' | sudo tee -a /etc/my.cnf"
done
#重启MySQL服务
for server in "${MySQL_servers[@]}"; do
sshpass -p "${new_user_password}" ssh -o StrictHostKeyChecking=no "${new_user}@${server}" "sudo systemctl restart mysqld"
done
#创建数据库
for server in "${MySQL_servers[@]}"; do
sshpass -p "${new_user_password}" ssh -o StrictHostKeyChecking=no "${new_user}@${server}" "mysql -u${mysql_user} -p${mysql_password} -e \"drop database maxwell;\"" 2>/dev/null || true
sshpass -p "${root_password}" ssh -o StrictHostKeyChecking=no "root@${server}" "mysql -u${mysql_user} -p${mysql_password} -e \"CREATE DATABASE maxwell;\"" 2>/dev/null || true
sshpass -p "${new_user_password}" ssh -o StrictHostKeyChecking=no "${new_user}@${server}" "mysql -u${mysql_user} -p${mysql_password} -e \"drop USER maxwell;\"" 2>/dev/null || true
sshpass -p "${root_password}" ssh -o StrictHostKeyChecking=no "root@${server}" "mysql -u${mysql_user} -p${mysql_password} -e \"CREATE USER 'maxwell'@'%' IDENTIFIED BY 'maxwell';\"" 2>/dev/null || true
sshpass -p "${root_password}" ssh -o StrictHostKeyChecking=no "root@${server}" "mysql -u${mysql_user} -p${mysql_password} -e \"GRANT ALL ON maxwell.* TO 'maxwell'@'%';\""
sshpass -p "${root_password}" ssh -o StrictHostKeyChecking=no "root@${server}" "mysql -u${mysql_user} -p${mysql_password} -e \"GRANT SELECT, REPLICATION CLIENT, REPLICATION SLAVE ON *.* TO 'maxwell'@'%';\""
done
#修改Maxwell配置文件名称
for server in "${maxwell_servers[@]}"; do
sshpass -p "${new_user_password}" ssh -o StrictHostKeyChecking=no "${new_user}@${server}" "cp ${module_path}/${maxwell_version}/config.properties.example ${module_path}/${maxwell_version}/config.properties"
done
#修改Maxwell配置文件
for server in "${maxwell_servers[@]}"; do
sshpass -p "${new_user_password}" ssh -o StrictHostKeyChecking=no "${new_user}@${server}" "scp ${new_user}@${master_server}:${autoshell_path}/maxwell_conf/config.properties ${module_path}/${maxwell_version}/"
done
echo "maxwell 安装成功"
启动脚本
run_common.sh
# 确保脚本在遇到错误时立即退出
set -x
set -e
set -o pipefail
# 用你的MySQL用户名和密码替换这里的值
mysql_user="root"
mysql_password="000000"
DATABASE_NAME="gmall"
# 定义服务器列表、密码和用户名等变量
servers=(
"hadoop102"
"hadoop103"
"hadoop104"
)
#安装zookeeper的服务器
zookeeper_servers=(
"hadoop102"
"hadoop103"
"hadoop104"
)
#安装kafka的服务器
kafka_servers=(
"hadoop102"
"hadoop103"
"hadoop104"
)
#安装NameNode的服务器
NameNode_servers=(
"hadoop102"
)
#安装MySQL的服务器
MySQL_servers=(
"hadoop102"
)
mxw.sh
#!/bin/bash
MAXWELL_HOME=/opt/module/maxwell
status_maxwell(){
result=`ps -ef | grep com.zendesk.maxwell.Maxwell | grep -v grep | wc -l`
return $result
}
start_maxwell(){
status_maxwell
if [[ $? -lt 1 ]]; then
echo "启动Maxwell"
$MAXWELL_HOME/bin/maxwell --config $MAXWELL_HOME/config.properties --daemon
else
echo "Maxwell正在运行"
fi
}
stop_maxwell(){
status_maxwell
if [[ $? -gt 0 ]]; then
echo "停止Maxwell"
ps -ef | grep com.zendesk.maxwell.Maxwell | grep -v grep | awk '{print $2}' | xargs kill -9
else
echo "Maxwell未在运行"
fi
}
case $1 in
start )
start_maxwell
;;
stop )
stop_maxwell
;;
restart )
stop_maxwell
start_maxwell
;;
esac